Skip to content

Commit

Permalink
Add service and instance annotations to tron pods (#3967)
Browse files Browse the repository at this point in the history
We can currently only figure out what service/instance a log belongs to
by looking at the k8s labels for the emitting pod, but label values are
quite limited in length and we've got some pretty large job and/or
action names, which means that the instance label for a large chunk of
tronjobs ends up getting truncated.

Solution: annotations! these have a significantly higher limit (256kb)
and they can still be read by our otel collector - the only downside is
that annotations cannot be used for filtering, but that's fine :)
  • Loading branch information
nemacysts authored Sep 25, 2024
1 parent d403b82 commit 1fb6560
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
9 changes: 7 additions & 2 deletions paasta_tools/tron_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,10 +1000,15 @@ def format_tron_action_dict(action_config: TronActionConfig):
"app.kubernetes.io/managed-by": "tron",
}

# we can hardcode this for now as batches really shouldn't
# need routable IPs and we know that Spark probably does.
result["annotations"] = {
# we can hardcode this for now as batches really shouldn't
# need routable IPs and we know that Spark does.
"paasta.yelp.com/routable_ip": "true" if executor == "spark" else "false",
# we have a large amount of tron pods whose instance names are too long for a k8s label
# ...so let's toss them into an annotation so that tooling can read them (since the length
# limit is much higher (256kb))
"paasta.yelp.com/service": action_config.get_service(),
"paasta.yelp.com/instance": action_config.get_instance(),
}

result["labels"]["yelp.com/owner"] = "compute_infra_platform_experience"
Expand Down
20 changes: 17 additions & 3 deletions tests/test_tron_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,11 @@ def test_format_tron_action_dict_paasta(self):
"yelp.com/owner": "compute_infra_platform_experience",
"app.kubernetes.io/managed-by": "tron",
},
"annotations": {"paasta.yelp.com/routable_ip": "false"},
"annotations": {
"paasta.yelp.com/routable_ip": "false",
"paasta.yelp.com/service": "my_service",
"paasta.yelp.com/instance": "my_job.do_something",
},
"cap_drop": CAPS_DROP,
"cap_add": [],
"secret_env": {},
Expand Down Expand Up @@ -1374,6 +1378,8 @@ def test_format_tron_action_dict_spark(
},
"annotations": {
"paasta.yelp.com/routable_ip": "true",
"paasta.yelp.com/service": "my_service",
"paasta.yelp.com/instance": "my_job.do_something",
"prometheus.io/port": "39091",
"prometheus.io/path": "/metrics/prometheus",
},
Expand Down Expand Up @@ -1463,6 +1469,8 @@ def test_format_tron_action_dict_paasta_k8s_service_account(self):
},
"annotations": {
"paasta.yelp.com/routable_ip": "false",
"paasta.yelp.com/service": "my_service",
"paasta.yelp.com/instance": "job_name.instance_name",
},
"node_selectors": {"yelp.com/pool": "default"},
"env": mock.ANY,
Expand Down Expand Up @@ -1601,6 +1609,8 @@ def test_format_tron_action_dict_paasta_k8s(
},
"annotations": {
"paasta.yelp.com/routable_ip": "false",
"paasta.yelp.com/service": "my_service",
"paasta.yelp.com/instance": instance_name,
},
"node_selectors": {"yelp.com/pool": "special_pool"},
"node_affinities": [
Expand Down Expand Up @@ -1736,7 +1746,11 @@ def test_format_tron_action_dict_paasta_no_branch_dict(self):
"yelp.com/owner": "compute_infra_platform_experience",
"app.kubernetes.io/managed-by": "tron",
},
"annotations": {"paasta.yelp.com/routable_ip": "false"},
"annotations": {
"paasta.yelp.com/routable_ip": "false",
"paasta.yelp.com/service": "my_service",
"paasta.yelp.com/instance": "my_job.do_something",
},
"cap_drop": CAPS_DROP,
"cap_add": [],
"secret_env": {},
Expand Down Expand Up @@ -1882,7 +1896,7 @@ def test_create_complete_config_e2e(self, tmpdir):
# that are not static, this will cause continuous reconfiguration, which
# will add significant load to the Tron API, which happened in DAR-1461.
# but if this is intended, just change the hash.
assert hasher.hexdigest() == "26dae1d70ae0b937706e3de597cc07e8"
assert hasher.hexdigest() == "0585c2049596190f5510f8ce4fc3a9ac"

def test_override_default_pool_override(self, tmpdir):
soa_dir = tmpdir.mkdir("test_create_complete_config_soa")
Expand Down

0 comments on commit 1fb6560

Please sign in to comment.