Skip to content

Commit 48b51a8

Browse files
committed
feat: log debug pod information before deleting pods by agent
1 parent f799bb2 commit 48b51a8

File tree

3 files changed

+77
-3
lines changed

3 files changed

+77
-3
lines changed

clearml_agent/glue/definitions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,8 @@
3030
ENV_DEFAULT_SCHEDULER_QUEUE_TAGS = EnvEntry(
3131
"K8S_GLUE_DEFAULT_SCHEDULER_QUEUE_TAGS", default=["k8s-glue"], converter=shlex.split
3232
)
33+
34+
ENV_LOG_POD_STATUS_BEFORE_DELETING = EnvEntry("K8S_GLUE_LOG_POD_STATUS_BEFORE_DELETING", default=False, converter=bool)
35+
"""
36+
If True, log pod status and exit code before deleting it.
37+
"""

clearml_agent/glue/k8s.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
ENV_POD_USE_IMAGE_ENTRYPOINT,
4343
ENV_KUBECTL_IGNORE_ERROR,
4444
ENV_DEFAULT_SCHEDULER_QUEUE_TAGS,
45+
ENV_LOG_POD_STATUS_BEFORE_DELETING,
4546
)
4647
from .._vendor import pyyaml as yaml
4748

@@ -220,6 +221,7 @@ def __init__(
220221
self.ignore_kubectl_errors_re = (
221222
re.compile(ENV_KUBECTL_IGNORE_ERROR.get()) if ENV_KUBECTL_IGNORE_ERROR.get() else None
222223
)
224+
self.log_pod_status_before_deleting = ENV_LOG_POD_STATUS_BEFORE_DELETING.get()
223225

224226
@property
225227
def agent_label(self):
@@ -431,7 +433,7 @@ def get_pods(self, filters: List[str] = None, debug_msg: str = None):
431433
output_config = json.loads(output)
432434
except Exception as ex:
433435
self.log.warning('Failed parsing kubectl output:\n{}\nEx: {}'.format(output, ex))
434-
return
436+
return []
435437
return output_config.get('items', [])
436438

437439
def _get_pod_count(self, extra_labels: List[str] = None, msg: str = None):
@@ -490,7 +492,6 @@ def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_sessio
490492
"task": task_id,
491493
"queue": self.k8s_pending_queue_id,
492494
"status_reason": "k8s pending scheduler",
493-
"update_execution_queue": False,
494495
}
495496
)
496497
if res.ok:
@@ -1037,7 +1038,74 @@ def _process_bash_lines_response(self, bash_cmd: str, raise_error=True):
10371038
]
10381039
return lines
10391040

1041+
def _log_pod_statuses(self, pods: List[Dict]):
1042+
"""
1043+
Log pod status and exit codes to the task's console log.
1044+
:param pods: A list of pod dictionaries to log.
1045+
:param msg: A message to log.
1046+
"""
1047+
if not pods:
1048+
return
1049+
for pod in pods:
1050+
pod_name = get_path(pod, "metadata", "name")
1051+
task_id = pod_name[len(self.pod_name_prefix) :]
1052+
1053+
# Log pod status and exit codes
1054+
pod_status = get_path(pod, "status", "phase")
1055+
ctr_statuses = get_path(pod, "status", "containerStatuses") or []
1056+
1057+
if pod_status.lower() == "succeeded":
1058+
continue
1059+
1060+
log_lines = []
1061+
log_lines.append(
1062+
"Pod '{pod_name}' resulted in status '{pod_status}'".format(
1063+
pod_name=pod_name, pod_status=pod_status
1064+
)
1065+
)
1066+
1067+
for ctr_status in ctr_statuses:
1068+
ctr_name = ctr_status.get("name")
1069+
ctr_state = get_path(ctr_status, "state")
1070+
ctr_reason = get_path(ctr_state, "terminated", "reason")
1071+
ctr_exit_code = get_path(ctr_state, "terminated", "exitCode")
1072+
log_lines.append(
1073+
"Container '{name}' resulted with reason '{reason}' and exit code '{exit_code}'".format(
1074+
name=ctr_name, reason=ctr_reason, exit_code=ctr_exit_code
1075+
)
1076+
)
1077+
1078+
try:
1079+
self.send_logs(task_id, [os.linesep.join(log_lines)])
1080+
except Exception as ex:
1081+
self.log.warning(f"Failed sending pod status logs for task {task_id}: {ex}")
1082+
1083+
def _delete_pods_by_names(self, names: List[str], namespace: str, msg: str = None) -> List[str]:
1084+
if not names:
1085+
return []
1086+
kubectl_cmd = "kubectl delete pod --namespace={ns} {names} --output=name".format(
1087+
ns=namespace, names=" ".join(names)
1088+
)
1089+
self.log.debug("Deleting pods by name {}: {}".format(
1090+
msg or "", kubectl_cmd
1091+
))
1092+
lines = self._process_bash_lines_response(kubectl_cmd)
1093+
self.log.debug(" - deleted pods by name %s", ", ".join(lines))
1094+
return lines
1095+
10401096
def _delete_pods(self, selectors: List[str], namespace: str, msg: str = None) -> List[str]:
1097+
if self.log_pod_status_before_deleting:
1098+
pods_to_delete = self.get_pods(
1099+
filters=selectors,
1100+
debug_msg="Getting pods to delete: {cmd}",
1101+
)
1102+
self._log_pod_statuses(pods_to_delete)
1103+
return self._delete_pods_by_names(
1104+
[get_path(p, "metadata", "name") for p in pods_to_delete if get_path(p, "metadata", "name")],
1105+
namespace,
1106+
msg=msg
1107+
)
1108+
10411109
kubectl_cmd = \
10421110
"kubectl delete pod -l={agent_label} " \
10431111
"--namespace={namespace} --field-selector={selector} --output name".format(
@@ -1089,6 +1157,8 @@ def _delete_completed_or_failed_pods(self, namespace, msg: str = None):
10891157
debug_msg="Deleting failed pods: {cmd}"
10901158
)
10911159
if failed_pods:
1160+
if self.log_pod_status_before_deleting:
1161+
self._log_pod_statuses(failed_pods)
10921162
job_names_to_delete = {
10931163
get_path(pod, "metadata", "labels", "job-name"): get_path(pod, "metadata", "namespace")
10941164
for pod in failed_pods

clearml_agent/helper/docker_args.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,6 @@ class CustomTemplate(Template):
294294
${TASK.hyperparams.properties.user_key.value}
295295
"""
296296

297-
idpattern = r'(?a:[_a-z][_a-z0-9|.|:]*)'
298297
prefix = "CLEARML_"
299298
queue_id_to_name_map = {}
300299

0 commit comments

Comments
 (0)