@@ -342,10 +342,15 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
342342 if container_obj is not None :
343343 out_logs , err_logs = None , None
344344 try :
345- out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False ,
346- since = last_log_time )
347- err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False ,
348- since = last_log_time )
345+ if container_obj .status == "exited" :
346+ # If the container has exited, we need to get the whole logs from the container
347+ out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False )
348+ err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False )
349+ else :
350+ out_logs = container_obj .logs (stdout = True , stderr = False , stream = False , follow = False ,
351+ since = last_log_time )
352+ err_logs = container_obj .logs (stdout = False , stderr = True , stream = False , follow = False ,
353+ since = last_log_time )
349354 except Exception as e :
350355 logging .error (f"Failed to get the logs from the container with exception { e } " )
351356 pass
@@ -355,16 +360,29 @@ def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type,
355360 if err_logs is not None :
356361 err_logs = sys_utils .decode_our_err_result (err_logs )
357362 if len (err_logs ) > 0 :
358- logging .error (f"{ format (err_logs )} " )
363+ logging .error (f"[-- Container Error Logs Start --] \n { format (err_logs )} \n [-- Container Error Logs End --] " )
359364
360365 if out_logs is not None :
361366 out_logs = sys_utils .decode_our_err_result (out_logs )
362367 if len (out_logs ) > 0 :
363- logging .info (f"{ format (out_logs )} " )
368+ logging .info (f"[-- Container Stdout Logs Start --] \n { format (out_logs )} \n [-- Container Stdout Logs End --] " )
364369
365370 if container_obj .status == "exited" :
366371 logging .info ("Container {} has exited, automatically remove it" .format (cmd_container_name ))
367372
373+ # try to get the logs from the filesystem
374+ if out_logs is None or err_logs is None :
375+ try :
376+ logs_path = f"/var/lib/docker/containers/{ container_obj .id } /{ container_obj .id } -json.log"
377+ if os .path .exists (logs_path ):
378+ with open (logs_path , 'r' ) as f :
379+ raw_logs = f .readlines ()
380+ out_logs = '\n ' .join ([line for line in raw_logs if '"stream":"stdout"' in line ])
381+ err_logs = '\n ' .join ([line for line in raw_logs if '"stream":"stderr"' in line ])
382+ logging .error (f"read Container Error Logs from log file: { err_logs } " )
383+ except Exception as e :
384+ logging .warning (f"Failed to read logs from filesystem: { str (e )} " )
385+
368386 # Save the failed log into ~/.fedml/fedml-model-client/fedml/logs/failed_logs/
369387 # $run_id/$container_name.log
370388 try :
0 commit comments