diff --git a/README.md b/README.md index ebf39e3..88dce3c 100644 --- a/README.md +++ b/README.md @@ -299,6 +299,41 @@ Grafana model performance example: - Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models - Custom Model [example](examples/custom/readme.md) - custom data +## Health Check Endpoints + +ClearML Serving provides standard health check endpoints for monitoring and orchestration: + +- `GET /health` - Basic service health check + - Returns service status, version, and timestamp + - Response example: + ```json + { + "status": "healthy", + "service": "clearml-serving", + "version": "1.5.0", + "timestamp": 1729700000.0, + "instance_id": "a1b2c3d4" + } + ``` + +- `GET /readiness` - Service readiness check + - Verifies if the service is ready to accept traffic + - Checks model loading status and GPU availability + - Returns 200 OK when ready, 503 Service Unavailable if not + +- `GET /liveness` - Simple liveness check + - Lightweight endpoint for container orchestration + - Returns 200 OK if the service process is responsive + +- `GET /metrics` - Service metrics + - Returns Prometheus-style metrics including: + - Uptime + - Request counts + - Model loading status + - GPU memory usage (if available) + +These endpoints are automatically enabled and require no additional configuration. + ### :pray: Status - [x] FastAPI integration for inference service @@ -330,6 +365,7 @@ Grafana model performance example: - [x] Prometheus install instructions - [x] Grafana install instructions - [x] Kubernetes Helm Chart + - [x] Standard health check endpoints (`/health`, `/readiness`, `/liveness`, `/metrics`) - [ ] Intel optimized container (python, numpy, daal, scikit-learn) ## Contributing diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 9f36c0f..bc52f1e 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -3,10 +3,12 @@ import traceback import gzip import asyncio +import time +import uuid from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends from fastapi.routing import APIRoute -from fastapi.responses import PlainTextResponse +from fastapi.responses import PlainTextResponse, JSONResponse from grpc.aio import AioRpcError from http import HTTPStatus @@ -57,6 +59,11 @@ async def custom_route_handler(request: Request) -> Response: # create clearml Task and load models serving_service_task_id, session_logger, instance_id = setup_task() + +# Health check tracking variables +startup_time = time.time() +service_instance_id = str(uuid.uuid4())[:8] + # polling frequency model_sync_frequency_secs = 5 try: @@ -180,6 +187,151 @@ async def process_with_exceptions( return return_value +# ============================================================================ +# HEALTH CHECK ENDPOINTS +# ============================================================================ + +@app.get("/health") +async def health_check(): + """ + Basic health check endpoint. + Returns 200 OK when service is running. + """ + return JSONResponse( + status_code=200, + content={ + "status": "healthy", + "service": "clearml-serving", + "version": __version__, + "timestamp": time.time(), + "instance_id": service_instance_id, + }, + ) + + +@app.get("/readiness") +async def readiness_check(): + """ + Readiness check endpoint. + Returns 200 if ready to serve requests, 503 if not ready. + Checks if ModelRequestProcessor is initialized and models are loaded. + """ + global processor + + if not processor: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": "Processor not initialized", + "timestamp": time.time(), + }, + ) + + try: + # Check if models are loaded + models_loaded = processor.get_loaded_endpoints() + if not models_loaded: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": "No models loaded", + "timestamp": time.time(), + }, + ) + + # Check GPU availability if applicable + gpu_available = False + try: + import torch + + gpu_available = torch.cuda.is_available() + except (ImportError, ModuleNotFoundError, AttributeError): + # torch not installed or CUDA not available + pass + + return JSONResponse( + status_code=200, + content={ + "status": "ready", + "models_loaded": len(models_loaded), + "gpu_available": gpu_available, + "timestamp": time.time(), + }, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=503, + detail={ + "status": "not_ready", + "reason": f"Error checking readiness: {str(e)}", + "timestamp": time.time(), + }, + ) + + +@app.get("/liveness") +async def liveness_check(): + """ + Liveness check endpoint. + Lightweight check for container orchestration. + Returns 200 OK if process is responsive. + """ + return JSONResponse( + status_code=200, content={"status": "alive", "timestamp": time.time()} + ) + + +@app.get("/metrics") +async def metrics_endpoint(): + """ + Detailed metrics endpoint. + Returns service metrics including uptime, request count, GPU usage, etc. + """ + global processor + + uptime_seconds = time.time() - startup_time + + metrics = { + "uptime_seconds": round(uptime_seconds, 2), + "total_requests": 0, + "last_prediction_timestamp": None, + "models": [], + } + + if processor: + try: + metrics["total_requests"] = processor.get_request_count() + metrics["last_prediction_timestamp"] = processor.get_last_prediction_time() + + # Get loaded models info + loaded_endpoints = processor.get_loaded_endpoints() + for endpoint_name in loaded_endpoints: + metrics["models"].append({"endpoint": endpoint_name, "loaded": True}) + except AttributeError: + # If methods don't exist yet, continue with basic metrics + pass + + # Try to get GPU metrics + try: + import pynvml + + pynvml.nvmlInit() + handle = pynvml.nvmlDeviceGetHandleByIndex(0) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + metrics["gpu_memory_used_mb"] = round(info.used / 1024 / 1024, 2) + metrics["gpu_memory_total_mb"] = round(info.total / 1024 / 1024, 2) + pynvml.nvmlShutdown() + except (ImportError, ModuleNotFoundError, AttributeError, OSError): + # GPU metrics not available (pynvml not installed, no GPU, or driver issues) + metrics["gpu_memory_used_mb"] = None + metrics["gpu_memory_total_mb"] = None + + return JSONResponse(status_code=200, content=metrics) + router = APIRouter( prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}", tags=["models"], diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 60c256d..2b7b27c 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -161,6 +161,9 @@ def __init__( self._metric_log_freq = None self._endpoint_telemetry = {} self._enable_endpoint_telemetry = os.environ.get("CLEARML_ENABLE_ENDPOINT_TELEMETRY", "1") != "0" + # Health check tracking variables + self._request_count = 0 + self._last_prediction_time = None def on_request_endpoint_telemetry(self, base_url=None, version=None): try: @@ -255,6 +258,10 @@ async def process_request(self, base_url: str, version: str, request_body: dict, Process request coming in, Raise Value error if url does not match existing endpoints """ + # Track request for health metrics + self._request_count += 1 + self._last_prediction_time = time() + self._request_processing_state.inc() # check if we need to stall if self._update_lock_flag: @@ -1567,3 +1574,23 @@ def _remove_registered_input_model(self, endpoint_url: str) -> bool: return False return True + + def get_loaded_endpoints(self) -> List[str]: + """ + Return list of loaded endpoint names for health checks. + """ + if not hasattr(self, "_endpoints") or not self._endpoints: + return [] + return list(self._endpoints.keys()) + + def get_request_count(self) -> int: + """ + Return total requests processed for health metrics. + """ + return getattr(self, "_request_count", 0) + + def get_last_prediction_time(self) -> Optional[float]: + """ + Return timestamp of last prediction for health metrics. + """ + return getattr(self, "_last_prediction_time", None)