clearml · Edition-X · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/README.md b/README.md
@@ -299,6 +299,41 @@ Grafana model performance example:
 - Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models
 - Custom Model [example](examples/custom/readme.md) - custom data
 
+## Health Check Endpoints
+
+ClearML Serving provides standard health check endpoints for monitoring and orchestration:
+
+- `GET /health` - Basic service health check
+  - Returns service status, version, and timestamp
+  - Response example:
+    ```json
+    {
+      "status": "healthy",
+      "service": "clearml-serving",
+      "version": "1.5.0",
+      "timestamp": 1729700000.0,
+      "instance_id": "a1b2c3d4"
+    }
+    ```
+
+- `GET /readiness` - Service readiness check
+  - Verifies if the service is ready to accept traffic
+  - Checks model loading status and GPU availability
+  - Returns 200 OK when ready, 503 Service Unavailable if not
+
+- `GET /liveness` - Simple liveness check
+  - Lightweight endpoint for container orchestration
+  - Returns 200 OK if the service process is responsive
+
+- `GET /metrics` - Service metrics
+  - Returns Prometheus-style metrics including:
+    - Uptime
+    - Request counts
+    - Model loading status
+    - GPU memory usage (if available)
+
+These endpoints are automatically enabled and require no additional configuration.
+
 ### :pray: Status
 
   - [x] FastAPI integration for inference service
@@ -330,6 +365,7 @@ Grafana model performance example:
   - [x] Prometheus install instructions
   - [x] Grafana install instructions
   - [x] Kubernetes Helm Chart
+  - [x] Standard health check endpoints (`/health`, `/readiness`, `/liveness`, `/metrics`)
   - [ ] Intel optimized container (python, numpy, daal, scikit-learn)
 
 ## Contributing

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
@@ -3,10 +3,12 @@
 import traceback
 import gzip
 import asyncio
+import time
+import uuid
 
 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends
 from fastapi.routing import APIRoute
-from fastapi.responses import PlainTextResponse
+from fastapi.responses import PlainTextResponse, JSONResponse
 from grpc.aio import AioRpcError
 
 from http import HTTPStatus
@@ -57,6 +59,11 @@ async def custom_route_handler(request: Request) -> Response:
 
 # create clearml Task and load models
 serving_service_task_id, session_logger, instance_id = setup_task()
+
+# Health check tracking variables
+startup_time = time.time()
+service_instance_id = str(uuid.uuid4())[:8]
+
 # polling frequency
 model_sync_frequency_secs = 5
 try:
@@ -180,6 +187,151 @@ async def process_with_exceptions(
     return return_value
 
 
+# ============================================================================
+# HEALTH CHECK ENDPOINTS
+# ============================================================================
+
+@app.get("/health")
+async def health_check():
+    """
+    Basic health check endpoint.
+    Returns 200 OK when service is running.
+    """
+    return JSONResponse(
+        status_code=200,
+        content={
+            "status": "healthy",
+            "service": "clearml-serving",
+            "version": __version__,
+            "timestamp": time.time(),
+            "instance_id": service_instance_id,
+        },
+    )
+
+
+@app.get("/readiness")
+async def readiness_check():
+    """
+    Readiness check endpoint.
+    Returns 200 if ready to serve requests, 503 if not ready.
+    Checks if ModelRequestProcessor is initialized and models are loaded.
+    """
+    global processor
+
+    if not processor:
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "not_ready",
+                "reason": "Processor not initialized",
+                "timestamp": time.time(),
+            },
+        )
+
+    try:
+        # Check if models are loaded
+        models_loaded = processor.get_loaded_endpoints()
+        if not models_loaded:
+            raise HTTPException(
+                status_code=503,
+                detail={
+                    "status": "not_ready",
+                    "reason": "No models loaded",
+                    "timestamp": time.time(),
+                },
+            )
+
+        # Check GPU availability if applicable
+        gpu_available = False
+        try:
+            import torch
+
+            gpu_available = torch.cuda.is_available()
+        except (ImportError, ModuleNotFoundError, AttributeError):
+            # torch not installed or CUDA not available
+            pass
+
+        return JSONResponse(
+            status_code=200,
+            content={
+                "status": "ready",
+                "models_loaded": len(models_loaded),
+                "gpu_available": gpu_available,
+                "timestamp": time.time(),
+            },
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "not_ready",
+                "reason": f"Error checking readiness: {str(e)}",
+                "timestamp": time.time(),
+            },
+        )
+
+
+@app.get("/liveness")
+async def liveness_check():
+    """
+    Liveness check endpoint.
+    Lightweight check for container orchestration.
+    Returns 200 OK if process is responsive.
+    """
+    return JSONResponse(
+        status_code=200, content={"status": "alive", "timestamp": time.time()}
+    )
+
+
+@app.get("/metrics")
+async def metrics_endpoint():
+    """
+    Detailed metrics endpoint.
+    Returns service metrics including uptime, request count, GPU usage, etc.
+    """
+    global processor
+
+    uptime_seconds = time.time() - startup_time
+
+    metrics = {
+        "uptime_seconds": round(uptime_seconds, 2),
+        "total_requests": 0,
+        "last_prediction_timestamp": None,
+        "models": [],
+    }
+
+    if processor:
+        try:
+            metrics["total_requests"] = processor.get_request_count()
+            metrics["last_prediction_timestamp"] = processor.get_last_prediction_time()
+
+            # Get loaded models info
+            loaded_endpoints = processor.get_loaded_endpoints()
+            for endpoint_name in loaded_endpoints:
+                metrics["models"].append({"endpoint": endpoint_name, "loaded": True})
+        except AttributeError:
+            # If methods don't exist yet, continue with basic metrics
+            pass
+
+    # Try to get GPU metrics
+    try:
+        import pynvml
+
+        pynvml.nvmlInit()
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        metrics["gpu_memory_used_mb"] = round(info.used / 1024 / 1024, 2)
+        metrics["gpu_memory_total_mb"] = round(info.total / 1024 / 1024, 2)
+        pynvml.nvmlShutdown()
+    except (ImportError, ModuleNotFoundError, AttributeError, OSError):
+        # GPU metrics not available (pynvml not installed, no GPU, or driver issues)
+        metrics["gpu_memory_used_mb"] = None
+        metrics["gpu_memory_total_mb"] = None
+
+    return JSONResponse(status_code=200, content=metrics)
+
 router = APIRouter(
     prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}",
     tags=["models"],

diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py
@@ -161,6 +161,9 @@ def __init__(
         self._metric_log_freq = None
         self._endpoint_telemetry = {}
         self._enable_endpoint_telemetry = os.environ.get("CLEARML_ENABLE_ENDPOINT_TELEMETRY", "1") != "0"
+        # Health check tracking variables
+        self._request_count = 0
+        self._last_prediction_time = None
 
     def on_request_endpoint_telemetry(self, base_url=None, version=None):
         try:
@@ -255,6 +258,10 @@ async def process_request(self, base_url: str, version: str, request_body: dict,
         Process request coming in,
         Raise Value error if url does not match existing endpoints
         """
+        # Track request for health metrics
+        self._request_count += 1
+        self._last_prediction_time = time()
+
         self._request_processing_state.inc()
         # check if we need to stall
         if self._update_lock_flag:
@@ -1567,3 +1574,23 @@ def _remove_registered_input_model(self, endpoint_url: str) -> bool:
             return False
 
         return True
+
+    def get_loaded_endpoints(self) -> List[str]:
+        """
+        Return list of loaded endpoint names for health checks.
+        """
+        if not hasattr(self, "_endpoints") or not self._endpoints:
+            return []
+        return list(self._endpoints.keys())
+
+    def get_request_count(self) -> int:
+        """
+        Return total requests processed for health metrics.
+        """
+        return getattr(self, "_request_count", 0)
+
+    def get_last_prediction_time(self) -> Optional[float]:
+        """
+        Return timestamp of last prediction for health metrics.
+        """
+        return getattr(self, "_last_prediction_time", None)