Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,41 @@ Grafana model performance example:
- Multi-Model ASync Pipeline [example](examples/pipeline/async_preprocess.py) - multiple models
- Custom Model [example](examples/custom/readme.md) - custom data

## Health Check Endpoints

ClearML Serving provides standard health check endpoints for monitoring and orchestration:

- `GET /health` - Basic service health check
- Returns service status, version, and timestamp
- Response example:
```json
{
"status": "healthy",
"service": "clearml-serving",
"version": "1.5.0",
"timestamp": 1729700000.0,
"instance_id": "a1b2c3d4"
}
```

- `GET /readiness` - Service readiness check
- Verifies if the service is ready to accept traffic
- Checks model loading status and GPU availability
- Returns 200 OK when ready, 503 Service Unavailable if not

- `GET /liveness` - Simple liveness check
- Lightweight endpoint for container orchestration
- Returns 200 OK if the service process is responsive

- `GET /metrics` - Service metrics
- Returns Prometheus-style metrics including:
- Uptime
- Request counts
- Model loading status
- GPU memory usage (if available)

These endpoints are automatically enabled and require no additional configuration.

### :pray: Status

- [x] FastAPI integration for inference service
Expand Down Expand Up @@ -330,6 +365,7 @@ Grafana model performance example:
- [x] Prometheus install instructions
- [x] Grafana install instructions
- [x] Kubernetes Helm Chart
- [x] Standard health check endpoints (`/health`, `/readiness`, `/liveness`, `/metrics`)
- [ ] Intel optimized container (python, numpy, daal, scikit-learn)

## Contributing
Expand Down
154 changes: 153 additions & 1 deletion clearml_serving/serving/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import traceback
import gzip
import asyncio
import time
import uuid

from fastapi import FastAPI, Request, Response, APIRouter, HTTPException, Depends
from fastapi.routing import APIRoute
from fastapi.responses import PlainTextResponse
from fastapi.responses import PlainTextResponse, JSONResponse
from grpc.aio import AioRpcError

from http import HTTPStatus
Expand Down Expand Up @@ -57,6 +59,11 @@ async def custom_route_handler(request: Request) -> Response:

# create clearml Task and load models
serving_service_task_id, session_logger, instance_id = setup_task()

# Health check tracking variables
startup_time = time.time()
service_instance_id = str(uuid.uuid4())[:8]

# polling frequency
model_sync_frequency_secs = 5
try:
Expand Down Expand Up @@ -180,6 +187,151 @@ async def process_with_exceptions(
return return_value


# ============================================================================
# HEALTH CHECK ENDPOINTS
# ============================================================================

@app.get("/health")
async def health_check():
"""
Basic health check endpoint.
Returns 200 OK when service is running.
"""
return JSONResponse(
status_code=200,
content={
"status": "healthy",
"service": "clearml-serving",
"version": __version__,
"timestamp": time.time(),
"instance_id": service_instance_id,
},
)


@app.get("/readiness")
async def readiness_check():
"""
Readiness check endpoint.
Returns 200 if ready to serve requests, 503 if not ready.
Checks if ModelRequestProcessor is initialized and models are loaded.
"""
global processor

if not processor:
raise HTTPException(
status_code=503,
detail={
"status": "not_ready",
"reason": "Processor not initialized",
"timestamp": time.time(),
},
)

try:
# Check if models are loaded
models_loaded = processor.get_loaded_endpoints()
if not models_loaded:
raise HTTPException(
status_code=503,
detail={
"status": "not_ready",
"reason": "No models loaded",
"timestamp": time.time(),
},
)

# Check GPU availability if applicable
gpu_available = False
try:
import torch

gpu_available = torch.cuda.is_available()
except (ImportError, ModuleNotFoundError, AttributeError):
# torch not installed or CUDA not available
pass

return JSONResponse(
status_code=200,
content={
"status": "ready",
"models_loaded": len(models_loaded),
"gpu_available": gpu_available,
"timestamp": time.time(),
},
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=503,
detail={
"status": "not_ready",
"reason": f"Error checking readiness: {str(e)}",
"timestamp": time.time(),
},
)


@app.get("/liveness")
async def liveness_check():
"""
Liveness check endpoint.
Lightweight check for container orchestration.
Returns 200 OK if process is responsive.
"""
return JSONResponse(
status_code=200, content={"status": "alive", "timestamp": time.time()}
)


@app.get("/metrics")
async def metrics_endpoint():
"""
Detailed metrics endpoint.
Returns service metrics including uptime, request count, GPU usage, etc.
"""
global processor

uptime_seconds = time.time() - startup_time

metrics = {
"uptime_seconds": round(uptime_seconds, 2),
"total_requests": 0,
"last_prediction_timestamp": None,
"models": [],
}

if processor:
try:
metrics["total_requests"] = processor.get_request_count()
metrics["last_prediction_timestamp"] = processor.get_last_prediction_time()

# Get loaded models info
loaded_endpoints = processor.get_loaded_endpoints()
for endpoint_name in loaded_endpoints:
metrics["models"].append({"endpoint": endpoint_name, "loaded": True})
except AttributeError:
# If methods don't exist yet, continue with basic metrics
pass

# Try to get GPU metrics
try:
import pynvml

pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
metrics["gpu_memory_used_mb"] = round(info.used / 1024 / 1024, 2)
metrics["gpu_memory_total_mb"] = round(info.total / 1024 / 1024, 2)
pynvml.nvmlShutdown()
except (ImportError, ModuleNotFoundError, AttributeError, OSError):
# GPU metrics not available (pynvml not installed, no GPU, or driver issues)
metrics["gpu_memory_used_mb"] = None
metrics["gpu_memory_total_mb"] = None

return JSONResponse(status_code=200, content=metrics)

router = APIRouter(
prefix=f"/{os.environ.get("CLEARML_DEFAULT_SERVE_SUFFIX", "serve")}",
tags=["models"],
Expand Down
27 changes: 27 additions & 0 deletions clearml_serving/serving/model_request_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ def __init__(
self._metric_log_freq = None
self._endpoint_telemetry = {}
self._enable_endpoint_telemetry = os.environ.get("CLEARML_ENABLE_ENDPOINT_TELEMETRY", "1") != "0"
# Health check tracking variables
self._request_count = 0
self._last_prediction_time = None

def on_request_endpoint_telemetry(self, base_url=None, version=None):
try:
Expand Down Expand Up @@ -255,6 +258,10 @@ async def process_request(self, base_url: str, version: str, request_body: dict,
Process request coming in,
Raise Value error if url does not match existing endpoints
"""
# Track request for health metrics
self._request_count += 1
self._last_prediction_time = time()

self._request_processing_state.inc()
# check if we need to stall
if self._update_lock_flag:
Expand Down Expand Up @@ -1567,3 +1574,23 @@ def _remove_registered_input_model(self, endpoint_url: str) -> bool:
return False

return True

def get_loaded_endpoints(self) -> List[str]:
"""
Return list of loaded endpoint names for health checks.
"""
if not hasattr(self, "_endpoints") or not self._endpoints:
return []
return list(self._endpoints.keys())

def get_request_count(self) -> int:
"""
Return total requests processed for health metrics.
"""
return getattr(self, "_request_count", 0)

def get_last_prediction_time(self) -> Optional[float]:
"""
Return timestamp of last prediction for health metrics.
"""
return getattr(self, "_last_prediction_time", None)