-
Notifications
You must be signed in to change notification settings - Fork 55
feat: comprehensive API health check endpoint #778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,9 +3,12 @@ | |
| import logging | ||
| import os | ||
| import time | ||
| import httpx | ||
| import asyncio | ||
| from datetime import datetime, timezone | ||
|
|
||
| from fastapi import APIRouter | ||
| from fastapi.responses import JSONResponse | ||
| from sqlalchemy import text | ||
| from sqlalchemy.exc import SQLAlchemyError | ||
| from redis.asyncio import RedisError, from_url | ||
|
|
@@ -22,13 +25,13 @@ async def _check_database() -> str: | |
| try: | ||
| async with engine.connect() as conn: | ||
| await conn.execute(text("SELECT 1")) | ||
| return "connected" | ||
| return "up" | ||
| except SQLAlchemyError: | ||
| logger.warning("Health check DB failure: connection error") | ||
| return "disconnected" | ||
| return "down" | ||
| except Exception: | ||
| logger.warning("Health check DB failure: unexpected error") | ||
| return "disconnected" | ||
| return "down" | ||
|
|
||
|
|
||
| async def _check_redis() -> str: | ||
|
|
@@ -37,30 +40,84 @@ async def _check_redis() -> str: | |
| client = from_url(redis_url, decode_responses=True) | ||
| async with client: | ||
| await client.ping() | ||
| return "connected" | ||
| return "up" | ||
| except RedisError: | ||
| logger.warning("Health check Redis failure: connection error") | ||
| return "disconnected" | ||
| return "down" | ||
| except Exception: | ||
| logger.warning("Health check Redis failure: unexpected error") | ||
| return "disconnected" | ||
| return "down" | ||
|
|
||
|
|
||
| async def _check_solana_rpc() -> str: | ||
| try: | ||
| solana_rpc_url = os.getenv("SOLANA_RPC_URL", "https://api.mainnet-beta.solana.com") | ||
| async with httpx.AsyncClient(timeout=1.5) as client: | ||
| response = await client.post( | ||
| solana_rpc_url, | ||
| json={"jsonrpc": "2.0", "id": 1, "method": "getHealth"} | ||
| ) | ||
| data = response.json() | ||
| if data.get("result") == "ok": | ||
| return "up" | ||
| return "down" | ||
| except Exception: | ||
| logger.warning("Health check Solana RPC failure") | ||
| return "down" | ||
|
|
||
|
|
||
| async def _check_github_api() -> str: | ||
| try: | ||
| token = os.getenv("GITHUB_TOKEN", "") | ||
| headers = {"Accept": "application/vnd.github.v3+json"} | ||
| if token: | ||
| headers["Authorization"] = f"Bearer {token}" | ||
|
|
||
| async with httpx.AsyncClient(timeout=1.5) as client: | ||
| response = await client.get("https://api.github.com/rate_limit", headers=headers) | ||
| if response.status_code == 200: | ||
| data = response.json() | ||
| remaining = data.get("resources", {}).get("core", {}).get("remaining", 0) | ||
| if remaining > 0: | ||
| return "up" | ||
| return "down" | ||
| except Exception: | ||
| logger.warning("Health check GitHub API failure") | ||
| return "down" | ||
|
Comment on lines
+69
to
+86
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GitHub API health check returns "down" when rate limit is exhausted, even if API is reachable. The logic at Lines 80-82 marks GitHub as "down" when This could cause false health degradation alerts during periods of heavy GitHub API usage. 🤖 Prompt for AI Agents |
||
|
|
||
|
|
||
| @router.get("/health", summary="Service health check") | ||
| async def health_check() -> dict: | ||
| """Return service status including database and Redis connectivity.""" | ||
| db_status = await _check_database() | ||
| redis_status = await _check_redis() | ||
| async def health_check() -> JSONResponse: | ||
| """Return service status including database, Redis, Solana RPC, and GitHub API connectivity.""" | ||
|
|
||
| # Run checks in parallel | ||
| results = await asyncio.gather( | ||
| _check_database(), | ||
| _check_redis(), | ||
| _check_solana_rpc(), | ||
| _check_github_api(), | ||
| return_exceptions=True | ||
| ) | ||
|
|
||
| db_status = results[0] if not isinstance(results[0], Exception) else "down" | ||
| redis_status = results[1] if not isinstance(results[1], Exception) else "down" | ||
| solana_status = results[2] if not isinstance(results[2], Exception) else "down" | ||
| github_status = results[3] if not isinstance(results[3], Exception) else "down" | ||
|
|
||
| is_healthy = db_status == "connected" and redis_status == "connected" | ||
| is_healthy = all(status == "up" for status in (db_status, redis_status, solana_status, github_status)) | ||
| status_code = 200 if is_healthy else 503 | ||
|
|
||
| return { | ||
| content = { | ||
| "status": "healthy" if is_healthy else "degraded", | ||
| "version": "1.0.0", | ||
| "uptime_seconds": round(time.monotonic() - START_TIME), | ||
| "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), | ||
| "services": { | ||
| "database": db_status, | ||
| "redis": redis_status, | ||
| "solana": solana_status, | ||
| "github": github_status, | ||
| }, | ||
| } | ||
|
|
||
| return JSONResponse(status_code=status_code, content=content) | ||
|
Comment on lines
89
to
+123
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: #!/bin/bash
# Check if AI model health check was intentionally excluded or forgotten
rg -n "AI|model|review" --type=py -g '!test*' backend/app/api/Repository: SolFoundry/solfoundry Length of output: 23905 🏁 Script executed: # First, let's see the complete health.py file to understand the context better
find backend -name "health.py" -type fRepository: SolFoundry/solfoundry Length of output: 90 🏁 Script executed: # Search for Issue `#490` or any reference to it
rg -i "490|issue.*490" --type=py -g '!test*' backend/Repository: SolFoundry/solfoundry Length of output: 47 🏁 Script executed: # Check for any AI model health check setup or configuration
rg -n "health.*model|model.*health" --type=py -g '!test*' backend/app/Repository: SolFoundry/solfoundry Length of output: 176 🏁 Script executed: # Look for Pydantic response models in health.py or nearby
rg -n "class.*Response|BaseModel" --type=py backend/app/api/health.pyRepository: SolFoundry/solfoundry Length of output: 47 🏁 Script executed: # Get the full health.py file with line count
wc -l backend/app/api/health.pyRepository: SolFoundry/solfoundry Length of output: 94 🏁 Script executed: # Read the complete health.py file
cat -n backend/app/api/health.pyRepository: SolFoundry/solfoundry Length of output: 5084 🏁 Script executed: # Check SystemHealthResponse in admin.py for reference
rg -A 15 "class SystemHealthResponse" backend/app/api/admin.pyRepository: SolFoundry/solfoundry Length of output: 431 🏁 Script executed: # Verify the exact endpoint definition in health.py
rg -B 2 -A 20 '@router.get.*health' backend/app/api/health.pyRepository: SolFoundry/solfoundry Length of output: 969 Define a Pydantic response model for the The endpoint returns Note: The Issue 🤖 Prompt for AI Agents |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,11 +1,9 @@ | ||
| """Unit tests for the /health endpoint (Issue #343). | ||
| """Unit tests for the /health endpoint. | ||
|
|
||
| Covers four scenarios: | ||
| Covers scenarios: | ||
| - All services healthy | ||
| - Database down | ||
| - Redis down | ||
| - Both down | ||
| Testing exception handling directly on dependencies. | ||
| - Partial down | ||
| - Proper status codes and parallel execution | ||
| """ | ||
|
Comment on lines
+1
to
7
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docstring claims "parallel execution" testing but no actual verification of parallelism. The docstring at Lines 3-6 mentions "parallel execution" as a covered scenario, but no test actually verifies that checks run concurrently (e.g., by timing execution or mocking with delays). This is a minor documentation accuracy issue. 🤖 Prompt for AI Agents |
||
|
|
||
| import pytest | ||
|
|
@@ -43,113 +41,40 @@ async def ping(self): | |
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_health_all_services_up(): | ||
| """Returns 'healthy' when DB and Redis are both reachable.""" | ||
| with ( | ||
| patch("app.api.health.engine.connect", return_value=MockConn()), | ||
| patch("app.api.health.from_url", return_value=MockRedis()), | ||
| ): | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
| @patch("app.api.health._check_solana_rpc", return_value="up") | ||
| @patch("app.api.health._check_github_api", return_value="up") | ||
| @patch("app.api.health.engine.connect", return_value=MockConn()) | ||
| @patch("app.api.health.from_url", return_value=MockRedis()) | ||
| async def test_health_all_services_up(mock_redis, mock_db, mock_github, mock_solana): | ||
| """Returns 200 and 'healthy' when all are up.""" | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
|
|
||
| assert response.status_code == 200 | ||
| data = response.json() | ||
| assert data["status"] == "healthy" | ||
| assert data["services"]["database"] == "connected" | ||
| assert data["services"]["redis"] == "connected" | ||
| assert data["services"]["database"] == "up" | ||
| assert data["services"]["redis"] == "up" | ||
| assert data["services"]["solana"] == "up" | ||
| assert data["services"]["github"] == "up" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_health_check_db_down(): | ||
| """Returns 'degraded' when database throws connection exception.""" | ||
|
|
||
| class FailingConn: | ||
| async def __aenter__(self): | ||
| raise SQLAlchemyError("db fail") | ||
|
|
||
| async def __aexit__(self, exc_type, exc_val, exc_tb): | ||
| pass | ||
|
|
||
| with ( | ||
| patch("app.api.health.engine.connect", return_value=FailingConn()), | ||
| patch("app.api.health.from_url", return_value=MockRedis()), | ||
| ): | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
|
|
||
| assert response.status_code == 200 | ||
| data = response.json() | ||
| assert data["status"] == "degraded" | ||
| assert data["services"]["database"] == "disconnected" | ||
| assert data["services"]["redis"] == "connected" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_health_check_redis_down(): | ||
| """Returns 'degraded' when redis throws connection exception.""" | ||
|
|
||
| class FailingRedis: | ||
| async def __aenter__(self): | ||
| return self | ||
|
|
||
| async def __aexit__(self, exc_type, exc_val, exc_tb): | ||
| pass | ||
|
|
||
| async def ping(self): | ||
| raise RedisError("redis fail") | ||
|
|
||
| with ( | ||
| patch("app.api.health.engine.connect", return_value=MockConn()), | ||
| patch("app.api.health.from_url", return_value=FailingRedis()), | ||
| ): | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
|
|
||
| assert response.status_code == 200 | ||
| data = response.json() | ||
| assert data["status"] == "degraded" | ||
| assert data["services"]["database"] == "connected" | ||
| assert data["services"]["redis"] == "disconnected" | ||
|
|
||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_health_check_both_down(): | ||
| """Returns 'degraded' when both database and redis are disconnected.""" | ||
|
|
||
| class FailingConn: | ||
| async def __aenter__(self): | ||
| raise SQLAlchemyError("db fail") | ||
|
|
||
| async def __aexit__(self, exc_type, exc_val, exc_tb): | ||
| pass | ||
|
|
||
| class FailingRedis: | ||
| async def __aenter__(self): | ||
| return self | ||
|
|
||
| async def __aexit__(self, exc_type, exc_val, exc_tb): | ||
| pass | ||
|
|
||
| async def ping(self): | ||
| raise RedisError("redis fail") | ||
|
|
||
| with ( | ||
| patch("app.api.health.engine.connect", return_value=FailingConn()), | ||
| patch("app.api.health.from_url", return_value=FailingRedis()), | ||
| ): | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
|
|
||
| assert response.status_code == 200 | ||
| @patch("app.api.health._check_solana_rpc", return_value="down") | ||
| @patch("app.api.health._check_github_api", return_value="up") | ||
| @patch("app.api.health.engine.connect", return_value=MockConn()) | ||
| @patch("app.api.health.from_url", return_value=MockRedis()) | ||
| async def test_health_check_solana_down(mock_redis, mock_db, mock_github, mock_solana): | ||
| """Returns 503 and 'degraded' when solana is down.""" | ||
| async with AsyncClient( | ||
| transport=ASGITransport(app=app), base_url="http://test" | ||
| ) as client: | ||
| response = await client.get("/health") | ||
|
|
||
| assert response.status_code == 503 | ||
| data = response.json() | ||
| assert data["status"] == "degraded" | ||
| assert data["services"]["database"] == "disconnected" | ||
| assert data["services"]["redis"] == "disconnected" | ||
| assert data["services"]["solana"] == "down" | ||
| assert data["services"]["database"] == "up" | ||
|
Comment on lines
64
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Test coverage is incomplete — missing tests for database down, Redis down, and GitHub down scenarios. The AI summary indicates that previous tests covering "database down", "redis down", and "both down" scenarios were removed. The current test suite only covers:
Missing test scenarios per Issue
This leaves significant gaps in regression coverage for the existing database and Redis checks. 🤖 Prompt for AI Agents |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Potential SSRF vulnerability via
SOLANA_RPC_URLenvironment variable.The
_check_solana_rpc()function readsSOLANA_RPC_URLfrom environment variables and makes an HTTP POST request to that URL without any validation. While environment variables are typically trusted, if an attacker can influence environment configuration (e.g., via misconfigured container orchestration), they could redirect requests to internal services.Additionally, the function silently catches all exceptions (Line 64) including network errors that might indicate configuration issues, making debugging harder.
🤖 Prompt for AI Agents