Skip to content

Commit b28dc4b

Browse files
committed
Add startup health module and readiness endpoint from PR #434
- Add services/startup_health.py with health check functions: - get_startup_status(): Returns current startup status - readiness_under_auth_context(): Validates tenant DB under auth context - run_startup_health_routine(): Runs all startup health checks - Add /health/readiness endpoint for tenant DB validation - Update startup_event() to use run_startup_health_routine() - Add raise to startup_event to fail fast on errors
1 parent 70d3677 commit b28dc4b

2 files changed

Lines changed: 235 additions & 6 deletions

File tree

backend/app.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,12 @@
9898
from api.content_planning.strategy_copilot import router as strategy_copilot_router
9999

100100
# Import database service
101-
from services.database import init_database, close_database
101+
from services.database import close_database
102+
from services.startup_health import (
103+
get_startup_status,
104+
readiness_under_auth_context,
105+
run_startup_health_routine,
106+
)
102107

103108
# Trigger reload for monitoring fix
104109

@@ -213,6 +218,14 @@ async def comprehensive_health():
213218
"""Comprehensive health check endpoint."""
214219
return health_checker.comprehensive_health_check()
215220

221+
@app.get("/health/readiness")
222+
async def readiness(current_user: dict = Depends(get_current_user)):
223+
"""Readiness check that validates tenant DB resolution/session under auth context."""
224+
return {
225+
"startup": get_startup_status(),
226+
"tenant": readiness_under_auth_context(current_user),
227+
}
228+
216229
# Rate limiting management endpoints
217230
@app.get("/api/rate-limit/status")
218231
async def rate_limit_status(request: Request):
@@ -449,23 +462,25 @@ async def serve_frontend():
449462
async def startup_event():
450463
"""Initialize services on startup."""
451464
try:
452-
# Initialize database
453-
init_database()
454-
465+
startup_report = run_startup_health_routine()
466+
if startup_report.get("status") != "healthy":
467+
logger.error(f"Startup readiness finished with failures: {startup_report.get('errors', [])}")
468+
455469
# Start task scheduler
456470
from services.scheduler import get_scheduler
457471
await get_scheduler().start()
458-
472+
459473
# Check Wix API key configuration
460474
wix_api_key = os.getenv('WIX_API_KEY')
461475
if wix_api_key:
462476
logger.warning(f"✅ WIX_API_KEY loaded ({len(wix_api_key)} chars, starts with '{wix_api_key[:10]}...')")
463477
else:
464478
logger.warning("⚠️ WIX_API_KEY not found in environment - Wix publishing may fail")
465-
479+
466480
logger.info("ALwrity backend started successfully")
467481
except Exception as e:
468482
logger.error(f"Error during startup: {e}")
483+
raise
469484

470485
# Shutdown event
471486
@app.on_event("shutdown")

backend/services/startup_health.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import os
2+
from datetime import datetime, timezone
3+
from pathlib import Path
4+
from typing import Any, Dict, List, Optional
5+
6+
from loguru import logger
7+
from sqlalchemy import inspect, text
8+
9+
from services.database import (
10+
WORKSPACE_DIR,
11+
get_all_user_ids,
12+
get_engine_for_user,
13+
get_session_for_user,
14+
get_user_db_path,
15+
init_database,
16+
default_engine,
17+
)
18+
19+
_REQUIRED_SCHEMA: Dict[str, List[str]] = {
20+
"onboarding_sessions": ["id", "user_id", "updated_at"],
21+
"daily_workflow_plans": ["id", "user_id", "generation_mode", "fallback_used"],
22+
}
23+
24+
_STARTUP_STATUS: Dict[str, Any] = {
25+
"status": "unknown",
26+
"mode": "multi_tenant" if default_engine is None else "single_tenant",
27+
"checks": [],
28+
"errors": [],
29+
"warnings": [],
30+
"checked_at": None,
31+
}
32+
33+
34+
def _env_true(name: str, default: bool = False) -> bool:
35+
raw = os.getenv(name)
36+
if raw is None:
37+
return default
38+
return raw.strip().lower() in {"1", "true", "yes", "y", "on"}
39+
40+
41+
def should_fail_fast() -> bool:
42+
if os.getenv("ALWRITY_FAIL_FAST_STARTUP") is not None:
43+
return _env_true("ALWRITY_FAIL_FAST_STARTUP", default=False)
44+
app_env = os.getenv("APP_ENV", os.getenv("ENV", "")).strip().lower()
45+
return app_env in {"prod", "production"}
46+
47+
48+
def _record_check(checks: List[Dict[str, Any]], name: str, ok: bool, detail: str) -> None:
49+
checks.append({"name": name, "ok": ok, "detail": detail})
50+
51+
52+
def _check_workspace_root(checks: List[Dict[str, Any]], errors: List[str]) -> None:
53+
workspace = Path(WORKSPACE_DIR)
54+
if not workspace.exists():
55+
errors.append(f"Workspace root does not exist: {workspace}")
56+
_record_check(checks, "workspace_root_exists", False, str(workspace))
57+
return
58+
59+
_record_check(checks, "workspace_root_exists", True, str(workspace))
60+
61+
if not os.access(workspace, os.W_OK):
62+
errors.append(f"Workspace root is not writable: {workspace}")
63+
_record_check(checks, "workspace_root_writable", False, str(workspace))
64+
return
65+
66+
probe_file = workspace / ".startup_health_write_probe"
67+
try:
68+
probe_file.write_text("ok", encoding="utf-8")
69+
probe_file.unlink(missing_ok=True)
70+
_record_check(checks, "workspace_root_writable", True, "write probe passed")
71+
except Exception as exc:
72+
errors.append(f"Workspace root write probe failed: {exc}")
73+
_record_check(checks, "workspace_root_writable", False, f"write probe failed: {exc}")
74+
75+
76+
def _check_schema_for_user(user_id: str, checks: List[Dict[str, Any]], errors: List[str]) -> None:
77+
engine = get_engine_for_user(user_id)
78+
inspector = inspect(engine)
79+
80+
for table, columns in _REQUIRED_SCHEMA.items():
81+
if not inspector.has_table(table):
82+
errors.append(f"Missing required table '{table}' in tenant DB for user '{user_id}'")
83+
_record_check(checks, f"schema_{table}", False, f"table missing for {user_id}")
84+
continue
85+
86+
existing_columns = {col["name"] for col in inspector.get_columns(table)}
87+
missing_columns = [col for col in columns if col not in existing_columns]
88+
if missing_columns:
89+
errors.append(
90+
f"Missing required columns in '{table}' for user '{user_id}': {', '.join(missing_columns)}"
91+
)
92+
_record_check(
93+
checks,
94+
f"schema_{table}",
95+
False,
96+
f"missing columns for {user_id}: {', '.join(missing_columns)}",
97+
)
98+
else:
99+
_record_check(checks, f"schema_{table}", True, f"schema ok for {user_id}")
100+
101+
102+
def _check_db_access(checks: List[Dict[str, Any]], errors: List[str], warnings: List[str]) -> Optional[str]:
103+
if default_engine is not None:
104+
try:
105+
init_database()
106+
with default_engine.connect() as conn:
107+
conn.execute(text("SELECT 1"))
108+
_record_check(checks, "single_tenant_db_connectivity", True, "SELECT 1 succeeded")
109+
return "single_tenant"
110+
except Exception as exc:
111+
errors.append(f"Single-tenant database check failed: {exc}")
112+
_record_check(checks, "single_tenant_db_connectivity", False, str(exc))
113+
return None
114+
115+
user_ids = get_all_user_ids()
116+
candidate_user = user_ids[0] if user_ids else "startup_synthetic"
117+
118+
try:
119+
db_path = get_user_db_path(candidate_user)
120+
_record_check(checks, "tenant_db_path_resolution", True, f"{candidate_user} -> {db_path}")
121+
except Exception as exc:
122+
errors.append(f"Tenant DB path resolution failed: {exc}")
123+
_record_check(checks, "tenant_db_path_resolution", False, str(exc))
124+
return None
125+
126+
try:
127+
session = get_session_for_user(candidate_user)
128+
if not session:
129+
raise RuntimeError("session creation returned None")
130+
session.execute(text("SELECT 1"))
131+
_record_check(checks, "tenant_session_create", True, f"session opened for {candidate_user}")
132+
session.close()
133+
except Exception as exc:
134+
errors.append(f"Tenant DB open/create check failed for '{candidate_user}': {exc}")
135+
_record_check(checks, "tenant_session_create", False, str(exc))
136+
return None
137+
138+
if not user_ids:
139+
warnings.append(
140+
"No existing tenant workspace found during startup; synthetic tenant DB path was used for readiness validation."
141+
)
142+
143+
_check_schema_for_user(candidate_user, checks, errors)
144+
return candidate_user
145+
146+
147+
def run_startup_health_routine() -> Dict[str, Any]:
148+
checks: List[Dict[str, Any]] = []
149+
errors: List[str] = []
150+
warnings: List[str] = []
151+
152+
_check_workspace_root(checks, errors)
153+
if not errors:
154+
_check_db_access(checks, errors, warnings)
155+
156+
status = "healthy" if not errors else "failed"
157+
report = {
158+
"status": status,
159+
"mode": "multi_tenant" if default_engine is None else "single_tenant",
160+
"checks": checks,
161+
"errors": errors,
162+
"warnings": warnings,
163+
"checked_at": datetime.now(timezone.utc).isoformat(),
164+
}
165+
166+
_STARTUP_STATUS.update(report)
167+
168+
if errors:
169+
for message in errors:
170+
logger.error(f"Startup readiness check failed: {message}")
171+
for warning in warnings:
172+
logger.warning(f"Startup readiness warning: {warning}")
173+
174+
if errors and should_fail_fast():
175+
raise RuntimeError("Startup readiness checks failed")
176+
177+
return report
178+
179+
180+
def get_startup_status() -> Dict[str, Any]:
181+
return dict(_STARTUP_STATUS)
182+
183+
184+
def readiness_under_auth_context(current_user: Dict[str, Any]) -> Dict[str, Any]:
185+
user_id = (current_user or {}).get("id") or (current_user or {}).get("clerk_user_id")
186+
if not user_id:
187+
return {
188+
"ready": False,
189+
"reason": "missing_user_context",
190+
"detail": "No authenticated user id was provided in auth context.",
191+
}
192+
193+
try:
194+
db_path = get_user_db_path(user_id)
195+
session = get_session_for_user(user_id)
196+
if not session:
197+
raise RuntimeError("Session creation returned None")
198+
session.execute(text("SELECT 1"))
199+
session.close()
200+
return {
201+
"ready": True,
202+
"user_id": user_id,
203+
"tenant_db_path": db_path,
204+
"db_session": "ok",
205+
}
206+
except Exception as exc:
207+
logger.error(f"Readiness auth-context DB check failed for user '{user_id}': {exc}")
208+
return {
209+
"ready": False,
210+
"user_id": user_id,
211+
"tenant_db_path": get_user_db_path(user_id),
212+
"db_session": "failed",
213+
"reason": str(exc),
214+
}

0 commit comments

Comments
 (0)