diff --git a/backend/app.py b/backend/app.py index 0b190fb9..5cb6ddb9 100644 --- a/backend/app.py +++ b/backend/app.py @@ -98,7 +98,12 @@ from api.content_planning.strategy_copilot import router as strategy_copilot_router # Import database service -from services.database import init_database, close_database +from services.database import close_database +from services.startup_health import ( + get_startup_status, + readiness_under_auth_context, + run_startup_health_routine, +) # Trigger reload for monitoring fix @@ -213,6 +218,15 @@ async def comprehensive_health(): """Comprehensive health check endpoint.""" return health_checker.comprehensive_health_check() +@app.get("/health/readiness") +async def readiness(current_user: dict = Depends(get_current_user)): + """Readiness check that validates tenant DB resolution/session under auth context.""" + return { + "startup": get_startup_status(), + "tenant": readiness_under_auth_context(current_user), + } + + # Rate limiting management endpoints @app.get("/api/rate-limit/status") async def rate_limit_status(request: Request): @@ -449,23 +463,25 @@ async def serve_frontend(): async def startup_event(): """Initialize services on startup.""" try: - # Initialize database - init_database() - + startup_report = run_startup_health_routine() + if startup_report.get("status") != "healthy": + logger.error(f"Startup readiness finished with failures: {startup_report.get('errors', [])}") + # Start task scheduler from services.scheduler import get_scheduler await get_scheduler().start() - + # Check Wix API key configuration wix_api_key = os.getenv('WIX_API_KEY') if wix_api_key: logger.warning(f"✅ WIX_API_KEY loaded ({len(wix_api_key)} chars, starts with '{wix_api_key[:10]}...')") else: logger.warning("⚠️ WIX_API_KEY not found in environment - Wix publishing may fail") - + logger.info("ALwrity backend started successfully") except Exception as e: logger.error(f"Error during startup: {e}") + raise # Shutdown event @app.on_event("shutdown") diff --git a/backend/main.py b/backend/main.py index 7f5ee43f..e39035aa 100644 --- a/backend/main.py +++ b/backend/main.py @@ -97,7 +97,12 @@ from api.content_planning.strategy_copilot import router as strategy_copilot_router # Import database service -from services.database import init_database, close_database +from services.database import close_database +from services.startup_health import ( + get_startup_status, + readiness_under_auth_context, + run_startup_health_routine, +) # Trigger reload for monitoring fix @@ -210,6 +215,15 @@ async def comprehensive_health(): """Comprehensive health check endpoint.""" return health_checker.comprehensive_health_check() +@app.get("/health/readiness") +async def readiness(current_user: dict = Depends(get_current_user)): + """Readiness check that validates tenant DB resolution/session under auth context.""" + return { + "startup": get_startup_status(), + "tenant": readiness_under_auth_context(current_user), + } + + # Rate limiting management endpoints @app.get("/api/rate-limit/status") async def rate_limit_status(request: Request): @@ -437,23 +451,25 @@ async def serve_frontend(): async def startup_event(): """Initialize services on startup.""" try: - # Initialize database - init_database() - + startup_report = run_startup_health_routine() + if startup_report.get("status") != "healthy": + logger.error(f"Startup readiness finished with failures: {startup_report.get('errors', [])}") + # Start task scheduler from services.scheduler import get_scheduler await get_scheduler().start() - + # Check Wix API key configuration wix_api_key = os.getenv('WIX_API_KEY') if wix_api_key: - logger.warning(f"✅ WIX_API_KEY loaded ({len(wix_api_key)} chars, starts with '{wix_api_key[:10]}...')") + logger.warning(f"✅ WIX_API_KEY loaded ({len(wix_api_key)} chars, starts with '{wix_api_key[:10]}...')") else: - logger.warning("âš ï¸ WIX_API_KEY not found in environment - Wix publishing may fail") - + logger.warning("⚠️ WIX_API_KEY not found in environment - Wix publishing may fail") + logger.info("ALwrity backend started successfully") except Exception as e: logger.error(f"Error during startup: {e}") + raise # Shutdown event @app.on_event("shutdown") diff --git a/backend/services/startup_health.py b/backend/services/startup_health.py new file mode 100644 index 00000000..7ae07dcb --- /dev/null +++ b/backend/services/startup_health.py @@ -0,0 +1,214 @@ +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, List, Optional + +from loguru import logger +from sqlalchemy import inspect, text + +from services.database import ( + WORKSPACE_DIR, + get_all_user_ids, + get_engine_for_user, + get_session_for_user, + get_user_db_path, + init_database, + default_engine, +) + +_REQUIRED_SCHEMA: Dict[str, List[str]] = { + "onboarding_sessions": ["id", "user_id", "updated_at"], + "daily_workflow_plans": ["id", "user_id", "generation_mode", "fallback_used"], +} + +_STARTUP_STATUS: Dict[str, Any] = { + "status": "unknown", + "mode": "multi_tenant" if default_engine is None else "single_tenant", + "checks": [], + "errors": [], + "warnings": [], + "checked_at": None, +} + + +def _env_true(name: str, default: bool = False) -> bool: + raw = os.getenv(name) + if raw is None: + return default + return raw.strip().lower() in {"1", "true", "yes", "y", "on"} + + +def should_fail_fast() -> bool: + if os.getenv("ALWRITY_FAIL_FAST_STARTUP") is not None: + return _env_true("ALWRITY_FAIL_FAST_STARTUP", default=False) + app_env = os.getenv("APP_ENV", os.getenv("ENV", "")).strip().lower() + return app_env in {"prod", "production"} + + +def _record_check(checks: List[Dict[str, Any]], name: str, ok: bool, detail: str) -> None: + checks.append({"name": name, "ok": ok, "detail": detail}) + + +def _check_workspace_root(checks: List[Dict[str, Any]], errors: List[str]) -> None: + workspace = Path(WORKSPACE_DIR) + if not workspace.exists(): + errors.append(f"Workspace root does not exist: {workspace}") + _record_check(checks, "workspace_root_exists", False, str(workspace)) + return + + _record_check(checks, "workspace_root_exists", True, str(workspace)) + + if not os.access(workspace, os.W_OK): + errors.append(f"Workspace root is not writable: {workspace}") + _record_check(checks, "workspace_root_writable", False, str(workspace)) + return + + probe_file = workspace / ".startup_health_write_probe" + try: + probe_file.write_text("ok", encoding="utf-8") + probe_file.unlink(missing_ok=True) + _record_check(checks, "workspace_root_writable", True, "write probe passed") + except Exception as exc: + errors.append(f"Workspace root write probe failed: {exc}") + _record_check(checks, "workspace_root_writable", False, f"write probe failed: {exc}") + + +def _check_schema_for_user(user_id: str, checks: List[Dict[str, Any]], errors: List[str]) -> None: + engine = get_engine_for_user(user_id) + inspector = inspect(engine) + + for table, columns in _REQUIRED_SCHEMA.items(): + if not inspector.has_table(table): + errors.append(f"Missing required table '{table}' in tenant DB for user '{user_id}'") + _record_check(checks, f"schema_{table}", False, f"table missing for {user_id}") + continue + + existing_columns = {col["name"] for col in inspector.get_columns(table)} + missing_columns = [col for col in columns if col not in existing_columns] + if missing_columns: + errors.append( + f"Missing required columns in '{table}' for user '{user_id}': {', '.join(missing_columns)}" + ) + _record_check( + checks, + f"schema_{table}", + False, + f"missing columns for {user_id}: {', '.join(missing_columns)}", + ) + else: + _record_check(checks, f"schema_{table}", True, f"schema ok for {user_id}") + + +def _check_db_access(checks: List[Dict[str, Any]], errors: List[str], warnings: List[str]) -> Optional[str]: + if default_engine is not None: + try: + init_database() + with default_engine.connect() as conn: + conn.execute(text("SELECT 1")) + _record_check(checks, "single_tenant_db_connectivity", True, "SELECT 1 succeeded") + return "single_tenant" + except Exception as exc: + errors.append(f"Single-tenant database check failed: {exc}") + _record_check(checks, "single_tenant_db_connectivity", False, str(exc)) + return None + + user_ids = get_all_user_ids() + candidate_user = user_ids[0] if user_ids else "startup_synthetic" + + try: + db_path = get_user_db_path(candidate_user) + _record_check(checks, "tenant_db_path_resolution", True, f"{candidate_user} -> {db_path}") + except Exception as exc: + errors.append(f"Tenant DB path resolution failed: {exc}") + _record_check(checks, "tenant_db_path_resolution", False, str(exc)) + return None + + try: + session = get_session_for_user(candidate_user) + if not session: + raise RuntimeError("session creation returned None") + session.execute(text("SELECT 1")) + _record_check(checks, "tenant_session_create", True, f"session opened for {candidate_user}") + session.close() + except Exception as exc: + errors.append(f"Tenant DB open/create check failed for '{candidate_user}': {exc}") + _record_check(checks, "tenant_session_create", False, str(exc)) + return None + + if not user_ids: + warnings.append( + "No existing tenant workspace found during startup; synthetic tenant DB path was used for readiness validation." + ) + + _check_schema_for_user(candidate_user, checks, errors) + return candidate_user + + +def run_startup_health_routine() -> Dict[str, Any]: + checks: List[Dict[str, Any]] = [] + errors: List[str] = [] + warnings: List[str] = [] + + _check_workspace_root(checks, errors) + if not errors: + _check_db_access(checks, errors, warnings) + + status = "healthy" if not errors else "failed" + report = { + "status": status, + "mode": "multi_tenant" if default_engine is None else "single_tenant", + "checks": checks, + "errors": errors, + "warnings": warnings, + "checked_at": datetime.now(timezone.utc).isoformat(), + } + + _STARTUP_STATUS.update(report) + + if errors: + for message in errors: + logger.error(f"Startup readiness check failed: {message}") + for warning in warnings: + logger.warning(f"Startup readiness warning: {warning}") + + if errors and should_fail_fast(): + raise RuntimeError("Startup readiness checks failed") + + return report + + +def get_startup_status() -> Dict[str, Any]: + return dict(_STARTUP_STATUS) + + +def readiness_under_auth_context(current_user: Dict[str, Any]) -> Dict[str, Any]: + user_id = (current_user or {}).get("id") or (current_user or {}).get("clerk_user_id") + if not user_id: + return { + "ready": False, + "reason": "missing_user_context", + "detail": "No authenticated user id was provided in auth context.", + } + + try: + db_path = get_user_db_path(user_id) + session = get_session_for_user(user_id) + if not session: + raise RuntimeError("Session creation returned None") + session.execute(text("SELECT 1")) + session.close() + return { + "ready": True, + "user_id": user_id, + "tenant_db_path": db_path, + "db_session": "ok", + } + except Exception as exc: + logger.error(f"Readiness auth-context DB check failed for user '{user_id}': {exc}") + return { + "ready": False, + "user_id": user_id, + "tenant_db_path": get_user_db_path(user_id), + "db_session": "failed", + "reason": str(exc), + } diff --git a/docs/STARTUP_READINESS_BEHAVIOR.md b/docs/STARTUP_READINESS_BEHAVIOR.md new file mode 100644 index 00000000..e97cf832 --- /dev/null +++ b/docs/STARTUP_READINESS_BEHAVIOR.md @@ -0,0 +1,65 @@ +# Backend Startup Readiness Behavior + +This document describes the startup/readiness checks now performed by the backend in both `backend/main.py` and `backend/app.py`. + +## What startup validates + +At startup, the backend now runs a dedicated health routine before scheduler startup: + +1. **Workspace root check** + - Verifies the workspace root directory exists. + - Verifies it is writable using an actual write/delete probe. + +2. **Database open/create check** + - **Single-tenant mode** (`default_engine` enabled): validates global DB initialization and a `SELECT 1` connectivity query. + - **Multi-tenant mode** (`default_engine` disabled): resolves at least one tenant DB path and validates DB session creation/query using: + - the first discovered tenant workspace (preferred), or + - a synthetic startup tenant (`startup_synthetic`) when no tenant exists yet. + +3. **Schema compatibility check** + - Confirms presence of required tables/columns for baseline migration compatibility: + - `onboarding_sessions`: `id`, `user_id`, `updated_at` + - `daily_workflow_plans`: `id`, `user_id`, `generation_mode`, `fallback_used` + +## Warning vs failure conditions + +### Multi-tenant mode + +- **Warning** + - No existing tenant workspace at startup. A synthetic tenant path/session check is used. +- **Failure** + - Workspace root missing or not writable. + - Tenant DB path resolution fails. + - Tenant DB session/query fails. + - Required schema tables/columns missing. + +### Single-tenant mode + +- **Failure** + - Global DB initialization/connectivity fails. + - Workspace root missing or not writable. + +## Fail-fast behavior + +Failures are always logged at **error** level. + +Startup fail-fast is controlled by: + +- `ALWRITY_FAIL_FAST_STARTUP=true|false` (explicit override), or +- if unset, defaults to **true in production** (`APP_ENV` or `ENV` is `production`/`prod`), and **false otherwise**. + +When fail-fast is active and startup checks fail, startup raises and the process exits instead of running in degraded mode. + +## Readiness endpoint + +`GET /health/readiness` + +- Requires authenticated context. +- Returns: + - latest startup check report, and + - auth-context tenant readiness validation (user DB path resolution + session/query check). + +This helps operators distinguish: + +- platform startup health, vs +- per-tenant readiness under real auth context.