fix: Refactor incident processing to use IncidentBl with sessions (#4013)

VladimirFilonov · web-flow · commit d025ba8b62d8 · 2025-03-13T17:11:23.000+04:00
diff --git a/keep/api/bl/incidents_bl.py b/keep/api/bl/incidents_bl.py
@@ -6,6 +6,7 @@
 from typing import List, Optional
 from uuid import UUID
 
+import asyncio
 from fastapi import HTTPException
 from pusher import Pusher
 from sqlalchemy.orm.exc import StaleDataError
@@ -55,6 +56,7 @@
 
 
 class IncidentBl:
+
     def __init__(
         self,
         tenant_id: str,
@@ -71,13 +73,13 @@ def __init__(
         self.redis = os.environ.get("REDIS", "false") == "true"
 
     def create_incident(
-        self, incident_dto: IncidentDtoIn, generated_from_ai: bool = False
+        self, incident_dto: [IncidentDtoIn | IncidentDto], generated_from_ai: bool = False
     ) -> IncidentDto:
         """
         Creates a new incident.
 
         Args:
-            incident_dto (IncidentDtoIn): The data transfer object containing the details of the incident to be created.
+            incident_dto (IncidentDtoIn | IncidentDto): The data transfer object containing the details of the incident to be created.
             generated_from_ai (bool, optional): Indicates if the incident was generated by Keep's AI. Defaults to False.
 
         Returns:
@@ -111,6 +113,12 @@ def create_incident(
         )
         return new_incident_dto
 
+    def sync_add_alerts_to_incident(self, *args, **kwargs) -> None:
+        """
+        Synchronous wrapper for the async add_alerts_to_incident method.
+        """
+        asyncio.run(self.add_alerts_to_incident(*args, **kwargs))
+
     async def add_alerts_to_incident(
         self,
         incident_id: UUID,
diff --git a/keep/api/core/db.py b/keep/api/core/db.py
@@ -3717,8 +3717,12 @@ def update_incident_from_dto_by_id(
         return incident
 
 
-def get_incident_by_fingerprint(tenant_id: str, fingerprint: str) -> Optional[Incident]:
-    with Session(engine) as session:
+def get_incident_by_fingerprint(
+    tenant_id: str,
+    fingerprint: str,
+    session: Optional[Session] = None
+) -> Optional[Incident]:
+    with existed_or_new_session(session) as session:
         return session.exec(
             select(Incident).where(
                 Incident.tenant_id == tenant_id, Incident.fingerprint == fingerprint
@@ -3729,10 +3733,11 @@ def get_incident_by_fingerprint(tenant_id: str, fingerprint: str) -> Optional[In
 def delete_incident_by_id(
     tenant_id: str,
     incident_id: UUID,
+    session: Optional[Session] = None
 ) -> bool:
     if isinstance(incident_id, str):
         incident_id = __convert_to_uuid(incident_id)
-    with Session(engine) as session:
+    with existed_or_new_session(session) as session:
         incident = session.exec(
             select(Incident).filter(
                 Incident.tenant_id == tenant_id,
diff --git a/keep/api/tasks/process_incident_task.py b/keep/api/tasks/process_incident_task.py
@@ -1,15 +1,14 @@
 import logging
 
 from arq import Retry
+from sqlmodel import Session
 
+from keep.api.bl.incidents_bl import IncidentBl
 from keep.api.core.db import (
-    add_alerts_to_incident,
-    create_incident_from_dto,
     get_incident_by_fingerprint,
     get_incident_by_id,
-    update_incident_from_dto_by_id,
+    engine,
 )
-from keep.api.core.dependencies import get_pusher_client
 from keep.api.models.incident import IncidentDto
 from keep.api.tasks.process_event_task import process_event
 
@@ -32,121 +31,111 @@ def process_incident(
         "trace_id": trace_id,
     }
 
-    if ctx and isinstance(ctx, dict):
-        extra["job_try"] = ctx.get("job_try", 0)
-        extra["job_id"] = ctx.get("job_id", None)
+    with Session(engine) as session:
 
-    if isinstance(incidents, IncidentDto):
-        incidents = [incidents]
+        if ctx and isinstance(ctx, dict):
+            extra["job_try"] = ctx.get("job_try", 0)
+            extra["job_id"] = ctx.get("job_id", None)
 
-    logger.info(f"Processing {len(incidents)} incidents", extra=extra)
+        if isinstance(incidents, IncidentDto):
+            incidents = [incidents]
 
-    if logger.getEffectiveLevel() == logging.DEBUG:
-        # Lets log the incidents in debug mode
-        extra["incident"] = [i.dict() for i in incidents]
+        logger.info(f"Processing {len(incidents)} incidents", extra=extra)
 
-    try:
-        for incident in incidents:
-            logger.info(
-                f"Processing incident: {incident.id}",
-                extra={**extra, "fingerprint": incident.fingerprint},
-            )
-
-            incident_from_db = get_incident_by_id(
-                tenant_id=tenant_id, incident_id=incident.id
-            )
+        if logger.getEffectiveLevel() == logging.DEBUG:
+            # Lets log the incidents in debug mode
+            extra["incident"] = [i.dict() for i in incidents]
 
-            # Try to get by fingerprint if no incident was found by id
-            if incident_from_db is None and incident.fingerprint:
-                incident_from_db = get_incident_by_fingerprint(
-                    tenant_id=tenant_id, fingerprint=incident.fingerprint
-                )
+        incident_bl = IncidentBl(tenant_id, session)
 
-            if incident_from_db:
-                logger.info(
-                    f"Updating incident: {incident.id}",
-                    extra={**extra, "fingerprint": incident.fingerprint},
-                )
-                incident_from_db = update_incident_from_dto_by_id(
-                    tenant_id=tenant_id,
-                    incident_id=incident_from_db.id,
-                    updated_incident_dto=incident,
-                )
-                logger.info(
-                    f"Updated incident: {incident.id}",
-                    extra={**extra, "fingerprint": incident.fingerprint},
-                )
-            else:
+        try:
+            for incident in incidents:
                 logger.info(
-                    f"Creating incident: {incident.id}",
+                    f"Processing incident: {incident.id}",
                     extra={**extra, "fingerprint": incident.fingerprint},
                 )
-                incident_from_db = create_incident_from_dto(
-                    tenant_id=tenant_id,
-                    incident_dto=incident,
-                )
-                logger.info(
-                    f"Created incident: {incident.id}",
-                    extra={**extra, "fingerprint": incident.fingerprint},
+
+                incident_from_db = get_incident_by_id(
+                    tenant_id=tenant_id, incident_id=incident.id, session=session
                 )
 
-            try:
-                if incident.alerts:
-                    logger.info("Adding incident alerts", extra=extra)
-                    processed_alerts = process_event(
-                        {},
-                        tenant_id,
-                        provider_type,
-                        provider_id,
-                        None,
-                        None,
-                        trace_id,
-                        incident.alerts,
+                # Try to get by fingerprint if no incident was found by id
+                if incident_from_db is None and incident.fingerprint:
+                    incident_from_db = get_incident_by_fingerprint(
+                        tenant_id=tenant_id, fingerprint=incident.fingerprint, session=session
+                    )
+
+                if incident_from_db:
+                    logger.info(
+                        f"Updating incident: {incident.id}",
+                        extra={**extra, "fingerprint": incident.fingerprint},
+                    )
+                    incident_from_db = incident_bl.update_incident(
+                        incident_id=incident_from_db.id,
+                        updated_incident_dto=incident,
+                        generated_by_ai=False,
+                    )
+                    logger.info(
+                        f"Updated incident: {incident.id}",
+                        extra={**extra, "fingerprint": incident.fingerprint},
+                    )
+                else:
+                    logger.info(
+                        f"Creating incident: {incident.id}",
+                        extra={**extra, "fingerprint": incident.fingerprint},
+                    )
+                    incident_from_db = incident_bl.create_incident(
+                        incident_dto=incident,
+                    )
+                    logger.info(
+                        f"Created incident: {incident.id}",
+                        extra={**extra, "fingerprint": incident.fingerprint},
                     )
-                    if processed_alerts:
-                        add_alerts_to_incident(
+
+                try:
+                    if incident.alerts:
+                        logger.info("Adding incident alerts", extra=extra)
+                        processed_alerts = process_event(
+                            {},
                             tenant_id,
-                            incident_from_db,
-                            [
-                                processed_alert.event_id
-                                for processed_alert in processed_alerts
-                            ],
-                            # Because the incident was created with the alerts count, we need to override it
-                            # otherwise it will be the sum of the previous count + the newly attached alerts count
-                            override_count=True,
-                        )
-                        logger.info("Added incident alerts", extra=extra)
-                    else:
-                        logger.info(
-                            "No alerts to add to incident, probably deduplicated",
-                            extra=extra,
+                            provider_type,
+                            provider_id,
+                            None,
+                            None,
+                            trace_id,
+                            incident.alerts,
                         )
-            except Exception:
-                logger.exception("Error adding incident alerts", extra=extra)
-            logger.info("Processed incident", extra=extra)
-
-        pusher_client = get_pusher_client()
-        if not pusher_client:
-            pass
-        try:
-            pusher_client.trigger(
-                f"private-{tenant_id}",
-                "incident-change",
-                {},
-            )
+                        if processed_alerts:
+                            incident_bl.sync_add_alerts_to_incident(
+                                incident_from_db.id,
+                                [
+                                    processed_alert.fingerprint
+                                    for processed_alert in processed_alerts
+                                ],
+                                # Because the incident was created with the alerts count, we need to override it
+                                # otherwise it will be the sum of the previous count + the newly attached alerts count
+                                override_count=True,
+                            )
+                            logger.info("Added incident alerts", extra=extra)
+                        else:
+                            logger.info(
+                                "No alerts to add to incident, probably deduplicated",
+                                extra=extra,
+                            )
+                except Exception:
+                    logger.exception("Error adding incident alerts", extra=extra)
+                logger.info("Processed incident", extra=extra)
+
+            logger.info("Processed all incidents", extra=extra)
         except Exception:
-            logger.exception("Failed to push incidents to the client")
-
-        logger.info("Processed all incidents", extra=extra)
-    except Exception:
-        logger.exception(
-            "Error processing incidents",
-            extra=extra,
-        )
-
-        # Retrying only if context is present (running the job in arq worker)
-        if bool(ctx):
-            raise Retry(defer=ctx["job_try"] * TIMES_TO_RETRY_JOB)
+            logger.exception(
+                "Error processing incidents",
+                extra=extra,
+            )
+
+            # Retrying only if context is present (running the job in arq worker)
+            if bool(ctx):
+                raise Retry(defer=ctx["job_try"] * TIMES_TO_RETRY_JOB)
 
 
 async def async_process_incident(*args, **kwargs):