RolnickLab
diff --git a/‎.agents/AGENTS.md‎
Lines changed: 10 additions & 0 deletions b/‎.agents/AGENTS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎ami/jobs/management/commands/chaos_monkey.py‎
Lines changed: 97 additions & 0 deletions b/‎ami/jobs/management/commands/chaos_monkey.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎ami/jobs/management/commands/test_ml_job_e2e.py‎
Lines changed: 5 additions & 1 deletion b/‎ami/jobs/management/commands/test_ml_job_e2e.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎ami/jobs/models.py‎
Lines changed: 40 additions & 19 deletions b/‎ami/jobs/models.py‎
Lines changed: 40 additions & 19 deletions
diff --git a/‎ami/jobs/tasks.py‎
Lines changed: 27 additions & 9 deletions b/‎ami/jobs/tasks.py‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎ami/jobs/tests.py‎
Lines changed: 3 additions & 1 deletion b/‎ami/jobs/tests.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎ami/jobs/views.py‎
Lines changed: 2 additions & 2 deletions b/‎ami/jobs/views.py‎
Lines changed: 2 additions & 2 deletions
@@ -650,6 +650,16 @@ images = SourceImage.objects.annotate(det_count=Count('detections'))
 - Use `@shared_task` decorator for all tasks
 - Check Flower UI for debugging: http://localhost:5555
 
+### E2E Testing & Monitoring Async Jobs
+
+Run an end-to-end ML job test:
+```bash
+docker compose run --rm django python manage.py test_ml_job_e2e \
+  --project 18 --dispatch-mode async_api --collection 142 --pipeline "global_moths_2024"
+```
+
+For monitoring running jobs (Django ORM, REST API, NATS consumer state, Redis counters, worker logs, etc.), see `docs/claude/reference/monitoring-async-jobs.md`.
+
 ### Running a Single Test
 
 ```bash
 
@@ -0,0 +1,97 @@
+"""
+Fault injection utility for manual chaos testing of ML async jobs.
+
+Use alongside `test_ml_job_e2e` to verify job behaviour when Redis or NATS
+becomes unavailable or loses state mid-processing.
+
+Usage examples:
+
+    # Flush all Redis state immediately (simulates FLUSHDB mid-job)
+    python manage.py chaos_monkey flush redis
+
+    # Flush all NATS JetStream streams (simulates broker state loss)
+    python manage.py chaos_monkey flush nats
+"""
+
+from asgiref.sync import async_to_sync
+from django.conf import settings
+from django.core.management.base import BaseCommand, CommandError
+from django_redis import get_redis_connection
+
+NATS_URL = getattr(settings, "NATS_URL", "nats://nats:4222")
+
+
+class Command(BaseCommand):
+    help = "Inject faults into Redis or NATS for chaos/resilience testing"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "action",
+            choices=["flush"],
+            help="flush: wipe all state.",
+        )
+        parser.add_argument(
+            "service",
+            choices=["redis", "nats"],
+            help="Target service to fault.",
+        )
+
+    def handle(self, *args, **options):
+        action = options["action"]
+        service = options["service"]
+
+        if action == "flush" and service == "redis":
+            self._flush_redis()
+        elif action == "flush" and service == "nats":
+            self._flush_nats()
+
+    # ------------------------------------------------------------------
+    # Redis
+    # ------------------------------------------------------------------
+
+    def _flush_redis(self):
+        self.stdout.write("Flushing Redis database (FLUSHDB)...")
+        try:
+            redis = get_redis_connection("default")
+            redis.flushdb()
+            self.stdout.write(self.style.SUCCESS("Redis flushed."))
+        except Exception as e:
+            raise CommandError(f"Failed to flush Redis: {e}") from e
+
+    # ------------------------------------------------------------------
+    # NATS
+    # ------------------------------------------------------------------
+
+    def _flush_nats(self):
+        """Delete all JetStream streams via the NATS Python client."""
+        self.stdout.write("Flushing all NATS JetStream streams...")
+
+        async def _delete_all_streams():
+            import nats
+
+            nc = await nats.connect(NATS_URL, connect_timeout=5, allow_reconnect=False)
+            js = nc.jetstream()
+            try:
+                streams = await js.streams_info()
+                if not streams:
+                    return []
+                deleted = []
+                for stream in streams:
+                    name = stream.config.name
+                    await js.delete_stream(name)
+                    deleted.append(name)
+                return deleted
+            finally:
+                await nc.close()
+
+        try:
+            deleted = async_to_sync(_delete_all_streams)()
+        except Exception as e:
+            raise CommandError(f"Failed to flush NATS: {e}") from e
+
+        if deleted:
+            for name in deleted:
+                self.stdout.write(f"  Deleted stream: {name}")
+            self.stdout.write(self.style.SUCCESS(f"Deleted {len(deleted)} stream(s)."))
+        else:
+            self.stdout.write("No streams found — NATS already empty.")
@@ -10,7 +10,11 @@
 
 
 class Command(BaseCommand):
-    help = "Run end-to-end test of ML job processing"
+    help = (
+        "Run end-to-end test of ML job processing.\n\n"
+        "For monitoring and debugging running jobs, see:\n"
+        "  docs/claude/reference/monitoring-async-jobs.md"
+    )
 
     def add_arguments(self, parser):
         parser.add_argument("--project", type=int, required=True, help="Project ID")
 
@@ -15,7 +15,7 @@
 
 from ami.base.models import BaseModel
 from ami.base.schemas import ConfigurableStage, ConfigurableStageParam
-from ami.jobs.tasks import run_job
+from ami.jobs.tasks import cleanup_async_job_if_needed, run_job
 from ami.main.models import Deployment, Project, SourceImage, SourceImageCollection
 from ami.ml.models import Pipeline
 from ami.ml.post_processing.registry import get_postprocessing_task
@@ -88,6 +88,11 @@ def final_states(cls):
     def failed_states(cls):
         return [cls.FAILURE, cls.REVOKED, cls.UNKNOWN]
 
+    @classmethod
+    def active_states(cls):
+        """States where a job is actively processing and should serve tasks to workers."""
+        return [cls.STARTED, cls.RETRY]
+
 
 def get_status_label(status: JobState, progress: float) -> str:
     """
@@ -331,26 +336,29 @@ def emit(self, record: logging.LogRecord):
         # Log to the current app logger
         logger.log(record.levelno, self.format(record))
 
-        # Write to the logs field on the job instance
-        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        msg = f"[{timestamp}] {record.levelname} {self.format(record)}"
-        if msg not in self.job.logs.stdout:
-            self.job.logs.stdout.insert(0, msg)
+        # Write to the logs field on the job instance.
+        # Refresh from DB first to reduce the window for concurrent overwrites — each
+        # worker holds its own stale in-memory copy of `logs`, so without a refresh the
+        # last writer always wins and earlier entries are silently dropped.
+        # @TODO consider saving logs to the database periodically rather than on every log
+        try:
+            self.job.refresh_from_db(fields=["logs"])
+            timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            msg = f"[{timestamp}] {record.levelname} {self.format(record)}"
+            if msg not in self.job.logs.stdout:
+                self.job.logs.stdout.insert(0, msg)
 
-        # Write a simpler copy of any errors to the errors field
-        if record.levelno >= logging.ERROR:
-            if record.message not in self.job.logs.stderr:
-                self.job.logs.stderr.insert(0, record.message)
+            # Write a simpler copy of any errors to the errors field
+            if record.levelno >= logging.ERROR:
+                if record.message not in self.job.logs.stderr:
+                    self.job.logs.stderr.insert(0, record.message)
 
-        if len(self.job.logs.stdout) > self.max_log_length:
-            self.job.logs.stdout = self.job.logs.stdout[: self.max_log_length]
+            if len(self.job.logs.stdout) > self.max_log_length:
+                self.job.logs.stdout = self.job.logs.stdout[: self.max_log_length]
 
-        # @TODO consider saving logs to the database periodically rather than on every log
-        try:
             self.job.save(update_fields=["logs"], update_progress=False)
         except Exception as e:
             logger.error(f"Failed to save logs for job #{self.job.pk}: {e}")
-            pass
 
 
 @dataclass
@@ -966,19 +974,28 @@ def retry(self, async_task=True):
 
     def cancel(self):
         """
-        Terminate the celery task.
+        Cancel a job. For async_api jobs, clean up NATS/Redis resources
+        and transition through CANCELING → REVOKED. For other jobs,
+        revoke the Celery task.
         """
         self.status = JobState.CANCELING
         self.save()
+
         if self.task_id:
             task = run_job.AsyncResult(self.task_id)
             if task:
                 task.revoke(terminate=True)
+            if self.dispatch_mode == JobDispatchMode.ASYNC_API:
+                # For async jobs we need to set the status to revoked here since the task already
+                # finished (it only queues the images).
+                self.status = JobState.REVOKED
                 self.save()
         else:
             self.status = JobState.REVOKED
             self.save()
 
+        cleanup_async_job_if_needed(self)
+
     def update_status(self, status=None, save=True):
         """
         Update the status of the job based on the status of the celery task.
@@ -1084,11 +1101,15 @@ def get_default_progress(cls) -> JobProgress:
     def logger(self) -> logging.Logger:
         _logger = logging.getLogger(f"ami.jobs.{self.pk}")
 
-        # Only add JobLogHandler if not already present
-        if not any(isinstance(h, JobLogHandler) for h in _logger.handlers):
-            # Also log output to a field on thie model instance
+        # Update or add JobLogHandler, always pointing to the current instance.
+        # The logger is a process-level singleton so its handler may reference a stale
+        # job instance from a previous task execution in this worker process.
+        handler = next((h for h in _logger.handlers if isinstance(h, JobLogHandler)), None)
+        if handler is None:
             logger.info("Adding JobLogHandler to logger for job %s", self.pk)
             _logger.addHandler(JobLogHandler(self))
+        else:
+            handler.job = self
         _logger.propagate = False
         return _logger
 
 
@@ -86,10 +86,9 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
 
     progress_info = state_manager.update_state(processed_image_ids, stage="process", failed_image_ids=failed_image_ids)
     if not progress_info:
-        logger.error(f"Redis state missing for job {job_id} — job may have been cleaned up prematurely.")
         # Acknowledge the task to prevent retries, since we don't know the state
         _ack_task_via_nats(reply_subject, logger)
-        # TODO: cancel the job to fail fast once PR #1144 is merged
+        _fail_job(job_id, "Redis state missing for job")
         return
 
     try:
@@ -153,8 +152,7 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
         )
 
         if not progress_info:
-            job.logger.error(f"Redis state missing for job {job_id} — job may have been cleaned up prematurely.")
-            # TODO: cancel the job to fail fast once PR #1144 is merged
+            _fail_job(job_id, "Redis state missing for job")
             return
 
         # update complete state based on latest progress info after saving results
@@ -180,6 +178,26 @@ def process_nats_pipeline_result(self, job_id: int, result_data: dict, reply_sub
         job.logger.error(error)
 
 
+def _fail_job(job_id: int, reason: str) -> None:
+    from ami.jobs.models import Job, JobState
+    from ami.ml.orchestration.jobs import cleanup_async_job_resources
+
+    try:
+        with transaction.atomic():
+            job = Job.objects.select_for_update().get(pk=job_id)
+            if job.status in (JobState.CANCELING, *JobState.final_states()):
+                return
+            job.update_status(JobState.FAILURE, save=False)
+            job.finished_at = datetime.datetime.now()
+            job.save(update_fields=["status", "progress", "finished_at"])
+
+        job.logger.error(f"Job {job_id} marked as FAILURE: {reason}")
+        cleanup_async_job_resources(job.pk, job.logger)
+    except Job.DoesNotExist:
+        logger.error(f"Cannot fail job {job_id}: not found")
+        cleanup_async_job_resources(job_id, logger)
+
+
 def _ack_task_via_nats(reply_subject: str, job_logger: logging.Logger) -> None:
     try:
 
@@ -295,10 +313,10 @@ def _update_job_progress(
     # Clean up async resources for completed jobs that use NATS/Redis
     if job.progress.is_complete():
         job = Job.objects.get(pk=job_id)  # Re-fetch outside transaction
-        _cleanup_job_if_needed(job)
+        cleanup_async_job_if_needed(job)
 
 
-def _cleanup_job_if_needed(job) -> None:
+def cleanup_async_job_if_needed(job) -> None:
     """
     Clean up async resources (NATS/Redis) if this job uses them.
 
@@ -314,7 +332,7 @@ def _cleanup_job_if_needed(job) -> None:
         # import here to avoid circular imports
         from ami.ml.orchestration.jobs import cleanup_async_job_resources
 
-        cleanup_async_job_resources(job)
+        cleanup_async_job_resources(job.pk, job.logger)
 
 
 @task_prerun.connect(sender=run_job)
@@ -353,7 +371,7 @@ def update_job_status(sender, task_id, task, state: str, retval=None, **kwargs):
 
     # Clean up async resources for revoked jobs
     if state == JobState.REVOKED:
-        _cleanup_job_if_needed(job)
+        cleanup_async_job_if_needed(job)
 
 
 @task_failure.connect(sender=run_job, retry=False)
@@ -368,7 +386,7 @@ def update_job_failure(sender, task_id, exception, *args, **kwargs):
     job.save()
 
     # Clean up async resources for failed jobs
-    _cleanup_job_if_needed(job)
+    cleanup_async_job_if_needed(job)
 
 
 def log_time(start: float = 0, msg: str | None = None) -> tuple[float, Callable]:
 
@@ -445,7 +445,8 @@ def _task_batch_helper(self, value: Any, expected_status: int):
         pipeline = self._create_pipeline()
         job = self._create_ml_job("Job for batch test", pipeline)
         job.dispatch_mode = JobDispatchMode.ASYNC_API
-        job.save(update_fields=["dispatch_mode"])
+        job.status = JobState.STARTED
+        job.save(update_fields=["dispatch_mode", "status"])
         images = [
             SourceImage.objects.create(
                 path=f"image_{i}.jpg",
@@ -487,6 +488,7 @@ def test_tasks_endpoint_without_pipeline(self):
             name="Job without pipeline",
             source_image_collection=self.source_image_collection,
             dispatch_mode=JobDispatchMode.ASYNC_API,
+            status=JobState.STARTED,
         )
 
         self.client.force_authenticate(user=self.user)
 
@@ -237,8 +237,8 @@ def tasks(self, request, pk=None):
         if job.dispatch_mode != JobDispatchMode.ASYNC_API:
             raise ValidationError("Only async_api jobs have fetchable tasks")
 
-        # Don't fetch tasks from completed/failed/revoked jobs
-        if job.status in JobState.final_states():
+        # Only serve tasks for actively processing jobs
+        if job.status not in JobState.active_states():
             return Response({"tasks": []})
 
         # Validate that the job has a pipeline