flexcompute · yaugenst-flex · Jan 28, 2026 · marcorudolphflex · Jan 28, 2026 · marcorudolphflex
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fixed frequency accumulation of gradients for custom dispersive media.
 - Fixed `snap_box_to_grid` producing zero-size boxes when using `Expand` behavior with very small intervals centered on a grid point.
 - Fixed sliver polygon artifacts in 2D material subdivision by filtering polygons based on grid cell size, preventing numerical issues with large-coordinate geometries.
+- Fixed CLI monitoring raising fatal errors on transient backend error states during automatic retries.
 
 ## [2.10.2] - 2026-01-21
 

diff --git a/tests/test_web/test_webapi.py b/tests/test_web/test_webapi.py
@@ -1,6 +1,7 @@
 # Tests webapi and things that depend on it
 from __future__ import annotations
 
+import json
 import os
 import posixpath
 from concurrent.futures import Future
@@ -25,7 +26,7 @@
 from tidy3d.components.monitor import FieldMonitor
 from tidy3d.components.source.current import PointDipole
 from tidy3d.components.source.time import GaussianPulse
-from tidy3d.exceptions import SetupError
+from tidy3d.exceptions import SetupError, WebError
 from tidy3d.web import common
 from tidy3d.web.api.asynchronous import run_async
 from tidy3d.web.api.container import Batch, Job, WebContainer
@@ -41,6 +42,7 @@
     estimate_cost,
     get_info,
     get_run_info,
+    get_status,
     get_tasks,
     load,
     load_simulation,
@@ -51,6 +53,7 @@
 )
 from tidy3d.web.core.environment import Env
 from tidy3d.web.core.exceptions import WebNotFoundError
+from tidy3d.web.core.task_info import TaskInfo
 from tidy3d.web.core.types import PayType, TaskType
 
 TASK_NAME = "task_name_test"
@@ -272,7 +275,7 @@ def mock_monitor(monkeypatch):
     status_count = [0]
     statuses = ("upload", "running", "running", "running", "running", "running", "success")
 
-    def mock_get_status(task_id):
+    def mock_get_status(task_id, **_kwargs):
         current_count = min(status_count[0], len(statuses) - 1)
         current_status = statuses[current_count]
         status_count[0] += 1
@@ -419,6 +422,61 @@ def test_get_run_info(mock_get_run_info, mock_get_info):
     assert get_run_info(TASK_ID) == (100, 0)
 
 
+def test_get_status_grace_period_recovers(monkeypatch):
+    statuses = iter(["run_error", "run_error", "running"])
+
+    def mock_get_info(task_id):
+        status = next(statuses, "running")
+        return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)
+
+    time_state = {"t": 0.0}
+
+    def fake_monotonic():
+        return time_state["t"]
+
+    def fake_sleep(seconds):
+        time_state["t"] += seconds
+
+    monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
+    monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
+    monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
+    monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
+    monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)
+
+    assert get_status(TASK_ID, error_grace_period=0.05) == "running"
+
+
+def test_get_status_grace_period_expires(monkeypatch):
+    statuses = iter(["run_error", "run_error", "run_error"])
+
+    def mock_get_info(task_id):
+        status = next(statuses, "run_error")
+        return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)
+
+    def mock_get_error_json(self, to_file, **_kwargs):
+        with open(to_file, "w", encoding="utf8") as handle:
+            json.dump({"msg": "boom"}, handle)
+        return Path(to_file)
+
+    time_state = {"t": 0.0}
+
+    def fake_monotonic():
+        return time_state["t"]
+
+    def fake_sleep(seconds):
+        time_state["t"] += seconds
+
+    monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
+    monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
+    monkeypatch.setattr(f"{api_path}.SimulationTask.get_error_json", mock_get_error_json)
+    monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
+    monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
+    monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)
+
+    with pytest.raises(WebError, match="boom"):
+        get_status(TASK_ID, error_grace_period=0.02)
+
+
 @responses.activate
 def test_download(mock_download, tmp_path):
     download(TASK_ID, str(tmp_path / "web_test_tmp.json"))

diff --git a/tests/test_web/test_webapi_eme.py b/tests/test_web/test_webapi_eme.py
@@ -151,7 +151,7 @@ def mock_monitor(monkeypatch):
     status_count = [0]
     statuses = ("upload", "running", "running", "running", "running", "running", "success")
 
-    def mock_get_status(task_id):
+    def mock_get_status(task_id, **_kwargs):
         current_count = min(status_count[0], len(statuses) - 1)
         current_status = statuses[current_count]
         status_count[0] += 1

diff --git a/tests/test_web/test_webapi_heat.py b/tests/test_web/test_webapi_heat.py
@@ -148,7 +148,7 @@ def mock_monitor(monkeypatch):
     status_count = [0]
     statuses = ("upload", "running", "running", "running", "running", "running", "success")
 
-    def mock_get_status(task_id):
+    def mock_get_status(task_id, **_kwargs):
         current_count = min(status_count[0], len(statuses) - 1)
         current_status = statuses[current_count]
         status_count[0] += 1

diff --git a/tests/test_web/test_webapi_mode.py b/tests/test_web/test_webapi_mode.py
@@ -184,7 +184,7 @@ def mock_monitor(monkeypatch):
     status_count = [0]
     statuses = ("upload", "running", "running", "running", "running", "running", "success")
 
-    def mock_get_status(task_id):
+    def mock_get_status(task_id, **_kwargs):
         current_count = min(status_count[0], len(statuses) - 1)
         current_status = statuses[current_count]
         status_count[0] += 1

diff --git a/tests/test_web/test_webapi_mode_sim.py b/tests/test_web/test_webapi_mode_sim.py
@@ -180,7 +180,7 @@ def mock_monitor(monkeypatch):
     status_count = [0]
     statuses = ("upload", "running", "running", "running", "running", "running", "success")
 
-    def mock_get_status(task_id):
+    def mock_get_status(task_id, **_kwargs):
         current_count = min(status_count[0], len(statuses) - 1)
         current_status = statuses[current_count]
         status_count[0] += 1

@@ -50,7 +50,7 @@ flowchart LR
 
 ## Module Reference
 
-- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`.
+- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`. `web.monitor_error_grace_period` controls how long `web.monitor()` waits through transient error states before raising.
 - `registry.py` - Stores section and handler registries and notifies the attached manager so new entries appear immediately.
 - `manager.py` - `ConfigManager` caches validated models, tracks runtime overrides per profile, filters persisted fields, exposes helpers such as `plugins`, `profiles`, and `format`. `SectionAccessor` routes attribute access to `update_section`.
 - `loader.py` - Resolves the config directory, loads `config.toml` and `profiles/<name>.toml`, parses environment overrides, and writes atomically through `serializer.build_document`.

@@ -363,6 +363,15 @@ class WebConfig(ConfigSection):
         le=300,
     )
 
+    monitor_error_grace_period: NonNegativeFloat = Field(
+        60.0,
+        title="Monitor error grace period",
+        description=(
+            "Seconds to wait out transient error statuses during web.monitor() "
+            "before raising an error."
+        ),
+    )
+
     ssl_version: Optional[str] = Field(
         None,
         title="SSL/TLS version",

diff --git a/tidy3d/web/api/webapi.py b/tidy3d/web/api/webapi.py
@@ -732,22 +732,50 @@ def _get_batch_detail_handle_error_status(batch: BatchTask) -> BatchDetail:
     return detail
 
 
-def get_status(task_id: TaskId) -> str:
+def get_status(task_id: TaskId, *, error_grace_period: float = 0.0) -> str:
     """Get the status of a task. Raises an error if status is "error".
 
     Parameters
     ----------
     task_id : str
         Unique identifier of task on server.  Returned by :meth:`upload`.
+    error_grace_period : float = 0.0
+        Seconds to wait out transient error statuses before raising an error.
     """
+
+    def _wait_out_error(fetch_status: Callable[[], str], raw_status: str | None) -> str | None:
+        if error_grace_period <= 0:
+            return raw_status
+        deadline = time.monotonic() + error_grace_period
+        status = (raw_status or "").lower()
+        while status in ERROR_STATES and time.monotonic() < deadline:
+            time.sleep(REFRESH_TIME)
+            raw_status = fetch_status()
+            status = (raw_status or "").lower()
+        return raw_status
+
     task = TaskFactory.get(task_id)
     if isinstance(task, BatchTask):
-        return _get_batch_detail_handle_error_status(task).status
+        detail = task.detail()
+        raw_status = detail.status
+        status = (raw_status or "").lower()
+        if status in ERROR_STATES:
+            raw_status = _wait_out_error(lambda: task.detail().status, raw_status)
+            status = (raw_status or "").lower()
+        if status in ERROR_STATES:
+            _batch_detail_error(task.task_id)
+        return raw_status
     else:
         task_info = get_info(task_id)
-        status = task_info.status
+        raw_status = task_info.status
+        status = (raw_status or "").lower()
         if status == "visualize":
             return "success"
+        if status in ERROR_STATES:
+            raw_status = _wait_out_error(lambda: get_info(task_id).status, raw_status)
+            status = (raw_status or "").lower()
+            if status == "visualize":
+                return "success"
         if status in ERROR_STATES:
             try:
                 # Try to obtain the error message
@@ -762,7 +790,7 @@ def get_status(task_id: TaskId) -> str:
                 error_msg = "Error message could not be obtained, please contact customer support."
 
             raise WebError(f"Error running task {task_id}! {error_msg}")
-    return status
+    return raw_status
 
 
 def monitor(task_id: TaskId, verbose: bool = True, worker_group: Optional[str] = None) -> None:
@@ -823,18 +851,21 @@ def get_estimated_cost() -> float:
             est_flex_unit = task_info.estFlexUnit
         return est_flex_unit
 
+    def _get_status() -> str:
+        return get_status(task_id, error_grace_period=config.web.monitor_error_grace_period)
+
     def monitor_preprocess() -> None:
         """Periodically check the status."""
-        status = get_status(task_id)
+        status = _get_status()
         while status not in END_STATES and status != "running":
-            new_status = get_status(task_id)
+            new_status = _get_status()
             if new_status != status:
                 status = new_status
                 if verbose and status != "running":
                     console.log(f"status = {status}")
             time.sleep(REFRESH_TIME)
 
-    status = get_status(task_id)
+    status = _get_status()
 
     if verbose:
         console.log(f"status = {status}")
@@ -861,7 +892,7 @@ def monitor_preprocess() -> None:
         console.log("starting up solver")
 
     # while running but before the percentage done is available, keep waiting
-    while get_run_info(task_id)[0] is None and get_status(task_id) == "running":
+    while get_run_info(task_id)[0] is None and _get_status() == "running":
         time.sleep(REFRESH_TIME)
 
     # while running but percentage done is available
@@ -873,9 +904,7 @@ def monitor_preprocess() -> None:
                 pbar_pd = progress.add_task("% done", total=100)
                 perc_done, _ = get_run_info(task_id)
 
-                while (
-                    perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
-                ):
+                while perc_done is not None and perc_done < 100 and _get_status() == "running":
                     perc_done, field_decay = get_run_info(task_id)
                     new_description = f"solver progress (field decay = {field_decay:.2e})"
                     progress.update(pbar_pd, completed=perc_done, description=new_description)
@@ -892,9 +921,7 @@ def monitor_preprocess() -> None:
                 pbar_pd = progress.add_task("% done", total=100)
                 perc_done, _ = get_run_info(task_id)
 
-                while (
-                    perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
-                ):
+                while perc_done is not None and perc_done < 100 and _get_status() == "running":
                     perc_done, _ = get_run_info(task_id)
                     new_description = "solver progress"
                     progress.update(pbar_pd, completed=perc_done, description=new_description)
@@ -904,26 +931,26 @@ def monitor_preprocess() -> None:
                 new_description = "solver progress"
                 progress.update(pbar_pd, completed=100, refresh=True, description=new_description)
         else:
-            while get_status(task_id) == "running":
+            while _get_status() == "running":
                 perc_done, _ = get_run_info(task_id)
                 time.sleep(RUN_REFRESH_TIME)
 
     else:
         # non-verbose case, just keep checking until status is not running or perc_done >= 100
         perc_done, _ = get_run_info(task_id)
-        while perc_done is not None and perc_done < 100 and get_status(task_id) == "running":
+        while perc_done is not None and perc_done < 100 and _get_status() == "running":
             perc_done, field_decay = get_run_info(task_id)
             time.sleep(RUN_REFRESH_TIME)
 
     # post processing
     if verbose:
-        status = get_status(task_id)
+        status = _get_status()
         if status != "running":
             console.log(f"status = {status}")
 
         with console.status(f"[bold green]Finishing '{task_name}'...", spinner="runner"):
             while status not in END_STATES:
-                new_status = get_status(task_id)
+                new_status = _get_status()
                 if new_status != status:
                     status = new_status
                     console.log(f"status = {status}")
@@ -933,7 +960,7 @@ def monitor_preprocess() -> None:
             url = _get_url(task_id)
             console.log(f"View simulation result at [blue underline][link={url}]'{url}'[/link].")
     else:
-        while get_status(task_id) not in END_STATES:
+        while _get_status() not in END_STATES:
             time.sleep(REFRESH_TIME)