Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Fixed frequency accumulation of gradients for custom dispersive media.
- Fixed `snap_box_to_grid` producing zero-size boxes when using `Expand` behavior with very small intervals centered on a grid point.
- Fixed sliver polygon artifacts in 2D material subdivision by filtering polygons based on grid cell size, preventing numerical issues with large-coordinate geometries.
- Fixed CLI monitoring raising fatal errors on transient backend error states during automatic retries.

## [2.10.2] - 2026-01-21

Expand Down
62 changes: 60 additions & 2 deletions tests/test_web/test_webapi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Tests webapi and things that depend on it
from __future__ import annotations

import json
import os
import posixpath
from concurrent.futures import Future
Expand All @@ -25,7 +26,7 @@
from tidy3d.components.monitor import FieldMonitor
from tidy3d.components.source.current import PointDipole
from tidy3d.components.source.time import GaussianPulse
from tidy3d.exceptions import SetupError
from tidy3d.exceptions import SetupError, WebError
from tidy3d.web import common
from tidy3d.web.api.asynchronous import run_async
from tidy3d.web.api.container import Batch, Job, WebContainer
Expand All @@ -41,6 +42,7 @@
estimate_cost,
get_info,
get_run_info,
get_status,
get_tasks,
load,
load_simulation,
Expand All @@ -51,6 +53,7 @@
)
from tidy3d.web.core.environment import Env
from tidy3d.web.core.exceptions import WebNotFoundError
from tidy3d.web.core.task_info import TaskInfo
from tidy3d.web.core.types import PayType, TaskType

TASK_NAME = "task_name_test"
Expand Down Expand Up @@ -272,7 +275,7 @@ def mock_monitor(monkeypatch):
status_count = [0]
statuses = ("upload", "running", "running", "running", "running", "running", "success")

def mock_get_status(task_id):
def mock_get_status(task_id, **_kwargs):
current_count = min(status_count[0], len(statuses) - 1)
current_status = statuses[current_count]
status_count[0] += 1
Expand Down Expand Up @@ -419,6 +422,61 @@ def test_get_run_info(mock_get_run_info, mock_get_info):
assert get_run_info(TASK_ID) == (100, 0)


def test_get_status_grace_period_recovers(monkeypatch):
statuses = iter(["run_error", "run_error", "running"])

def mock_get_info(task_id):
status = next(statuses, "running")
return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)

time_state = {"t": 0.0}

def fake_monotonic():
return time_state["t"]

def fake_sleep(seconds):
time_state["t"] += seconds

monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)

assert get_status(TASK_ID, error_grace_period=0.05) == "running"


def test_get_status_grace_period_expires(monkeypatch):
statuses = iter(["run_error", "run_error", "run_error"])

def mock_get_info(task_id):
status = next(statuses, "run_error")
return TaskInfo(taskId=task_id, status=status, taskType=TaskType.MODE.name)

def mock_get_error_json(self, to_file, **_kwargs):
with open(to_file, "w", encoding="utf8") as handle:
json.dump({"msg": "boom"}, handle)
return Path(to_file)

time_state = {"t": 0.0}

def fake_monotonic():
return time_state["t"]

def fake_sleep(seconds):
time_state["t"] += seconds

monkeypatch.setattr(f"{api_path}.TaskFactory.get", lambda *_args, **_kwargs: None)
monkeypatch.setattr(f"{api_path}.get_info", mock_get_info)
monkeypatch.setattr(f"{api_path}.SimulationTask.get_error_json", mock_get_error_json)
monkeypatch.setattr(f"{api_path}.REFRESH_TIME", 0.01)
monkeypatch.setattr(f"{api_path}.time.sleep", fake_sleep)
monkeypatch.setattr(f"{api_path}.time.monotonic", fake_monotonic)

with pytest.raises(WebError, match="boom"):
get_status(TASK_ID, error_grace_period=0.02)


@responses.activate
def test_download(mock_download, tmp_path):
download(TASK_ID, str(tmp_path / "web_test_tmp.json"))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_web/test_webapi_eme.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def mock_monitor(monkeypatch):
status_count = [0]
statuses = ("upload", "running", "running", "running", "running", "running", "success")

def mock_get_status(task_id):
def mock_get_status(task_id, **_kwargs):
current_count = min(status_count[0], len(statuses) - 1)
current_status = statuses[current_count]
status_count[0] += 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_web/test_webapi_heat.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def mock_monitor(monkeypatch):
status_count = [0]
statuses = ("upload", "running", "running", "running", "running", "running", "success")

def mock_get_status(task_id):
def mock_get_status(task_id, **_kwargs):
current_count = min(status_count[0], len(statuses) - 1)
current_status = statuses[current_count]
status_count[0] += 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_web/test_webapi_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def mock_monitor(monkeypatch):
status_count = [0]
statuses = ("upload", "running", "running", "running", "running", "running", "success")

def mock_get_status(task_id):
def mock_get_status(task_id, **_kwargs):
current_count = min(status_count[0], len(statuses) - 1)
current_status = statuses[current_count]
status_count[0] += 1
Expand Down
2 changes: 1 addition & 1 deletion tests/test_web/test_webapi_mode_sim.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def mock_monitor(monkeypatch):
status_count = [0]
statuses = ("upload", "running", "running", "running", "running", "running", "success")

def mock_get_status(task_id):
def mock_get_status(task_id, **_kwargs):
current_count = min(status_count[0], len(statuses) - 1)
current_status = statuses[current_count]
status_count[0] += 1
Expand Down
2 changes: 1 addition & 1 deletion tidy3d/config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ flowchart LR

## Module Reference

- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`.
- `sections.py` - Pydantic models for built-in sections (logging, simulation, microwave, adjoint, web, local cache, plugin container) registered via `register_section`. The bundled models inherit from the internal `ConfigSection` helper, but external code can use plain `BaseModel` subclasses. Optional handlers perform side effects. Fields mark persistence with `json_schema_extra={"persist": True}`. `web.monitor_error_grace_period` controls how long `web.monitor()` waits through transient error states before raising.
- `registry.py` - Stores section and handler registries and notifies the attached manager so new entries appear immediately.
- `manager.py` - `ConfigManager` caches validated models, tracks runtime overrides per profile, filters persisted fields, exposes helpers such as `plugins`, `profiles`, and `format`. `SectionAccessor` routes attribute access to `update_section`.
- `loader.py` - Resolves the config directory, loads `config.toml` and `profiles/<name>.toml`, parses environment overrides, and writes atomically through `serializer.build_document`.
Expand Down
9 changes: 9 additions & 0 deletions tidy3d/config/sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,15 @@ class WebConfig(ConfigSection):
le=300,
)

monitor_error_grace_period: NonNegativeFloat = Field(
60.0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lower this default? what's more realistic and user-friendly - 20-30s?

title="Monitor error grace period",
description=(
"Seconds to wait out transient error statuses during web.monitor() "
"before raising an error."
),
)

ssl_version: Optional[str] = Field(
None,
title="SSL/TLS version",
Expand Down
65 changes: 46 additions & 19 deletions tidy3d/web/api/webapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,22 +732,50 @@ def _get_batch_detail_handle_error_status(batch: BatchTask) -> BatchDetail:
return detail


def get_status(task_id: TaskId) -> str:
def get_status(task_id: TaskId, *, error_grace_period: float = 0.0) -> str:
"""Get the status of a task. Raises an error if status is "error".

Parameters
----------
task_id : str
Unique identifier of task on server. Returned by :meth:`upload`.
error_grace_period : float = 0.0
Seconds to wait out transient error statuses before raising an error.
"""

def _wait_out_error(fetch_status: Callable[[], str], raw_status: str | None) -> str | None:
if error_grace_period <= 0:
return raw_status
deadline = time.monotonic() + error_grace_period
status = (raw_status or "").lower()
while status in ERROR_STATES and time.monotonic() < deadline:
time.sleep(REFRESH_TIME)
raw_status = fetch_status()
status = (raw_status or "").lower()
return raw_status

task = TaskFactory.get(task_id)
if isinstance(task, BatchTask):
return _get_batch_detail_handle_error_status(task).status
detail = task.detail()
raw_status = detail.status
status = (raw_status or "").lower()
if status in ERROR_STATES:
raw_status = _wait_out_error(lambda: task.detail().status, raw_status)
status = (raw_status or "").lower()
if status in ERROR_STATES:
_batch_detail_error(task.task_id)
return raw_status
else:
task_info = get_info(task_id)
status = task_info.status
raw_status = task_info.status
status = (raw_status or "").lower()
if status == "visualize":
return "success"
if status in ERROR_STATES:
raw_status = _wait_out_error(lambda: get_info(task_id).status, raw_status)
status = (raw_status or "").lower()
if status == "visualize":
return "success"
if status in ERROR_STATES:
try:
# Try to obtain the error message
Expand All @@ -762,7 +790,7 @@ def get_status(task_id: TaskId) -> str:
error_msg = "Error message could not be obtained, please contact customer support."

raise WebError(f"Error running task {task_id}! {error_msg}")
return status
return raw_status


def monitor(task_id: TaskId, verbose: bool = True, worker_group: Optional[str] = None) -> None:
Expand Down Expand Up @@ -823,18 +851,21 @@ def get_estimated_cost() -> float:
est_flex_unit = task_info.estFlexUnit
return est_flex_unit

def _get_status() -> str:
return get_status(task_id, error_grace_period=config.web.monitor_error_grace_period)

def monitor_preprocess() -> None:
"""Periodically check the status."""
status = get_status(task_id)
status = _get_status()
while status not in END_STATES and status != "running":
new_status = get_status(task_id)
new_status = _get_status()
if new_status != status:
status = new_status
if verbose and status != "running":
console.log(f"status = {status}")
time.sleep(REFRESH_TIME)

status = get_status(task_id)
status = _get_status()

if verbose:
console.log(f"status = {status}")
Expand All @@ -861,7 +892,7 @@ def monitor_preprocess() -> None:
console.log("starting up solver")

# while running but before the percentage done is available, keep waiting
while get_run_info(task_id)[0] is None and get_status(task_id) == "running":
while get_run_info(task_id)[0] is None and _get_status() == "running":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should the grace period also apply to intermediate hick-ups of get_run_info?

time.sleep(REFRESH_TIME)

# while running but percentage done is available
Expand All @@ -873,9 +904,7 @@ def monitor_preprocess() -> None:
pbar_pd = progress.add_task("% done", total=100)
perc_done, _ = get_run_info(task_id)

while (
perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
):
while perc_done is not None and perc_done < 100 and _get_status() == "running":
perc_done, field_decay = get_run_info(task_id)
new_description = f"solver progress (field decay = {field_decay:.2e})"
progress.update(pbar_pd, completed=perc_done, description=new_description)
Expand All @@ -892,9 +921,7 @@ def monitor_preprocess() -> None:
pbar_pd = progress.add_task("% done", total=100)
perc_done, _ = get_run_info(task_id)

while (
perc_done is not None and perc_done < 100 and get_status(task_id) == "running"
):
while perc_done is not None and perc_done < 100 and _get_status() == "running":
perc_done, _ = get_run_info(task_id)
new_description = "solver progress"
progress.update(pbar_pd, completed=perc_done, description=new_description)
Expand All @@ -904,26 +931,26 @@ def monitor_preprocess() -> None:
new_description = "solver progress"
progress.update(pbar_pd, completed=100, refresh=True, description=new_description)
else:
while get_status(task_id) == "running":
while _get_status() == "running":
perc_done, _ = get_run_info(task_id)
time.sleep(RUN_REFRESH_TIME)

else:
# non-verbose case, just keep checking until status is not running or perc_done >= 100
perc_done, _ = get_run_info(task_id)
while perc_done is not None and perc_done < 100 and get_status(task_id) == "running":
while perc_done is not None and perc_done < 100 and _get_status() == "running":
perc_done, field_decay = get_run_info(task_id)
time.sleep(RUN_REFRESH_TIME)

# post processing
if verbose:
status = get_status(task_id)
status = _get_status()
if status != "running":
console.log(f"status = {status}")

with console.status(f"[bold green]Finishing '{task_name}'...", spinner="runner"):
while status not in END_STATES:
new_status = get_status(task_id)
new_status = _get_status()
if new_status != status:
status = new_status
console.log(f"status = {status}")
Expand All @@ -933,7 +960,7 @@ def monitor_preprocess() -> None:
url = _get_url(task_id)
console.log(f"View simulation result at [blue underline][link={url}]'{url}'[/link].")
else:
while get_status(task_id) not in END_STATES:
while _get_status() not in END_STATES:
time.sleep(REFRESH_TIME)


Expand Down
Loading