Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DEPENDENCIES.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
| `httpx` | `>=0, <1` | The next generation HTTP client. | [https://pypi.org/project/httpx/](https://pypi.org/project/httpx/) |
| `pandas` | `>=2, <3` | Powerful data structures for data analysis, time series, and statistics | [https://pandas.pydata.org](https://pandas.pydata.org) |
| `nest-asyncio2` | `>=1.6, <2.0` | Patch asyncio to allow nested event loops | [https://github.com/Chaoses-Ib/nest-asyncio2](https://github.com/Chaoses-Ib/nest-asyncio2) |
| `rich` | `>=13.6, <14.0` | Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal | [https://github.com/Textualize/rich](https://github.com/Textualize/rich) |
| `rich` | `>=13.6, <14.0` | Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal | [https://pypi.org/project/rich/](https://pypi.org/project/rich/) |
| `pytket` | `>=2.3.1, <3.0` | Quantum computing toolkit and interface to the TKET compiler | [https://pypi.org/project/pytket/](https://pypi.org/project/pytket/) |
| `websockets` | `>11, <16` | An implementation of the WebSocket Protocol (RFC 6455 & 7692) | [https://pypi.org/project/websockets/](https://pypi.org/project/websockets/) |
| `pydantic-settings` | `>=2, <3.0` | Settings management using Pydantic | [https://pypi.org/project/pydantic-settings/](https://pypi.org/project/pydantic-settings/) |
Expand Down
59 changes: 59 additions & 0 deletions integration/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import qnexus as qnx
import qnexus.exceptions as qnx_exc
from qnexus.client.jobs import WaitStrategy
from qnexus.models.job_status import JobStatusEnum
from qnexus.models.references import (
CircuitRef,
Expand Down Expand Up @@ -708,3 +709,61 @@ def test_job_cost_confidence(
cost_confidence = qnx.jobs.cost_confidence(execute_job_ref)
assert isinstance(cost_confidence, list)
assert len(cost_confidence) > 0


def test_wait_for_with_polling_strategy(
create_execute_job_in_project: Callable[..., ContextManager[ExecuteJobRef]],
test_circuit: Circuit,
) -> None:
"""Test that we can wait for a job using the polling strategy."""

with create_execute_job_in_project(
project_name=project_name,
job_name=execute_job_name,
circuit=test_circuit,
circuit_name=circuit_name,
) as execute_job_ref:
assert isinstance(execute_job_ref, ExecuteJobRef)

job_status = qnx.jobs.wait_for(
execute_job_ref,
strategy=WaitStrategy.POLLING,
timeout=120.0,
)

assert job_status.status == JobStatusEnum.COMPLETED

execute_results = qnx.jobs.results(execute_job_ref)
assert len(execute_results) == 1
assert isinstance(execute_results[0], ExecutionResultRef)


def test_wait_for_with_auto_strategy(
create_execute_job_in_project: Callable[..., ContextManager[ExecuteJobRef]],
test_circuit: Circuit,
) -> None:
"""Test that we can wait for a job using the auto (hybrid) strategy,
which uses websocket initially then falls back to polling."""

with create_execute_job_in_project(
project_name=project_name,
job_name=execute_job_name,
circuit=test_circuit,
circuit_name=circuit_name,
) as execute_job_ref:
assert isinstance(execute_job_ref, ExecuteJobRef)

# Use a short websocket_timeout to test the fallback mechanism
# (though the job will likely complete before the timeout)
job_status = qnx.jobs.wait_for(
execute_job_ref,
strategy=WaitStrategy.AUTO,
websocket_timeout=5.0,
timeout=120.0,
)

assert job_status.status == JobStatusEnum.COMPLETED

execute_results = qnx.jobs.results(execute_job_ref)
assert len(execute_results) == 1
assert isinstance(execute_results[0], ExecutionResultRef)
16 changes: 16 additions & 0 deletions qnexus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""The qnexus package."""

import logging
import warnings

import nest_asyncio2 # type: ignore
Expand Down Expand Up @@ -39,9 +40,24 @@
from qnexus.client.jobs import compile, execute
from qnexus.client.jobs._compile import start_compile_job
from qnexus.client.jobs._execute import start_execute_job
from qnexus.config import CONFIG

warnings.filterwarnings("default", category=DeprecationWarning, module=__name__)

# Configure library logging: silent by default, let applications configure handlers
logging.getLogger("qnexus").addHandler(logging.NullHandler())
# Convenience logger, can be enabled via CONFIG.log_level
if CONFIG.log_level is not None:
import sys

handler = logging.StreamHandler(sys.stderr)
handler.setFormatter(
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
qnexus_logger = logging.getLogger("qnexus")
qnexus_logger.addHandler(handler)
qnexus_logger.setLevel(CONFIG.log_level)

# This is necessary for use in Jupyter notebooks to allow for nested asyncio loops
try:
nest_asyncio2.apply()
Expand Down
2 changes: 1 addition & 1 deletion qnexus/client/hugr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
uploaded to Nexus before stability is achieved might not work in the future.
"""

import warnings
import base64
import warnings
from datetime import datetime
from typing import Any, Literal, Union, cast
from uuid import UUID
Expand Down
197 changes: 183 additions & 14 deletions qnexus/client/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import json
import logging
import ssl
from datetime import datetime, timezone
from enum import Enum
Expand Down Expand Up @@ -60,10 +61,11 @@
SystemRef,
WasmModuleRef,
)

from qnexus.models.scope import ScopeFilterEnum
from qnexus.models.utils import assert_never

logger = logging.getLogger(__name__)

EPOCH_START = datetime(1970, 1, 1, tzinfo=timezone.utc)


Expand All @@ -80,6 +82,22 @@ class RemoteRetryStrategy(str, Enum):
FULL_RESTART = "FULL_RESTART"


class WaitStrategy(str, Enum):
"""Strategy for waiting on job completion.

WEBSOCKET: Use a websocket connection for real-time updates.
Best for short-running jobs (<5 minutes).
POLLING: Use exponential backoff polling.
More robust for long-running jobs.
AUTO: Start with websocket, fall back to polling after 5 minutes.
Recommended for most use cases.
"""

WEBSOCKET = "websocket"
POLLING = "polling"
AUTO = "auto"


class Params(
CreatorFilter,
PropertiesFilter,
Expand Down Expand Up @@ -338,19 +356,168 @@ def _fetch_by_id(
)


def wait_for(
async def poll_job_status(
job: JobRef,
wait_for_status: JobStatusEnum = JobStatusEnum.COMPLETED,
initial_interval: float = 1.0,
max_interval_queued: float = 1200.0,
max_interval_running: float = 180.0,
backoff_factor: float = 2.0,
Copy link
Copy Markdown
Contributor

@quantinuum-richard-morrison quantinuum-richard-morrison Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(related to my other comment about using subclasses for different strategies)

The implementation of wait_for gives the user no way to choose different values for these params. They have to pass in an enum member and the code in wait_for picks a function, calling it with just the required args, so these'll always get defaults.

With a class-based approach, it could be:

class PollingStrategy(BaseStrategy):
    ...
    backoff_factor: float = 2.0
    ...

and then a user could (if they chose) do:

class MyCustomPollingStrategy(PollingStrategy):
    backoff_factor: float = 1.5

and they could pass that to wait_for

) -> JobStatus:
"""Poll job status with exponential backoff and adaptive intervals.

Uses different maximum poll intervals based on job state:
- QUEUED: Polls less frequently (default 20 min) since queue position changes slowly
- RUNNING/SUBMITTED: Polls more frequently (default 3 min) for responsiveness
Comment on lines +397 to +398
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great idea


Args:
job: The job to monitor.
wait_for_status: The status to wait for.
initial_interval: Starting poll interval in seconds.
max_interval_queued: Maximum poll interval when job is queued (default: 1200s).
max_interval_running: Maximum poll interval when job is running (default: 180s).
backoff_factor: Multiplier for interval after each poll.

Returns:
The final JobStatus when the target status is reached or job terminates.
"""
interval = initial_interval
logger.debug(
"Starting polling for job %s (target: %s, interval: %.1fs, "
"max queued: %.1fs, max running: %.1fs)",
job.id,
wait_for_status.value,
initial_interval,
max_interval_queued,
max_interval_running,
)

while True:
job_status = status(job)

# Adapt max interval based on job state
if job_status.status == JobStatusEnum.QUEUED:
max_interval = max_interval_queued
else:
max_interval = max_interval_running

# Clamp interval to current max (allows faster polling when transitioning
# from QUEUED to RUNNING)
interval = min(interval, max_interval)

logger.debug(
"Job %s status: %s (next poll in %.1fs, max: %.1fs)",
job.id,
job_status.status.value,
interval,
max_interval,
)

if (
job_status.status not in WAITING_STATUS
or job_status.status == wait_for_status
):
logger.debug("Job %s reached status: %s", job.id, job_status.status.value)
return job_status

await asyncio.sleep(interval)
interval = min(interval * backoff_factor, max_interval)


async def hybrid_wait(
job: JobRef,
wait_for_status: JobStatusEnum = JobStatusEnum.COMPLETED,
timeout: float | None = 900.0,
websocket_timeout: float = 600.0,
) -> JobStatus:
"""Check job status until the job is complete (or a specified status)."""
job_status = asyncio.run(
asyncio.wait_for(
"""Use websocket for initial period, then fall back to polling.

Args:
job: The job to monitor.
wait_for_status: The status to wait for.
websocket_timeout: How long to use websocket before switching to polling.

Returns:
The final JobStatus when the target status is reached or job terminates.
"""
logger.debug(
"Using hybrid strategy for job %s (websocket timeout: %.1fs)",
job.id,
websocket_timeout,
)
try:
# Try websocket first with a timeout
job_status = await asyncio.wait_for(
listen_job_status(job=job, wait_for_status=wait_for_status),
timeout=timeout,
timeout=websocket_timeout,
)
return job_status
except asyncio.TimeoutError:
# Websocket phase timed out, switch to polling
logger.debug(
"Job %s: websocket timeout after %.1fs, switching to polling",
job.id,
websocket_timeout,
)
return await poll_job_status(job=job, wait_for_status=wait_for_status)


def wait_for(
job: JobRef,
wait_for_status: JobStatusEnum = JobStatusEnum.COMPLETED,
timeout: float | None = None,
strategy: WaitStrategy = WaitStrategy.AUTO,
websocket_timeout: float = 600.0,
) -> JobStatus:
"""Check job status until the job is complete (or a specified status).

Args:
job: The job to monitor.
wait_for_status: The status to wait for (default: COMPLETED).
timeout: Overall timeout in seconds. None for no timeout (default: None).
strategy: How to monitor the job:
- WEBSOCKET: Real-time updates via websocket. Best for short jobs (<10 minutes).
- POLLING: Exponential backoff polling. Robust for long jobs (>10 minutes).
- AUTO: Websocket first, then polling fallback (default).
Recommended for most use cases.
websocket_timeout: For AUTO strategy, how long to use websocket
before switching to polling (default: 600 seconds).

Returns:
The final JobStatus.

Raises:
JobError: If the job errors, is cancelled, depleted, or terminated
(unless that was the status being waited for).
asyncio.TimeoutError: If the overall timeout is exceeded.
"""
logger.debug(
"Waiting for job %s with strategy=%s, timeout=%s, target=%s",
job.id,
strategy.value,
timeout,
wait_for_status.value,
)

match strategy:
case WaitStrategy.WEBSOCKET:
coro = listen_job_status(job=job, wait_for_status=wait_for_status)
case WaitStrategy.POLLING:
coro = poll_job_status(job=job, wait_for_status=wait_for_status)
case WaitStrategy.AUTO:
coro = hybrid_wait(
job=job,
wait_for_status=wait_for_status,
websocket_timeout=websocket_timeout,
)
Copy link
Copy Markdown
Contributor

@quantinuum-richard-morrison quantinuum-richard-morrison Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of an enum a case and three functions, these could be BaseStrategy (perhaps an ABC) and three subclasses WebsocketStrategy, PollingStrategy and HybridStrategy. The advantages would be:

  • users could implement their own variations easily (if they wanted to customise)
  • HybridStrategy could be based on PollingStrategy with WebsocketStrategy pulled in as a mixin. Fewer lines of code (maybe) (not that that's a great metric). DRYer (hopefully)
  • arguably more idiomatic python, easier to maintain and reason about

case _:
assert_never(strategy)

if timeout is not None:
coro = asyncio.wait_for(coro, timeout=timeout)

job_status = asyncio.run(coro)
logger.info("Job %s finished with status: %s", job.id, job_status.status.value)

if (
job_status.status == JobStatusEnum.ERROR
and wait_for_status != JobStatusEnum.ERROR
Expand Down Expand Up @@ -387,7 +554,6 @@ def status(job: JobRef, scope: ScopeFilterEnum = ScopeFilterEnum.USER) -> JobSta
message=resp.text, status_code=resp.status_code
)
job_status = JobStatus.from_dict(resp.json())
# job.last_status = job_status.status
return job_status


Expand All @@ -397,7 +563,7 @@ async def listen_job_status(
"""Check the Status of a Job via a websocket connection.
Will use SSO tokens."""
job_status = status(job)
# logger.debug("Current job status: %s", job_status.status)
logger.debug("Job %s initial status: %s", job.id, job_status.status.value)
if job_status.status not in WAITING_STATUS or job_status.status == wait_for_status:
return job_status

Expand All @@ -418,17 +584,20 @@ def _process_exception(exc: Exception) -> Exception | None:
# TODO, this cookie will expire frequently
"Cookie": f"myqos_id={get_nexus_client().auth.cookies.get('myqos_id')}" # type: ignore
}
logger.debug("Job %s: opening websocket connection", job.id)
async for websocket in connect(
f"{CONFIG.websockets_url}/api/jobs/v1beta3/{job.id}/attributes/status/ws",
ssl=ssl_context,
additional_headers=additional_headers,
process_exception=_process_exception,
# logger=logger,
logger=logger,
):
try:
async for status_json in websocket:
# logger.debug("New status: %s", status_json)
job_status = JobStatus.from_dict(json.loads(status_json))
logger.debug(
"Job %s websocket update: %s", job.id, job_status.status.value
)

if (
job_status.status not in WAITING_STATUS
Expand All @@ -437,9 +606,9 @@ def _process_exception(exc: Exception) -> Exception | None:
break
break
except ConnectionClosed:
# logger.debug(
# "Websocket connection closed... attempting to reconnect..."
# )
logger.debug(
"Job %s: websocket connection closed, attempting to reconnect", job.id
)
continue
finally:
try:
Expand Down
Loading