Skip to content

Features/scheduler refactor cursor #283

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: features/scheduler_refactor
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,6 @@ src/ui/next-env.d.ts
!src/ui/public/manifest.json
!src/ui/serve.json
.eslintcache

# vllm-sim
bin/
17 changes: 17 additions & 0 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,17 @@ def benchmark():
"If None, will run until max_seconds or the data is exhausted."
),
)
# TODO: Review Cursor generated code (start)
@click.option(
"--max-error-rate",
type=float,
default=None,
help=(
"The maximum error rate allowed (0.0 to 1.0) before stopping the benchmark. "
"If None, no error rate constraint will be applied."
),
)
# TODO: Review Cursor generated code (end)
@click.option(
"--warmup-percent",
type=float,
Expand Down Expand Up @@ -261,6 +272,9 @@ def run(
rate,
max_seconds,
max_requests,
# TODO: Review Cursor generated code (start)
max_error_rate,
# TODO: Review Cursor generated code (end)
warmup_percent,
cooldown_percent,
disable_progress,
Expand Down Expand Up @@ -288,6 +302,9 @@ def run(
rate=rate,
max_seconds=max_seconds,
max_requests=max_requests,
# TODO: Review Cursor generated code (start)
max_error_rate=max_error_rate,
# TODO: Review Cursor generated code (end)
warmup_percent=warmup_percent,
cooldown_percent=cooldown_percent,
output_sampling=output_sampling,
Expand Down
41 changes: 41 additions & 0 deletions src/guidellm/backend/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,53 @@ async def resolve(
request_info.request_timings.request_end = time.time()
response.request_output_tokens = usage_stats.output_tokens
response.request_prompt_tokens = usage_stats.prompt_tokens
# TODO: Review Cursor generated code (start)
logger.debug(
f"OpenAI Backend: Got usage_stats - prompt_tokens={usage_stats.prompt_tokens}, output_tokens={usage_stats.output_tokens}"
)
# TODO: Review Cursor generated code (end)

# TODO: Review Cursor generated code (start)
# Debug what we're actually yielding
from loguru import logger
# TODO: Review Cursor generated code (end)

# TODO: Review Cursor generated code (start)
logger.debug("OpenAI Backend: About to yield response, request_info")
logger.debug(
f"OpenAI Backend: request_info.request_timings id: {id(request_info.request_timings)}"
)
if request_info.request_timings:
logger.debug(
f"OpenAI Backend: Yielding with first_iteration={request_info.request_timings.first_iteration}, last_iteration={request_info.request_timings.last_iteration}"
)
else:
logger.debug("OpenAI Backend: Yielding with request_timings=None")
# TODO: Review Cursor generated code (end)

yield response, request_info

if request_info.request_timings.request_end is None:
request_info.request_timings.request_end = time.time()
response.delta = None

# TODO: Review Cursor generated code (start)
# Debug final yield
from loguru import logger
# TODO: Review Cursor generated code (end)

# TODO: Review Cursor generated code (start)
logger.debug(
f"OpenAI Backend: Final yield - request_info.request_timings id: {id(request_info.request_timings)}"
)
if request_info.request_timings:
logger.debug(
f"OpenAI Backend: Final yield with first_iteration={request_info.request_timings.first_iteration}, last_iteration={request_info.request_timings.last_iteration}"
)
else:
logger.debug("OpenAI Backend: Final yield with request_timings=None")
# TODO: Review Cursor generated code (end)

yield response, request_info

async def text_completions(
Expand Down
Loading