Skip to content

Commit 8311c92

Browse files
authored
Merge branch 'main' into spolisetty/tri-231-backend-endpoint-reports-to-be-healthy-when-using
2 parents 25072c1 + dcf92e1 commit 8311c92

File tree

12 files changed

+1142
-825
lines changed

12 files changed

+1142
-825
lines changed

build.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
"ort_version": "1.23.2",
7878
"ort_openvino_version": "2025.3.0",
7979
"standalone_openvino_version": "2025.3.0",
80-
"dcgm_version": "4.4.0-1",
80+
"dcgm_version": "4.4.2-1",
8181
"vllm_version": "0.11.0",
8282
"rhel_py_version": "3.12.3",
8383
}
@@ -862,9 +862,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
862862
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
863863
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
864864
&& dnf clean expire-cache \\
865+
&& dnf makecache --refresh \\
865866
&& dnf install --assumeyes \\
866-
datacenter-gpu-manager-4-core=1:{} \\
867-
datacenter-gpu-manager-4-devel=1:{}
867+
datacenter-gpu-manager-4-core-1:{} \\
868+
datacenter-gpu-manager-4-devel-1:{}
868869
""".format(
869870
dcgm_version, dcgm_version, dcgm_version
870871
)
@@ -874,9 +875,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
874875
# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
875876
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
876877
&& dnf clean expire-cache \\
878+
&& dnf makecache --refresh \\
877879
&& dnf install --assumeyes \\
878-
datacenter-gpu-manager-4-core=1:{} \\
879-
datacenter-gpu-manager-4-devel=1:{}
880+
datacenter-gpu-manager-4-core-1:{} \\
881+
datacenter-gpu-manager-4-devel-1:{}
880882
""".format(
881883
dcgm_version, dcgm_version, dcgm_version
882884
)
@@ -889,7 +891,7 @@ def install_dcgm_libraries(dcgm_version, target_machine):
889891
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
890892
&& apt install /tmp/cuda-keyring.deb \\
891893
&& rm /tmp/cuda-keyring.deb \\
892-
&& apt update \\
894+
&& apt update -qq \\
893895
&& apt install --yes --no-install-recommends \\
894896
datacenter-gpu-manager-4-core=1:{} \\
895897
datacenter-gpu-manager-4-dev=1:{}
@@ -904,7 +906,7 @@ def install_dcgm_libraries(dcgm_version, target_machine):
904906
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
905907
&& apt install /tmp/cuda-keyring.deb \\
906908
&& rm /tmp/cuda-keyring.deb \\
907-
&& apt update \\
909+
&& apt update -qq \\
908910
&& apt install --yes --no-install-recommends \\
909911
datacenter-gpu-manager-4-core=1:{} \\
910912
datacenter-gpu-manager-4-dev=1:{}
@@ -1517,18 +1519,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
15171519
ENV PYTHONPATH=/opt/tritonserver/backends/dali/wheel/dali:$PYTHONPATH
15181520
"""
15191521

1520-
if target_platform() not in ["igpu", "windows", "rhel"]:
1522+
if target_platform() == "rhel":
15211523
repo_arch = "sbsa" if target_machine == "aarch64" else "x86_64"
1522-
df += f"""
1523-
RUN curl -o /tmp/cuda-keyring.deb \\
1524-
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/{repo_arch}/cuda-keyring_1.1-1_all.deb \\
1525-
&& apt install /tmp/cuda-keyring.deb \\
1526-
&& rm /tmp/cuda-keyring.deb \\
1527-
&& apt update -qq \\
1528-
&& apt install --yes --no-install-recommends libnvshmem3-cuda-13 \\
1529-
&& rm -rf /var/lib/apt/lists/* \\
1530-
&& dpkg -L libnvshmem3-cuda-13 | grep libnvshmem_host.so | sed -e 's/libnvshmem_host.*//g' | sort -u > /etc/ld.so.conf.d/libnvshmem3-cuda-13.conf \\
1531-
&& ldconfig
1524+
df += """
1525+
RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/{repo_arch}/cuda-rhel8.repo \\
1526+
&& dnf clean expire-cache \\
1527+
&& dnf install --assumeyes libnvshmem3-cuda-13
1528+
1529+
RUN dirname $(find /usr -name "libcudart*.so" -o -name "libnvinf*.so" -o -name "libnvshm*" -type f) | sort -u > /etc/ld.so.conf.d/triton-cuda-libs.conf && ldconfig
15321530
""".format(
15331531
repo_arch=repo_arch
15341532
)

docs/user_guide/model_configuration.md

Lines changed: 204 additions & 395 deletions
Large diffs are not rendered by default.

docs/user_guide/model_management.md

Lines changed: 111 additions & 193 deletions
Large diffs are not rendered by default.

docs/user_guide/model_repository.md

Lines changed: 89 additions & 190 deletions
Large diffs are not rendered by default.

python/openai/openai_frontend/engine/triton_engine.py

Lines changed: 73 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
_create_trtllm_generate_request,
5858
_create_vllm_embedding_request,
5959
_create_vllm_generate_request,
60+
_get_openai_chat_format_logprobs_from_vllm_response,
61+
_get_openai_completion_format_logprobs_from_vllm_response,
6062
_get_output,
6163
_get_usage_from_response,
6264
_get_vllm_lora_names,
@@ -66,6 +68,7 @@
6668
from schemas.openai import (
6769
ChatCompletionChoice,
6870
ChatCompletionFinishReason,
71+
ChatCompletionLogprobs,
6972
ChatCompletionMessageToolCall,
7073
ChatCompletionMessageToolCallChunk,
7174
ChatCompletionNamedToolChoice,
@@ -255,13 +258,22 @@ async def chat(
255258
response, metadata.backend, RequestKind.GENERATION
256259
)
257260

261+
# Parse logprobs if requested
262+
logprobs_data = None
263+
if request.logprobs:
264+
openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
265+
response
266+
)
267+
if openai_logprobs:
268+
logprobs_data = ChatCompletionLogprobs(content=openai_logprobs)
269+
258270
return CreateChatCompletionResponse(
259271
id=request_id,
260272
choices=[
261273
ChatCompletionChoice(
262274
index=0,
263275
message=response_message,
264-
logprobs=None,
276+
logprobs=logprobs_data,
265277
finish_reason=finish_reason,
266278
)
267279
],
@@ -360,10 +372,17 @@ async def completion(
360372
response, metadata.backend, RequestKind.GENERATION
361373
)
362374

375+
# Parse logprobs if requested
376+
logprobs_data = None
377+
if request.logprobs is not None and request.logprobs > 0:
378+
logprobs_data = _get_openai_completion_format_logprobs_from_vllm_response(
379+
response
380+
)
381+
363382
choice = Choice(
364383
finish_reason=FinishReason.stop,
365384
index=0,
366-
logprobs=None,
385+
logprobs=logprobs_data,
367386
text=text,
368387
)
369388
return CreateCompletionResponse(
@@ -605,6 +624,15 @@ async def _streaming_chat_iterator(
605624
)
606625
previous_text = current_text
607626

627+
# Parse logprobs for this chunk if requested
628+
chunk_logprobs = None
629+
if request.logprobs:
630+
openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
631+
response
632+
)
633+
if openai_logprobs:
634+
chunk_logprobs = ChatCompletionLogprobs(content=openai_logprobs)
635+
608636
# if the response delta is None (e.g. because it was a
609637
# "control token" for tool calls or the parser otherwise
610638
# wasn't ready to send a token, then
@@ -618,7 +646,7 @@ async def _streaming_chat_iterator(
618646
choice = ChatCompletionStreamingResponseChoice(
619647
index=0,
620648
delta=response_delta,
621-
logprobs=None,
649+
logprobs=chunk_logprobs,
622650
finish_reason=finish_reason,
623651
)
624652

@@ -791,8 +819,19 @@ def _validate_chat_request(
791819
f"Received n={request.n}, but only single choice (n=1) is currently supported"
792820
)
793821

794-
if request.logit_bias is not None or request.logprobs:
795-
raise ClientError("logit bias and log probs not currently supported")
822+
if request.logit_bias is not None:
823+
raise ClientError("logit bias is not currently supported")
824+
825+
# Logprobs are only supported for vLLM backend currently
826+
if metadata.backend != "vllm" and (
827+
request.logprobs is not None or request.top_logprobs is not None
828+
):
829+
raise ClientError(
830+
"logprobs are currently available only for the vLLM backend"
831+
)
832+
833+
if request.top_logprobs is not None and not request.logprobs:
834+
raise ClientError("`top_logprobs` can only be used when `logprobs` is True")
796835

797836
self._verify_chat_tool_call_settings(request=request)
798837

@@ -847,16 +886,32 @@ async def _streaming_completion_iterator(
847886
model = request.model
848887
include_usage = request.stream_options and request.stream_options.include_usage
849888
usage_accumulator = _StreamingUsageAccumulator(backend)
889+
current_offset = 0
850890

851891
async for response in responses:
852892
if include_usage:
853893
usage_accumulator.update(response)
854894

855895
text = _get_output(response)
896+
897+
# Parse logprobs for this chunk if requested
898+
chunk_logprobs = None
899+
if request.logprobs is not None and request.logprobs > 0:
900+
chunk_logprobs = (
901+
_get_openai_completion_format_logprobs_from_vllm_response(response)
902+
)
903+
# Adjust text offsets based on accumulated output
904+
if chunk_logprobs and chunk_logprobs.text_offset:
905+
chunk_logprobs.text_offset = [
906+
offset + current_offset for offset in chunk_logprobs.text_offset
907+
]
908+
909+
current_offset += len(text)
910+
856911
choice = Choice(
857912
finish_reason=FinishReason.stop if response.final else None,
858913
index=0,
859-
logprobs=None,
914+
logprobs=chunk_logprobs,
860915
text=text,
861916
)
862917
chunk = CreateCompletionResponse(
@@ -942,8 +997,18 @@ def _validate_completion_request(
942997
f"Received best_of={request.best_of}, but only single choice (best_of=1) is currently supported"
943998
)
944999

945-
if request.logit_bias is not None or request.logprobs is not None:
946-
raise ClientError("logit bias and log probs not supported")
1000+
if request.logit_bias is not None:
1001+
raise ClientError("logit bias is not supported")
1002+
1003+
# Logprobs are only supported for vLLM backend currently
1004+
if (
1005+
request.logprobs is not None
1006+
and request.logprobs > 0
1007+
and metadata.backend != "vllm"
1008+
):
1009+
raise ClientError(
1010+
"logprobs are currently available only for the vLLM backend"
1011+
)
9471012

9481013
if request.stream_options and not request.stream:
9491014
raise ClientError("`stream_options` can only be used when `stream` is True")

0 commit comments

Comments
 (0)