triton-inference-server
diff --git a/‎build.py‎
Lines changed: 16 additions & 18 deletions b/‎build.py‎
Lines changed: 16 additions & 18 deletions
diff --git a/‎docs/user_guide/model_configuration.md‎
Lines changed: 204 additions & 395 deletions b/‎docs/user_guide/model_configuration.md‎
Lines changed: 204 additions & 395 deletions
diff --git a/‎docs/user_guide/model_management.md‎
Lines changed: 111 additions & 193 deletions b/‎docs/user_guide/model_management.md‎
Lines changed: 111 additions & 193 deletions
diff --git a/‎docs/user_guide/model_repository.md‎
Lines changed: 89 additions & 190 deletions b/‎docs/user_guide/model_repository.md‎
Lines changed: 89 additions & 190 deletions
diff --git a/‎python/openai/openai_frontend/engine/triton_engine.py‎
Lines changed: 73 additions & 8 deletions b/‎python/openai/openai_frontend/engine/triton_engine.py‎
Lines changed: 73 additions & 8 deletions
@@ -77,7 +77,7 @@
     "ort_version": "1.23.2",
     "ort_openvino_version": "2025.3.0",
     "standalone_openvino_version": "2025.3.0",
-    "dcgm_version": "4.4.0-1",
+    "dcgm_version": "4.4.2-1",
     "vllm_version": "0.11.0",
     "rhel_py_version": "3.12.3",
 }
@@ -862,9 +862,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo \\
     && dnf clean expire-cache \\
+    && dnf makecache --refresh \\
     && dnf install --assumeyes \\
-                 datacenter-gpu-manager-4-core=1:{} \\
-                 datacenter-gpu-manager-4-devel=1:{}
+                 datacenter-gpu-manager-4-core-1:{} \\
+                 datacenter-gpu-manager-4-devel-1:{}
 """.format(
                     dcgm_version, dcgm_version, dcgm_version
                 )
@@ -874,9 +875,10 @@ def install_dcgm_libraries(dcgm_version, target_machine):
 # Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
 RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo \\
     && dnf clean expire-cache \\
+    && dnf makecache --refresh \\
     && dnf install --assumeyes \\
-                 datacenter-gpu-manager-4-core=1:{} \\
-                 datacenter-gpu-manager-4-devel=1:{}
+                 datacenter-gpu-manager-4-core-1:{} \\
+                 datacenter-gpu-manager-4-devel-1:{}
 """.format(
                     dcgm_version, dcgm_version, dcgm_version
                 )
@@ -889,7 +891,7 @@ def install_dcgm_libraries(dcgm_version, target_machine):
         https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb \\
       && apt install /tmp/cuda-keyring.deb \\
       && rm /tmp/cuda-keyring.deb \\
-      && apt update \\
+      && apt update -qq \\
       && apt install --yes --no-install-recommends \\
                   datacenter-gpu-manager-4-core=1:{} \\
                   datacenter-gpu-manager-4-dev=1:{}
@@ -904,7 +906,7 @@ def install_dcgm_libraries(dcgm_version, target_machine):
           https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb \\
       && apt install /tmp/cuda-keyring.deb \\
       && rm /tmp/cuda-keyring.deb \\
-      && apt update \\
+      && apt update -qq \\
       && apt install --yes --no-install-recommends \\
                    datacenter-gpu-manager-4-core=1:{} \\
                    datacenter-gpu-manager-4-dev=1:{}
@@ -1517,18 +1519,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 ENV PYTHONPATH=/opt/tritonserver/backends/dali/wheel/dali:$PYTHONPATH
 """
 
-    if target_platform() not in ["igpu", "windows", "rhel"]:
+    if target_platform() == "rhel":
         repo_arch = "sbsa" if target_machine == "aarch64" else "x86_64"
-        df += f"""
-RUN curl -o /tmp/cuda-keyring.deb \\
-        https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/{repo_arch}/cuda-keyring_1.1-1_all.deb \\
-      && apt install /tmp/cuda-keyring.deb \\
-      && rm /tmp/cuda-keyring.deb \\
-      && apt update -qq \\
-      && apt install --yes --no-install-recommends libnvshmem3-cuda-13 \\
-      && rm -rf /var/lib/apt/lists/* \\
-      && dpkg -L libnvshmem3-cuda-13 | grep libnvshmem_host.so | sed -e 's/libnvshmem_host.*//g' | sort -u > /etc/ld.so.conf.d/libnvshmem3-cuda-13.conf \\
-      && ldconfig
+        df += """
+RUN dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/{repo_arch}/cuda-rhel8.repo \\
+    && dnf clean expire-cache \\
+    && dnf install --assumeyes libnvshmem3-cuda-13
+
+RUN dirname  $(find /usr -name "libcudart*.so" -o  -name "libnvinf*.so" -o -name "libnvshm*" -type f) | sort -u > /etc/ld.so.conf.d/triton-cuda-libs.conf && ldconfig
 """.format(
             repo_arch=repo_arch
         )
 
@@ -57,6 +57,8 @@
     _create_trtllm_generate_request,
     _create_vllm_embedding_request,
     _create_vllm_generate_request,
+    _get_openai_chat_format_logprobs_from_vllm_response,
+    _get_openai_completion_format_logprobs_from_vllm_response,
     _get_output,
     _get_usage_from_response,
     _get_vllm_lora_names,
@@ -66,6 +68,7 @@
 from schemas.openai import (
     ChatCompletionChoice,
     ChatCompletionFinishReason,
+    ChatCompletionLogprobs,
     ChatCompletionMessageToolCall,
     ChatCompletionMessageToolCallChunk,
     ChatCompletionNamedToolChoice,
@@ -255,13 +258,22 @@ async def chat(
             response, metadata.backend, RequestKind.GENERATION
         )
 
+        # Parse logprobs if requested
+        logprobs_data = None
+        if request.logprobs:
+            openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
+                response
+            )
+            if openai_logprobs:
+                logprobs_data = ChatCompletionLogprobs(content=openai_logprobs)
+
         return CreateChatCompletionResponse(
             id=request_id,
             choices=[
                 ChatCompletionChoice(
                     index=0,
                     message=response_message,
-                    logprobs=None,
+                    logprobs=logprobs_data,
                     finish_reason=finish_reason,
                 )
             ],
@@ -360,10 +372,17 @@ async def completion(
             response, metadata.backend, RequestKind.GENERATION
         )
 
+        # Parse logprobs if requested
+        logprobs_data = None
+        if request.logprobs is not None and request.logprobs > 0:
+            logprobs_data = _get_openai_completion_format_logprobs_from_vllm_response(
+                response
+            )
+
         choice = Choice(
             finish_reason=FinishReason.stop,
             index=0,
-            logprobs=None,
+            logprobs=logprobs_data,
             text=text,
         )
         return CreateCompletionResponse(
@@ -605,6 +624,15 @@ async def _streaming_chat_iterator(
             )
             previous_text = current_text
 
+            # Parse logprobs for this chunk if requested
+            chunk_logprobs = None
+            if request.logprobs:
+                openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response(
+                    response
+                )
+                if openai_logprobs:
+                    chunk_logprobs = ChatCompletionLogprobs(content=openai_logprobs)
+
             # if the response delta is None (e.g. because it was a
             # "control token" for tool calls or the parser otherwise
             # wasn't ready to send a token, then
@@ -618,7 +646,7 @@ async def _streaming_chat_iterator(
             choice = ChatCompletionStreamingResponseChoice(
                 index=0,
                 delta=response_delta,
-                logprobs=None,
+                logprobs=chunk_logprobs,
                 finish_reason=finish_reason,
             )
 
@@ -791,8 +819,19 @@ def _validate_chat_request(
                 f"Received n={request.n}, but only single choice (n=1) is currently supported"
             )
 
-        if request.logit_bias is not None or request.logprobs:
-            raise ClientError("logit bias and log probs not currently supported")
+        if request.logit_bias is not None:
+            raise ClientError("logit bias is not currently supported")
+
+        # Logprobs are only supported for vLLM backend currently
+        if metadata.backend != "vllm" and (
+            request.logprobs is not None or request.top_logprobs is not None
+        ):
+            raise ClientError(
+                "logprobs are currently available only for the vLLM backend"
+            )
+
+        if request.top_logprobs is not None and not request.logprobs:
+            raise ClientError("`top_logprobs` can only be used when `logprobs` is True")
 
         self._verify_chat_tool_call_settings(request=request)
 
@@ -847,16 +886,32 @@ async def _streaming_completion_iterator(
         model = request.model
         include_usage = request.stream_options and request.stream_options.include_usage
         usage_accumulator = _StreamingUsageAccumulator(backend)
+        current_offset = 0
 
         async for response in responses:
             if include_usage:
                 usage_accumulator.update(response)
 
             text = _get_output(response)
+
+            # Parse logprobs for this chunk if requested
+            chunk_logprobs = None
+            if request.logprobs is not None and request.logprobs > 0:
+                chunk_logprobs = (
+                    _get_openai_completion_format_logprobs_from_vllm_response(response)
+                )
+                # Adjust text offsets based on accumulated output
+                if chunk_logprobs and chunk_logprobs.text_offset:
+                    chunk_logprobs.text_offset = [
+                        offset + current_offset for offset in chunk_logprobs.text_offset
+                    ]
+
+            current_offset += len(text)
+
             choice = Choice(
                 finish_reason=FinishReason.stop if response.final else None,
                 index=0,
-                logprobs=None,
+                logprobs=chunk_logprobs,
                 text=text,
             )
             chunk = CreateCompletionResponse(
@@ -942,8 +997,18 @@ def _validate_completion_request(
                 f"Received best_of={request.best_of}, but only single choice (best_of=1) is currently supported"
             )
 
-        if request.logit_bias is not None or request.logprobs is not None:
-            raise ClientError("logit bias and log probs not supported")
+        if request.logit_bias is not None:
+            raise ClientError("logit bias is not supported")
+
+        # Logprobs are only supported for vLLM backend currently
+        if (
+            request.logprobs is not None
+            and request.logprobs > 0
+            and metadata.backend != "vllm"
+        ):
+            raise ClientError(
+                "logprobs are currently available only for the vLLM backend"
+            )
 
         if request.stream_options and not request.stream:
             raise ClientError("`stream_options` can only be used when `stream` is True")