5757 _create_trtllm_generate_request ,
5858 _create_vllm_embedding_request ,
5959 _create_vllm_generate_request ,
60+ _get_openai_chat_format_logprobs_from_vllm_response ,
61+ _get_openai_completion_format_logprobs_from_vllm_response ,
6062 _get_output ,
6163 _get_usage_from_response ,
6264 _get_vllm_lora_names ,
6668from schemas .openai import (
6769 ChatCompletionChoice ,
6870 ChatCompletionFinishReason ,
71+ ChatCompletionLogprobs ,
6972 ChatCompletionMessageToolCall ,
7073 ChatCompletionMessageToolCallChunk ,
7174 ChatCompletionNamedToolChoice ,
@@ -255,13 +258,22 @@ async def chat(
255258 response , metadata .backend , RequestKind .GENERATION
256259 )
257260
261+ # Parse logprobs if requested
262+ logprobs_data = None
263+ if request .logprobs :
264+ openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response (
265+ response
266+ )
267+ if openai_logprobs :
268+ logprobs_data = ChatCompletionLogprobs (content = openai_logprobs )
269+
258270 return CreateChatCompletionResponse (
259271 id = request_id ,
260272 choices = [
261273 ChatCompletionChoice (
262274 index = 0 ,
263275 message = response_message ,
264- logprobs = None ,
276+ logprobs = logprobs_data ,
265277 finish_reason = finish_reason ,
266278 )
267279 ],
@@ -360,10 +372,17 @@ async def completion(
360372 response , metadata .backend , RequestKind .GENERATION
361373 )
362374
375+ # Parse logprobs if requested
376+ logprobs_data = None
377+ if request .logprobs is not None and request .logprobs > 0 :
378+ logprobs_data = _get_openai_completion_format_logprobs_from_vllm_response (
379+ response
380+ )
381+
363382 choice = Choice (
364383 finish_reason = FinishReason .stop ,
365384 index = 0 ,
366- logprobs = None ,
385+ logprobs = logprobs_data ,
367386 text = text ,
368387 )
369388 return CreateCompletionResponse (
@@ -605,6 +624,15 @@ async def _streaming_chat_iterator(
605624 )
606625 previous_text = current_text
607626
627+ # Parse logprobs for this chunk if requested
628+ chunk_logprobs = None
629+ if request .logprobs :
630+ openai_logprobs = _get_openai_chat_format_logprobs_from_vllm_response (
631+ response
632+ )
633+ if openai_logprobs :
634+ chunk_logprobs = ChatCompletionLogprobs (content = openai_logprobs )
635+
608636 # if the response delta is None (e.g. because it was a
609637 # "control token" for tool calls or the parser otherwise
610638 # wasn't ready to send a token, then
@@ -618,7 +646,7 @@ async def _streaming_chat_iterator(
618646 choice = ChatCompletionStreamingResponseChoice (
619647 index = 0 ,
620648 delta = response_delta ,
621- logprobs = None ,
649+ logprobs = chunk_logprobs ,
622650 finish_reason = finish_reason ,
623651 )
624652
@@ -791,8 +819,19 @@ def _validate_chat_request(
791819 f"Received n={ request .n } , but only single choice (n=1) is currently supported"
792820 )
793821
794- if request .logit_bias is not None or request .logprobs :
795- raise ClientError ("logit bias and log probs not currently supported" )
822+ if request .logit_bias is not None :
823+ raise ClientError ("logit bias is not currently supported" )
824+
825+ # Logprobs are only supported for vLLM backend currently
826+ if metadata .backend != "vllm" and (
827+ request .logprobs is not None or request .top_logprobs is not None
828+ ):
829+ raise ClientError (
830+ "logprobs are currently available only for the vLLM backend"
831+ )
832+
833+ if request .top_logprobs is not None and not request .logprobs :
834+ raise ClientError ("`top_logprobs` can only be used when `logprobs` is True" )
796835
797836 self ._verify_chat_tool_call_settings (request = request )
798837
@@ -847,16 +886,32 @@ async def _streaming_completion_iterator(
847886 model = request .model
848887 include_usage = request .stream_options and request .stream_options .include_usage
849888 usage_accumulator = _StreamingUsageAccumulator (backend )
889+ current_offset = 0
850890
851891 async for response in responses :
852892 if include_usage :
853893 usage_accumulator .update (response )
854894
855895 text = _get_output (response )
896+
897+ # Parse logprobs for this chunk if requested
898+ chunk_logprobs = None
899+ if request .logprobs is not None and request .logprobs > 0 :
900+ chunk_logprobs = (
901+ _get_openai_completion_format_logprobs_from_vllm_response (response )
902+ )
903+ # Adjust text offsets based on accumulated output
904+ if chunk_logprobs and chunk_logprobs .text_offset :
905+ chunk_logprobs .text_offset = [
906+ offset + current_offset for offset in chunk_logprobs .text_offset
907+ ]
908+
909+ current_offset += len (text )
910+
856911 choice = Choice (
857912 finish_reason = FinishReason .stop if response .final else None ,
858913 index = 0 ,
859- logprobs = None ,
914+ logprobs = chunk_logprobs ,
860915 text = text ,
861916 )
862917 chunk = CreateCompletionResponse (
@@ -942,8 +997,18 @@ def _validate_completion_request(
942997 f"Received best_of={ request .best_of } , but only single choice (best_of=1) is currently supported"
943998 )
944999
945- if request .logit_bias is not None or request .logprobs is not None :
946- raise ClientError ("logit bias and log probs not supported" )
1000+ if request .logit_bias is not None :
1001+ raise ClientError ("logit bias is not supported" )
1002+
1003+ # Logprobs are only supported for vLLM backend currently
1004+ if (
1005+ request .logprobs is not None
1006+ and request .logprobs > 0
1007+ and metadata .backend != "vllm"
1008+ ):
1009+ raise ClientError (
1010+ "logprobs are currently available only for the vLLM backend"
1011+ )
9471012
9481013 if request .stream_options and not request .stream :
9491014 raise ClientError ("`stream_options` can only be used when `stream` is True" )
0 commit comments