Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 35 additions & 30 deletions litellm/integrations/opentelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def _get_litellm_resource():
while maintaining backward compatibility with LiteLLM-specific environment variables.
"""
from opentelemetry.sdk.resources import OTELResourceDetector, Resource
from opentelemetry.semconv.schemas import Schemas

# Create base resource attributes with LiteLLM-specific defaults
# These will be overridden by OTEL_RESOURCE_ATTRIBUTES if present
Expand All @@ -72,7 +73,7 @@ def _get_litellm_resource():
}

# Create base resource with LiteLLM-specific defaults
base_resource = Resource.create(base_attributes) # type: ignore
base_resource = Resource.create(base_attributes, Schemas.V1_25_0.value) # type: ignore

# Create resource from OTEL_RESOURCE_ATTRIBUTES using the detector
otel_resource_detector = OTELResourceDetector()
Expand Down Expand Up @@ -557,9 +558,9 @@ def get_tracer_to_use_for_request(self, kwargs: dict) -> Tracer:

def _get_dynamic_otel_headers_from_kwargs(self, kwargs) -> Optional[dict]:
"""Extract dynamic headers from kwargs if available."""
standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = (
kwargs.get("standard_callback_dynamic_params")
)
standard_callback_dynamic_params: Optional[
StandardCallbackDynamicParams
] = kwargs.get("standard_callback_dynamic_params")

if not standard_callback_dynamic_params:
return None
Expand Down Expand Up @@ -764,34 +765,34 @@ def _to_timestamp(val: Optional[Union[datetime, float, str]]) -> Optional[float]
return float(val)
# isinstance(val, str) - parse datetime string (with or without microseconds)
try:
return datetime.strptime(val, '%Y-%m-%d %H:%M:%S.%f').timestamp()
return datetime.strptime(val, "%Y-%m-%d %H:%M:%S.%f").timestamp()
except ValueError:
try:
return datetime.strptime(val, '%Y-%m-%d %H:%M:%S').timestamp()
return datetime.strptime(val, "%Y-%m-%d %H:%M:%S").timestamp()
except ValueError:
return None

def _record_time_to_first_token_metric(self, kwargs: dict, common_attrs: dict):
"""Record Time to First Token (TTFT) metric for streaming requests."""
optional_params = kwargs.get("optional_params", {})
is_streaming = optional_params.get("stream", False)

if not (self._time_to_first_token_histogram and is_streaming):
return

# Use api_call_start_time for precision (matches Prometheus implementation)
# This excludes LiteLLM overhead and measures pure LLM API latency
api_call_start_time = kwargs.get("api_call_start_time", None)
completion_start_time = kwargs.get("completion_start_time", None)

if api_call_start_time is not None and completion_start_time is not None:
# Convert to timestamps if needed (handles datetime, float, and string)
api_call_start_ts = self._to_timestamp(api_call_start_time)
completion_start_ts = self._to_timestamp(completion_start_time)

if api_call_start_ts is None or completion_start_ts is None:
return # Skip recording if conversion failed

time_to_first_token_seconds = completion_start_ts - api_call_start_ts
self._time_to_first_token_histogram.record(
time_to_first_token_seconds, attributes=common_attrs
Expand All @@ -806,38 +807,40 @@ def _record_time_per_output_token_metric(
common_attrs: dict,
):
"""Record Time Per Output Token (TPOT) metric.

Calculated as: generation_time / completion_tokens
- For streaming: uses end_time - completion_start_time (time to generate all tokens after first)
- For non-streaming: uses end_time - api_call_start_time (total generation time)
"""
if not self._time_per_output_token_histogram:
return

# Get completion tokens from response_obj
completion_tokens = None
if response_obj and (usage := response_obj.get("usage")):
completion_tokens = usage.get("completion_tokens")

if completion_tokens is None or completion_tokens <= 0:
return

# Calculate generation time
completion_start_time = kwargs.get("completion_start_time", None)
api_call_start_time = kwargs.get("api_call_start_time", None)

# Convert end_time to timestamp (handles datetime, float, and string)
end_time_ts = self._to_timestamp(end_time)
if end_time_ts is None:
# Fallback to duration_s if conversion failed
generation_time_seconds = duration_s
if generation_time_seconds > 0:
time_per_output_token_seconds = generation_time_seconds / completion_tokens
time_per_output_token_seconds = (
generation_time_seconds / completion_tokens
)
self._time_per_output_token_histogram.record(
time_per_output_token_seconds, attributes=common_attrs
)
return

if completion_start_time is not None:
# Streaming: use completion_start_time (when first token arrived)
# This measures time to generate all tokens after the first one
Expand All @@ -858,7 +861,7 @@ def _record_time_per_output_token_metric(
else:
# Fallback: use duration_s (already calculated as (end_time - start_time).total_seconds())
generation_time_seconds = duration_s

if generation_time_seconds > 0:
time_per_output_token_seconds = generation_time_seconds / completion_tokens
self._time_per_output_token_histogram.record(
Expand All @@ -872,37 +875,37 @@ def _record_response_duration_metric(
common_attrs: dict,
):
"""Record Total Generation Time (response duration) metric.

Measures pure LLM API generation time: end_time - api_call_start_time
This excludes LiteLLM overhead and measures only the LLM provider's response time.
Works for both streaming and non-streaming requests.

Mirrors Prometheus's litellm_llm_api_latency_metric.
Uses kwargs.get("end_time") with fallback to parameter for consistency with Prometheus.
"""
if not self._response_duration_histogram:
return

api_call_start_time = kwargs.get("api_call_start_time", None)
if api_call_start_time is None:
return

# Use end_time from kwargs if available (matches Prometheus), otherwise use parameter
# For streaming: end_time is when the stream completes (final chunk received)
# For non-streaming: end_time is when the response is received
_end_time = kwargs.get("end_time") or end_time
if _end_time is None:
_end_time = datetime.now()

# Convert to timestamps if needed (handles datetime, float, and string)
api_call_start_ts = self._to_timestamp(api_call_start_time)
end_time_ts = self._to_timestamp(_end_time)

if api_call_start_ts is None or end_time_ts is None:
return # Skip recording if conversion failed

response_duration_seconds = end_time_ts - api_call_start_ts

if response_duration_seconds > 0:
self._response_duration_histogram.record(
response_duration_seconds, attributes=common_attrs
Expand Down Expand Up @@ -1263,7 +1266,9 @@ def set_attributes( # noqa: PLR0915
)
return
elif self.callback_name == "weave_otel":
from litellm.integrations.weave.weave_otel import set_weave_otel_attributes
from litellm.integrations.weave.weave_otel import (
set_weave_otel_attributes,
)

set_weave_otel_attributes(span, kwargs, response_obj)
return
Expand Down Expand Up @@ -1407,7 +1412,7 @@ def set_attributes( # noqa: PLR0915
value=usage.get("prompt_tokens"),
)

########################################################################
########################################################################
########## LLM Request Medssages / tools / content Attributes ###########
#########################################################################

Expand Down Expand Up @@ -1994,7 +1999,7 @@ def create_litellm_proxy_request_started_span(
"""
Create a span for the received proxy server request.
"""

return self.tracer.start_span(
name="Received Proxy Server Request",
start_time=self._to_ns(start_time),
Expand Down
7 changes: 4 additions & 3 deletions tests/test_litellm/integrations/test_opentelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.semconv.schemas import Schemas

from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
from litellm.litellm_core_utils.safe_json_dumps import safe_dumps
Expand Down Expand Up @@ -448,7 +449,7 @@ def test_get_litellm_resource_with_defaults(
"deployment.environment": "production",
"model_id": "litellm",
}
mock_resource_create.assert_called_once_with(expected_attributes)
mock_resource_create.assert_called_once_with(expected_attributes, Schemas.V1_25_0.value)
mock_detector.detect.assert_called_once()
mock_base_resource.merge.assert_called_once_with(mock_env_resource)
self.assertEqual(result, mock_merged_resource)
Expand Down Expand Up @@ -493,7 +494,7 @@ def test_get_litellm_resource_with_litellm_env_vars(
"deployment.environment": "staging",
"model_id": "test-model",
}
mock_resource_create.assert_called_once_with(expected_attributes)
mock_resource_create.assert_called_once_with(expected_attributes, Schemas.V1_25_0.value)
mock_detector.detect.assert_called_once()
mock_base_resource.merge.assert_called_once_with(mock_env_resource)
self.assertEqual(result, mock_merged_resource)
Expand Down Expand Up @@ -539,7 +540,7 @@ def test_get_litellm_resource_with_otel_resource_attributes(
"deployment.environment": "production",
"model_id": "should-be-overridden",
}
mock_resource_create.assert_called_once_with(expected_attributes)
mock_resource_create.assert_called_once_with(expected_attributes, Schemas.V1_25_0.value)
mock_detector.detect.assert_called_once()
mock_base_resource.merge.assert_called_once_with(mock_env_resource)
self.assertEqual(result, mock_merged_resource)
Expand Down
Loading