Skip to content

Commit

Permalink
Merge branch 'main' into romain.komorn/chore/enable_pytest_plugin_v2
Browse files Browse the repository at this point in the history
  • Loading branch information
romainkomorndatadog authored Nov 12, 2024
2 parents 3ed6550 + 1aac18b commit 49d8b55
Show file tree
Hide file tree
Showing 12 changed files with 428 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ endif()

include(ExternalProject)
set(TAG_LIBDATADOG
"v14.0.0"
"v14.1.0"
CACHE STRING "libdatadog github tag")

set(Datadog_BUILD_DIR ${CMAKE_BINARY_DIR}/libdatadog)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
6aa3a1dd9664f1bb51aa64e647344f48deb0b07a2c0c95cfa40af0fd0463cb08 libdatadog-aarch64-alpine-linux-musl.tar.gz
fa29ac61904b0481bcaaf2cc3aff844ac058ce92d0a4d7cfed25e4f178442359 libdatadog-aarch64-apple-darwin.tar.gz
44cde6f2b406842e9e94b36cc04aadfcc628242c634cf103bde2f4907640d39a libdatadog-aarch64-unknown-linux-gnu.tar.gz
0aaed4bbbd30dc77c9e2cd5c9bbc011d101086eb6eada6332f0a8276cd67b691 libdatadog-x86_64-alpine-linux-musl.tar.gz
c88fa1f191637e7e42776d2139721294cebc697d3cc951b972f677bb08d641fd libdatadog-x86_64-unknown-linux-gnu.tar.gz
fc6be3383d3a115804c43e2c66dd176c63f33b362d987d9b1211034e2b549c2d libdatadog-aarch64-alpine-linux-musl.tar.gz
b9c972afea19696ee6a459d2fa65563b738baf77dcb12739c8e4ae44d1c975fb libdatadog-aarch64-apple-darwin.tar.gz
1a9bc4d99d23f7baf403b6b7527f9b9d76bdb166dc34656150561dcb148cc90b libdatadog-aarch64-unknown-linux-gnu.tar.gz
8244831681332dfa939eefe6923fe6a8beaffff48cb336f836b55a438078add1 libdatadog-x86_64-alpine-linux-musl.tar.gz
76fcb3bfe3b3971d77f6dd4968ffe6bd5f6a1ada82e2e990a78919107dc2ee40 libdatadog-x86_64-unknown-linux-gnu.tar.gz
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ void
Datadog::SampleManager::postfork_child()
{
Datadog::Sample::postfork_child();
if (sample_pool != nullptr) {
// Clear the pool to make sure it's in a consistent state.
// Suppose there was a thread that was adding/removing sample from the pool
// and the fork happened in the middle of that operation.
sample_pool = std::make_unique<SynchronizedSamplePool>(sample_pool_capacity);
}
}

void
Expand Down
4 changes: 4 additions & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,7 @@
ANNOTATIONS_CONTEXT_ID = "annotations_context_id"
INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys"
INTERNAL_QUERY_VARIABLE_KEYS = "_dd_query_variable_keys"

FAITHFULNESS_DISAGREEMENTS_METADATA = "_dd.faithfulness_disagreements"
EVALUATION_KIND_METADATA = "_dd.evaluation_kind"
EVALUATION_SPAN_METADATA = "_dd.evaluation_span"
53 changes: 40 additions & 13 deletions ddtrace/llmobs/_evaluators/ragas/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
import traceback
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union

from ddtrace.internal.logger import get_logger
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL
from ddtrace.internal.utils.version import parse_version
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA
from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS
from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS
from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX


Expand Down Expand Up @@ -163,7 +169,7 @@ def __init__(self, llmobs_service):
def run_and_submit_evaluation(self, span_event: dict):
if not span_event:
return
score_result_or_failure = self.evaluate(span_event)
score_result_or_failure, metric_metadata = self.evaluate(span_event)
telemetry_writer.add_count_metric(
TELEMETRY_APM_PRODUCT.LLMOBS,
"evaluators.run",
Expand All @@ -179,9 +185,10 @@ def run_and_submit_evaluation(self, span_event: dict):
label=RagasFaithfulnessEvaluator.LABEL,
metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE,
value=score_result_or_failure,
metadata=metric_metadata,
)

def evaluate(self, span_event: dict) -> Union[float, str]:
def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
"""
Performs a faithfulness evaluation on a span event, returning either
- faithfulness score (float) OR
Expand All @@ -191,20 +198,34 @@ def evaluate(self, span_event: dict) -> Union[float, str]:
"""
self.ragas_faithfulness_instance = _get_faithfulness_instance()
if not self.ragas_faithfulness_instance:
return "fail_faithfulness_is_none"

score, question, answer, context, statements, faithfulness_list = math.nan, None, None, None, None, None
return "fail_faithfulness_is_none", {}

evaluation_metadata = {EVALUATION_KIND_METADATA: "faithfulness"} # type: dict[str, Union[str, dict, list]]

# initialize data we annotate for tracing ragas
score, question, answer, context, statements, faithfulness_list = (
math.nan,
None,
None,
None,
None,
None,
)

with self.llmobs_service.workflow(
"dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_faithfulness_workflow:
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(
span=ragas_faithfulness_workflow
)

faithfulness_inputs = self._extract_faithfulness_inputs(span_event)
if faithfulness_inputs is None:
logger.debug(
"Failed to extract question and context from span sampled for ragas_faithfulness evaluation"
)
return "fail_extract_faithfulness_inputs"
return "fail_extract_faithfulness_inputs", evaluation_metadata

question = faithfulness_inputs["question"]
answer = faithfulness_inputs["answer"]
Expand All @@ -213,19 +234,23 @@ def evaluate(self, span_event: dict) -> Union[float, str]:
statements = self._create_statements(question, answer)
if statements is None:
logger.debug("Failed to create statements from answer for `ragas_faithfulness` evaluator")
return "statements_is_none"
return "statements_is_none", evaluation_metadata

faithfulness_list = self._create_verdicts(context, statements)
if faithfulness_list is None:
logger.debug("Failed to create faithfulness list `ragas_faithfulness` evaluator")
return "statements_create_faithfulness_list"
return "statements_create_faithfulness_list", evaluation_metadata

evaluation_metadata[FAITHFULNESS_DISAGREEMENTS_METADATA] = [
{"answer_quote": answer.statement} for answer in faithfulness_list.__root__ if answer.verdict == 0
]

score = self._compute_score(faithfulness_list)
if math.isnan(score):
logger.debug("Score computation returned NaN for `ragas_faithfulness` evaluator")
return "statements_compute_score"
return "statements_compute_score", evaluation_metadata

return score
return score, evaluation_metadata
finally:
self.llmobs_service.annotate(
span=ragas_faithfulness_workflow,
Expand Down Expand Up @@ -341,10 +366,12 @@ def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]:
answer = messages[-1].get("content")

if prompt_variables:
question = prompt_variables.get("question")
context = prompt_variables.get("context")
context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"])
question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"])
context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)])
question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)])

if not question and len(input_messages) > 0:
if not question and input_messages is not None and len(input_messages) > 0:
question = input_messages[-1].get("content")

self.llmobs_service.annotate(
Expand Down
2 changes: 2 additions & 0 deletions ddtrace/profiling/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ddtrace.internal import service
from ddtrace.internal import uwsgi
from ddtrace.internal import writer
from ddtrace.internal.core import crashtracking
from ddtrace.internal.datadog.profiling import ddup
from ddtrace.internal.module import ModuleWatchdog
from ddtrace.internal.telemetry import telemetry_writer
Expand Down Expand Up @@ -223,6 +224,7 @@ def _build_default_exporters(self):
configured_features.append("CAP" + str(profiling_config.capture_pct))
configured_features.append("MAXF" + str(profiling_config.max_frames))
self.tags.update({"profiler_config": "_".join(configured_features)})
crashtracking.add_tag("profiler_config", self.tags["profiler_config"])

endpoint_call_counter_span_processor = self.tracer._endpoint_call_counter_span_processor
if self.endpoint_collection_enabled:
Expand Down
6 changes: 6 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -681,3 +681,9 @@ Exception Replay
----------------

.. ddtrace-envier-configuration:: ddtrace.settings.exception_replay:ExceptionReplayConfig


Code Origin
-----------

.. ddtrace-envier-configuration:: ddtrace.settings.code_origin:CodeOriginConfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
fixes:
- |
profiling: fixes an issue where the sample pool could deadlock after ``fork()``
by clearing it in the child process.
14 changes: 14 additions & 0 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ddtrace._trace.span import Span
from ddtrace.ext import SpanTypes
from ddtrace.llmobs._utils import _get_span_name
from ddtrace.llmobs._writer import LLMObsEvaluationMetricEvent


if vcr:
Expand Down Expand Up @@ -508,6 +509,19 @@ def run_and_submit_evaluation(self, span):
)


def _dummy_evaluator_eval_metric_event(span_id, trace_id):
return LLMObsEvaluationMetricEvent(
span_id=span_id,
trace_id=trace_id,
score_value=1.0,
ml_app="unnamed-ml-app",
timestamp_ms=mock.ANY,
metric_type="score",
label=DummyEvaluator.LABEL,
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"],
)


def _expected_ragas_spans(ragas_inputs=None):
if not ragas_inputs:
ragas_inputs = default_ragas_inputs
Expand Down
Loading

0 comments on commit 49d8b55

Please sign in to comment.