From 69026ee6533917537fa453b5c2d906843e15ce2d Mon Sep 17 00:00:00 2001 From: "Gabriele N. Tornetta" Date: Mon, 11 Nov 2024 19:25:16 +0000 Subject: [PATCH 1/5] docs: code origin configuration (#11324) We include the code origin configuration to the documentation. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- docs/configuration.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 45673747b72..3afe7ee817f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -681,3 +681,9 @@ Exception Replay ---------------- .. ddtrace-envier-configuration:: ddtrace.settings.exception_replay:ExceptionReplayConfig + + +Code Origin +----------- + +.. ddtrace-envier-configuration:: ddtrace.settings.code_origin:CodeOriginConfig From 9de85e35dc95608be7b586739f7d8fe33a7c62bd Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Mon, 11 Nov 2024 16:23:02 -0500 Subject: [PATCH 2/5] fix(profiling): clear sample pool after fork (#11350) Tested with native tests `crossbeam::ArrayQueue` uses Rust `std::sync::atomic` which doesn't seem to be consistent across `fork()` calls as `std::mutex` from C++ isn't. So we need to make sure, an equivalent of `std::mutex::unlock()` happens after fork. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- .../datadog/profiling/dd_wrapper/src/sample_manager.cpp | 6 ++++++ .../notes/profiling-sample-pool-461a108e068dea5b.yaml | 6 ++++++ 2 files changed, 12 insertions(+) create mode 100644 releasenotes/notes/profiling-sample-pool-461a108e068dea5b.yaml diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample_manager.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample_manager.cpp index 310bf3b95bd..ca355cac97c 100644 --- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample_manager.cpp +++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/sample_manager.cpp @@ -74,6 +74,12 @@ void Datadog::SampleManager::postfork_child() { Datadog::Sample::postfork_child(); + if (sample_pool != nullptr) { + // Clear the pool to make sure it's in a consistent state. + // Suppose there was a thread that was adding/removing sample from the pool + // and the fork happened in the middle of that operation. + sample_pool = std::make_unique(sample_pool_capacity); + } } void diff --git a/releasenotes/notes/profiling-sample-pool-461a108e068dea5b.yaml b/releasenotes/notes/profiling-sample-pool-461a108e068dea5b.yaml new file mode 100644 index 00000000000..2038245d266 --- /dev/null +++ b/releasenotes/notes/profiling-sample-pool-461a108e068dea5b.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + profiling: fixes an issue where the sample pool could deadlock after ``fork()`` + by clearing it in the child process. + From 498e125e927d6ab5449f6dd9aa0325ba70cfffa1 Mon Sep 17 00:00:00 2001 From: David Sanchez <838104+sanchda@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:49:23 -0600 Subject: [PATCH 3/5] chore(profiling): bump libdatadog version (#11337) This is a minor version bump for libdatadog. It doesn't really affect anything, except it fixes an issue with the crashtracker altstack creation not being sufficiently granular. This isn't a problem for Python, since we just use and create a sane default. Still, if we ever need this to work properly during an investigation, it should be properly configurable. ## Checklist - [X] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) Co-authored-by: Taegyun Kim --- .../datadog/profiling/cmake/FindLibdatadog.cmake | 2 +- .../profiling/cmake/tools/libdatadog_checksums.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ddtrace/internal/datadog/profiling/cmake/FindLibdatadog.cmake b/ddtrace/internal/datadog/profiling/cmake/FindLibdatadog.cmake index e713722698b..c74851b9e65 100644 --- a/ddtrace/internal/datadog/profiling/cmake/FindLibdatadog.cmake +++ b/ddtrace/internal/datadog/profiling/cmake/FindLibdatadog.cmake @@ -5,7 +5,7 @@ endif() include(ExternalProject) set(TAG_LIBDATADOG - "v14.0.0" + "v14.1.0" CACHE STRING "libdatadog github tag") set(Datadog_BUILD_DIR ${CMAKE_BINARY_DIR}/libdatadog) diff --git a/ddtrace/internal/datadog/profiling/cmake/tools/libdatadog_checksums.txt b/ddtrace/internal/datadog/profiling/cmake/tools/libdatadog_checksums.txt index d2e19b88f78..a6a65ce0f90 100644 --- a/ddtrace/internal/datadog/profiling/cmake/tools/libdatadog_checksums.txt +++ b/ddtrace/internal/datadog/profiling/cmake/tools/libdatadog_checksums.txt @@ -1,5 +1,5 @@ -6aa3a1dd9664f1bb51aa64e647344f48deb0b07a2c0c95cfa40af0fd0463cb08 libdatadog-aarch64-alpine-linux-musl.tar.gz -fa29ac61904b0481bcaaf2cc3aff844ac058ce92d0a4d7cfed25e4f178442359 libdatadog-aarch64-apple-darwin.tar.gz -44cde6f2b406842e9e94b36cc04aadfcc628242c634cf103bde2f4907640d39a libdatadog-aarch64-unknown-linux-gnu.tar.gz -0aaed4bbbd30dc77c9e2cd5c9bbc011d101086eb6eada6332f0a8276cd67b691 libdatadog-x86_64-alpine-linux-musl.tar.gz -c88fa1f191637e7e42776d2139721294cebc697d3cc951b972f677bb08d641fd libdatadog-x86_64-unknown-linux-gnu.tar.gz +fc6be3383d3a115804c43e2c66dd176c63f33b362d987d9b1211034e2b549c2d libdatadog-aarch64-alpine-linux-musl.tar.gz +b9c972afea19696ee6a459d2fa65563b738baf77dcb12739c8e4ae44d1c975fb libdatadog-aarch64-apple-darwin.tar.gz +1a9bc4d99d23f7baf403b6b7527f9b9d76bdb166dc34656150561dcb148cc90b libdatadog-aarch64-unknown-linux-gnu.tar.gz +8244831681332dfa939eefe6923fe6a8beaffff48cb336f836b55a438078add1 libdatadog-x86_64-alpine-linux-musl.tar.gz +76fcb3bfe3b3971d77f6dd4968ffe6bd5f6a1ada82e2e990a78919107dc2ee40 libdatadog-x86_64-unknown-linux-gnu.tar.gz From 7ceea025f75ada875434ebde0501f598a763fd54 Mon Sep 17 00:00:00 2001 From: David Sanchez <838104+sanchda@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:57:58 -0600 Subject: [PATCH 4/5] chore(profiling): add profiling configuration to crashtracking tags (#11336) This patch adds the profiling configuration to the crashtracking tags. Crashtracking already has tests for this, so it's a simple matter of actually enabling the propagation. ## Checklist - [X] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: Taegyun Kim --- ddtrace/profiling/profiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ddtrace/profiling/profiler.py b/ddtrace/profiling/profiler.py index a74136912cc..48789200d7f 100644 --- a/ddtrace/profiling/profiler.py +++ b/ddtrace/profiling/profiler.py @@ -16,6 +16,7 @@ from ddtrace.internal import service from ddtrace.internal import uwsgi from ddtrace.internal import writer +from ddtrace.internal.core import crashtracking from ddtrace.internal.datadog.profiling import ddup from ddtrace.internal.module import ModuleWatchdog from ddtrace.internal.telemetry import telemetry_writer @@ -223,6 +224,7 @@ def _build_default_exporters(self): configured_features.append("CAP" + str(profiling_config.capture_pct)) configured_features.append("MAXF" + str(profiling_config.max_frames)) self.tags.update({"profiler_config": "_".join(configured_features)}) + crashtracking.add_tag("profiler_config", self.tags["profiler_config"]) endpoint_call_counter_span_processor = self.tracer._endpoint_call_counter_span_processor if self.endpoint_collection_enabled: From 1aac18bd67a581a01af5d7894d398c6ea6ce87f9 Mon Sep 17 00:00:00 2001 From: lievan <42917263+lievan@users.noreply.github.com> Date: Mon, 11 Nov 2024 21:20:43 -0500 Subject: [PATCH 5/5] chore(llmobs): add metadata to the ragas evaluation metric (#11271) Some final UX quirks for ragas faithfulness evaluations. - Add metadata to the faithfulness evaluation metric event. This metadata includes a list of flagged segments from the answer (enables better trace view UX) and the exported span of the faithfulness trace (enables trace link). ``` "metadata": { "_dd": { "evaluation_kind": "faithfulness", "evaluation_span": { "trace_id": "672d238e000000003a5fc64e5f9a9d6f", "span_id": "17204623770592671651" } "disagreements_list": [ {"answer_quote": "..."} ], } }, ``` - Support the ability for users to specify their own custom context & query keys that are used as inputs for faithfulness eval. With these changes, ragas faithfulness will then be aligned with our OOTB hallucination metric. This PR has some small refactors to tests - decouples ragas faithfulness evaluator from the evaluator runner tests (we now only rely on the dummy evaluator in utils) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan --- ddtrace/llmobs/_constants.py | 4 + .../llmobs/_evaluators/ragas/faithfulness.py | 53 +++- tests/llmobs/_utils.py | 14 + ...s_evaluation_on_span_with_custom_keys.yaml | 279 ++++++++++++++++++ tests/llmobs/test_llmobs_evaluator_runner.py | 33 +-- ...est_llmobs_ragas_faithfulness_evaluator.py | 61 +++- 6 files changed, 402 insertions(+), 42 deletions(-) create mode 100644 tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 5f33349e938..7c295835e54 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -56,3 +56,7 @@ ANNOTATIONS_CONTEXT_ID = "annotations_context_id" INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys" INTERNAL_QUERY_VARIABLE_KEYS = "_dd_query_variable_keys" + +FAITHFULNESS_DISAGREEMENTS_METADATA = "_dd.faithfulness_disagreements" +EVALUATION_KIND_METADATA = "_dd.evaluation_kind" +EVALUATION_SPAN_METADATA = "_dd.evaluation_span" diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index 9b0abbd8953..d651c2443a4 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -3,6 +3,7 @@ import traceback from typing import List from typing import Optional +from typing import Tuple from typing import Union from ddtrace.internal.logger import get_logger @@ -10,6 +11,11 @@ from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL from ddtrace.internal.utils.version import parse_version +from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA +from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA +from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA +from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS +from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX @@ -163,7 +169,7 @@ def __init__(self, llmobs_service): def run_and_submit_evaluation(self, span_event: dict): if not span_event: return - score_result_or_failure = self.evaluate(span_event) + score_result_or_failure, metric_metadata = self.evaluate(span_event) telemetry_writer.add_count_metric( TELEMETRY_APM_PRODUCT.LLMOBS, "evaluators.run", @@ -179,9 +185,10 @@ def run_and_submit_evaluation(self, span_event: dict): label=RagasFaithfulnessEvaluator.LABEL, metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, value=score_result_or_failure, + metadata=metric_metadata, ) - def evaluate(self, span_event: dict) -> Union[float, str]: + def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: """ Performs a faithfulness evaluation on a span event, returning either - faithfulness score (float) OR @@ -191,20 +198,34 @@ def evaluate(self, span_event: dict) -> Union[float, str]: """ self.ragas_faithfulness_instance = _get_faithfulness_instance() if not self.ragas_faithfulness_instance: - return "fail_faithfulness_is_none" - - score, question, answer, context, statements, faithfulness_list = math.nan, None, None, None, None, None + return "fail_faithfulness_is_none", {} + + evaluation_metadata = {EVALUATION_KIND_METADATA: "faithfulness"} # type: dict[str, Union[str, dict, list]] + + # initialize data we annotate for tracing ragas + score, question, answer, context, statements, faithfulness_list = ( + math.nan, + None, + None, + None, + None, + None, + ) with self.llmobs_service.workflow( "dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_faithfulness_workflow: try: + evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span( + span=ragas_faithfulness_workflow + ) + faithfulness_inputs = self._extract_faithfulness_inputs(span_event) if faithfulness_inputs is None: logger.debug( "Failed to extract question and context from span sampled for ragas_faithfulness evaluation" ) - return "fail_extract_faithfulness_inputs" + return "fail_extract_faithfulness_inputs", evaluation_metadata question = faithfulness_inputs["question"] answer = faithfulness_inputs["answer"] @@ -213,19 +234,23 @@ def evaluate(self, span_event: dict) -> Union[float, str]: statements = self._create_statements(question, answer) if statements is None: logger.debug("Failed to create statements from answer for `ragas_faithfulness` evaluator") - return "statements_is_none" + return "statements_is_none", evaluation_metadata faithfulness_list = self._create_verdicts(context, statements) if faithfulness_list is None: logger.debug("Failed to create faithfulness list `ragas_faithfulness` evaluator") - return "statements_create_faithfulness_list" + return "statements_create_faithfulness_list", evaluation_metadata + + evaluation_metadata[FAITHFULNESS_DISAGREEMENTS_METADATA] = [ + {"answer_quote": answer.statement} for answer in faithfulness_list.__root__ if answer.verdict == 0 + ] score = self._compute_score(faithfulness_list) if math.isnan(score): logger.debug("Score computation returned NaN for `ragas_faithfulness` evaluator") - return "statements_compute_score" + return "statements_compute_score", evaluation_metadata - return score + return score, evaluation_metadata finally: self.llmobs_service.annotate( span=ragas_faithfulness_workflow, @@ -341,10 +366,12 @@ def _extract_faithfulness_inputs(self, span_event: dict) -> Optional[dict]: answer = messages[-1].get("content") if prompt_variables: - question = prompt_variables.get("question") - context = prompt_variables.get("context") + context_keys = prompt.get(INTERNAL_CONTEXT_VARIABLE_KEYS, ["context"]) + question_keys = prompt.get(INTERNAL_QUERY_VARIABLE_KEYS, ["question"]) + context = " ".join([prompt_variables.get(key) for key in context_keys if prompt_variables.get(key)]) + question = " ".join([prompt_variables.get(key) for key in question_keys if prompt_variables.get(key)]) - if not question and len(input_messages) > 0: + if not question and input_messages is not None and len(input_messages) > 0: question = input_messages[-1].get("content") self.llmobs_service.annotate( diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index afafdaf4aab..fb48d15431b 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -12,6 +12,7 @@ from ddtrace._trace.span import Span from ddtrace.ext import SpanTypes from ddtrace.llmobs._utils import _get_span_name +from ddtrace.llmobs._writer import LLMObsEvaluationMetricEvent if vcr: @@ -508,6 +509,19 @@ def run_and_submit_evaluation(self, span): ) +def _dummy_evaluator_eval_metric_event(span_id, trace_id): + return LLMObsEvaluationMetricEvent( + span_id=span_id, + trace_id=trace_id, + score_value=1.0, + ml_app="unnamed-ml-app", + timestamp_ms=mock.ANY, + metric_type="score", + label=DummyEvaluator.LABEL, + tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"], + ) + + def _expected_ragas_spans(ragas_inputs=None): if not ragas_inputs: ragas_inputs = default_ragas_inputs diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml new file mode 100644 index 00000000000..1301513e8aa --- /dev/null +++ b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml @@ -0,0 +1,279 @@ +interactions: +- request: + body: '{"messages": [{"content": "Given a question, an answer, and sentences from + the answer analyze the complexity of each sentence given under ''sentences'' + and break down each sentence into one or more fully understandable statements + while also ensuring no pronouns are used in each statement. Format the outputs + in JSON.\n\nThe output should be a well-formatted JSON instance that conforms + to the JSON schema below.\n\nAs an example, for the schema {\"properties\": + {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": + \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\nthe + object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. + The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n\nHere + is the output JSON schema:\n```\n{\"type\": \"array\", \"items\": {\"$ref\": + \"#/definitions/Statements\"}, \"definitions\": {\"Statements\": {\"title\": + \"Statements\", \"type\": \"object\", \"properties\": {\"sentence_index\": {\"title\": + \"Sentence Index\", \"description\": \"Index of the sentence from the statement + list\", \"type\": \"integer\"}, \"simpler_statements\": {\"title\": \"Simpler + Statements\", \"description\": \"the simpler statements\", \"type\": \"array\", + \"items\": {\"type\": \"string\"}}}, \"required\": [\"sentence_index\", \"simpler_statements\"]}}}\n```\n\nDo + not return any preamble or explanations, return only a pure JSON string surrounded + by triple backticks (```).\n\nExamples:\n\nquestion: \"Who was Albert Einstein + and what is he best known for?\"\nanswer: \"He was a German-born theoretical + physicist, widely acknowledged to be one of the greatest and most influential + physicists of all time. He was best known for developing the theory of relativity, + he also made important contributions to the development of the theory of quantum + mechanics.\"\nsentences: \"\\n 0:He was a German-born theoretical physicist, + widely acknowledged to be one of the greatest and most influential physicists + of all time. \\n 1:He was best known for developing the theory of relativity, + he also made important contributions to the development of the theory of quantum + mechanics.\\n \"\nanalysis: ```[{\"sentence_index\": 0, \"simpler_statements\": + [\"Albert Einstein was a German-born theoretical physicist.\", \"Albert Einstein + is recognized as one of the greatest and most influential physicists of all + time.\"]}, {\"sentence_index\": 1, \"simpler_statements\": [\"Albert Einstein + was best known for developing the theory of relativity.\", \"Albert Einstein + also made important contributions to the development of the theory of quantum + mechanics.\"]}]```\n\nYour actual task:\n\nquestion: \"Is france part of europe?\"\nanswer: + \"France is indeed part of europe\"\nsentences: \"\"\nanalysis: \n", "role": + "user"}], "model": "gpt-4o-mini", "n": 1, "stream": false, "temperature": 1e-08}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '2915' + content-type: + - application/json + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.52.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.52.0 + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.10.13 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAA4ySS2/bMBCE7/oVxJ6tQFb8qm89pAWCFn2k6MUyJIZayWwkkuWu0Ifh/x5QViQb + bYFedNiPM5pZ8hgJAbqErQB1kKxa18SvP6cPH7/+WGzow/397fe3T1/STfruU/tQvr/7DbOgsI/f + UPGL6kbZ1jXI2pozVh4lY3Cdr2/nSbJerZY9aG2JTZDVjuOFjVttdJwm6SJO1vF8M6gPVisk2Ipd + JIQQx/4bcpoSf8JWJLOXSYtEskbYjoeEAG+bMAFJpImlYZhNUFnDaProRVHsjhkQhonCvLfPen+R + AenQyefEkrFFwxTQLoM3XhqFQpNw0rOwlbjrvHV4k8H+tC+K4vJ3HquOZKhsuqYZ5qcxf2Nr5+0j + DXycV9poOuQeJVkTshJbBz09RULs+z11V9XBeds6ztk+oQmGq2TYE0zXM9F0OUC2LJsL1Qiu/PIS + WeqGLjYNSqoDlpN0uhbZldpegOii9Z9p/uZ9bq5N/T/2E1AKHWOZO4+lVteNp2Mew+v917Fxy31g + oF/E2OaVNjV65/X57VQur5av5Ga+SNcVRKfoGQAA//8DAN2uIFBJAwAA + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8defb8f19b1c3992-IAD + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Thu, 07 Nov 2024 19:27:45 GMT + Server: + - cloudflare + Set-Cookie: + - __cf_bm=GMAym2MnyZWZw_ORQOfRH7m0vuREaeNFf6gzJiHGy1k-1731007665-1.0.1.1-nArkb5npme6zrdpZ1L0Fho.0C5Glt5LEHaKERjf0koaMgNHOFfv34RUUXMfeQlNRjorW8a21hX7CcW0rBlX22w; + path=/; expires=Thu, 07-Nov-24 19:57:45 GMT; domain=.api.openai.com; HttpOnly; + Secure; SameSite=None + - _cfuvid=2UlWnMHqlwf2iyqQoi_qr_fVNbYS7TQkpnENDD0iUfk-1731007665474-0.0.1.1-604800000; + path=/; domain=.api.openai.com; HttpOnly; Secure; SameSite=None + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - datadog-staging + openai-processing-ms: + - '458' + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999324' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_a8016b4905b963ff5fd373af0e60e956 + status: + code: 200 + message: OK +- request: + body: '{"messages": [{"content": "Your task is to judge the faithfulness of a + series of statements based on a given context. For each statement you must return + verdict as 1 if the statement can be directly inferred based on the context + or 0 if the statement can not be directly inferred based on the context.\n\nThe + output should be a well-formatted JSON instance that conforms to the JSON schema + below.\n\nAs an example, for the schema {\"properties\": {\"foo\": {\"title\": + \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": + {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\nthe object {\"foo\": [\"bar\", + \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": + {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n\nHere is the output + JSON schema:\n```\n{\"type\": \"array\", \"items\": {\"$ref\": \"#/definitions/StatementFaithfulnessAnswer\"}, + \"definitions\": {\"StatementFaithfulnessAnswer\": {\"title\": \"StatementFaithfulnessAnswer\", + \"type\": \"object\", \"properties\": {\"statement\": {\"title\": \"Statement\", + \"description\": \"the original statement, word-by-word\", \"type\": \"string\"}, + \"reason\": {\"title\": \"Reason\", \"description\": \"the reason of the verdict\", + \"type\": \"string\"}, \"verdict\": {\"title\": \"Verdict\", \"description\": + \"the verdict(0/1) of the faithfulness.\", \"type\": \"integer\"}}, \"required\": + [\"statement\", \"reason\", \"verdict\"]}}}\n```\n\nDo not return any preamble + or explanations, return only a pure JSON string surrounded by triple backticks + (```).\n\nExamples:\n\ncontext: \"John is a student at XYZ University. He is + pursuing a degree in Computer Science. He is enrolled in several courses this + semester, including Data Structures, Algorithms, and Database Management. John + is a diligent student and spends a significant amount of time studying and completing + assignments. He often stays late in the library to work on his projects.\"\nstatements: + ```[\"John is majoring in Biology.\", \"John is taking a course on Artificial + Intelligence.\", \"John is a dedicated student.\", \"John has a part-time job.\"]```\nanswer: + ```[{\"statement\": \"John is majoring in Biology.\", \"reason\": \"John''s + major is explicitly mentioned as Computer Science. There is no information suggesting + he is majoring in Biology.\", \"verdict\": 0}, {\"statement\": \"John is taking + a course on Artificial Intelligence.\", \"reason\": \"The context mentions the + courses John is currently enrolled in, and Artificial Intelligence is not mentioned. + Therefore, it cannot be deduced that John is taking a course on AI.\", \"verdict\": + 0}, {\"statement\": \"John is a dedicated student.\", \"reason\": \"The context + states that he spends a significant amount of time studying and completing assignments. + Additionally, it mentions that he often stays late in the library to work on + his projects, which implies dedication.\", \"verdict\": 1}, {\"statement\": + \"John has a part-time job.\", \"reason\": \"There is no information given in + the context about John having a part-time job.\", \"verdict\": 0}]```\n\ncontext: + \"Photosynthesis is a process used by plants, algae, and certain bacteria to + convert light energy into chemical energy.\"\nstatements: ```[\"Albert Einstein + was a genius.\"]```\nanswer: ```[{\"statement\": \"Albert Einstein was a genius.\", + \"reason\": \"The context and statement are unrelated\", \"verdict\": 0}]```\n\nYour + actual task:\n\ncontext: \"hello, france is part of europe\"\nstatements: + \"[\\\"France is part of Europe.\\\"]\"\nanswer: \n", "role": "user"}], "model": + "gpt-4o-mini", "n": 1, "stream": false, "temperature": 1e-08}' + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '3657' + content-type: + - application/json + cookie: + - __cf_bm=GMAym2MnyZWZw_ORQOfRH7m0vuREaeNFf6gzJiHGy1k-1731007665-1.0.1.1-nArkb5npme6zrdpZ1L0Fho.0C5Glt5LEHaKERjf0koaMgNHOFfv34RUUXMfeQlNRjorW8a21hX7CcW0rBlX22w; + _cfuvid=2UlWnMHqlwf2iyqQoi_qr_fVNbYS7TQkpnENDD0iUfk-1731007665474-0.0.1.1-604800000 + host: + - api.openai.com + user-agent: + - OpenAI/Python 1.52.0 + x-stainless-arch: + - arm64 + x-stainless-async: + - 'false' + x-stainless-lang: + - python + x-stainless-os: + - MacOS + x-stainless-package-version: + - 1.52.0 + x-stainless-retry-count: + - '0' + x-stainless-runtime: + - CPython + x-stainless-runtime-version: + - 3.10.13 + method: POST + uri: https://api.openai.com/v1/chat/completions + response: + body: + string: !!binary | + H4sIAAAAAAAAA4xSwW6bQBC98xWjOZsIHBJbvvmQHCJVqprm0MYRrJcBtl12V7tD68jyv1dgbGy1 + VXvh8N68x5s3u48AUJW4ApSNYNk6Ha8/zZ8/rj+wWSc/u8UTPWXt12z9Qtn8y/MLznqF3X4jySfV + jbSt08TKmiMtPQmm3jVd3KZJsri/vxuI1pake1ntOM5s3Cqj4nkyz+JkEafLUd1YJSngCl4jAID9 + 8O1zmpJ2uIJkdkJaCkHUhKvzEAB6q3sERQgqsDCMs4mU1jCZIXpRFK/7DQYWTC0Z3uAKNvjohZEE + KoATnsFW8NB56+hmgzPYoCcRrDmOfm4IBr8dA+2cVlKxfofBMAA3guEfbj/Il0oOf04Pb0VRXEb1 + VHVB9HWZTusRP5x317Z23m7DyJ/xShkVmvwYtN8zsHU4sIcI4G3ouLuqDZ23reOc7XcyveFiOXaM + 02kn9vZuJNmy0BO+TE/ElV9eEgulw8WVUArZUDlJp5OKrlT2gogutv49zZ+8j5srU/+P/URISY6p + zJ2n/iZXG09jnvqX/7exc8tDYAzvganNK2Vq8s6r47urXJ5sRVKm86xKMTpEvwAAAP//AwA6IXqX + hQMAAA== + headers: + CF-Cache-Status: + - DYNAMIC + CF-RAY: + - 8defb8f6280b3992-IAD + Connection: + - keep-alive + Content-Encoding: + - gzip + Content-Type: + - application/json + Date: + - Thu, 07 Nov 2024 19:27:46 GMT + Server: + - cloudflare + Transfer-Encoding: + - chunked + X-Content-Type-Options: + - nosniff + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + openai-organization: + - datadog-staging + openai-processing-ms: + - '821' + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + x-ratelimit-limit-requests: + - '30000' + x-ratelimit-limit-tokens: + - '150000000' + x-ratelimit-remaining-requests: + - '29999' + x-ratelimit-remaining-tokens: + - '149999152' + x-ratelimit-reset-requests: + - 2ms + x-ratelimit-reset-tokens: + - 0s + x-request-id: + - req_397280c8d63c855801cfdc02b86052b2 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py index 7f7d685cf0a..7ee7d510276 100644 --- a/tests/llmobs/test_llmobs_evaluator_runner.py +++ b/tests/llmobs/test_llmobs_evaluator_runner.py @@ -5,12 +5,12 @@ import mock import pytest -import ddtrace from ddtrace._trace.span import Span from ddtrace.llmobs._evaluators.runner import EvaluatorRunner from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSamplingRule -from ddtrace.llmobs._writer import LLMObsEvaluationMetricEvent +from tests.llmobs._utils import DummyEvaluator +from tests.llmobs._utils import _dummy_evaluator_eval_metric_event from tests.utils import override_env from tests.utils import override_global_config @@ -18,22 +18,9 @@ DUMMY_SPAN = Span("dummy_span") -def _dummy_ragas_eval_metric_event(span_id, trace_id): - return LLMObsEvaluationMetricEvent( - span_id=span_id, - trace_id=trace_id, - score_value=1.0, - ml_app="unnamed-ml-app", - timestamp_ms=mock.ANY, - metric_type="score", - label="ragas_faithfulness", - tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:unnamed-ml-app"], - ) - - -def test_evaluator_runner_start(mock_evaluator_logs, mock_ragas_evaluator): +def test_evaluator_runner_start(mock_evaluator_logs): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock()) - evaluator_runner.evaluators.append(mock_ragas_evaluator) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=mock.MagicMock())) evaluator_runner.start() mock_evaluator_logs.debug.assert_has_calls([mock.call("started %r to %r", "EvaluatorRunner")]) @@ -47,20 +34,20 @@ def test_evaluator_runner_buffer_limit(mock_evaluator_logs): ) -def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer, mock_ragas_evaluator): +def test_evaluator_runner_periodic_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(mock_ragas_evaluator(llmobs_service=LLMObs)) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) evaluator_runner.periodic() mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with( - _dummy_ragas_eval_metric_event(span_id="123", trace_id="1234") + _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234") ) @pytest.mark.vcr_logs -def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer, mock_ragas_evaluator): +def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_metric_writer): evaluator_runner = EvaluatorRunner(interval=0.01, llmobs_service=LLMObs) - evaluator_runner.evaluators.append(mock_ragas_evaluator(llmobs_service=LLMObs)) + evaluator_runner.evaluators.append(DummyEvaluator(llmobs_service=LLMObs)) evaluator_runner.start() evaluator_runner.enqueue({"span_id": "123", "trace_id": "1234"}, DUMMY_SPAN) @@ -68,7 +55,7 @@ def test_evaluator_runner_timed_enqueues_eval_metric(LLMObs, mock_llmobs_eval_me time.sleep(0.1) mock_llmobs_eval_metric_writer.enqueue.assert_called_once_with( - _dummy_ragas_eval_metric_event(span_id="123", trace_id="1234") + _dummy_evaluator_eval_metric_event(span_id="123", trace_id="1234") ) diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index 51da6aed3cf..1f78b538f24 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -5,11 +5,10 @@ from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator from ddtrace.span import Span - -from ._utils import _expected_llmobs_llm_span_event -from ._utils import _expected_ragas_spans -from ._utils import _llm_span_with_expected_ragas_inputs_in_messages -from ._utils import _llm_span_with_expected_ragas_inputs_in_prompt +from tests.llmobs._utils import _expected_llmobs_llm_span_event +from tests.llmobs._utils import _expected_ragas_spans +from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_messages +from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt def _llm_span_without_io(): @@ -30,7 +29,8 @@ def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_raga def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.evaluate(_llm_span_without_io()) == "fail_extract_faithfulness_inputs" + failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) + assert failure_msg == "fail_extract_faithfulness_inputs" assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 @@ -89,6 +89,11 @@ def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit label=RagasFaithfulnessEvaluator.LABEL, metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, ) ] ) @@ -112,6 +117,50 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages label=RagasFaithfulnessEvaluator.LABEL, metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _expected_llmobs_llm_span_event( + Span("dummy"), + prompt={ + "variables": { + "user_input": "Is france part of europe?", + "context_1": "hello, ", + "context_2": "france is ", + "context_3": "part of europe", + }, + "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], + "_dd_query_variable_keys": ["user_input"], + }, + output_messages=[{"content": "France is indeed part of europe"}], + ) + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, ) ] )