From 6b9c5c6f1d62da1a7b9d307891902c6fac8af0c1 Mon Sep 17 00:00:00 2001
From: Billy Hu <ninhu@microsoft.com>
Date: Wed, 3 Apr 2024 08:39:36 -0700
Subject: [PATCH] Make ChatEvaluator output consistent with latest chat flow
 (#2598)

# Description

Please add an informative description that covers that changes made by
the pull request and link all relevant issues.

# All Promptflow Contribution checklist:
- [ ] **The pull request does not introduce [breaking changes].**
- [ ] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).**
- [ ] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [ ] Title of the pull request is clear and informative.
- [ ] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [ ] Pull request includes test coverage for the included changes.

---------

Co-authored-by: Ankit Singhal <30610298+singankit@users.noreply.github.com>
Co-authored-by: Ankit Singhal <anksing@microsoft.com>
Co-authored-by: Clement Wang <47586720+wangchao1230@users.noreply.github.com>
---
 .../evals/evaluators/chat/__init__.py         | 56 ++++++++++++++-----
 .../samples/built_in_evaluators.py            |  6 +-
 .../tests/unittests/test_chat_evaluator.py    | 31 ++++++++++
 3 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
index f7c419a3aeb..81cbd0a8075 100644
--- a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
+++ b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
@@ -128,28 +128,27 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
                     }
 
                     for future in as_completed(future_to_evaluator):
-                        score = future.result()
-                        current_turn_result.update(score)
+                        result = future.result()
+                        current_turn_result.update(result)
             else:
                 # Sequential execution
                 for evaluator in selected_evaluators:
-                    score = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
-                    current_turn_result.update(score)
+                    result = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
+                    current_turn_result.update(result)
 
             per_turn_results.append(current_turn_result)
 
         # Aggregate results
         # Final aggregated results for a conversation will look like:
-        # {
-        #     "gpt_groundedness": 0.9,
-        #     "gpt_groundedness_per_turn": [0.9, 0.8, 0.9, ...],
-        #     ...
+        #     "gpt_groundedness": 2.0, # Mean of all groundedness scores
+        #     "evaluation_per_turn": {
+        #         "gpt_groundedness": {
+        #             "score": [1.0, ...],
+        #             "reason": ["reason1", ...],
+        #         },
+        #     },
         # }
-        aggregated = {}
-        for key in per_turn_results[0].keys():
-            values = [d[key] for d in per_turn_results]
-            aggregated[key] = np.nanmean(values)
-            aggregated[key + "_per_turn"] = values
+        aggregated = self._aggregate_results(per_turn_results)
 
         return aggregated
 
@@ -170,6 +169,37 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
                 f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
             return {}
 
+    def _aggregate_results(self, per_turn_results: List[Dict]):
+        scores = {}
+        reasons = {}
+
+        for turn in per_turn_results:
+            for metric, value in turn.items():
+                if 'reason' in metric:
+                    if metric not in reasons:
+                        reasons[metric] = []
+                    reasons[metric].append(value)
+                else:
+                    if metric not in scores:
+                        scores[metric] = []
+                    scores[metric].append(value)
+
+        aggregated = {}
+        evaluation_per_turn = {}
+
+        for metric, values in scores.items():
+            aggregated[metric] = np.nanmean(values)
+
+            # Prepare per-turn evaluations
+            evaluation_per_turn[metric] = {"score": values}
+            reason_key = f"{metric}_reason"
+            if reason_key in reasons:
+                evaluation_per_turn[metric]["reason"] = reasons[reason_key]
+
+        aggregated["evaluation_per_turn"] = evaluation_per_turn
+
+        return aggregated
+
     def _validate_conversation(self, conversation: List[Dict]):
         if conversation is None or not isinstance(conversation, list):
             raise ValueError("'conversation' must be a list of dictionaries.")
diff --git a/src/promptflow-evals/samples/built_in_evaluators.py b/src/promptflow-evals/samples/built_in_evaluators.py
index c04815529ac..d2561ba3e71 100644
--- a/src/promptflow-evals/samples/built_in_evaluators.py
+++ b/src/promptflow-evals/samples/built_in_evaluators.py
@@ -142,9 +142,9 @@ def run_chat_evaluator():
     ]
     score = chat_eval(conversation=conversation)
     print(score)
-    # {'gpt_coherence': 5.0, 'gpt_coherence_per_turn': [5.0, 5.0], 'gpt_fluency': 5.0, 'gpt_fluency_per_turn': [5.0,
-    # 5.0], 'gpt_groundedness': 5.0, 'gpt_groundedness_per_turn': [5.0, 5.0], 'gpt_relevance': 5.0,
-    # 'gpt_relevance_per_turn': [5.0, 5.0]}
+    # {'gpt_fluency': 5.0, 'gpt_groundedness': 5.0, 'gpt_coherence': 5.0, 'gpt_relevance': 5.0,
+    # 'evaluation_per_turn': {'gpt_fluency': {'score': [5.0, 5.0]}, 'gpt_groundedness': {'score': [5.0, 5.0]},
+    #   'gpt_coherence': {'score': [5.0, 5.0]}, 'gpt_relevance': {'score': [5.0, 5.0]}}}
 
 
 if __name__ == "__main__":
diff --git a/src/promptflow-evals/tests/unittests/test_chat_evaluator.py b/src/promptflow-evals/tests/unittests/test_chat_evaluator.py
index 94fb6cb5d2e..f1ccb9bead9 100644
--- a/src/promptflow-evals/tests/unittests/test_chat_evaluator.py
+++ b/src/promptflow-evals/tests/unittests/test_chat_evaluator.py
@@ -89,3 +89,34 @@ def test_conversation_validation_invalid_citations(self):
         with pytest.raises(ValueError) as e:
             chat_eval(conversation=conversation)
         assert str(e.value) == "'citations' in context must be a list. Turn number: 2"
+
+    def test_per_turn_results_aggregation(self):
+        model_config = AzureOpenAIConnection(
+            api_base="mocked_endpoint",
+            api_key="mocked_key",
+            api_type="azure",
+        )
+        chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+
+        per_turn_results = [
+            {
+                "gpt_groundedness": 1.0,
+                "gpt_groundedness_reason": "reason1",
+                "gpt_fluency": 2.0,
+
+            },
+            {
+                "gpt_groundedness": 3.0,
+                "gpt_groundedness_reason": "reason2",
+                "gpt_fluency": 4.0,
+            },
+        ]
+        aggregated = chat_eval._aggregate_results(per_turn_results)
+        assert aggregated == {
+            "gpt_groundedness": 2.0,
+            "gpt_fluency": 3.0,
+            "evaluation_per_turn": {
+                "gpt_groundedness": {"score": [1.0, 3.0], "reason": ["reason1", "reason2"]},
+                "gpt_fluency": {"score": [2.0, 4.0]},
+            }
+        }