Skip to content

Commit

Permalink
Make ChatEvaluator output consistent with latest chat flow (#2598)
Browse files Browse the repository at this point in the history
# Description

Please add an informative description that covers that changes made by
the pull request and link all relevant issues.

# All Promptflow Contribution checklist:
- [ ] **The pull request does not introduce [breaking changes].**
- [ ] **CHANGELOG is updated for new features, bug fixes or other
significant changes.**
- [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).**
- [ ] **Create an issue and link to the pull request to get dedicated
review from promptflow team. Learn more: [suggested
workflow](../CONTRIBUTING.md#suggested-workflow).**

## General Guidelines and Best Practices
- [ ] Title of the pull request is clear and informative.
- [ ] There are a small number of commits, each of which have an
informative message. This means that previously merged commits do not
appear in the history of the PR. For more information on cleaning up the
commits in your PR, [see this
page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md).

### Testing Guidelines
- [ ] Pull request includes test coverage for the included changes.

---------

Co-authored-by: Ankit Singhal <[email protected]>
Co-authored-by: Ankit Singhal <[email protected]>
Co-authored-by: Clement Wang <[email protected]>
  • Loading branch information
4 people authored Apr 3, 2024
1 parent 526bd08 commit 6b9c5c6
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 16 deletions.
56 changes: 43 additions & 13 deletions src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,28 +128,27 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
}

for future in as_completed(future_to_evaluator):
score = future.result()
current_turn_result.update(score)
result = future.result()
current_turn_result.update(result)
else:
# Sequential execution
for evaluator in selected_evaluators:
score = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
current_turn_result.update(score)
result = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
current_turn_result.update(result)

per_turn_results.append(current_turn_result)

# Aggregate results
# Final aggregated results for a conversation will look like:
# {
# "gpt_groundedness": 0.9,
# "gpt_groundedness_per_turn": [0.9, 0.8, 0.9, ...],
# ...
# "gpt_groundedness": 2.0, # Mean of all groundedness scores
# "evaluation_per_turn": {
# "gpt_groundedness": {
# "score": [1.0, ...],
# "reason": ["reason1", ...],
# },
# },
# }
aggregated = {}
for key in per_turn_results[0].keys():
values = [d[key] for d in per_turn_results]
aggregated[key] = np.nanmean(values)
aggregated[key + "_per_turn"] = values
aggregated = self._aggregate_results(per_turn_results)

return aggregated

Expand All @@ -170,6 +169,37 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
return {}

def _aggregate_results(self, per_turn_results: List[Dict]):
scores = {}
reasons = {}

for turn in per_turn_results:
for metric, value in turn.items():
if 'reason' in metric:
if metric not in reasons:
reasons[metric] = []
reasons[metric].append(value)
else:
if metric not in scores:
scores[metric] = []
scores[metric].append(value)

aggregated = {}
evaluation_per_turn = {}

for metric, values in scores.items():
aggregated[metric] = np.nanmean(values)

# Prepare per-turn evaluations
evaluation_per_turn[metric] = {"score": values}
reason_key = f"{metric}_reason"
if reason_key in reasons:
evaluation_per_turn[metric]["reason"] = reasons[reason_key]

aggregated["evaluation_per_turn"] = evaluation_per_turn

return aggregated

def _validate_conversation(self, conversation: List[Dict]):
if conversation is None or not isinstance(conversation, list):
raise ValueError("'conversation' must be a list of dictionaries.")
Expand Down
6 changes: 3 additions & 3 deletions src/promptflow-evals/samples/built_in_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ def run_chat_evaluator():
]
score = chat_eval(conversation=conversation)
print(score)
# {'gpt_coherence': 5.0, 'gpt_coherence_per_turn': [5.0, 5.0], 'gpt_fluency': 5.0, 'gpt_fluency_per_turn': [5.0,
# 5.0], 'gpt_groundedness': 5.0, 'gpt_groundedness_per_turn': [5.0, 5.0], 'gpt_relevance': 5.0,
# 'gpt_relevance_per_turn': [5.0, 5.0]}
# {'gpt_fluency': 5.0, 'gpt_groundedness': 5.0, 'gpt_coherence': 5.0, 'gpt_relevance': 5.0,
# 'evaluation_per_turn': {'gpt_fluency': {'score': [5.0, 5.0]}, 'gpt_groundedness': {'score': [5.0, 5.0]},
# 'gpt_coherence': {'score': [5.0, 5.0]}, 'gpt_relevance': {'score': [5.0, 5.0]}}}


if __name__ == "__main__":
Expand Down
31 changes: 31 additions & 0 deletions src/promptflow-evals/tests/unittests/test_chat_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,34 @@ def test_conversation_validation_invalid_citations(self):
with pytest.raises(ValueError) as e:
chat_eval(conversation=conversation)
assert str(e.value) == "'citations' in context must be a list. Turn number: 2"

def test_per_turn_results_aggregation(self):
model_config = AzureOpenAIConnection(
api_base="mocked_endpoint",
api_key="mocked_key",
api_type="azure",
)
chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")

per_turn_results = [
{
"gpt_groundedness": 1.0,
"gpt_groundedness_reason": "reason1",
"gpt_fluency": 2.0,

},
{
"gpt_groundedness": 3.0,
"gpt_groundedness_reason": "reason2",
"gpt_fluency": 4.0,
},
]
aggregated = chat_eval._aggregate_results(per_turn_results)
assert aggregated == {
"gpt_groundedness": 2.0,
"gpt_fluency": 3.0,
"evaluation_per_turn": {
"gpt_groundedness": {"score": [1.0, 3.0], "reason": ["reason1", "reason2"]},
"gpt_fluency": {"score": [2.0, 4.0]},
}
}

0 comments on commit 6b9c5c6

Please sign in to comment.