huggingface · saidineshpola · Mar 16, 2025
diff --git a/src/open_r1/rewards.py b/src/open_r1/rewards.py
@@ -25,12 +25,10 @@ def accuracy_reward(completions, solution, **kwargs):
     """Reward function that checks if the completion is the same as the ground truth."""
     contents = [completion[0]["content"] for completion in completions]
     rewards = []
-    for content, sol in zip(contents, solution):
-        gold_parsed = parse(
-            sol,
-            extraction_mode="first_match",
-            extraction_config=[LatexExtractionConfig()],
-        )
+    answers = kwargs["answer"] if "answer" in kwargs else None
+    for content, answer in zip(contents, answers):
+        # No need to parse the solution, as grpo rewards function returns answers only
+        gold_parsed = answer
         if len(gold_parsed) != 0:
             # We require the answer to be provided in correct latex (no malformed operators)
             answer_parsed = parse(
@@ -61,7 +59,7 @@ def accuracy_reward(completions, solution, **kwargs):
         else:
             # If the gold solution is not parseable, we reward 1 to skip this example
             reward = 1.0
-            print("Failed to parse gold solution: ", sol)
+            print("Failed to parse gold solution: ", answer)
         rewards.append(reward)
 
     return rewards