stanfordnlp
diff --git a/‎benchmark.py
+172 b/‎benchmark.py
+172
diff --git a/‎comparison_example.py
+113 b/‎comparison_example.py
+113
diff --git a/‎docs/metaladder_vs_cot.md
+1 b/‎docs/metaladder_vs_cot.md
+1
diff --git a/‎dspy/adapters/__init__.py
+3-3 b/‎dspy/adapters/__init__.py
+3-3
@@ -0,0 +1,172 @@
+"""Benchmark comparing ChainOfThought with MetaLadder."""
+import os
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import dspy
+from dspy.primitives import Module
+from dspy.adapters import MetaLadderAdapter
+from dspy.clients.lm import LM
+
+# Set up the language model with API key
+if "OPENAI_API_KEY" not in os.environ:
+    raise ValueError("Please set the OPENAI_API_KEY environment variable")
+
+# Configure language model
+lm = LM(model="gpt-3.5-turbo")
+dspy.settings.configure(lm=lm)
+
+# Disable caching
+dspy.settings.configure(cache_seed=None)
+
+class MathSolver(dspy.Signature):
+    """Signature for solving math problems."""
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="numerical answer with units")
+    reasoning = dspy.OutputField(desc="step by step reasoning")
+
+
+@dataclass
+class BenchmarkResult:
+    """Results from a benchmark run.
+    
+    Attributes:
+        accuracy: Percentage of correct solutions
+        avg_time: Average time per problem in seconds
+        problem_types: Dictionary mapping problem types to their accuracies
+        generalization_score: Score for similar but slightly modified problems
+    """
+    accuracy: float
+    avg_time: float
+    problem_types: Dict[str, float]
+    generalization_score: float
+
+
+def get_test_problems() -> Dict[str, List[Tuple[str, str]]]:
+    """Get test problems with expected answers.
+    
+    Returns:
+        Dictionary mapping problem types to lists of (problem, answer) tuples
+    """
+    return {
+        "multiplication": [
+            (
+                "If a train travels at 60 miles per hour for 2.5 hours, how far does it travel?",
+                "150 miles"
+            ),
+            (
+                "A factory produces 120 widgets per hour. How many widgets does it produce in 8 hours?",
+                "960 widgets"
+            )
+        ],
+        "division": [
+            (
+                "If 144 cookies are divided equally among 3 charity events, how many cookies does each event get?",
+                "48 cookies"
+            ),
+            (
+                "A company has $900 to divide among 6 employees. How much does each employee receive?",
+                "$150"
+            )
+        ]
+    }
+
+
+def get_variation_problems() -> Dict[str, List[Tuple[str, str]]]:
+    """Get variation problems to test generalization.
+    
+    Returns:
+        Dictionary mapping problem types to lists of (problem, answer) tuples
+    """
+    return {
+        "multiplication": [
+            (
+                "A cyclist pedals at 15 kilometers per hour for 3.5 hours. What distance does the cyclist cover?",
+                "52.5 kilometers"
+            )
+        ],
+        "division": [
+            (
+                "If 288 candies need to be distributed equally to 4 schools, how many candies does each school get?",
+                "72 candies"
+            )
+        ]
+    }
+
+
+def run_benchmark(
+    model: Module,
+    problems: List[Tuple[str, str]],
+    model_name: str
+) -> Tuple[int, float]:
+    """Run benchmark on a set of problems.
+    
+    Args:
+        model: The model to benchmark
+        problems: List of (problem, expected_answer) tuples
+        model_name: Name of the model for logging
+        
+    Returns:
+        Tuple of (correct_count, total_time)
+    """
+    correct = 0
+    total_time = 0
+    
+    for i, (problem, expected) in enumerate(problems, 1):
+        print(f"\nProblem {i}:")
+        print(f"Question: {problem}")
+        print(f"Expected: {expected}")
+        
+        start_time = time.time()
+        result = model(question=problem)
+        answer = result.answer
+        time_taken = time.time() - start_time
+        
+        print(f"{model_name} answer: {answer}")
+        if hasattr(result, "reasoning"):
+            print(f"Reasoning: {result.reasoning}")
+            
+        if expected.lower() in answer.lower():
+            correct += 1
+            print("✓ Correct")
+        else:
+            print("✗ Incorrect")
+            
+        total_time += time_taken
+        print(f"Time: {time_taken:.2f}s")
+    
+    return correct, total_time
+
+
+def benchmark_models() -> None:
+    """Run benchmark comparing ChainOfThought and MetaLadder."""
+    # Create solvers
+    cot_solver = dspy.ChainOfThought(MathSolver)
+    meta_solver = MetaLadderAdapter(cot_solver)
+    
+    # Get test problems
+    problems = get_test_problems()
+    total_problems = sum(len(probs) for probs in problems.values())
+    
+    print("=== Model Comparison Benchmark ===\n")
+    
+    # Test Chain of Thought
+    print("Chain of Thought:")
+    for prob_type, test_cases in problems.items():
+        correct, time_taken = run_benchmark(cot_solver, test_cases, "Chain of Thought")
+        print(f"\n{prob_type.title()}:")
+        print(f"Accuracy: {(correct / len(test_cases)) * 100:.1f}%")
+        print(f"Average time: {time_taken / len(test_cases):.2f}s")
+    
+    # Test MetaLadder
+    print("\nMetaLadder:")
+    for prob_type, test_cases in problems.items():
+        correct, time_taken = run_benchmark(meta_solver, test_cases, "MetaLadder")
+        print(f"\n{prob_type.title()}:")
+        print(f"Accuracy: {(correct / len(test_cases)) * 100:.1f}%")
+        print(f"Average time: {time_taken / len(test_cases):.2f}s")
+
+
+if __name__ == "__main__":
+    benchmark_models() 
@@ -0,0 +1,113 @@
+"""Example comparing Chain of Thought vs MetaLadder approaches."""
+import os
+from typing import Any, Dict, List, Optional
+
+import dspy
+from dspy import ChainOfThought, InputField, OutputField, Module, Predict
+from dspy.signatures.signature import make_signature
+from dspy.utils.dummies import DummyLM
+from dspy.clients.lm import LM
+
+from dspy.adapters.metaladder_adapter import MetaLadderAdapter
+
+class MathSolver(dspy.Signature):
+    """Signature for solving math word problems."""
+    
+    question = InputField(desc="A math word problem to solve")
+    answer = OutputField(desc="The numerical answer with units")
+    reasoning = OutputField(desc="Step by step reasoning process")
+
+def solve_with_cot(lm: Any, question: str) -> Dict[str, str]:
+    """Solve a problem using Chain of Thought reasoning.
+    
+    Args:
+        lm: Language model to use
+        question: Math problem to solve
+        
+    Returns:
+        Dict containing answer and reasoning
+    """
+    # Create basic solver
+    solver = ChainOfThought(MathSolver)
+    dspy.settings.configure(lm=lm)
+    
+    # Get prediction
+    pred = solver(question=question)
+    return {
+        "answer": pred.answer,
+        "reasoning": pred.reasoning
+    }
+
+def solve_with_metaladder(lm: Any, question: str) -> Dict[str, Any]:
+    """Solve a problem using MetaLadder approach.
+    
+    Args:
+        lm: Language model to use
+        question: Math problem to solve
+        
+    Returns:
+        Dict containing answer and meta-problem details
+    """
+    # Create MetaLadder adapter
+    adapter = MetaLadderAdapter(model=lm)
+    dspy.settings.configure(lm=lm)
+    
+    # Get prediction and meta-problem
+    pred = adapter(question=question)
+    return {
+        "answer": pred.answer,
+        "meta_problem": adapter._meta_problems.get(question)
+    }
+
+def main() -> None:
+    """Run comparison example."""
+    # Initialize language model
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY environment variable must be set")
+    
+    lm = LM(model="gpt-3.5-turbo", api_key=api_key)
+    
+    # Test problems of increasing complexity
+    problems = [
+        # Simple rate problem
+        "If a car travels at 50 miles per hour for 3 hours, how far does it travel?",
+        
+        # Multi-step problem with unit conversion
+        "A factory produces 120 widgets per hour and operates for 8 hours per day. If each widget requires 0.5 pounds of material, how many pounds of material are needed per week (5 days)?",
+        
+        # Problem requiring identifying relevant information
+        "A store sells notebooks for $4 each and pens for $2 each. A student needs 3 notebooks and wants to spend exactly $20 in total. How many pens should they buy?",
+        
+        # Problem with distracting information
+        "In a school library with 1000 books, 40% are fiction and 35% are non-fiction. If the remaining books are reference materials and 15 books are being repaired, how many reference books are available?"
+    ]
+    
+    print("\n=== Comparing Problem-Solving Approaches ===\n")
+    
+    for i, problem in enumerate(problems, 1):
+        print(f"Problem {i}:")
+        print(f"Question: {problem}\n")
+        
+        try:
+            # Solve with Chain of Thought
+            print("Chain of Thought approach:")
+            cot_result = solve_with_cot(lm, problem)
+            print(f"Reasoning: {cot_result['reasoning']}")
+            print(f"Answer: {cot_result['answer']}\n")
+            
+            # Solve with MetaLadder
+            print("MetaLadder approach:")
+            ml_result = solve_with_metaladder(lm, problem)
+            meta = ml_result['meta_problem']
+            print(f"Problem type: {meta.problem_type}")
+            print(f"Meta-problem: {meta.meta_problem}")
+            print(f"Restatement: {meta.restatement}")
+            print(f"Answer: {ml_result['answer']}\n")
+        except Exception as e:
+            print(f"Error processing problem: {str(e)}\n")
+            
+        print("-" * 80 + "\n")
+
+if __name__ == "__main__":
+    main() 
@@ -0,0 +1 @@
+ 
@@ -3,15 +3,15 @@
 from dspy.adapters.base import Adapter
 from dspy.adapters.chat_adapter import ChatAdapter
 from dspy.adapters.json_adapter import JSONAdapter
-from dspy.adapters.types import Image, History, AdapterResponse
 from dspy.adapters.metaladder_adapter import MetaLadderAdapter
+from dspy.adapters.types import Image, History, AdapterResponse
 
 __all__ = [
     "Adapter",
     "ChatAdapter",
     "JSONAdapter",
+    "MetaLadderAdapter",
     "Image",
     "History",
-    "AdapterResponse",
-    "MetaLadderAdapter"
+    "AdapterResponse"
 ]