Code cleaning

saireddythfc · saireddythfc · commit aa12cf23c3e0 · 2025-08-29T14:15:20.000Z
diff --git a/delphi/__main__.py b/delphi/__main__.py
@@ -30,7 +30,7 @@
 from delphi.latents.neighbours import NeighbourCalculator
 from delphi.log.result_analysis import log_results
 from delphi.pipeline import Pipe, Pipeline, process_wrapper
-from delphi.scorers import DetectionScorer, FuzzingScorer, OpenAISimulator, InterventionScorer, LogProbInterventionScorer, SurprisalInterventionScorer
+from delphi.scorers import DetectionScorer, FuzzingScorer, OpenAISimulator, SurprisalInterventionScorer
 from delphi.sparse_coders import load_hooks_sparse_coders, load_sparse_coders
 from delphi.utils import assert_type, load_tokenized_data
 
@@ -252,8 +252,6 @@ def scorer_postprocess(result, score_dir, scorer_name=None):
         safe_latent_name = str(result.record.latent).replace("/", "--")
 
         with open(score_dir / f"{safe_latent_name}.txt", "wb") as f:
-            # This line now works universally. For other scorers, it saves their simple
-            # score. For surprisal_intervention, it saves the rich 'final_payload'.
             f.write(orjson.dumps(result.score, default=custom_serializer))
 
         
@@ -278,20 +276,7 @@ def scorer_postprocess(result, score_dir, scorer_name=None):
                 verbose=run_cfg.verbose,
                 log_prob=run_cfg.log_probs,
             )
-        elif scorer_name == "intervention":
-            scorer = InterventionScorer(
-                llm_client,
-                n_examples_shown=run_cfg.num_examples_per_scorer_prompt,
-                verbose=run_cfg.verbose,
-                log_prob=run_cfg.log_probs,
-            )
-        elif scorer_name == "logprob_intervention":
-            scorer = LogProbInterventionScorer(
-                llm_client,
-                n_examples_shown=run_cfg.num_examples_per_scorer_prompt,
-                verbose=run_cfg.verbose,
-                log_prob=run_cfg.log_probs,
-            )
+
         elif scorer_name == "surprisal_intervention":
             scorer = SurprisalInterventionScorer(
                 model,
diff --git a/delphi/config.py b/delphi/config.py
@@ -152,8 +152,6 @@ class RunConfig(Serializable):
             "fuzz",
             "detection",
             "simulation",
-            "intervention",
-            "logprob_intervention",
             "surprisal_intervention"
         ],
         default=[
@@ -162,7 +160,7 @@ class RunConfig(Serializable):
         ],
     )
     """Scorer methods to score latent explanations. Options are 'fuzz', 'detection',
-    'simulation' and 'intervention'."""
+    'simulation' and 'surprisal_intervention'."""
 
     name: str = ""
     """The name of the run. Results are saved in a directory with this name."""
diff --git a/delphi/log/result_analysis.py b/delphi/log/result_analysis.py
@@ -150,7 +150,6 @@ def parse_score_file(path: Path) -> pd.DataFrame:
 
         latent_idx = int(path.stem.split("latent")[-1])
 
-        # --- MODIFICATION 1: PARSE THE NEW METRICS ---
         # Updated to extract all possible keys safely using .get()
         return pd.DataFrame(
             [
@@ -254,11 +253,9 @@ def log_results(
         dead = sum((counts[m] == 0).sum().item() for m in modules)
         print(f"Number of dead features: {dead}")
     
-    # --- MODIFICATION 2: ADD CONDITIONAL REPORTING ---
-    # Loop through all scorer types found in the data
+
     for score_type in latent_df["score_type"].unique():
         
-        # Handle the new scorer with its specific metrics
         if score_type == 'surprisal_intervention':
             # Drop duplicates since score is per-latent, not per-example
             unique_latents = surprisal_df.drop_duplicates(subset=['module', 'latent_idx'])
@@ -269,7 +266,6 @@ def log_results(
             print(f"Average Normalized Score: {avg_score:.3f}")
             print(f"Average KL Divergence: {avg_kl:.3f}")
 
-        # Handle all other scorers with the original classification metrics
         else:
             if not classification_df.empty:
                 score_type_summary = processed_df[processed_df.score_type == score_type].iloc[0]
diff --git a/delphi/scorers/intervention/surprisal_intervention_scorer.py b/delphi/scorers/intervention/surprisal_intervention_scorer.py
@@ -1,4 +1,3 @@
-# surprisal_intervention_scorer.py
 import functools
 import random
 import copy
@@ -9,8 +8,6 @@
 import torch.nn.functional as F
 from transformers import AutoTokenizer
 
-# Assuming 'delphi' is your project structure.
-# If not, you may need to adjust these relative imports.
 from ..scorer import Scorer, ScorerResult
 from ...latents import LatentRecord, ActivatingExample
 
@@ -75,11 +72,9 @@ def __init__(self, subject_model: Any, explainer_model: Any = None, **kwargs):
         if len(self.hookpoints):
             self.hookpoint_str = self.hookpoints[0]
 
-        # Ensure tokenizer is available
         if hasattr(subject_model, "tokenizer"):
             self.tokenizer = subject_model.tokenizer
         else:
-            # Fallback to a standard tokenizer if not attached to the model
             self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
         
         if self.tokenizer.pad_token is None:
@@ -113,7 +108,6 @@ def _resolve_hookpoint(self, model: Any, hookpoint_str: str) -> Any:
         """
         parts = hookpoint_str.split('.')
         
-        # 1. Validate the string format.
         is_valid_format = (
             len(parts) == 3 and
             parts[0] in ['layers', 'h'] and
@@ -122,137 +116,75 @@ def _resolve_hookpoint(self, model: Any, hookpoint_str: str) -> Any:
         )
 
         if not is_valid_format:
-            # Fallback for simple block types at the top level, e.g. 'embed_in'
             if len(parts) == 1 and hasattr(model, hookpoint_str):
                  return getattr(model, hookpoint_str)
             raise ValueError(f"Hookpoint string '{hookpoint_str}' is not in a recognized format like 'layers.6.mlp'.")
-        # --- End of changes ---
 
-        # 2. Heuristically find the model prefix.
+        #Heuristically find the model prefix.
         prefix = None
         for p in ["gpt_neox", "transformer", "model"]:
             if hasattr(model, p):
                 candidate_body = getattr(model, p)
-                # Use parts[0] to get the layer block name ('layers' or 'h')
                 if hasattr(candidate_body, parts[0]):
                     prefix = p
                     break
         
         full_path = f"{prefix}.{hookpoint_str}" if prefix else hookpoint_str
 
-        # 3. Use the simple path finder to get the module.
         try:
             return self._find_layer(model, full_path)
         except AttributeError as e:
             raise AttributeError(f"Could not resolve path '{full_path}'. Model structure might be unexpected. Original error: {e}")
 
-        
-
-
-    # def _sanitize_examples(self, examples: List[Any]) -> List[Dict[str, Any]]:
-    #     """Ensures examples are in a consistent format: a list of dictionaries with 'str_tokens'."""
-    #     sanitized = []
-    #     for ex in examples:
-    #         if isinstance(ex, dict) and "str_tokens" in ex:
-    #             sanitized.append(ex)
-    #         elif hasattr(ex, "str_tokens"):
-    #             sanitized.append({"str_tokens": [str(t) for t in ex.str_tokens]})
-    #         elif isinstance(ex, str):
-    #             sanitized.append({"str_tokens": [ex]})
-    #         elif isinstance(ex, (list, tuple)):
-    #             sanitized.append({"str_tokens": [str(t) for t in ex]})
-    #         else:
-    #             sanitized.append({"str_tokens": [str(ex)]})
-    #     return sanitized
-
 
     def _sanitize_examples(self, examples: List[Any]) -> List[Dict[str, Any]]:
+        """
+        Function used for formatting results to run smoothly in the delphi pipeline
+        """
         sanitized = []
         for ex in examples:
-            # --- NEW, MORE ROBUST LOGIC ---
-            # 1. Prioritize handling objects that have the data we need (like ActivatingExample)
             if hasattr(ex, 'str_tokens') and ex.str_tokens is not None:
-                # This correctly handles ActivatingExample objects and similar structures.
-                # It extracts the string tokens instead of converting the whole object to a string.
                 sanitized.append({'str_tokens': ex.str_tokens})
             
-            # 2. Handle cases where the item is already a correct dictionary
             elif isinstance(ex, dict) and "str_tokens" in ex:
                 sanitized.append(ex)
             
-            # 3. Handle plain strings
             elif isinstance(ex, str):
                 sanitized.append({"str_tokens": [ex]})
             
-            # 4. Handle lists/tuples of strings as a fallback
             elif isinstance(ex, (list, tuple)):
                 sanitized.append({"str_tokens": [str(t) for t in ex]})
             
-            # 5. Handle any other unexpected type as a last resort
             else:
                 sanitized.append({"str_tokens": [str(ex)]})
                 
         return sanitized
 
 
-    # def _sanitize_examples(self, examples: List[Any]) -> List[Dict[str, Any]]:
-        
-    #     sanitized = []
-    #     for i, ex in enumerate(examples):
-            
-
-    #         if isinstance(ex, dict) and "str_tokens" in ex:
-    #             sanitized.append(ex)
-                
-            
-    #         elif isinstance(ex, str):
-    #             # This is the key conversion
-    #             converted_ex = {"str_tokens": [ex]}
-    #             sanitized.append(converted_ex)
-                
-
-    #         elif isinstance(ex, (list, tuple)):
-    #             converted_ex = {"str_tokens": [str(t) for t in ex]}
-    #             sanitized.append(converted_ex)
-                
-    #         else:
-    #             converted_ex = {"str_tokens": [str(ex)]}
-    #             sanitized.append(converted_ex)
-                
-    #     print("fin this")
-    #     return sanitized
 
     async def __call__(self, record: LatentRecord) -> ScorerResult:
-        # --- MODIFICATION START ---
-        # 1. Create a deep copy to work on, ensuring we don't interfere
-        #    with other parts of the pipeline that might use the original record.
+
         record_copy = copy.deepcopy(record)
 
-        # 2. Read the raw examples from our copy.
         raw_examples = getattr(record_copy, "test", []) or []
         
         if not raw_examples:
             result = SurprisalInterventionResult(score=0.0, avg_kl=0.0, explanation=record_copy.explanation)
-            # Return the result with the original record since no changes were made.
             return ScorerResult(record=record, score=result)
 
-        # 3. Sanitize the examples.
         examples = self._sanitize_examples(raw_examples)
         
-        # 4. Overwrite the attributes on the copy with the clean data.
         record_copy.test = examples
         record_copy.examples = examples
         record_copy.train = examples
         
-        # Now, use the sanitized 'examples' and the 'record_copy' for all subsequent operations.
         prompts = ["".join(ex["str_tokens"]) for ex in examples[:self.num_prompts]]
         
         total_diff = 0.0
         total_kl = 0.0
         n = 0
 
         for prompt in prompts:
-            # Pass the clean record_copy to the generation methods.
             clean_text, clean_logp_dist = await self._generate_with_and_without_intervention(prompt, record_copy, intervene=False)
             int_text, int_logp_dist = await self._generate_with_and_without_intervention(prompt, record_copy, intervene=True)
             
@@ -274,7 +206,6 @@ async def __call__(self, record: LatentRecord) -> ScorerResult:
         for ex in examples[:self.num_prompts]:
             final_output_list.append({
                 "str_tokens": ex["str_tokens"],
-                # Add the final scores. These will be duplicated for each example.
                 "final_score": final_score,
                 "avg_kl_divergence": avg_kl,
                 # Add placeholder keys that the parser expects, with default values.
@@ -312,14 +243,12 @@ async def _generate_with_and_without_intervention(
             if hookpoint_str is None:
                 raise ValueError("No hookpoint string specified for intervention.")
 
-            # Resolve the string into the actual layer module.
             layer_to_hook = self._resolve_hookpoint(self.subject_model, hookpoint_str)
 
             direction = self._get_intervention_direction(record).to(device)
             direction = direction.unsqueeze(0).unsqueeze(0)  # Shape for broadcasting: [1, 1, D]
 
             def hook_fn(module, inp, out):
-                # Gracefully handle both tuple and tensor outputs
                 hidden_states = out[0] if isinstance(out, tuple) else out
                 
                 # Apply intervention to the last token's hidden state
@@ -423,7 +352,6 @@ def _estimate_direction_from_examples(self, record: LatentRecord) -> torch.Tenso
         def capture_hook(module, inp, out):
             hidden_states = out[0] if isinstance(out, tuple) else out
             
-            # Now, hidden_states is guaranteed to be the 3D activation tensor
             captured_activations.append(hidden_states[:, -1, :].detach().cpu())
 
         hookpoint_str = self.hookpoint_str or getattr(record, "hookpoint", None)