[Spelunker] Tweak eval tooling (#743)

Log successful spelunker requests in `~/.spelunker.log`. (A place where it isn't easily accidentally erased.) Add a simple tool to read back `~/.spelunker.log`. (For manual inspection, for now.) Add a simple script to dump the salient parts of an eval database to a text file `eval.sql`. (Tool to read it back is TBD.) Rename IdType to ChunkId in `chunker.py`. Preserve `Questions` and `Scores` tables when redoing the eval setup. Lint.
microsoft · Feb 24, 2025 · 63d610a · 63d610a
1 parent cb6ec74
commit 63d610a
Show file tree

Hide file tree

Showing 8 changed files with 632 additions and 44 deletions.
diff --git a/ts/packages/agents/spelunker/evals/design.md b/ts/packages/agents/spelunker/evals/design.md
@@ -1,6 +1,4 @@
-# Spelunker design notes
-
-# Evaluation design
+# Spelunker evaluation design
 
 ## Purpose of the eval
 
@@ -111,7 +109,8 @@ Should be possible to remove certain entries to be redone.
 Basically in a loop:
 
 - If the chunk has already been scored for all questions, skip it.
-- Display the chunk (mostly from its blobs), probably using less
+- Display the chunk (mostly from its blobs).
+  - We use _Pygments_ to colorize and _less_ to page through the text.
 - For each question that hasn't been scored yet:
   - Ask for yes/no, corresponding to "should it be included in the
     oracle context"
@@ -127,4 +126,11 @@ TBD
 
 # Random notes
 
-Scoring tool needs more UI. Maybe use colors? (Code colorization, even?)
+Do we need more versatility in the scoring tool? E.g.
+
+- Pass the question _ID_ on the command line instead of the text.
+- A way to set a fixed score for all chunks in a given file
+  (or a file pattern).
+- A way to review scores (possibly by date range).
+- A way to set a fixed score for a list of chunk IDs
+  (e.g. the References section of an actual answer).
diff --git a/ts/packages/agents/spelunker/evals/eval-1/dbdump.sql b/ts/packages/agents/spelunker/evals/eval-1/dbdump.sql
diff --git a/ts/packages/agents/spelunker/evals/src/dump.sh b/ts/packages/agents/spelunker/evals/src/dump.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+case $1 in
+    "" | "-h" | "--help")
+        echo "Usage: $0 <path>  # eval directory, e.g. evals/eval-1"
+        exit 1
+        ;;
+esac
+
+TABLES="Questions Hashes Scores"
+
+sqlite3 $1/eval.db ".dump $TABLES" >$1/dbdump.sql  || exit 1
+echo "Dumped $TABLES $1/eval.db to $1/dbdump.sql"
+
diff --git a/ts/packages/agents/spelunker/evals/src/evalscore.py b/ts/packages/agents/spelunker/evals/src/evalscore.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
@@ -7,6 +8,7 @@
 import datetime
 import os
 import sqlite3
+import sys
 
 
 EXT_TO_LANG = {
@@ -30,7 +32,11 @@ def main():
         "--question",
         type=str,
         required=True,
-        help="The question to score chunks against (e.g. 'Describe the toplevel classes and interfaces').",
+        help=(
+            "The question to score chunks against\n"
+            + "(e.g. 'Describe the toplevel classes and interfaces').\n"
+            + "May also be an integer question ID."
+        ),
     )
     args = parser.parse_args()
 
@@ -39,26 +45,14 @@ def main():
 
     filename_prefix = os.path.join(os.path.realpath(args.folder), "source", "")
 
-    question_row = cursor.execute(
-        "SELECT questionId FROM questions WHERE question = ?",
-        [args.question],
-    ).fetchone()
-    if question_row:
-        question_id = question_row[0]
-        print(f"Existing question ID: {question_id}")
-    else:
-        # Write the question to the database (unique key auto-generated)
-        cursor.execute(
-            "INSERT INTO questions (question) VALUES (?)",
-            [args.question],
-        )
+    question = args.question
+    try:
+        question_id = int(question)
+    except ValueError:
+        question_id = get_question_id_by_text(cursor, question)
         conn.commit()
-        [question_id] = cursor.execute(
-            "SELECT questionId FROM questions WHERE question = ?",
-            [args.question],
-        ).fetchone()
-        assert question_id, question_id
-        print(f"New question ID: {question_id}")
+    else:
+        question = get_question_by_id(cursor, question_id)
 
     # Score each chunk
     selection = cursor.execute(
@@ -106,7 +100,7 @@ def main():
         path.reverse()  # E.g. "module class method"
         chunk_text = f"{filename}\n{' '.join(path)}\n"
         chunk_text += get_chunk_text(cursor, chunk_id)
-        score = score_chunk(args.question, chunk_text, language)
+        score = score_chunk(question, chunk_text, language)
 
         timestamp = datetime.datetime.now().isoformat()
         cursor.execute(
@@ -118,6 +112,50 @@ def main():
     conn.close()
 
 
+def get_question_by_id(cursor: sqlite3.Cursor, question_id):
+    row = cursor.execute(
+        "SELECT question FROM Questions WHERE questionId = ?",
+        [question_id],
+    ).fetchone()
+    if not row:
+        print(f"Question ID {question_id} not found")
+        return sys.exit(1)
+
+    question = row[0]
+    print(f"Existing question: {question}")
+    return question
+
+
+def get_question_id_by_text(cursor: sqlite3.Cursor, question):
+    row = cursor.execute(
+        "SELECT questionId FROM Questions WHERE question = ?",
+        [question],
+    ).fetchone()
+    if row:
+        question_id = row[0]
+        print(f"Existing question ID: {question_id}")
+        return question_id
+
+    # Write the question to the database (ID is auto-generated)
+    cursor.execute(
+        "INSERT INTO Questions (question) VALUES (?)",
+        [question],
+    )
+
+    # Retrieve the question ID from the newly inserted row
+    row = cursor.execute(
+        "SELECT questionId FROM Questions WHERE question = ?",
+        [question],
+    ).fetchone()
+    if not row:
+        print(f"Huh? Newly inserted question not found")
+        return sys.exit(1)
+
+    question_id = row[0]
+    print(f"New question ID: {question_id}")
+    return question_id
+
+
 def get_chunk_text(cursor: sqlite3.Cursor, chunkid):
     text_lines = []
     for (lines,) in cursor.execute(

diff --git a/ts/packages/agents/spelunker/evals/src/evalsetup.py b/ts/packages/agents/spelunker/evals/src/evalsetup.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
@@ -8,7 +9,8 @@
 By default SOURCE is ~/.typeagent/agents/spelunker/codeSearchDatabase.db,
 and EVALDIR evals/eval-1.
 
-EVALDIR is always a new directory; if the given directory already exists,
+EVALDIR is always a new directory, unless --overwrite is given.
+If the given directory already exists,
 we create a new directory name by adding -2, -3, etc.
 
 This script does the following:
@@ -64,7 +66,7 @@ def main():
 
     if not os.path.exists(source):
         print(f"Source database {source} does not exist.", file=sys.stderr)
-        os._exit(2)
+        return sys.exit(2)
 
     if not args.overwrite:
         while os.path.exists(evaldir):
@@ -82,11 +84,6 @@ def main():
     dbname = os.path.join(evaldir, "eval.db")
     print(f"Database: {dbname}")
 
-    if args.overwrite:
-        # TODO: Unsafe, but okay for now
-        assert "'" not in dbname, dbname  # TODO: Still not safe?
-        os.system(f"rm '{dbname}'*")
-
     src_conn = sqlite3.connect(f"file:{source}?mode=ro", uri=True)
     src_cur = src_conn.cursor()
     dst_conn = sqlite3.connect(dbname)
@@ -147,9 +144,11 @@ def add_new_tables(dst_cur):
         if not sql or sql.startswith("--"):
             continue
         table_name = sql.split()[5]
-        print(f"Creating table {table_name} and clearing it")
+        print(f"Creating table {table_name}")
         dst_cur.execute(sql)
-        dst_cur.execute(f"DELETE FROM {table_name}")
+        if table_name == "Hashes":
+            print(f"Clearing contents of table {table_name}")
+            dst_cur.execute(f"DELETE FROM {table_name}")
 
 
 def fill_in_hashes(dst_cur, prefix):
@@ -163,7 +162,9 @@ def fill_in_hashes(dst_cur, prefix):
         if filename.startswith(prefix):
             filename = filename[len(prefix) :]  # E.g. 'dispatcher/src/index.ts'
         else:
-            print(f"Skipping chunk {chunkid} ({filename}) because it is not in {prefix}")
+            print(
+                f"Skipping chunk {chunkid} ({filename}) because it is not in {prefix}"
+            )
             continue
         input_lines = [filename]  # Start with the cleaned-up filename
 

diff --git a/ts/packages/agents/spelunker/evals/src/readlog.py b/ts/packages/agents/spelunker/evals/src/readlog.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+import json
+import os
+import time
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-q",
+        "--questions",
+        action="store_true",
+        help="Show the questions only.",
+        default=False,
+    )
+    parser.add_argument(
+        "-f",
+        "--follow",
+        action="store_true",
+        help="Follow the log file (like tail -f).",
+        default=False,
+    )
+    args = parser.parse_args()
+    questions_only = args.questions
+    follow = args.follow
+    with open(os.path.join(os.getenv("HOME") or "", ".spelunker.log"), "r") as f:
+        while True:
+            line = f.readline()
+            if not line:
+                if follow:
+                    time.sleep(0.2)
+                    continue
+                else:
+                    break
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"Invalid JSON: {line:.100}")
+                continue
+            if questions_only:
+                print(data["question"])
+            else:
+                print()
+                print("-" * 50)
+                print(f"Question: {data['question']}")
+                print(f"Answer: {data['answer']}")
+                print(f"References: {data['references']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ts/packages/agents/spelunker/src/chunker.py b/ts/packages/agents/spelunker/src/chunker.py
@@ -31,7 +31,7 @@
 from typing import Any, Iterator
 
 
-IdType = str
+ChunkId = str
 
 
 @dataclass
@@ -40,7 +40,7 @@ class Blob:
 
     start: int  # 0-based!
     lines: list[str]
-    breadcrumb: IdType | None = None  # Chunk id if breadcrumb
+    breadcrumb: ChunkId | None = None  # Chunk id if breadcrumb
 
     def to_dict(self) -> dict[str, object]:
         result: dict[str, Any] = {
@@ -56,16 +56,16 @@ def to_dict(self) -> dict[str, object]:
 class Chunk:
     """A chunk at any level of nesting (root, inner, leaf)."""
 
-    chunkId: IdType
+    chunkId: ChunkId
     treeName: str  # AST node name
     codeName: str  # function/class/module name (TODO: dotted names)
     blobs: list[Blob]  # Blobs around the placeholders
 
     # For inner chunks:
-    parentId: IdType
+    parentId: ChunkId
 
     # For outer chunks:
-    children: list[IdType]  # len() is one less than len(blobs)
+    children: list[ChunkId]  # len() is one less than len(blobs)
 
     # Used by custom_json() below.
     def to_dict(self) -> dict[str, object]:
@@ -121,7 +121,7 @@ def custom_json(obj: object) -> dict[str, object]:
 last_ts: datetime.datetime = datetime.datetime.now()
 
 
-def generate_id() -> IdType:
+def generate_id() -> ChunkId:
     """Generate a new unique ID.
 
     IDs are really timestamps formatted as YYYY_MM_DD-HH_MM_SS.UUUUUU,

diff --git a/ts/packages/agents/spelunker/src/searchCode.ts b/ts/packages/agents/spelunker/src/searchCode.ts
@@ -25,6 +25,7 @@ import {
 import { createDatabase, purgeFile } from "./databaseUtils.js";
 import { loadEmbeddings, preSelectChunks } from "./embeddings.js";
 import { console_log, resetEpoch } from "./logging.js";
+import { OracleSpecs } from "./oracleSchema.js";
 import { chunkifyPythonFiles } from "./pythonChunker.js";
 import { createQueryContext } from "./queryContext.js";
 import { retryOn429 } from "./retryLogic.js";
@@ -76,10 +77,18 @@ export async function searchCode(
     }
 
     // 6. Extract answer from result.
-    const result = wrappedResult.data;
+    const result: OracleSpecs = wrappedResult.data;
     const answer =
         result.answer.trimEnd() + formatReferences(result.references);
 
+    // 6a. Log the answer to a permanent place.
+    // Wrong place in the hierarchy, but avoids accidental deletion
+    const logFile: string = path.join(process.env.HOME ?? "", ".spelunker.log");
+    const logRecord = JSON.stringify(result);
+    const fd = fs.openSync(logFile, "a");
+    fs.writeSync(fd, logRecord + "\n");
+    fs.closeSync(fd);
+
     // 7. Produce entities and an action result from the result.
     const outputEntities = produceEntitiesFromResult(result, allChunks, db);
     const resultEntity = createResultEntity(input, answer);
@@ -160,7 +169,7 @@ function constructPrompt(input: string, chunks: Chunk[]): string {
 async function queryOracle(
     context: SpelunkerContext,
     prompt: string,
-): Promise<Result<any>> {
+): Promise<Result<OracleSpecs>> {
     console_log(`[Step 5: Ask the oracle]`);
     return await context.queryContext!.oracle.translate(prompt);
 }