Skip to content

Commit

Permalink
[Spelunker] Tweak eval tooling (#743)
Browse files Browse the repository at this point in the history
Log successful spelunker requests in `~/.spelunker.log`. (A place where
it isn't easily accidentally erased.)
Add a simple tool to read back `~/.spelunker.log`. (For manual
inspection, for now.)
Add a simple script to dump the salient parts of an eval database to a
text file `eval.sql`. (Tool to read it back is TBD.)
Rename IdType to ChunkId in `chunker.py`.
Preserve `Questions` and `Scores` tables when redoing the eval setup.
Lint.
  • Loading branch information
gvanrossum-ms authored Feb 24, 2025
1 parent cb6ec74 commit 63d610a
Show file tree
Hide file tree
Showing 8 changed files with 632 additions and 44 deletions.
16 changes: 11 additions & 5 deletions ts/packages/agents/spelunker/evals/design.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
# Spelunker design notes

# Evaluation design
# Spelunker evaluation design

## Purpose of the eval

Expand Down Expand Up @@ -111,7 +109,8 @@ Should be possible to remove certain entries to be redone.
Basically in a loop:

- If the chunk has already been scored for all questions, skip it.
- Display the chunk (mostly from its blobs), probably using less
- Display the chunk (mostly from its blobs).
- We use _Pygments_ to colorize and _less_ to page through the text.
- For each question that hasn't been scored yet:
- Ask for yes/no, corresponding to "should it be included in the
oracle context"
Expand All @@ -127,4 +126,11 @@ TBD

# Random notes

Scoring tool needs more UI. Maybe use colors? (Code colorization, even?)
Do we need more versatility in the scoring tool? E.g.

- Pass the question _ID_ on the command line instead of the text.
- A way to set a fixed score for all chunks in a given file
(or a file pattern).
- A way to review scores (possibly by date range).
- A way to set a fixed score for a list of chunk IDs
(e.g. the References section of an actual answer).
463 changes: 463 additions & 0 deletions ts/packages/agents/spelunker/evals/eval-1/dbdump.sql

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions ts/packages/agents/spelunker/evals/src/dump.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

case $1 in
"" | "-h" | "--help")
echo "Usage: $0 <path> # eval directory, e.g. evals/eval-1"
exit 1
;;
esac

TABLES="Questions Hashes Scores"

sqlite3 $1/eval.db ".dump $TABLES" >$1/dbdump.sql || exit 1
echo "Dumped $TABLES $1/eval.db to $1/dbdump.sql"

80 changes: 59 additions & 21 deletions ts/packages/agents/spelunker/evals/src/evalscore.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

Expand All @@ -7,6 +8,7 @@
import datetime
import os
import sqlite3
import sys


EXT_TO_LANG = {
Expand All @@ -30,7 +32,11 @@ def main():
"--question",
type=str,
required=True,
help="The question to score chunks against (e.g. 'Describe the toplevel classes and interfaces').",
help=(
"The question to score chunks against\n"
+ "(e.g. 'Describe the toplevel classes and interfaces').\n"
+ "May also be an integer question ID."
),
)
args = parser.parse_args()

Expand All @@ -39,26 +45,14 @@ def main():

filename_prefix = os.path.join(os.path.realpath(args.folder), "source", "")

question_row = cursor.execute(
"SELECT questionId FROM questions WHERE question = ?",
[args.question],
).fetchone()
if question_row:
question_id = question_row[0]
print(f"Existing question ID: {question_id}")
else:
# Write the question to the database (unique key auto-generated)
cursor.execute(
"INSERT INTO questions (question) VALUES (?)",
[args.question],
)
question = args.question
try:
question_id = int(question)
except ValueError:
question_id = get_question_id_by_text(cursor, question)
conn.commit()
[question_id] = cursor.execute(
"SELECT questionId FROM questions WHERE question = ?",
[args.question],
).fetchone()
assert question_id, question_id
print(f"New question ID: {question_id}")
else:
question = get_question_by_id(cursor, question_id)

# Score each chunk
selection = cursor.execute(
Expand Down Expand Up @@ -106,7 +100,7 @@ def main():
path.reverse() # E.g. "module class method"
chunk_text = f"{filename}\n{' '.join(path)}\n"
chunk_text += get_chunk_text(cursor, chunk_id)
score = score_chunk(args.question, chunk_text, language)
score = score_chunk(question, chunk_text, language)

timestamp = datetime.datetime.now().isoformat()
cursor.execute(
Expand All @@ -118,6 +112,50 @@ def main():
conn.close()


def get_question_by_id(cursor: sqlite3.Cursor, question_id):
row = cursor.execute(
"SELECT question FROM Questions WHERE questionId = ?",
[question_id],
).fetchone()
if not row:
print(f"Question ID {question_id} not found")
return sys.exit(1)

question = row[0]
print(f"Existing question: {question}")
return question


def get_question_id_by_text(cursor: sqlite3.Cursor, question):
row = cursor.execute(
"SELECT questionId FROM Questions WHERE question = ?",
[question],
).fetchone()
if row:
question_id = row[0]
print(f"Existing question ID: {question_id}")
return question_id

# Write the question to the database (ID is auto-generated)
cursor.execute(
"INSERT INTO Questions (question) VALUES (?)",
[question],
)

# Retrieve the question ID from the newly inserted row
row = cursor.execute(
"SELECT questionId FROM Questions WHERE question = ?",
[question],
).fetchone()
if not row:
print(f"Huh? Newly inserted question not found")
return sys.exit(1)

question_id = row[0]
print(f"New question ID: {question_id}")
return question_id


def get_chunk_text(cursor: sqlite3.Cursor, chunkid):
text_lines = []
for (lines,) in cursor.execute(
Expand Down
21 changes: 11 additions & 10 deletions ts/packages/agents/spelunker/evals/src/evalsetup.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

Expand All @@ -8,7 +9,8 @@
By default SOURCE is ~/.typeagent/agents/spelunker/codeSearchDatabase.db,
and EVALDIR evals/eval-1.
EVALDIR is always a new directory; if the given directory already exists,
EVALDIR is always a new directory, unless --overwrite is given.
If the given directory already exists,
we create a new directory name by adding -2, -3, etc.
This script does the following:
Expand Down Expand Up @@ -64,7 +66,7 @@ def main():

if not os.path.exists(source):
print(f"Source database {source} does not exist.", file=sys.stderr)
os._exit(2)
return sys.exit(2)

if not args.overwrite:
while os.path.exists(evaldir):
Expand All @@ -82,11 +84,6 @@ def main():
dbname = os.path.join(evaldir, "eval.db")
print(f"Database: {dbname}")

if args.overwrite:
# TODO: Unsafe, but okay for now
assert "'" not in dbname, dbname # TODO: Still not safe?
os.system(f"rm '{dbname}'*")

src_conn = sqlite3.connect(f"file:{source}?mode=ro", uri=True)
src_cur = src_conn.cursor()
dst_conn = sqlite3.connect(dbname)
Expand Down Expand Up @@ -147,9 +144,11 @@ def add_new_tables(dst_cur):
if not sql or sql.startswith("--"):
continue
table_name = sql.split()[5]
print(f"Creating table {table_name} and clearing it")
print(f"Creating table {table_name}")
dst_cur.execute(sql)
dst_cur.execute(f"DELETE FROM {table_name}")
if table_name == "Hashes":
print(f"Clearing contents of table {table_name}")
dst_cur.execute(f"DELETE FROM {table_name}")


def fill_in_hashes(dst_cur, prefix):
Expand All @@ -163,7 +162,9 @@ def fill_in_hashes(dst_cur, prefix):
if filename.startswith(prefix):
filename = filename[len(prefix) :] # E.g. 'dispatcher/src/index.ts'
else:
print(f"Skipping chunk {chunkid} ({filename}) because it is not in {prefix}")
print(
f"Skipping chunk {chunkid} ({filename}) because it is not in {prefix}"
)
continue
input_lines = [filename] # Start with the cleaned-up filename

Expand Down
55 changes: 55 additions & 0 deletions ts/packages/agents/spelunker/evals/src/readlog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import argparse
import json
import os
import time


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-q",
"--questions",
action="store_true",
help="Show the questions only.",
default=False,
)
parser.add_argument(
"-f",
"--follow",
action="store_true",
help="Follow the log file (like tail -f).",
default=False,
)
args = parser.parse_args()
questions_only = args.questions
follow = args.follow
with open(os.path.join(os.getenv("HOME") or "", ".spelunker.log"), "r") as f:
while True:
line = f.readline()
if not line:
if follow:
time.sleep(0.2)
continue
else:
break
try:
data = json.loads(line)
except json.JSONDecodeError:
print(f"Invalid JSON: {line:.100}")
continue
if questions_only:
print(data["question"])
else:
print()
print("-" * 50)
print(f"Question: {data['question']}")
print(f"Answer: {data['answer']}")
print(f"References: {data['references']}")


if __name__ == "__main__":
main()
12 changes: 6 additions & 6 deletions ts/packages/agents/spelunker/src/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from typing import Any, Iterator


IdType = str
ChunkId = str


@dataclass
Expand All @@ -40,7 +40,7 @@ class Blob:

start: int # 0-based!
lines: list[str]
breadcrumb: IdType | None = None # Chunk id if breadcrumb
breadcrumb: ChunkId | None = None # Chunk id if breadcrumb

def to_dict(self) -> dict[str, object]:
result: dict[str, Any] = {
Expand All @@ -56,16 +56,16 @@ def to_dict(self) -> dict[str, object]:
class Chunk:
"""A chunk at any level of nesting (root, inner, leaf)."""

chunkId: IdType
chunkId: ChunkId
treeName: str # AST node name
codeName: str # function/class/module name (TODO: dotted names)
blobs: list[Blob] # Blobs around the placeholders

# For inner chunks:
parentId: IdType
parentId: ChunkId

# For outer chunks:
children: list[IdType] # len() is one less than len(blobs)
children: list[ChunkId] # len() is one less than len(blobs)

# Used by custom_json() below.
def to_dict(self) -> dict[str, object]:
Expand Down Expand Up @@ -121,7 +121,7 @@ def custom_json(obj: object) -> dict[str, object]:
last_ts: datetime.datetime = datetime.datetime.now()


def generate_id() -> IdType:
def generate_id() -> ChunkId:
"""Generate a new unique ID.
IDs are really timestamps formatted as YYYY_MM_DD-HH_MM_SS.UUUUUU,
Expand Down
13 changes: 11 additions & 2 deletions ts/packages/agents/spelunker/src/searchCode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
import { createDatabase, purgeFile } from "./databaseUtils.js";
import { loadEmbeddings, preSelectChunks } from "./embeddings.js";
import { console_log, resetEpoch } from "./logging.js";
import { OracleSpecs } from "./oracleSchema.js";
import { chunkifyPythonFiles } from "./pythonChunker.js";
import { createQueryContext } from "./queryContext.js";
import { retryOn429 } from "./retryLogic.js";
Expand Down Expand Up @@ -76,10 +77,18 @@ export async function searchCode(
}

// 6. Extract answer from result.
const result = wrappedResult.data;
const result: OracleSpecs = wrappedResult.data;
const answer =
result.answer.trimEnd() + formatReferences(result.references);

// 6a. Log the answer to a permanent place.
// Wrong place in the hierarchy, but avoids accidental deletion
const logFile: string = path.join(process.env.HOME ?? "", ".spelunker.log");
const logRecord = JSON.stringify(result);
const fd = fs.openSync(logFile, "a");
fs.writeSync(fd, logRecord + "\n");
fs.closeSync(fd);

// 7. Produce entities and an action result from the result.
const outputEntities = produceEntitiesFromResult(result, allChunks, db);
const resultEntity = createResultEntity(input, answer);
Expand Down Expand Up @@ -160,7 +169,7 @@ function constructPrompt(input: string, chunks: Chunk[]): string {
async function queryOracle(
context: SpelunkerContext,
prompt: string,
): Promise<Result<any>> {
): Promise<Result<OracleSpecs>> {
console_log(`[Step 5: Ask the oracle]`);
return await context.queryContext!.oracle.translate(prompt);
}
Expand Down

0 comments on commit 63d610a

Please sign in to comment.