diff --git a/.github/workflows/upskill-internal-pr.yml b/.github/workflows/upskill-internal-pr.yml new file mode 100644 index 00000000..a404865f --- /dev/null +++ b/.github/workflows/upskill-internal-pr.yml @@ -0,0 +1,90 @@ +name: Validate Skill Performance With Upskill + +on: + pull_request: + paths: + - ".github/workflows/upskill-internal-pr.yml" + - ".upskill/**" + - "evals/**" + - "upskill.config.yaml" + - "fastagent.config.yaml" + - "scripts/upskill_ci.py" + - "skills/**/SKILL.md" + - "skills/**/references/**" + - "skills/**/scripts/**" + push: + branches: + - main + paths: + - ".github/workflows/upskill-internal-pr.yml" + - ".upskill/**" + - "evals/**" + - "upskill.config.yaml" + - "fastagent.config.yaml" + - "scripts/upskill_ci.py" + - "skills/**/SKILL.md" + - "skills/**/references/**" + - "skills/**/scripts/**" + +jobs: + upskill: + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + permissions: + contents: read + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + UPSKILL_JUDGE_MODEL: "claude-haiku-4-5-20251001" + UPSKILL_CI_RUNS: "1" + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Set up uv + uses: astral-sh/setup-uv@v7 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13.5" + + - name: Resolve comparison ref + run: | + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "UPSKILL_BASE_REF=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_ENV" + echo "UPSKILL_PERSIST=false" >> "$GITHUB_ENV" + else + echo "UPSKILL_BASE_REF=${{ github.event.before }}" >> "$GITHUB_ENV" + echo "UPSKILL_PERSIST=true" >> "$GITHUB_ENV" + fi + + - name: Run upskill comparison + run: | + set -euo pipefail + cmd=( + uv run + --python 3.13.5 + --with git+https://github.com/huggingface/upskill@main + --with huggingface_hub + python + scripts/upskill_ci.py + --base-ref "$UPSKILL_BASE_REF" + --runs "$UPSKILL_CI_RUNS" + ) + if [ "$UPSKILL_PERSIST" = "true" ]; then + cmd+=(--persist) + fi + "${cmd[@]}" + + - name: Upload upskill artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: upskill-artifacts-${{ github.run_id }} + path: | + .upskill/artifacts + .upskill/reports + if-no-files-found: ignore diff --git a/.gitignore b/.gitignore index 382b6ad0..d1127f58 100644 --- a/.gitignore +++ b/.gitignore @@ -207,6 +207,9 @@ marimo/_lsp/ __marimo__/ .claude .fast-agent/ +.upskill/artifacts/ +.upskill/reports/ +.upskill/runs/ # MacOS -.DS_Store \ No newline at end of file +.DS_Store diff --git a/.upskill/README.md b/.upskill/README.md new file mode 100644 index 00000000..f37a5b24 --- /dev/null +++ b/.upskill/README.md @@ -0,0 +1,20 @@ +# Upskill CI + +This directory contains the smoke-eval manifest and supporting files for the +skills performance workflow. + +## What runs + +- Internal pull requests only: the workflow skips fork PRs entirely. +- Pushes to `main`: the workflow reruns changed scenarios and persists a + normalized history record to a Hugging Face dataset. + +## Required secrets + +- `ANTHROPIC_API_KEY` for local `upskill` evaluation and LLM-as-a-judge. +- `HF_TOKEN` for history uploads on `main` pushes. + +## Files + +- `evals.json` maps skills to smoke scenarios. +- `../evals/**` contains the fixed test cases used by CI. diff --git a/.upskill/evals.json b/.upskill/evals.json new file mode 100644 index 00000000..ea87ced2 --- /dev/null +++ b/.upskill/evals.json @@ -0,0 +1,147 @@ +{ + "scenarios": [ + { + "id": "hf-cli-smoke", + "skills": ["skills/hf-cli"], + "tests": "evals/hf-cli/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the modern hf CLI, not deprecated huggingface-cli commands.", + "Provides a concrete command that is directly executable.", + "Stays focused on the user's request." + ] + } + }, + { + "id": "community-evals-smoke", + "skills": ["skills/huggingface-community-evals"], + "tests": "evals/huggingface-community-evals/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Chooses the correct local evaluation script for the task.", + "Keeps the workflow local instead of drifting into HF Jobs.", + "Provides a runnable command shape." + ] + } + }, + { + "id": "datasets-smoke", + "skills": ["skills/huggingface-datasets"], + "tests": "evals/huggingface-datasets/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the Dataset Viewer API endpoints that match the requested workflow.", + "Covers split discovery, preview, and pagination.", + "Keeps the answer concrete and operational." + ] + } + }, + { + "id": "gradio-smoke", + "skills": ["skills/huggingface-gradio"], + "tests": "evals/huggingface-gradio/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Shows a minimal valid Gradio solution.", + "Uses the API shape that matches the user request.", + "Keeps the code concise and directly usable." + ] + } + }, + { + "id": "jobs-smoke", + "skills": ["skills/huggingface-jobs"], + "tests": "evals/huggingface-jobs/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the hf_jobs MCP tool as the preferred submission path.", + "Mentions HF_TOKEN handling for Hub access.", + "Gives a concrete submission pattern rather than generic advice." + ] + } + }, + { + "id": "llm-trainer-smoke", + "skills": ["skills/huggingface-llm-trainer"], + "tests": "evals/huggingface-llm-trainer/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the training workflow described by the skill.", + "Keeps Trackio or equivalent monitoring enabled.", + "Points to the right script or job pattern for SFT." + ] + } + }, + { + "id": "paper-publisher-smoke", + "skills": ["skills/huggingface-paper-publisher"], + "tests": "evals/huggingface-paper-publisher/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the paper_manager script correctly.", + "Provides the right subcommand and required flag.", + "Keeps the answer concrete and executable." + ] + } + }, + { + "id": "papers-smoke", + "skills": ["skills/huggingface-papers"], + "tests": "evals/huggingface-papers/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the HF paper metadata and markdown endpoints correctly.", + "Handles the provided paper id directly.", + "Keeps the answer narrow and API-focused." + ] + } + }, + { + "id": "trackio-smoke", + "skills": ["skills/huggingface-trackio"], + "tests": "evals/huggingface-trackio/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Uses the Trackio CLI flow that matches the retrieval task.", + "Returns JSON-oriented automation guidance.", + "Keeps the answer concrete and specific." + ] + } + }, + { + "id": "vision-trainer-smoke", + "skills": ["skills/huggingface-vision-trainer"], + "tests": "evals/huggingface-vision-trainer/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Emphasizes dataset validation before expensive training.", + "Uses the helper script referenced by the skill.", + "Keeps the answer focused on the requested preflight step." + ] + } + }, + { + "id": "transformers-js-smoke", + "skills": ["skills/transformers-js"], + "tests": "evals/transformers-js/smoke.json", + "judge": { + "enabled": true, + "criteria": [ + "Shows the minimal correct Transformers.js setup for Node.js.", + "Includes proper pipeline usage and cleanup.", + "Keeps the answer directly usable." + ] + } + } + ] +} diff --git a/evals/hf-cli/smoke.json b/evals/hf-cli/smoke.json new file mode 100644 index 00000000..43d8475e --- /dev/null +++ b/evals/hf-cli/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "A user wants the modern Hugging Face CLI command to download the model bert-base-uncased into ./models and explicitly does not want deprecated CLI names. Give the exact command.", + "verifiers": [ + {"type": "contains", "values": ["hf download"]}, + {"type": "contains", "values": ["bert-base-uncased"]}, + {"type": "contains", "values": ["--local-dir ./models"]} + ] + } + ] +} diff --git a/evals/huggingface-community-evals/smoke.json b/evals/huggingface-community-evals/smoke.json new file mode 100644 index 00000000..4aa0699f --- /dev/null +++ b/evals/huggingface-community-evals/smoke.json @@ -0,0 +1,13 @@ +{ + "cases": [ + { + "input": "I want a quick local inspect-ai smoke test against a Hub model using inference providers, not Hugging Face Jobs. Give the command.", + "verifiers": [ + {"type": "contains", "values": ["uv run scripts/inspect_eval_uv.py"]}, + {"type": "contains", "values": ["--model"]}, + {"type": "contains", "values": ["--task"]}, + {"type": "contains", "values": ["--limit"]} + ] + } + ] +} diff --git a/evals/huggingface-datasets/smoke.json b/evals/huggingface-datasets/smoke.json new file mode 100644 index 00000000..e6025665 --- /dev/null +++ b/evals/huggingface-datasets/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "I need to inspect a dataset with the Dataset Viewer API: resolve splits, preview rows, then paginate more rows. Describe the exact endpoints I should call.", + "verifiers": [ + {"type": "contains", "values": ["/splits?dataset="]}, + {"type": "contains", "values": ["/first-rows?dataset="]}, + {"type": "contains", "values": ["/rows?dataset="]} + ] + } + ] +} diff --git a/evals/huggingface-gradio/smoke.json b/evals/huggingface-gradio/smoke.json new file mode 100644 index 00000000..1ec5c38c --- /dev/null +++ b/evals/huggingface-gradio/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "Show a minimal Gradio chatbot app in Python.", + "verifiers": [ + {"type": "contains", "values": ["import gradio as gr"]}, + {"type": "contains", "values": ["gr.ChatInterface"]}, + {"type": "contains", "values": [".launch()"]} + ] + } + ] +} diff --git a/evals/huggingface-jobs/smoke.json b/evals/huggingface-jobs/smoke.json new file mode 100644 index 00000000..2adf18ac --- /dev/null +++ b/evals/huggingface-jobs/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "Run a short Python workload on Hugging Face Jobs and make sure HF_TOKEN is available to the job. Show the preferred submission pattern.", + "verifiers": [ + {"type": "contains", "values": ["hf_jobs"]}, + {"type": "contains", "values": ["HF_TOKEN"]}, + {"type": "contains", "values": ["script"]} + ] + } + ] +} diff --git a/evals/huggingface-llm-trainer/smoke.json b/evals/huggingface-llm-trainer/smoke.json new file mode 100644 index 00000000..c27b2f0a --- /dev/null +++ b/evals/huggingface-llm-trainer/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "Start an SFT training job on Hugging Face Jobs and keep real-time monitoring enabled. What pattern should you use?", + "verifiers": [ + {"type": "contains", "values": ["hf_jobs"]}, + {"type": "contains", "values": ["Trackio"]}, + {"type": "contains", "values": ["scripts/train_sft_example.py"]} + ] + } + ] +} diff --git a/evals/huggingface-paper-publisher/smoke.json b/evals/huggingface-paper-publisher/smoke.json new file mode 100644 index 00000000..48be6bb1 --- /dev/null +++ b/evals/huggingface-paper-publisher/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "I want to index arXiv paper 2301.12345 on the Hub. Give the exact command.", + "verifiers": [ + {"type": "contains", "values": ["uv run scripts/paper_manager.py index"]}, + {"type": "contains", "values": ["--arxiv-id"]}, + {"type": "contains", "values": ["2301.12345"]} + ] + } + ] +} diff --git a/evals/huggingface-papers/smoke.json b/evals/huggingface-papers/smoke.json new file mode 100644 index 00000000..8f446751 --- /dev/null +++ b/evals/huggingface-papers/smoke.json @@ -0,0 +1,11 @@ +{ + "cases": [ + { + "input": "Given arXiv ID 2602.08025, show the Hugging Face API call for structured metadata and the markdown fetch path.", + "verifiers": [ + {"type": "contains", "values": ["/api/papers/2602.08025"]}, + {"type": "contains", "values": ["/papers/2602.08025.md"]} + ] + } + ] +} diff --git a/evals/huggingface-trackio/smoke.json b/evals/huggingface-trackio/smoke.json new file mode 100644 index 00000000..f97941cf --- /dev/null +++ b/evals/huggingface-trackio/smoke.json @@ -0,0 +1,11 @@ +{ + "cases": [ + { + "input": "How do I retrieve Trackio alerts as JSON for automation?", + "verifiers": [ + {"type": "contains", "values": ["trackio list alerts"]}, + {"type": "contains", "values": ["--json"]} + ] + } + ] +} diff --git a/evals/huggingface-vision-trainer/smoke.json b/evals/huggingface-vision-trainer/smoke.json new file mode 100644 index 00000000..c5ffce68 --- /dev/null +++ b/evals/huggingface-vision-trainer/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "Before launching GPU training on an unknown dataset, what local validation command should you run first?", + "verifiers": [ + {"type": "contains", "values": ["uv run scripts/dataset_inspector.py"]}, + {"type": "contains", "values": ["--dataset"]}, + {"type": "contains", "values": ["--split"]} + ] + } + ] +} diff --git a/evals/transformers-js/smoke.json b/evals/transformers-js/smoke.json new file mode 100644 index 00000000..2aeaed1f --- /dev/null +++ b/evals/transformers-js/smoke.json @@ -0,0 +1,12 @@ +{ + "cases": [ + { + "input": "Show the minimal Node.js sentiment-analysis setup with Transformers.js, including cleanup.", + "verifiers": [ + {"type": "contains", "values": ["npm install @huggingface/transformers"]}, + {"type": "contains", "values": ["pipeline("]}, + {"type": "contains", "values": [".dispose()"]} + ] + } + ] +} diff --git a/fastagent.config.yaml b/fastagent.config.yaml new file mode 100644 index 00000000..e8e5ccdf --- /dev/null +++ b/fastagent.config.yaml @@ -0,0 +1,6 @@ +default_model: kimi + +logger: + progress_display: false + show_chat: false + streaming: markdown diff --git a/scripts/upskill_ci.py b/scripts/upskill_ci.py new file mode 100644 index 00000000..42a4c249 --- /dev/null +++ b/scripts/upskill_ci.py @@ -0,0 +1,955 @@ +#!/usr/bin/env python3 +"""Run upskill-based smoke evaluations for changed skills and compare them to a base ref.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import os +import subprocess +import sys +import tempfile +import urllib.error +import urllib.request +from dataclasses import dataclass +from datetime import datetime, timezone +from importlib import resources +from pathlib import Path +from typing import Any + +from upskill.ci import load_test_cases, plan_ci_suite +from upskill.config import Config +from upskill.evaluate import evaluate_skill +from upskill.executors.local_fast_agent import LocalFastAgentExecutor +from upskill.models import Skill, TestResult + + +JUDGE_SYSTEM_PROMPT = """You are grading whether an assistant response correctly applied a repository skill. + +Return strict JSON with this shape: +{"score": 0.0, "summary": "short explanation"} + +Rules: +- score must be a float between 0.0 and 1.0 +- summary must be short +- use the hard-verifier result as context, but do not simply restate it +- prioritize correctness, relevance, completeness, and staying within the skill boundary +""" + +SUMMARY_CASE_LIMIT = 2 +SUMMARY_REQUEST_LIMIT = 140 +SUMMARY_ISSUE_LIMIT = 220 +SUMMARY_OUTPUT_LIMIT = 180 + + +@dataclass(frozen=True) +class JudgeConfig: + provider: str + model: str + + +@dataclass +class AggregateMetrics: + case_success_rate: float + hard_score: float + assertions_passed: int + assertions_total: int + avg_tokens: float + avg_turns: float + judge_score: float | None + case_details: list[dict[str, Any]] + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run upskill smoke checks for changed skills.") + parser.add_argument( + "--manifest", + default=".upskill/evals.json", + help="Path to the upskill eval manifest.", + ) + parser.add_argument( + "--base-ref", + required=True, + help="Git ref or commit to compare against.", + ) + parser.add_argument( + "--eval-model", + default=None, + help="Model name to pass to upskill evaluation. Defaults to upskill.config.yaml.", + ) + parser.add_argument( + "--runs", + type=int, + default=1, + help="Number of repeated runs per scenario variant.", + ) + parser.add_argument( + "--artifacts-dir", + default=".upskill/artifacts", + help="Directory for raw evaluation artifacts.", + ) + parser.add_argument( + "--output", + default=".upskill/reports/upskill-ci-report.json", + help="Path for the machine-readable report.", + ) + parser.add_argument( + "--hard-regression-threshold", + type=float, + default=0.0, + help="Allowed hard-score drop before failing.", + ) + parser.add_argument( + "--judge-regression-threshold", + type=float, + default=0.1, + help="Allowed judge-score drop before failing when hard score is flat or worse.", + ) + parser.add_argument( + "--token-regression-threshold", + type=float, + default=0.2, + help="Allowed avg-token increase ratio before failing when hard score is flat or worse.", + ) + parser.add_argument( + "--persist", + action="store_true", + help="Append the current report to the configured Hugging Face dataset history.", + ) + parser.add_argument( + "--history-repo", + default=os.environ.get("UPSKILL_HISTORY_REPO", "hf-skills/skill-performance-history"), + help="HF dataset repo used for persisted history.", + ) + return parser.parse_args() + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _run_git(repo_root: Path, *args: str, text: bool = True) -> subprocess.CompletedProcess[Any]: + return subprocess.run( + ["git", *args], + cwd=repo_root, + check=False, + capture_output=True, + text=text, + ) + + +def _normalize_base_ref(base_ref: str) -> str: + if base_ref == "0000000000000000000000000000000000000000": + return "HEAD~1" + return base_ref + + +def _materialize_skill_from_git(repo_root: Path, ref: str, skill_path: str, destination: Path) -> Path: + listed = _run_git(repo_root, "ls-tree", "-r", "--name-only", ref, "--", skill_path) + if listed.returncode != 0: + message = listed.stderr.strip() or listed.stdout.strip() or "git ls-tree failed" + raise RuntimeError(f"Failed to inspect {skill_path} at {ref}: {message}") + + files = [line.strip() for line in listed.stdout.splitlines() if line.strip()] + if not files: + raise FileNotFoundError(f"No files found for {skill_path} at {ref}") + + skill_root = destination / skill_path + for relative_path in files: + blob = _run_git(repo_root, "show", f"{ref}:{relative_path}", text=False) + if blob.returncode != 0: + message = blob.stderr.decode("utf-8", errors="replace").strip() + raise RuntimeError(f"Failed to read {relative_path} at {ref}: {message}") + target = destination / relative_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(blob.stdout) + return skill_root + + +def _assertion_counts(test_result: TestResult) -> tuple[int, int]: + if test_result.validation_result is not None: + return ( + test_result.validation_result.assertions_passed, + test_result.validation_result.assertions_total, + ) + return (1 if test_result.success else 0, 1) + + +def _safe_average(total: float, count: int) -> float: + if count <= 0: + return 0.0 + return total / count + + +def _truncate_text(value: Any, limit: int) -> str | None: + if value is None: + return None + if isinstance(value, (list, tuple)): + text = "; ".join(str(item) for item in value if item) + else: + text = str(value) + text = " ".join(text.split()) + if not text: + return None + if len(text) <= limit: + return text + return f"{text[: limit - 3].rstrip()}..." + + +def _extract_json_object(text: str) -> dict[str, Any]: + stripped = text.strip() + if stripped.startswith("```"): + lines = stripped.splitlines() + if len(lines) >= 3: + stripped = "\n".join(lines[1:-1]).strip() + start = stripped.find("{") + end = stripped.rfind("}") + if start < 0 or end < start: + raise ValueError(f"Judge did not return JSON: {text}") + return json.loads(stripped[start : end + 1]) + + +def _http_json(url: str, *, headers: dict[str, str], payload: dict[str, Any]) -> dict[str, Any]: + request = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + headers={**headers, "Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(request, timeout=90) as response: + return json.loads(response.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + body = _truncate_text(exc.read().decode("utf-8", errors="replace"), SUMMARY_ISSUE_LIMIT) + message = f"HTTP {exc.code} {exc.reason} for {url}" + if body: + message = f"{message}: {body}" + raise RuntimeError(message) from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"request to {url} failed: {exc.reason}") from exc + + +def _judge_config() -> JudgeConfig: + provider = os.environ.get("UPSKILL_JUDGE_PROVIDER") + if provider is None: + if os.environ.get("ANTHROPIC_API_KEY"): + provider = "anthropic" + elif os.environ.get("OPENAI_API_KEY"): + provider = "openai" + + if provider == "anthropic": + return JudgeConfig( + provider="anthropic", + model=os.environ.get("UPSKILL_JUDGE_MODEL", "claude-haiku-4-5-20251001"), + ) + if provider == "openai": + return JudgeConfig( + provider="openai", + model=os.environ.get("UPSKILL_JUDGE_MODEL", "gpt-4.1-mini"), + ) + raise RuntimeError( + "No judge provider configured. Set UPSKILL_JUDGE_PROVIDER or provide ANTHROPIC_API_KEY/OPENAI_API_KEY." + ) + + +def _normalize_anthropic_base_url(base_url: str) -> str: + trimmed = base_url.rstrip("/") + if trimmed.endswith("/v1"): + return trimmed[: -len("/v1")] + return trimmed + + +def _call_anthropic(prompt: str, config: JudgeConfig) -> str: + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise RuntimeError("ANTHROPIC_API_KEY is required for anthropic judge runs.") + base = _normalize_anthropic_base_url( + os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") + ) + payload = { + "model": config.model, + "max_tokens": 300, + "temperature": 0, + "system": JUDGE_SYSTEM_PROMPT, + "messages": [{"role": "user", "content": prompt}], + } + response = _http_json( + f"{base}/v1/messages", + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + }, + payload=payload, + ) + blocks = response.get("content", []) + return "\n".join(block.get("text", "") for block in blocks if block.get("type") == "text").strip() + + +def _normalize_openai_base_url(base_url: str) -> str: + trimmed = base_url.rstrip("/") + if trimmed.endswith("/v1"): + return trimmed + return f"{trimmed}/v1" + + +def _call_openai(prompt: str, config: JudgeConfig) -> str: + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise RuntimeError("OPENAI_API_KEY is required for openai judge runs.") + base = _normalize_openai_base_url(os.environ.get("OPENAI_API_BASE", "https://api.openai.com")) + payload = { + "model": config.model, + "temperature": 0, + "response_format": {"type": "json_object"}, + "messages": [ + {"role": "system", "content": JUDGE_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + } + response = _http_json( + f"{base}/chat/completions", + headers={"Authorization": f"Bearer {api_key}"}, + payload=payload, + ) + choices = response.get("choices", []) + if not choices: + raise RuntimeError("OpenAI judge returned no choices.") + message = choices[0].get("message", {}) + content = message.get("content", "") + if isinstance(content, list): + parts = [item.get("text", "") for item in content if isinstance(item, dict)] + return "\n".join(parts).strip() + return str(content).strip() + + +def _judge_case( + *, + judge: JudgeConfig, + scenario_id: str, + skill_path: str, + criteria: list[str], + test_result: TestResult, +) -> tuple[float | None, str | None]: + verifier_status = "passed" if test_result.success else "failed" + verifier_details = [] + if test_result.validation_result is not None: + if test_result.validation_result.error_message: + verifier_details.append(test_result.validation_result.error_message) + verifier_details.extend(test_result.validation_result.details) + + prompt = "\n".join( + [ + f"Scenario: {scenario_id}", + f"Skill path: {skill_path}", + "Criteria:", + *[f"- {criterion}" for criterion in criteria], + "", + "User request:", + test_result.test_case.input, + "", + "Assistant response:", + test_result.output or "(no response)", + "", + f"Hard verifier status: {verifier_status}", + f"Verifier details: {verifier_details or ['none']}", + ] + ) + + try: + if judge.provider == "anthropic": + raw = _call_anthropic(prompt, judge) + else: + raw = _call_openai(prompt, judge) + payload = _extract_json_object(raw) + score = float(payload["score"]) + score = max(0.0, min(1.0, score)) + summary = str(payload.get("summary", "")).strip() or None + return score, summary + except (KeyError, TypeError, ValueError, urllib.error.URLError, RuntimeError) as exc: + return None, f"judge error: {exc}" + + +def _aggregate_metrics( + *, + scenario_id: str, + skill_path: str, + criteria: list[str], + test_results: list[TestResult], + judge: JudgeConfig | None, +) -> AggregateMetrics: + total_cases = len(test_results) + passed_cases = 0 + assertions_passed = 0 + assertions_total = 0 + total_tokens = 0 + total_turns = 0 + judge_scores: list[float] = [] + details: list[dict[str, Any]] = [] + + for index, test_result in enumerate(test_results, start=1): + passed_cases += int(test_result.success) + case_assertions_passed, case_assertions_total = _assertion_counts(test_result) + assertions_passed += case_assertions_passed + assertions_total += case_assertions_total + total_tokens += test_result.stats.total_tokens or test_result.tokens_used + total_turns += test_result.stats.turns or test_result.turns + + judge_score = None + judge_summary = None + if judge is not None: + judge_score, judge_summary = _judge_case( + judge=judge, + scenario_id=scenario_id, + skill_path=skill_path, + criteria=criteria, + test_result=test_result, + ) + if judge_score is not None: + judge_scores.append(judge_score) + + details.append( + { + "test_index": index, + "success": test_result.success, + "assertions_passed": case_assertions_passed, + "assertions_total": case_assertions_total, + "tokens": test_result.stats.total_tokens or test_result.tokens_used, + "turns": test_result.stats.turns or test_result.turns, + "input": getattr(test_result.test_case, "input", None), + "output": test_result.output, + "error": test_result.error, + "validation_error": ( + test_result.validation_result.error_message + if test_result.validation_result is not None + else None + ), + "validation_details": ( + list(test_result.validation_result.details) + if test_result.validation_result is not None + else [] + ), + "judge_score": judge_score, + "judge_summary": judge_summary, + } + ) + + return AggregateMetrics( + case_success_rate=_safe_average(passed_cases, total_cases), + hard_score=_safe_average(assertions_passed, assertions_total), + assertions_passed=assertions_passed, + assertions_total=assertions_total, + avg_tokens=_safe_average(total_tokens, total_cases), + avg_turns=_safe_average(total_turns, total_cases), + judge_score=_safe_average(sum(judge_scores), len(judge_scores)) if judge_scores else None, + case_details=details, + ) + + +async def _evaluate_variant( + *, + scenario_id: str, + skill_path: str, + skill: Skill, + tests_path: Path, + cards_path: Path, + config: Config, + model: str, + runs: int, + artifact_root: Path, + judge: JudgeConfig | None, + criteria: list[str], +) -> AggregateMetrics: + test_cases = load_test_cases(tests_path) + executor = LocalFastAgentExecutor() + all_test_results: list[TestResult] = [] + + for run_index in range(1, runs + 1): + eval_results = await evaluate_skill( + skill, + test_cases=test_cases, + executor=executor, + model=model, + fastagent_config_path=config.effective_fastagent_config, + cards_source_dir=cards_path, + artifact_root=artifact_root / f"run_{run_index}", + run_baseline=False, + max_parallel=config.max_parallel, + operation="eval", + ) + all_test_results.extend(eval_results.with_skill_results) + + return _aggregate_metrics( + scenario_id=scenario_id, + skill_path=skill_path, + criteria=criteria, + test_results=all_test_results, + judge=judge, + ) + + +def _compare_variants( + *, + current: AggregateMetrics, + baseline: AggregateMetrics, + hard_regression_threshold: float, + judge_regression_threshold: float, + token_regression_threshold: float, +) -> tuple[bool, list[str], dict[str, float]]: + reasons: list[str] = [] + deltas = { + "hard_score": current.hard_score - baseline.hard_score, + "judge_score": ( + current.judge_score - baseline.judge_score + if current.judge_score is not None and baseline.judge_score is not None + else 0.0 + ), + "avg_tokens": current.avg_tokens - baseline.avg_tokens, + } + + if current.hard_score + hard_regression_threshold < baseline.hard_score: + reasons.append( + f"hard score dropped from {baseline.hard_score:.3f} to {current.hard_score:.3f}" + ) + + hard_is_flat_or_worse = current.hard_score <= baseline.hard_score + 1e-9 + if ( + hard_is_flat_or_worse + and current.judge_score is not None + and baseline.judge_score is not None + and current.judge_score + judge_regression_threshold < baseline.judge_score + ): + reasons.append( + f"judge score dropped from {baseline.judge_score:.3f} to {current.judge_score:.3f}" + ) + + if hard_is_flat_or_worse and baseline.avg_tokens > 0: + token_increase_ratio = (current.avg_tokens - baseline.avg_tokens) / baseline.avg_tokens + if token_increase_ratio > token_regression_threshold: + reasons.append( + "average tokens increased from " + f"{baseline.avg_tokens:.1f} to {current.avg_tokens:.1f}" + ) + + return (bool(reasons), reasons, deltas) + + +def _case_issues(case_detail: dict[str, Any]) -> list[str]: + issues: list[str] = [] + error = _truncate_text(case_detail.get("error"), SUMMARY_ISSUE_LIMIT) + if error: + issues.append(f"execution error: {error}") + + validation_error = _truncate_text(case_detail.get("validation_error"), SUMMARY_ISSUE_LIMIT) + if validation_error: + issues.append(f"verifier error: {validation_error}") + + validation_details = case_detail.get("validation_details") or [] + detail = _truncate_text(validation_details, SUMMARY_ISSUE_LIMIT) + if detail: + issues.append(f"verifier detail: {detail}") + + if not case_detail.get("success"): + assertions_passed = case_detail.get("assertions_passed", 0) + assertions_total = case_detail.get("assertions_total", 0) + issues.append(f"hard assertions: {assertions_passed}/{assertions_total}") + + judge_summary = _truncate_text(case_detail.get("judge_summary"), SUMMARY_ISSUE_LIMIT) + if judge_summary: + prefix = "judge issue" if case_detail.get("judge_score") is None else "judge note" + issues.append(f"{prefix}: {judge_summary}") + + return issues + + +def _issue_examples(case_details: list[dict[str, Any]]) -> list[str]: + examples: list[str] = [] + for case_detail in case_details: + issues = _case_issues(case_detail) + if not issues: + continue + + request = _truncate_text(case_detail.get("input"), SUMMARY_REQUEST_LIMIT) or "(no request)" + line = ( + f"test {case_detail['test_index']}: request `{request}` | " + f"issues: {'; '.join(issues)}" + ) + response = _truncate_text(case_detail.get("output"), SUMMARY_OUTPUT_LIMIT) + if response: + line = f"{line} | response `{response}`" + examples.append(line) + if len(examples) >= SUMMARY_CASE_LIMIT: + break + return examples + + +def _judge_cell(metrics: dict[str, Any]) -> str: + judge_score = metrics.get("judge_score") + if judge_score is not None: + return f"{judge_score:.3f}" + case_details = metrics.get("case_details") or [] + if any(case.get("judge_summary") for case in case_details): + return "error" + return "n/a" + + +def _render_markdown(report: dict[str, Any]) -> str: + lines = [ + "# Upskill Skill Performance", + "", + f"- Base ref: `{report['base_ref']}`", + f"- Eval model: `{report['eval_model']}`", + f"- Runs per variant: `{report['runs']}`", + ] + if report.get("judge") is not None: + lines.append( + f"- Judge: `{report['judge']['provider']}` / `{report['judge']['model']}`" + ) + if report["selected_scenarios"]: + lines.append(f"- Selected scenarios: `{', '.join(report['selected_scenarios'])}`") + else: + lines.extend(["", "No scenarios were selected."]) + return "\n".join(lines) + + lines.extend( + [ + "", + "| Scenario | Skill | Current Hard | Main Hard | Judge | Main Judge | Avg Tokens | Main Tokens | Status |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- |", + ] + ) + for scenario in report["scenarios"]: + current = scenario["current"] + baseline = scenario["baseline"] + current_judge = _judge_cell(current) + baseline_hard = "n/a" + baseline_tokens = "n/a" + baseline_judge = "n/a" + if baseline is not None: + baseline_hard = f"{baseline['hard_score']:.3f}" + baseline_tokens = f"{baseline['avg_tokens']:.1f}" + baseline_judge = _judge_cell(baseline) + status = "FAIL" if scenario["regression"] else ("PASS" if baseline is not None else "NEW") + lines.append( + "| " + f"{scenario['scenario_id']} | " + f"{scenario['skill_path']} | " + f"{current['hard_score']:.3f} | " + f"{baseline_hard} | " + f"{current_judge} | " + f"{baseline_judge} | " + f"{current['avg_tokens']:.1f} | " + f"{baseline_tokens} | " + f"{status} |" + ) + if scenario["notes"]: + lines.append("") + lines.append(f"Notes for `{scenario['scenario_id']}`:") + for note in scenario["notes"]: + lines.append(f"- {note}") + if scenario["reasons"]: + lines.append("") + lines.append(f"Reasons for `{scenario['scenario_id']}`:") + for reason in scenario["reasons"]: + lines.append(f"- {reason}") + current_examples = _issue_examples(current["case_details"]) + if current_examples: + lines.append("") + lines.append(f"Issue examples for `{scenario['scenario_id']}` current run:") + for example in current_examples: + lines.append(f"- {example}") + if baseline is not None: + baseline_examples = _issue_examples(baseline["case_details"]) + if baseline_examples: + lines.append("") + lines.append(f"Issue examples for `{scenario['scenario_id']}` main baseline:") + for example in baseline_examples: + lines.append(f"- {example}") + return "\n".join(lines) + + +def _write_step_summary(markdown: str) -> None: + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path: + return + with open(summary_path, "a", encoding="utf-8") as handle: + handle.write(markdown) + handle.write("\n") + + +def _persist_history(report: dict[str, Any], repo_id: str) -> None: + from huggingface_hub import HfApi, hf_hub_download + from huggingface_hub.utils import HfHubHTTPError + + token = os.environ.get("HF_TOKEN") + if not token: + raise RuntimeError("HF_TOKEN is required to persist history.") + + api = HfApi(token=token) + api.create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True) + + history_path = "data/history.jsonl" + metadata_path = "data/metadata.json" + existing_content = "" + + try: + downloaded = hf_hub_download( + repo_id=repo_id, + repo_type="dataset", + filename=history_path, + token=token, + ) + except HfHubHTTPError as exc: + if "404" not in str(exc): + raise + else: + existing_content = Path(downloaded).read_text(encoding="utf-8") + + history_rows = report["history_rows"] + appended = "\n".join(json.dumps(row, sort_keys=True) for row in history_rows) + new_content = existing_content.strip() + if new_content and appended: + new_content = f"{new_content}\n{appended}" + elif appended: + new_content = appended + + metadata = { + "updated_at": _now_iso(), + "total_rows": len([line for line in new_content.splitlines() if line.strip()]), + "latest_sha": report["commit_sha"], + "latest_scenarios": report["selected_scenarios"], + } + + api.upload_file( + path_or_fileobj=new_content.encode("utf-8"), + path_in_repo=history_path, + repo_id=repo_id, + repo_type="dataset", + commit_message=f"Update skill performance history for {report['commit_sha'][:8]}", + ) + api.upload_file( + path_or_fileobj=json.dumps(metadata, indent=2).encode("utf-8"), + path_in_repo=metadata_path, + repo_id=repo_id, + repo_type="dataset", + commit_message=f"Update performance metadata for {report['commit_sha'][:8]}", + ) + api.upload_file( + path_or_fileobj=json.dumps(report, indent=2).encode("utf-8"), + path_in_repo=f"reports/{report['commit_sha']}.json", + repo_id=repo_id, + repo_type="dataset", + commit_message=f"Upload detailed performance report for {report['commit_sha'][:8]}", + ) + + +async def _run_suite(args: argparse.Namespace) -> dict[str, Any]: + repo_root = Path.cwd() + base_ref = _normalize_base_ref(args.base_ref) + report_plan, selected_scenarios = plan_ci_suite( + Path(args.manifest), + scope="changed", + base_ref=base_ref, + working_dir=repo_root, + ) + + config = Config.load() + eval_model = args.eval_model or config.effective_eval_model + report: dict[str, Any] = { + "generated_at": _now_iso(), + "repo_root": str(repo_root), + "base_ref": base_ref, + "eval_model": eval_model, + "runs": args.runs, + "changed_files": report_plan.changed_files, + "changed_skills": report_plan.changed_skills, + "selected_scenarios": report_plan.selected_scenarios, + "scenarios": [], + "history_rows": [], + "commit_sha": os.environ.get("GITHUB_SHA") + or _run_git(repo_root, "rev-parse", "HEAD").stdout.strip(), + } + + if not selected_scenarios: + return report + + artifact_root = Path(args.artifacts_dir) + artifact_root.mkdir(parents=True, exist_ok=True) + + judge_config = None + if any((scenario.judge and scenario.judge.enabled) for scenario in selected_scenarios): + judge_config = _judge_config() + report["judge"] = ( + {"provider": judge_config.provider, "model": judge_config.model} + if judge_config is not None + else None + ) + + cards_resource = resources.files("upskill").joinpath("agent_cards") + with resources.as_file(cards_resource) as cards_path, tempfile.TemporaryDirectory( + prefix="upskill-base-" + ) as temp_dir: + temp_root = Path(temp_dir) + for scenario in selected_scenarios: + if len(scenario.skills) != 1: + raise RuntimeError( + f"Scenario {scenario.id} must reference exactly one skill for this workflow." + ) + + skill_path = scenario.skills[0] + current_skill_path = repo_root / skill_path + tests_path = repo_root / scenario.tests + if not current_skill_path.exists(): + raise FileNotFoundError(f"Current skill path not found: {current_skill_path}") + if not tests_path.exists(): + raise FileNotFoundError(f"Tests path not found: {tests_path}") + + current_skill = Skill.load(current_skill_path) + criteria = list((scenario.judge.criteria if scenario.judge else None) or []) + active_judge = judge_config if scenario.judge and scenario.judge.enabled else None + notes: list[str] = [] + + current_metrics = await _evaluate_variant( + scenario_id=scenario.id, + skill_path=skill_path, + skill=current_skill, + tests_path=tests_path, + cards_path=cards_path, + config=config, + model=eval_model, + runs=args.runs, + artifact_root=artifact_root / scenario.id / "current", + judge=active_judge, + criteria=criteria, + ) + baseline_metrics = None + regression = False + reasons: list[str] = [] + deltas: dict[str, float] = {} + try: + baseline_skill_path = _materialize_skill_from_git( + repo_root, + base_ref, + skill_path, + temp_root, + ) + except FileNotFoundError: + notes.append(f"No baseline skill found at `{base_ref}`.") + else: + baseline_skill = Skill.load(baseline_skill_path) + baseline_metrics = await _evaluate_variant( + scenario_id=scenario.id, + skill_path=skill_path, + skill=baseline_skill, + tests_path=tests_path, + cards_path=cards_path, + config=config, + model=eval_model, + runs=args.runs, + artifact_root=artifact_root / scenario.id / "baseline", + judge=active_judge, + criteria=criteria, + ) + regression, reasons, deltas = _compare_variants( + current=current_metrics, + baseline=baseline_metrics, + hard_regression_threshold=args.hard_regression_threshold, + judge_regression_threshold=args.judge_regression_threshold, + token_regression_threshold=args.token_regression_threshold, + ) + + scenario_report = { + "scenario_id": scenario.id, + "skill_path": skill_path, + "tests_path": scenario.tests, + "criteria": criteria, + "notes": notes, + "regression": regression, + "reasons": reasons, + "deltas": deltas, + "current": { + "case_success_rate": current_metrics.case_success_rate, + "hard_score": current_metrics.hard_score, + "assertions_passed": current_metrics.assertions_passed, + "assertions_total": current_metrics.assertions_total, + "avg_tokens": current_metrics.avg_tokens, + "avg_turns": current_metrics.avg_turns, + "judge_score": current_metrics.judge_score, + "case_details": current_metrics.case_details, + }, + "baseline": ( + { + "case_success_rate": baseline_metrics.case_success_rate, + "hard_score": baseline_metrics.hard_score, + "assertions_passed": baseline_metrics.assertions_passed, + "assertions_total": baseline_metrics.assertions_total, + "avg_tokens": baseline_metrics.avg_tokens, + "avg_turns": baseline_metrics.avg_turns, + "judge_score": baseline_metrics.judge_score, + "case_details": baseline_metrics.case_details, + } + if baseline_metrics is not None + else None + ), + } + report["scenarios"].append(scenario_report) + report["history_rows"].append( + { + "generated_at": report["generated_at"], + "commit_sha": report["commit_sha"], + "base_ref": base_ref, + "scenario_id": scenario.id, + "skill_path": skill_path, + "eval_model": eval_model, + "runs": args.runs, + "hard_score": current_metrics.hard_score, + "judge_score": current_metrics.judge_score, + "avg_tokens": current_metrics.avg_tokens, + "avg_turns": current_metrics.avg_turns, + "baseline_hard_score": ( + baseline_metrics.hard_score if baseline_metrics is not None else None + ), + "baseline_judge_score": ( + baseline_metrics.judge_score if baseline_metrics is not None else None + ), + "baseline_avg_tokens": ( + baseline_metrics.avg_tokens if baseline_metrics is not None else None + ), + "regression": regression, + "notes": notes, + "reasons": reasons, + } + ) + + return report + + +def main() -> int: + args = _parse_args() + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + report = asyncio.run(_run_suite(args)) + except Exception as exc: + print(f"upskill CI failed: {exc}", file=sys.stderr) + return 2 + + output_path.write_text(json.dumps(report, indent=2), encoding="utf-8") + markdown = _render_markdown(report) + _write_step_summary(markdown) + print(markdown) + + if args.persist: + try: + _persist_history(report, args.history_repo) + except Exception as exc: + print(f"failed to persist history: {exc}", file=sys.stderr) + return 3 + + regressions = [scenario for scenario in report["scenarios"] if scenario["regression"]] + return 1 if regressions else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/upskill.config.yaml b/upskill.config.yaml new file mode 100644 index 00000000..b5c82493 --- /dev/null +++ b/upskill.config.yaml @@ -0,0 +1,7 @@ +skill_generation_model: kimi +eval_model: opus +executor: local +num_runs: 1 +max_parallel: 1 +runs_dir: ./.upskill/runs +fastagent_config: ./fastagent.config.yaml