huggingface · burtenshaw · Mar 25, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.github/workflows/upskill-internal-pr.yml b/.github/workflows/upskill-internal-pr.yml
@@ -0,0 +1,90 @@
+name: Validate Skill Performance With Upskill
+
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/upskill-internal-pr.yml"
+      - ".upskill/**"
+      - "evals/**"
+      - "upskill.config.yaml"
+      - "fastagent.config.yaml"
+      - "scripts/upskill_ci.py"
+      - "skills/**/SKILL.md"
+      - "skills/**/references/**"
+      - "skills/**/scripts/**"
+  push:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/upskill-internal-pr.yml"
+      - ".upskill/**"
+      - "evals/**"
+      - "upskill.config.yaml"
+      - "fastagent.config.yaml"
+      - "scripts/upskill_ci.py"
+      - "skills/**/SKILL.md"
+      - "skills/**/references/**"
+      - "skills/**/scripts/**"
+
+jobs:
+  upskill:
+    if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    env:
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      UPSKILL_JUDGE_MODEL: "claude-haiku-4-5-20251001"
+      UPSKILL_CI_RUNS: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: Set up uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13.5"
+
+      - name: Resolve comparison ref
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "UPSKILL_BASE_REF=${{ github.event.pull_request.base.sha }}" >> "$GITHUB_ENV"
+            echo "UPSKILL_PERSIST=false" >> "$GITHUB_ENV"
+          else
+            echo "UPSKILL_BASE_REF=${{ github.event.before }}" >> "$GITHUB_ENV"
+            echo "UPSKILL_PERSIST=true" >> "$GITHUB_ENV"
+          fi
+
+      - name: Run upskill comparison
+        run: |
+          set -euo pipefail
+          cmd=(
+            uv run
+            --python 3.13.5
+            --with git+https://github.com/huggingface/upskill@main
+            --with huggingface_hub
+            python
+            scripts/upskill_ci.py
+            --base-ref "$UPSKILL_BASE_REF"
+            --runs "$UPSKILL_CI_RUNS"
+          )
+          if [ "$UPSKILL_PERSIST" = "true" ]; then
+            cmd+=(--persist)
+          fi
+          "${cmd[@]}"
+
+      - name: Upload upskill artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: upskill-artifacts-${{ github.run_id }}
+          path: |
+            .upskill/artifacts
+            .upskill/reports
+          if-no-files-found: ignore
diff --git a/.gitignore b/.gitignore
@@ -207,6 +207,9 @@ marimo/_lsp/
 __marimo__/
 .claude
 .fast-agent/
+.upskill/artifacts/
+.upskill/reports/
+.upskill/runs/
 
 # MacOS
-.DS_Store
+.DS_Store
diff --git a/.upskill/README.md b/.upskill/README.md
@@ -0,0 +1,20 @@
+# Upskill CI
+
+This directory contains the smoke-eval manifest and supporting files for the
+skills performance workflow.
+
+## What runs
+
+- Internal pull requests only: the workflow skips fork PRs entirely.
+- Pushes to `main`: the workflow reruns changed scenarios and persists a
+  normalized history record to a Hugging Face dataset.
+
+## Required secrets
+
+- `ANTHROPIC_API_KEY` for local `upskill` evaluation and LLM-as-a-judge.
+- `HF_TOKEN` for history uploads on `main` pushes.
+
+## Files
+
+- `evals.json` maps skills to smoke scenarios.
+- `../evals/**` contains the fixed test cases used by CI.
diff --git a/.upskill/evals.json b/.upskill/evals.json
@@ -0,0 +1,147 @@
+{
+  "scenarios": [
+    {
+      "id": "hf-cli-smoke",
+      "skills": ["skills/hf-cli"],
+      "tests": "evals/hf-cli/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the modern hf CLI, not deprecated huggingface-cli commands.",
+          "Provides a concrete command that is directly executable.",
+          "Stays focused on the user's request."
+        ]
+      }
+    },
+    {
+      "id": "community-evals-smoke",
+      "skills": ["skills/huggingface-community-evals"],
+      "tests": "evals/huggingface-community-evals/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Chooses the correct local evaluation script for the task.",
+          "Keeps the workflow local instead of drifting into HF Jobs.",
+          "Provides a runnable command shape."
+        ]
+      }
+    },
+    {
+      "id": "datasets-smoke",
+      "skills": ["skills/huggingface-datasets"],
+      "tests": "evals/huggingface-datasets/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the Dataset Viewer API endpoints that match the requested workflow.",
+          "Covers split discovery, preview, and pagination.",
+          "Keeps the answer concrete and operational."
+        ]
+      }
+    },
+    {
+      "id": "gradio-smoke",
+      "skills": ["skills/huggingface-gradio"],
+      "tests": "evals/huggingface-gradio/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Shows a minimal valid Gradio solution.",
+          "Uses the API shape that matches the user request.",
+          "Keeps the code concise and directly usable."
+        ]
+      }
+    },
+    {
+      "id": "jobs-smoke",
+      "skills": ["skills/huggingface-jobs"],
+      "tests": "evals/huggingface-jobs/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the hf_jobs MCP tool as the preferred submission path.",
+          "Mentions HF_TOKEN handling for Hub access.",
+          "Gives a concrete submission pattern rather than generic advice."
+        ]
+      }
+    },
+    {
+      "id": "llm-trainer-smoke",
+      "skills": ["skills/huggingface-llm-trainer"],
+      "tests": "evals/huggingface-llm-trainer/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the training workflow described by the skill.",
+          "Keeps Trackio or equivalent monitoring enabled.",
+          "Points to the right script or job pattern for SFT."
+        ]
+      }
+    },
+    {
+      "id": "paper-publisher-smoke",
+      "skills": ["skills/huggingface-paper-publisher"],
+      "tests": "evals/huggingface-paper-publisher/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the paper_manager script correctly.",
+          "Provides the right subcommand and required flag.",
+          "Keeps the answer concrete and executable."
+        ]
+      }
+    },
+    {
+      "id": "papers-smoke",
+      "skills": ["skills/huggingface-papers"],
+      "tests": "evals/huggingface-papers/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the HF paper metadata and markdown endpoints correctly.",
+          "Handles the provided paper id directly.",
+          "Keeps the answer narrow and API-focused."
+        ]
+      }
+    },
+    {
+      "id": "trackio-smoke",
+      "skills": ["skills/huggingface-trackio"],
+      "tests": "evals/huggingface-trackio/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Uses the Trackio CLI flow that matches the retrieval task.",
+          "Returns JSON-oriented automation guidance.",
+          "Keeps the answer concrete and specific."
+        ]
+      }
+    },
+    {
+      "id": "vision-trainer-smoke",
+      "skills": ["skills/huggingface-vision-trainer"],
+      "tests": "evals/huggingface-vision-trainer/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Emphasizes dataset validation before expensive training.",
+          "Uses the helper script referenced by the skill.",
+          "Keeps the answer focused on the requested preflight step."
+        ]
+      }
+    },
+    {
+      "id": "transformers-js-smoke",
+      "skills": ["skills/transformers-js"],
+      "tests": "evals/transformers-js/smoke.json",
+      "judge": {
+        "enabled": true,
+        "criteria": [
+          "Shows the minimal correct Transformers.js setup for Node.js.",
+          "Includes proper pipeline usage and cleanup.",
+          "Keeps the answer directly usable."
+        ]
+      }
+    }
+  ]
+}
diff --git a/evals/hf-cli/smoke.json b/evals/hf-cli/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "A user wants the modern Hugging Face CLI command to download the model bert-base-uncased into ./models and explicitly does not want deprecated CLI names. Give the exact command.",
+      "verifiers": [
+        {"type": "contains", "values": ["hf download"]},
+        {"type": "contains", "values": ["bert-base-uncased"]},
+        {"type": "contains", "values": ["--local-dir ./models"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-community-evals/smoke.json b/evals/huggingface-community-evals/smoke.json
@@ -0,0 +1,13 @@
+{
+  "cases": [
+    {
+      "input": "I want a quick local inspect-ai smoke test against a Hub model using inference providers, not Hugging Face Jobs. Give the command.",
+      "verifiers": [
+        {"type": "contains", "values": ["uv run scripts/inspect_eval_uv.py"]},
+        {"type": "contains", "values": ["--model"]},
+        {"type": "contains", "values": ["--task"]},
+        {"type": "contains", "values": ["--limit"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-datasets/smoke.json b/evals/huggingface-datasets/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "I need to inspect a dataset with the Dataset Viewer API: resolve splits, preview rows, then paginate more rows. Describe the exact endpoints I should call.",
+      "verifiers": [
+        {"type": "contains", "values": ["/splits?dataset="]},
+        {"type": "contains", "values": ["/first-rows?dataset="]},
+        {"type": "contains", "values": ["/rows?dataset="]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-gradio/smoke.json b/evals/huggingface-gradio/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "Show a minimal Gradio chatbot app in Python.",
+      "verifiers": [
+        {"type": "contains", "values": ["import gradio as gr"]},
+        {"type": "contains", "values": ["gr.ChatInterface"]},
+        {"type": "contains", "values": [".launch()"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-jobs/smoke.json b/evals/huggingface-jobs/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "Run a short Python workload on Hugging Face Jobs and make sure HF_TOKEN is available to the job. Show the preferred submission pattern.",
+      "verifiers": [
+        {"type": "contains", "values": ["hf_jobs"]},
+        {"type": "contains", "values": ["HF_TOKEN"]},
+        {"type": "contains", "values": ["script"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-llm-trainer/smoke.json b/evals/huggingface-llm-trainer/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "Start an SFT training job on Hugging Face Jobs and keep real-time monitoring enabled. What pattern should you use?",
+      "verifiers": [
+        {"type": "contains", "values": ["hf_jobs"]},
+        {"type": "contains", "values": ["Trackio"]},
+        {"type": "contains", "values": ["scripts/train_sft_example.py"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-paper-publisher/smoke.json b/evals/huggingface-paper-publisher/smoke.json
@@ -0,0 +1,12 @@
+{
+  "cases": [
+    {
+      "input": "I want to index arXiv paper 2301.12345 on the Hub. Give the exact command.",
+      "verifiers": [
+        {"type": "contains", "values": ["uv run scripts/paper_manager.py index"]},
+        {"type": "contains", "values": ["--arxiv-id"]},
+        {"type": "contains", "values": ["2301.12345"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-papers/smoke.json b/evals/huggingface-papers/smoke.json
@@ -0,0 +1,11 @@
+{
+  "cases": [
+    {
+      "input": "Given arXiv ID 2602.08025, show the Hugging Face API call for structured metadata and the markdown fetch path.",
+      "verifiers": [
+        {"type": "contains", "values": ["/api/papers/2602.08025"]},
+        {"type": "contains", "values": ["/papers/2602.08025.md"]}
+      ]
+    }
+  ]
+}
diff --git a/evals/huggingface-trackio/smoke.json b/evals/huggingface-trackio/smoke.json
@@ -0,0 +1,11 @@
+{
+  "cases": [
+    {
+      "input": "How do I retrieve Trackio alerts as JSON for automation?",
+      "verifiers": [
+        {"type": "contains", "values": ["trackio list alerts"]},
+        {"type": "contains", "values": ["--json"]}
+      ]
+    }
+  ]
+}