Skip to content

Full Evals

Full Evals #343

Workflow file for this run

name: Full Evals
on:
workflow_dispatch:
inputs:
enforce_kpi_release_gate:
description: "Fail workflow if contract KPI publication gate is not met"
required: false
type: boolean
default: false
schedule:
- cron: "0 2 * * *"
pull_request:
types:
- opened
- synchronize
- reopened
- ready_for_review
paths:
- "SKILL.md"
- "**/SKILL.md"
- "**/references/**"
- "evals/**"
- "scripts/quality/**"
- ".github/workflows/**"
jobs:
full-evals:
permissions:
contents: read
models: read
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
persist-credentials: false
- name: Setup Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
with:
python-version: "3.12"
- name: Setup Scarb
uses: software-mansion/setup-scarb@95ba816a4383938e2338cb793773d4670011b65f # v1
with:
scarb-version: "2.14.0"
- name: Setup Starknet Foundry
uses: foundry-rs/setup-snfoundry@84390a1602c157576ceefa6f9cfcb8b0504c94c7 # v3
with:
starknet-foundry-version: "0.56.0"
- name: Install Python deps
run: python -m pip install -r requirements.txt detect-secrets==1.5.0
- name: Run parity quality checks
run: python scripts/quality/parity_check.py
- name: Validate manifests + uniqueness
run: |
python scripts/audit-pipeline/validate_jsonl.py \
--schema datasets/manifests/audit-manifest.schema.json \
--jsonl datasets/manifests/audits.jsonl
python scripts/audit-pipeline/check_unique_ids.py \
--jsonl datasets/manifests/audits.jsonl \
--keys audit_id source_url raw_path extracted_path raw_sha256
python scripts/audit-pipeline/validate_json.py \
--schema datasets/normalized/audit.schema.json \
--glob 'datasets/normalized/audits/*.json'
for file in datasets/normalized/findings/*.jsonl; do
python scripts/audit-pipeline/validate_jsonl.py \
--schema datasets/normalized/finding.schema.json \
--jsonl "$file"
done
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/cases/benchmark-case.schema.json \
--jsonl evals/cases/cairo_auditor_benchmark.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/cases/benchmark-case.schema.json \
--jsonl evals/cases/cairo_auditor_realworld_benchmark.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/cases/contract-benchmark-case.schema.json \
--jsonl evals/cases/contract_skill_benchmark.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/cases/contract-generation-case.schema.json \
--jsonl evals/cases/contract_skill_generation_eval.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/cases/benchmark-case.schema.json \
--jsonl evals/heldout/cairo_auditor_llm_eval_cases.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/reports/data/external-triage-label.schema.json \
--jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/reports/data/security-review-signoff.schema.json \
--jsonl evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl
python scripts/audit-pipeline/validate_jsonl.py \
--schema evals/reports/data/manual-19-gold.schema.json \
--jsonl evals/reports/data/manual-19-gold.jsonl
python scripts/quality/check_vulndb_parity.py \
--cases evals/cases/cairo_auditor_benchmark.jsonl \
--cases evals/cases/cairo_auditor_realworld_benchmark.jsonl
python scripts/quality/check_attack_vector_coverage.py \
--min-vectors 120
python scripts/quality/check_semgrep_vector_coverage.py \
--core-min 1 \
--core-max 120
- name: Held-out leakage guard
run: python scripts/audit-pipeline/check_no_heldout_leak.py
- name: Secret scan
run: |
detect-secrets scan --all-files \
--exclude-files '.git/.*' \
--exclude-files 'datasets/audits/raw/.*' \
--exclude-files 'datasets/audits/extracted/.*' \
--exclude-files 'datasets/manifests/audit_metadata.seed.json' \
--exclude-files 'evals/reports/data/.*' \
--exclude-files 'website/data/.*' > /tmp/detect-secrets.json
python - <<'PY'
import json
import sys
with open("/tmp/detect-secrets.json", "r", encoding="utf-8") as handle:
report = json.load(handle)
findings = report.get("results", {})
finding_count = sum(len(v) for v in findings.values())
print(f"detect-secrets findings={finding_count}")
if finding_count > 0:
for path, items in findings.items():
if items:
print(f"{path}: {len(items)} potential secrets")
sys.exit(1)
PY
- name: Cairo auditor benchmark
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/benchmark_cairo_auditor.py \
--cases evals/cases/cairo_auditor_benchmark.jsonl \
--output "${RUNNER_TEMP}/scorecards/cairo-auditor-benchmark.md" \
--version v0.2.0 \
--min-precision 0.90 \
--min-recall 0.90 \
--min-class-recall 0.90
- name: Cairo auditor real-world benchmark
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/benchmark_cairo_auditor.py \
--cases evals/cases/cairo_auditor_realworld_benchmark.jsonl \
--output "${RUNNER_TEMP}/scorecards/cairo-auditor-realworld-benchmark.md" \
--version v0.2.0 \
--title "v0.2.0 Cairo Auditor Real-World Benchmark" \
--min-precision 0.90 \
--min-recall 0.90 \
--min-class-recall 0.90
- name: Manual-19 gold recall
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/check_manual_gold_recall.py \
--gold evals/reports/data/manual-19-gold.jsonl \
--findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.md" \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.json" \
--min-recall 0.90 \
--min-class-recall 0.75
- name: Contract skill benchmark
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/benchmark_contract_skills.py \
--cases evals/cases/contract_skill_benchmark.jsonl \
--output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark.md" \
--version v0.5.0 \
--title "v0.5.0 Contract Skill Benchmark" \
--min-precision 0.95 \
--min-recall 0.95 \
--min-evaluated 60 \
--enforce-min-evaluated \
--require-tools
- name: Contract benchmark mutation suite
timeout-minutes: 30
run: |
python scripts/quality/mutation_test_contract_benchmark.py \
--cases evals/cases/contract_skill_benchmark.jsonl \
--min-precision 1.0 \
--min-recall 1.0 \
--min-evaluated 60
- name: Contract benchmark trend
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/render_contract_benchmark_trend.py \
--scorecards-glob 'evals/scorecards/v*-contract-skill-benchmark.md' \
--output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \
--min-cases 60 \
--min-consecutive 2
- name: Contract KPI publication gate
env:
ENFORCE_KPI_GATE: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enforce_kpi_release_gate == 'true' }}
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
extra_args=()
if [ "${ENFORCE_KPI_GATE}" = "true" ]; then
extra_args+=(--enforce)
fi
python scripts/quality/check_contract_kpi_release_gate.py \
--trend "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \
--signoffs evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl \
--output "${RUNNER_TEMP}/scorecards/contract-kpi-publication-gate.md" \
--min-consecutive 2 \
"${extra_args[@]}"
- name: Cairo auditor external triage scorecard
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/score_external_triage.py \
--labels evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl \
--findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
--release v0.2.0 \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.md" \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.json" \
--output-unlabeled-jsonl "${RUNNER_TEMP}/scorecards/cairo-auditor-external-unlabeled.jsonl" \
--trend-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-trend.md" \
--min-precision 0.70 \
--min-recall 0.90 \
--min-labeled-coverage 0.90
- name: Sierra parallel signal (auxiliary)
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
continue-on-error: true
timeout-minutes: 25
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/sierra_parallel_signal.py \
--scan-id "ci-sierra-parallel-low-profile-${{ github.run_id }}" \
--repos-file evals/reports/data/external-repo-scan-low-profile-repos.txt \
--detector-findings-jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
--allow-build \
--scarb-timeout-seconds 240 \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.json" \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.md"
- name: Caracal adapter (auxiliary)
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
continue-on-error: true
timeout-minutes: 10
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/run_caracal_adapter.py \
--repo-root . \
--allow-build \
--scarb-timeout-seconds 240 \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.json" \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.md"
- name: Semgrep Cairo adapter (auxiliary)
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
continue-on-error: true
timeout-minutes: 10
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/run_semgrep_cairo.py \
--repo-root . \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.json" \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.md"
- name: Probe GitHub Models availability
id: models_probe
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }}
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
body='{"messages":[{"role":"user","content":"Return {\"ok\":true} in JSON."}],"model":"openai/gpt-4o","temperature":0}'
code=$(curl -sS -o /tmp/models-probe.json -w "%{http_code}" \
"https://models.github.ai/inference/chat/completions" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $GITHUB_TOKEN" \
-d "$body")
if [ "$code" = "200" ]; then
echo "available=true" >> "$GITHUB_OUTPUT"
echo "GitHub Models probe: available"
else
echo "available=false" >> "$GITHUB_OUTPUT"
echo "GitHub Models probe unavailable (HTTP $code), skipping LLM held-out eval."
sed -n '1,40p' /tmp/models-probe.json || true
fi
- name: LLM held-out eval
if: ${{ (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }}
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/run_llm_eval.py \
--cases evals/heldout/cairo_auditor_llm_eval_cases.jsonl \
--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.json" \
--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.md" \
--model openai/gpt-4o \
--min-precision 0.75 \
--min-recall 0.75
- name: Contract generation eval (build side, informational)
if: ${{ (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }}
# Informational telemetry: non-zero exits are recorded in logs/artifacts but do not fail the workflow.
continue-on-error: true
timeout-minutes: 40
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
mkdir -p "${RUNNER_TEMP}/scorecards"
python scripts/quality/run_contract_generation_eval.py \
--cases evals/cases/contract_skill_generation_eval.jsonl \
--output-json "${RUNNER_TEMP}/scorecards/contract-generation-eval.json" \
--output-md "${RUNNER_TEMP}/scorecards/contract-generation-eval.md" \
--model openai/gpt-4o \
--min-pass-rate 0.55 \
--max-vuln-rate 0.35 \
--min-evaluated 8 \
--enforce-min-evaluated \
--require-tools
- name: "LLM held-out eval (skipped: GitHub Models unavailable)"
if: ${{ steps.models_probe.outputs.available != 'true' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }}
run: echo "Skipping LLM held-out eval because GitHub Models is unavailable for this run."
- name: "Contract generation eval (skipped: GitHub Models unavailable)"
if: ${{ steps.models_probe.outputs.available != 'true' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }}
run: echo "Skipping contract generation eval because GitHub Models is unavailable for this run."
- name: Upload benchmark scorecards
if: always()
uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
with:
name: cairo-auditor-scorecards
if-no-files-found: warn
path: |
${{ runner.temp }}/scorecards/cairo-auditor-benchmark.md
${{ runner.temp }}/scorecards/cairo-auditor-realworld-benchmark.md
${{ runner.temp }}/scorecards/contract-skill-benchmark.md
${{ runner.temp }}/scorecards/contract-skill-benchmark-trend.md
${{ runner.temp }}/scorecards/contract-kpi-publication-gate.md
${{ runner.temp }}/scorecards/cairo-auditor-external-triage.md
${{ runner.temp }}/scorecards/cairo-auditor-external-trend.md
${{ runner.temp }}/scorecards/cairo-auditor-external-triage.json
${{ runner.temp }}/scorecards/cairo-auditor-external-unlabeled.jsonl
${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.md
${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.json
${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.md
${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.json
${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.md
${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.json
${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.md
${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.json
${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.md
${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.json
${{ runner.temp }}/scorecards/contract-generation-eval.md
${{ runner.temp }}/scorecards/contract-generation-eval.json