Full Evals #343
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Full Evals | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| enforce_kpi_release_gate: | |
| description: "Fail workflow if contract KPI publication gate is not met" | |
| required: false | |
| type: boolean | |
| default: false | |
| schedule: | |
| - cron: "0 2 * * *" | |
| pull_request: | |
| types: | |
| - opened | |
| - synchronize | |
| - reopened | |
| - ready_for_review | |
| paths: | |
| - "SKILL.md" | |
| - "**/SKILL.md" | |
| - "**/references/**" | |
| - "evals/**" | |
| - "scripts/quality/**" | |
| - ".github/workflows/**" | |
| jobs: | |
| full-evals: | |
| permissions: | |
| contents: read | |
| models: read | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | |
| with: | |
| persist-credentials: false | |
| - name: Setup Python | |
| uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | |
| with: | |
| python-version: "3.12" | |
| - name: Setup Scarb | |
| uses: software-mansion/setup-scarb@95ba816a4383938e2338cb793773d4670011b65f # v1 | |
| with: | |
| scarb-version: "2.14.0" | |
| - name: Setup Starknet Foundry | |
| uses: foundry-rs/setup-snfoundry@84390a1602c157576ceefa6f9cfcb8b0504c94c7 # v3 | |
| with: | |
| starknet-foundry-version: "0.56.0" | |
| - name: Install Python deps | |
| run: python -m pip install -r requirements.txt detect-secrets==1.5.0 | |
| - name: Run parity quality checks | |
| run: python scripts/quality/parity_check.py | |
| - name: Validate manifests + uniqueness | |
| run: | | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema datasets/manifests/audit-manifest.schema.json \ | |
| --jsonl datasets/manifests/audits.jsonl | |
| python scripts/audit-pipeline/check_unique_ids.py \ | |
| --jsonl datasets/manifests/audits.jsonl \ | |
| --keys audit_id source_url raw_path extracted_path raw_sha256 | |
| python scripts/audit-pipeline/validate_json.py \ | |
| --schema datasets/normalized/audit.schema.json \ | |
| --glob 'datasets/normalized/audits/*.json' | |
| for file in datasets/normalized/findings/*.jsonl; do | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema datasets/normalized/finding.schema.json \ | |
| --jsonl "$file" | |
| done | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/cases/benchmark-case.schema.json \ | |
| --jsonl evals/cases/cairo_auditor_benchmark.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/cases/benchmark-case.schema.json \ | |
| --jsonl evals/cases/cairo_auditor_realworld_benchmark.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/cases/contract-benchmark-case.schema.json \ | |
| --jsonl evals/cases/contract_skill_benchmark.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/cases/contract-generation-case.schema.json \ | |
| --jsonl evals/cases/contract_skill_generation_eval.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/cases/benchmark-case.schema.json \ | |
| --jsonl evals/heldout/cairo_auditor_llm_eval_cases.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/reports/data/external-triage-label.schema.json \ | |
| --jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/reports/data/security-review-signoff.schema.json \ | |
| --jsonl evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl | |
| python scripts/audit-pipeline/validate_jsonl.py \ | |
| --schema evals/reports/data/manual-19-gold.schema.json \ | |
| --jsonl evals/reports/data/manual-19-gold.jsonl | |
| python scripts/quality/check_vulndb_parity.py \ | |
| --cases evals/cases/cairo_auditor_benchmark.jsonl \ | |
| --cases evals/cases/cairo_auditor_realworld_benchmark.jsonl | |
| python scripts/quality/check_attack_vector_coverage.py \ | |
| --min-vectors 120 | |
| python scripts/quality/check_semgrep_vector_coverage.py \ | |
| --core-min 1 \ | |
| --core-max 120 | |
| - name: Held-out leakage guard | |
| run: python scripts/audit-pipeline/check_no_heldout_leak.py | |
| - name: Secret scan | |
| run: | | |
| detect-secrets scan --all-files \ | |
| --exclude-files '.git/.*' \ | |
| --exclude-files 'datasets/audits/raw/.*' \ | |
| --exclude-files 'datasets/audits/extracted/.*' \ | |
| --exclude-files 'datasets/manifests/audit_metadata.seed.json' \ | |
| --exclude-files 'evals/reports/data/.*' \ | |
| --exclude-files 'website/data/.*' > /tmp/detect-secrets.json | |
| python - <<'PY' | |
| import json | |
| import sys | |
| with open("/tmp/detect-secrets.json", "r", encoding="utf-8") as handle: | |
| report = json.load(handle) | |
| findings = report.get("results", {}) | |
| finding_count = sum(len(v) for v in findings.values()) | |
| print(f"detect-secrets findings={finding_count}") | |
| if finding_count > 0: | |
| for path, items in findings.items(): | |
| if items: | |
| print(f"{path}: {len(items)} potential secrets") | |
| sys.exit(1) | |
| PY | |
| - name: Cairo auditor benchmark | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/benchmark_cairo_auditor.py \ | |
| --cases evals/cases/cairo_auditor_benchmark.jsonl \ | |
| --output "${RUNNER_TEMP}/scorecards/cairo-auditor-benchmark.md" \ | |
| --version v0.2.0 \ | |
| --min-precision 0.90 \ | |
| --min-recall 0.90 \ | |
| --min-class-recall 0.90 | |
| - name: Cairo auditor real-world benchmark | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/benchmark_cairo_auditor.py \ | |
| --cases evals/cases/cairo_auditor_realworld_benchmark.jsonl \ | |
| --output "${RUNNER_TEMP}/scorecards/cairo-auditor-realworld-benchmark.md" \ | |
| --version v0.2.0 \ | |
| --title "v0.2.0 Cairo Auditor Real-World Benchmark" \ | |
| --min-precision 0.90 \ | |
| --min-recall 0.90 \ | |
| --min-class-recall 0.90 | |
| - name: Manual-19 gold recall | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/check_manual_gold_recall.py \ | |
| --gold evals/reports/data/manual-19-gold.jsonl \ | |
| --findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.md" \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.json" \ | |
| --min-recall 0.90 \ | |
| --min-class-recall 0.75 | |
| - name: Contract skill benchmark | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/benchmark_contract_skills.py \ | |
| --cases evals/cases/contract_skill_benchmark.jsonl \ | |
| --output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark.md" \ | |
| --version v0.5.0 \ | |
| --title "v0.5.0 Contract Skill Benchmark" \ | |
| --min-precision 0.95 \ | |
| --min-recall 0.95 \ | |
| --min-evaluated 60 \ | |
| --enforce-min-evaluated \ | |
| --require-tools | |
| - name: Contract benchmark mutation suite | |
| timeout-minutes: 30 | |
| run: | | |
| python scripts/quality/mutation_test_contract_benchmark.py \ | |
| --cases evals/cases/contract_skill_benchmark.jsonl \ | |
| --min-precision 1.0 \ | |
| --min-recall 1.0 \ | |
| --min-evaluated 60 | |
| - name: Contract benchmark trend | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/render_contract_benchmark_trend.py \ | |
| --scorecards-glob 'evals/scorecards/v*-contract-skill-benchmark.md' \ | |
| --output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \ | |
| --min-cases 60 \ | |
| --min-consecutive 2 | |
| - name: Contract KPI publication gate | |
| env: | |
| ENFORCE_KPI_GATE: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enforce_kpi_release_gate == 'true' }} | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| extra_args=() | |
| if [ "${ENFORCE_KPI_GATE}" = "true" ]; then | |
| extra_args+=(--enforce) | |
| fi | |
| python scripts/quality/check_contract_kpi_release_gate.py \ | |
| --trend "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \ | |
| --signoffs evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl \ | |
| --output "${RUNNER_TEMP}/scorecards/contract-kpi-publication-gate.md" \ | |
| --min-consecutive 2 \ | |
| "${extra_args[@]}" | |
| - name: Cairo auditor external triage scorecard | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/score_external_triage.py \ | |
| --labels evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl \ | |
| --findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \ | |
| --release v0.2.0 \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.md" \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.json" \ | |
| --output-unlabeled-jsonl "${RUNNER_TEMP}/scorecards/cairo-auditor-external-unlabeled.jsonl" \ | |
| --trend-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-trend.md" \ | |
| --min-precision 0.70 \ | |
| --min-recall 0.90 \ | |
| --min-labeled-coverage 0.90 | |
| - name: Sierra parallel signal (auxiliary) | |
| if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} | |
| continue-on-error: true | |
| timeout-minutes: 25 | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/sierra_parallel_signal.py \ | |
| --scan-id "ci-sierra-parallel-low-profile-${{ github.run_id }}" \ | |
| --repos-file evals/reports/data/external-repo-scan-low-profile-repos.txt \ | |
| --detector-findings-jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \ | |
| --allow-build \ | |
| --scarb-timeout-seconds 240 \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.json" \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.md" | |
| - name: Caracal adapter (auxiliary) | |
| if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} | |
| continue-on-error: true | |
| timeout-minutes: 10 | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/run_caracal_adapter.py \ | |
| --repo-root . \ | |
| --allow-build \ | |
| --scarb-timeout-seconds 240 \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.json" \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.md" | |
| - name: Semgrep Cairo adapter (auxiliary) | |
| if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} | |
| continue-on-error: true | |
| timeout-minutes: 10 | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/run_semgrep_cairo.py \ | |
| --repo-root . \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.json" \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.md" | |
| - name: Probe GitHub Models availability | |
| id: models_probe | |
| if: ${{ github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository }} | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| run: | | |
| body='{"messages":[{"role":"user","content":"Return {\"ok\":true} in JSON."}],"model":"openai/gpt-4o","temperature":0}' | |
| code=$(curl -sS -o /tmp/models-probe.json -w "%{http_code}" \ | |
| "https://models.github.ai/inference/chat/completions" \ | |
| -H "Content-Type: application/json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -d "$body") | |
| if [ "$code" = "200" ]; then | |
| echo "available=true" >> "$GITHUB_OUTPUT" | |
| echo "GitHub Models probe: available" | |
| else | |
| echo "available=false" >> "$GITHUB_OUTPUT" | |
| echo "GitHub Models probe unavailable (HTTP $code), skipping LLM held-out eval." | |
| sed -n '1,40p' /tmp/models-probe.json || true | |
| fi | |
| - name: LLM held-out eval | |
| if: ${{ (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }} | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/run_llm_eval.py \ | |
| --cases evals/heldout/cairo_auditor_llm_eval_cases.jsonl \ | |
| --output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.json" \ | |
| --output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.md" \ | |
| --model openai/gpt-4o \ | |
| --min-precision 0.75 \ | |
| --min-recall 0.75 | |
| - name: Contract generation eval (build side, informational) | |
| if: ${{ (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }} | |
| # Informational telemetry: non-zero exits are recorded in logs/artifacts but do not fail the workflow. | |
| continue-on-error: true | |
| timeout-minutes: 40 | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| run: | | |
| mkdir -p "${RUNNER_TEMP}/scorecards" | |
| python scripts/quality/run_contract_generation_eval.py \ | |
| --cases evals/cases/contract_skill_generation_eval.jsonl \ | |
| --output-json "${RUNNER_TEMP}/scorecards/contract-generation-eval.json" \ | |
| --output-md "${RUNNER_TEMP}/scorecards/contract-generation-eval.md" \ | |
| --model openai/gpt-4o \ | |
| --min-pass-rate 0.55 \ | |
| --max-vuln-rate 0.35 \ | |
| --min-evaluated 8 \ | |
| --enforce-min-evaluated \ | |
| --require-tools | |
| - name: "LLM held-out eval (skipped: GitHub Models unavailable)" | |
| if: ${{ steps.models_probe.outputs.available != 'true' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }} | |
| run: echo "Skipping LLM held-out eval because GitHub Models is unavailable for this run." | |
| - name: "Contract generation eval (skipped: GitHub Models unavailable)" | |
| if: ${{ steps.models_probe.outputs.available != 'true' || (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }} | |
| run: echo "Skipping contract generation eval because GitHub Models is unavailable for this run." | |
| - name: Upload benchmark scorecards | |
| if: always() | |
| uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 | |
| with: | |
| name: cairo-auditor-scorecards | |
| if-no-files-found: warn | |
| path: | | |
| ${{ runner.temp }}/scorecards/cairo-auditor-benchmark.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-realworld-benchmark.md | |
| ${{ runner.temp }}/scorecards/contract-skill-benchmark.md | |
| ${{ runner.temp }}/scorecards/contract-skill-benchmark-trend.md | |
| ${{ runner.temp }}/scorecards/contract-kpi-publication-gate.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-external-triage.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-external-trend.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-external-triage.json | |
| ${{ runner.temp }}/scorecards/cairo-auditor-external-unlabeled.jsonl | |
| ${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.json | |
| ${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.json | |
| ${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.json | |
| ${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.json | |
| ${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.md | |
| ${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.json | |
| ${{ runner.temp }}/scorecards/contract-generation-eval.md | |
| ${{ runner.temp }}/scorecards/contract-generation-eval.json |