Full Evals #343

Workflow file for this run

.github/workflows/full-evals.yml at 7765ad5

	name: Full Evals

	on:
	workflow_dispatch:
	inputs:
	enforce_kpi_release_gate:
	description: "Fail workflow if contract KPI publication gate is not met"
	required: false
	type: boolean
	default: false
	schedule:
	- cron: "0 2 * * *"
	pull_request:
	types:
	- opened
	- synchronize
	- reopened
	- ready_for_review
	paths:
	- "SKILL.md"
	- "**/SKILL.md"
	- "/references/"
	- "evals/**"
	- "scripts/quality/**"
	- ".github/workflows/**"

	jobs:
	full-evals:
	permissions:
	contents: read
	models: read
	runs-on: ubuntu-latest
	steps:
	- name: Checkout
	uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
	with:
	persist-credentials: false

	- name: Setup Python
	uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
	with:
	python-version: "3.12"

	- name: Setup Scarb
	uses: software-mansion/setup-scarb@95ba816a4383938e2338cb793773d4670011b65f # v1
	with:
	scarb-version: "2.14.0"

	- name: Setup Starknet Foundry
	uses: foundry-rs/setup-snfoundry@84390a1602c157576ceefa6f9cfcb8b0504c94c7 # v3
	with:
	starknet-foundry-version: "0.56.0"

	- name: Install Python deps
	run: python -m pip install -r requirements.txt detect-secrets==1.5.0

	- name: Run parity quality checks
	run: python scripts/quality/parity_check.py

	- name: Validate manifests + uniqueness
	run: \|
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema datasets/manifests/audit-manifest.schema.json \
	--jsonl datasets/manifests/audits.jsonl
	python scripts/audit-pipeline/check_unique_ids.py \
	--jsonl datasets/manifests/audits.jsonl \
	--keys audit_id source_url raw_path extracted_path raw_sha256
	python scripts/audit-pipeline/validate_json.py \
	--schema datasets/normalized/audit.schema.json \
	--glob 'datasets/normalized/audits/*.json'
	for file in datasets/normalized/findings/*.jsonl; do
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema datasets/normalized/finding.schema.json \
	--jsonl "$file"
	done
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/cases/benchmark-case.schema.json \
	--jsonl evals/cases/cairo_auditor_benchmark.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/cases/benchmark-case.schema.json \
	--jsonl evals/cases/cairo_auditor_realworld_benchmark.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/cases/contract-benchmark-case.schema.json \
	--jsonl evals/cases/contract_skill_benchmark.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/cases/contract-generation-case.schema.json \
	--jsonl evals/cases/contract_skill_generation_eval.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/cases/benchmark-case.schema.json \
	--jsonl evals/heldout/cairo_auditor_llm_eval_cases.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/reports/data/external-triage-label.schema.json \
	--jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/reports/data/security-review-signoff.schema.json \
	--jsonl evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl
	python scripts/audit-pipeline/validate_jsonl.py \
	--schema evals/reports/data/manual-19-gold.schema.json \
	--jsonl evals/reports/data/manual-19-gold.jsonl
	python scripts/quality/check_vulndb_parity.py \
	--cases evals/cases/cairo_auditor_benchmark.jsonl \
	--cases evals/cases/cairo_auditor_realworld_benchmark.jsonl
	python scripts/quality/check_attack_vector_coverage.py \
	--min-vectors 120
	python scripts/quality/check_semgrep_vector_coverage.py \
	--core-min 1 \
	--core-max 120

	- name: Held-out leakage guard
	run: python scripts/audit-pipeline/check_no_heldout_leak.py

	- name: Secret scan
	run: \|
	detect-secrets scan --all-files \
	--exclude-files '.git/.*' \
	--exclude-files 'datasets/audits/raw/.*' \
	--exclude-files 'datasets/audits/extracted/.*' \
	--exclude-files 'datasets/manifests/audit_metadata.seed.json' \
	--exclude-files 'evals/reports/data/.*' \
	--exclude-files 'website/data/.*' > /tmp/detect-secrets.json
	python - <<'PY'
	import json
	import sys

	with open("/tmp/detect-secrets.json", "r", encoding="utf-8") as handle:
	report = json.load(handle)
	findings = report.get("results", {})
	finding_count = sum(len(v) for v in findings.values())
	print(f"detect-secrets findings={finding_count}")
	if finding_count > 0:
	for path, items in findings.items():
	if items:
	print(f"{path}: {len(items)} potential secrets")
	sys.exit(1)
	PY

	- name: Cairo auditor benchmark
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/benchmark_cairo_auditor.py \
	--cases evals/cases/cairo_auditor_benchmark.jsonl \
	--output "${RUNNER_TEMP}/scorecards/cairo-auditor-benchmark.md" \
	--version v0.2.0 \
	--min-precision 0.90 \
	--min-recall 0.90 \
	--min-class-recall 0.90

	- name: Cairo auditor real-world benchmark
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/benchmark_cairo_auditor.py \
	--cases evals/cases/cairo_auditor_realworld_benchmark.jsonl \
	--output "${RUNNER_TEMP}/scorecards/cairo-auditor-realworld-benchmark.md" \
	--version v0.2.0 \
	--title "v0.2.0 Cairo Auditor Real-World Benchmark" \
	--min-precision 0.90 \
	--min-recall 0.90 \
	--min-class-recall 0.90

	- name: Manual-19 gold recall
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/check_manual_gold_recall.py \
	--gold evals/reports/data/manual-19-gold.jsonl \
	--findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.md" \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-manual-19-gold-recall.json" \
	--min-recall 0.90 \
	--min-class-recall 0.75

	- name: Contract skill benchmark
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/benchmark_contract_skills.py \
	--cases evals/cases/contract_skill_benchmark.jsonl \
	--output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark.md" \
	--version v0.5.0 \
	--title "v0.5.0 Contract Skill Benchmark" \
	--min-precision 0.95 \
	--min-recall 0.95 \
	--min-evaluated 60 \
	--enforce-min-evaluated \
	--require-tools

	- name: Contract benchmark mutation suite
	timeout-minutes: 30
	run: \|
	python scripts/quality/mutation_test_contract_benchmark.py \
	--cases evals/cases/contract_skill_benchmark.jsonl \
	--min-precision 1.0 \
	--min-recall 1.0 \
	--min-evaluated 60

	- name: Contract benchmark trend
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/render_contract_benchmark_trend.py \
	--scorecards-glob 'evals/scorecards/v*-contract-skill-benchmark.md' \
	--output "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \
	--min-cases 60 \
	--min-consecutive 2

	- name: Contract KPI publication gate
	env:
	ENFORCE_KPI_GATE: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.enforce_kpi_release_gate == 'true' }}
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	extra_args=()
	if [ "${ENFORCE_KPI_GATE}" = "true" ]; then
	extra_args+=(--enforce)
	fi
	python scripts/quality/check_contract_kpi_release_gate.py \
	--trend "${RUNNER_TEMP}/scorecards/contract-skill-benchmark-trend.md" \
	--signoffs evals/scorecards/security-review-signoffs.contract-skill-benchmark.jsonl \
	--output "${RUNNER_TEMP}/scorecards/contract-kpi-publication-gate.md" \
	--min-consecutive 2 \
	"${extra_args[@]}"

	- name: Cairo auditor external triage scorecard
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/score_external_triage.py \
	--labels evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.labels.jsonl \
	--findings evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
	--release v0.2.0 \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.md" \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-external-triage.json" \
	--output-unlabeled-jsonl "${RUNNER_TEMP}/scorecards/cairo-auditor-external-unlabeled.jsonl" \
	--trend-md "${RUNNER_TEMP}/scorecards/cairo-auditor-external-trend.md" \
	--min-precision 0.70 \
	--min-recall 0.90 \
	--min-labeled-coverage 0.90

	- name: Sierra parallel signal (auxiliary)
	if: ${{ github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository }}
	continue-on-error: true
	timeout-minutes: 25
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/sierra_parallel_signal.py \
	--scan-id "ci-sierra-parallel-low-profile-${{ github.run_id }}" \
	--repos-file evals/reports/data/external-repo-scan-low-profile-repos.txt \
	--detector-findings-jsonl evals/reports/data/external-repo-scan-low-profile-rerun-2026-03-09-v5.findings.jsonl \
	--allow-build \
	--scarb-timeout-seconds 240 \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.json" \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-sierra-parallel.md"

	- name: Caracal adapter (auxiliary)
	if: ${{ github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository }}
	continue-on-error: true
	timeout-minutes: 10
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/run_caracal_adapter.py \
	--repo-root . \
	--allow-build \
	--scarb-timeout-seconds 240 \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.json" \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-caracal-adapter.md"

	- name: Semgrep Cairo adapter (auxiliary)
	if: ${{ github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository }}
	continue-on-error: true
	timeout-minutes: 10
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/run_semgrep_cairo.py \
	--repo-root . \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.json" \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-semgrep-adapter.md"

	- name: Probe GitHub Models availability
	id: models_probe
	if: ${{ github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository }}
	env:
	GITHUB_TOKEN: ${{ github.token }}
	run: \|
	body='{"messages":[{"role":"user","content":"Return {\"ok\":true} in JSON."}],"model":"openai/gpt-4o","temperature":0}'
	code=$(curl -sS -o /tmp/models-probe.json -w "%{http_code}" \
	"https://models.github.ai/inference/chat/completions" \
	-H "Content-Type: application/json" \
	-H "Authorization: Bearer $GITHUB_TOKEN" \
	-d "$body")
	if [ "$code" = "200" ]; then
	echo "available=true" >> "$GITHUB_OUTPUT"
	echo "GitHub Models probe: available"
	else
	echo "available=false" >> "$GITHUB_OUTPUT"
	echo "GitHub Models probe unavailable (HTTP $code), skipping LLM held-out eval."
	sed -n '1,40p' /tmp/models-probe.json \|\| true
	fi

	- name: LLM held-out eval
	if: ${{ (github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }}
	env:
	GITHUB_TOKEN: ${{ github.token }}
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/run_llm_eval.py \
	--cases evals/heldout/cairo_auditor_llm_eval_cases.jsonl \
	--output-json "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.json" \
	--output-md "${RUNNER_TEMP}/scorecards/cairo-auditor-llm-heldout.md" \
	--model openai/gpt-4o \
	--min-precision 0.75 \
	--min-recall 0.75

	- name: Contract generation eval (build side, informational)
	if: ${{ (github.event_name != 'pull_request' \|\| github.event.pull_request.head.repo.full_name == github.repository) && steps.models_probe.outputs.available == 'true' }}
	# Informational telemetry: non-zero exits are recorded in logs/artifacts but do not fail the workflow.
	continue-on-error: true
	timeout-minutes: 40
	env:
	GITHUB_TOKEN: ${{ github.token }}
	run: \|
	mkdir -p "${RUNNER_TEMP}/scorecards"
	python scripts/quality/run_contract_generation_eval.py \
	--cases evals/cases/contract_skill_generation_eval.jsonl \
	--output-json "${RUNNER_TEMP}/scorecards/contract-generation-eval.json" \
	--output-md "${RUNNER_TEMP}/scorecards/contract-generation-eval.md" \
	--model openai/gpt-4o \
	--min-pass-rate 0.55 \
	--max-vuln-rate 0.35 \
	--min-evaluated 8 \
	--enforce-min-evaluated \
	--require-tools

	- name: "LLM held-out eval (skipped: GitHub Models unavailable)"
	if: ${{ steps.models_probe.outputs.available != 'true' \|\| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }}
	run: echo "Skipping LLM held-out eval because GitHub Models is unavailable for this run."

	- name: "Contract generation eval (skipped: GitHub Models unavailable)"
	if: ${{ steps.models_probe.outputs.available != 'true' \|\| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository) }}
	run: echo "Skipping contract generation eval because GitHub Models is unavailable for this run."

	- name: Upload benchmark scorecards
	if: always()
	uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
	with:
	name: cairo-auditor-scorecards
	if-no-files-found: warn
	path: \|
	${{ runner.temp }}/scorecards/cairo-auditor-benchmark.md
	${{ runner.temp }}/scorecards/cairo-auditor-realworld-benchmark.md
	${{ runner.temp }}/scorecards/contract-skill-benchmark.md
	${{ runner.temp }}/scorecards/contract-skill-benchmark-trend.md
	${{ runner.temp }}/scorecards/contract-kpi-publication-gate.md
	${{ runner.temp }}/scorecards/cairo-auditor-external-triage.md
	${{ runner.temp }}/scorecards/cairo-auditor-external-trend.md
	${{ runner.temp }}/scorecards/cairo-auditor-external-triage.json
	${{ runner.temp }}/scorecards/cairo-auditor-external-unlabeled.jsonl
	${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.md
	${{ runner.temp }}/scorecards/cairo-auditor-manual-19-gold-recall.json
	${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.md
	${{ runner.temp }}/scorecards/cairo-auditor-sierra-parallel.json
	${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.md
	${{ runner.temp }}/scorecards/cairo-auditor-caracal-adapter.json
	${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.md
	${{ runner.temp }}/scorecards/cairo-auditor-semgrep-adapter.json
	${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.md
	${{ runner.temp }}/scorecards/cairo-auditor-llm-heldout.json
	${{ runner.temp }}/scorecards/contract-generation-eval.md
	${{ runner.temp }}/scorecards/contract-generation-eval.json

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Full Evals #343

Workflow file

Full Evals #343

Uh oh!

Workflow file for this run