diff --git a/.gitignore b/.gitignore index 249a910f5f..b0ced7ba23 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,12 @@ CLAUDE.local.md .env .idea/ .agents/ -.worktrees/ \ No newline at end of file +.worktrees/ + +# Additional ignores +.devcontainer/ +.claude/ +.agentic-qe/ +.claude-flow/ +.mcp.json +*.db \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index f4876c26ad..eb538b0e34 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1 +1,164 @@ -See [AGENTS.md](AGENTS.md) +# For Superplane project instructions See [AGENTS.md](AGENTS.md) + +## Agentic QE v3 + +This project uses **Agentic QE v3** - a Domain-Driven Quality Engineering platform with 13 bounded contexts, ReasoningBank learning, HNSW vector search, and Agent Teams coordination (ADR-064). + +--- + +### CRITICAL POLICIES + +#### Integrity Rule (ABSOLUTE) +- NO shortcuts, fake data, or false claims +- ALWAYS implement properly, verify before claiming success +- ALWAYS use real database queries for integration tests +- ALWAYS run actual tests, not assume they pass + +**We value the quality we deliver to our users.** + +#### Test Execution +- NEVER run `npm test` without `--run` flag (watch mode risk) +- Use: `npm test -- --run`, `npm run test:unit`, `npm run test:integration` when available + +#### Data Protection +- NEVER run `rm -f` on `.agentic-qe/` or `*.db` files without confirmation +- ALWAYS backup before database operations + +#### Git Operations +- NEVER auto-commit/push without explicit user request +- ALWAYS wait for user confirmation before git operations + +--- + +### Quick Reference + +```bash +# Run tests +npm test -- --run + +# Check quality +aqe quality assess + +# Generate tests +aqe test generate + +# Coverage analysis +aqe coverage +``` + +### Using AQE MCP Tools + +AQE exposes tools via MCP with the `mcp__agentic-qe__` prefix. You MUST call `fleet_init` before any other tool. + +#### 1. Initialize the Fleet (required first step) + +```typescript +mcp__agentic-qe__fleet_init({ + topology: "hierarchical", + maxAgents: 15, + memoryBackend: "hybrid" +}) +``` + +#### 2. Generate Tests + +```typescript +mcp__agentic-qe__test_generate_enhanced({ + targetPath: "src/services/auth.ts", + framework: "vitest", + strategy: "boundary-value" +}) +``` + +#### 3. Analyze Coverage + +```typescript +mcp__agentic-qe__coverage_analyze_sublinear({ + paths: ["src/"], + threshold: 80 +}) +``` + +#### 4. Assess Quality + +```typescript +mcp__agentic-qe__quality_assess({ + scope: "full", + includeMetrics: true +}) +``` + +#### 5. Store and Query Patterns (with learning persistence) + +```typescript +// Store a learned pattern +mcp__agentic-qe__memory_store({ + key: "patterns/coverage-gap/{timestamp}", + namespace: "learning", + value: { + pattern: "...", + confidence: 0.95, + type: "coverage-gap", + metadata: { /* domain-specific */ } + }, + persist: true +}) + +// Query stored patterns +mcp__agentic-qe__memory_query({ + pattern: "patterns/*", + namespace: "learning", + limit: 10 +}) +``` + +#### 6. Orchestrate Multi-Agent Tasks + +```typescript +mcp__agentic-qe__task_orchestrate({ + task: "Full quality assessment of auth module", + domains: ["test-generation", "coverage-analysis", "security-compliance"], + parallel: true +}) +``` + +### MCP Tool Reference + +| Tool | Description | +|------|-------------| +| `fleet_init` | Initialize QE fleet (MUST call first) | +| `fleet_status` | Get fleet health and agent status | +| `agent_spawn` | Spawn specialized QE agent | +| `test_generate_enhanced` | AI-powered test generation | +| `test_execute_parallel` | Parallel test execution with retry | +| `task_orchestrate` | Orchestrate multi-agent QE tasks | +| `coverage_analyze_sublinear` | O(log n) coverage analysis | +| `quality_assess` | Quality gate evaluation | +| `memory_store` | Store patterns with namespace + persist | +| `memory_query` | Query patterns by namespace/pattern | +| `security_scan_comprehensive` | SAST/DAST scanning | + +### Configuration + +- **Enabled Domains**: test-generation, test-execution, coverage-analysis, quality-assessment, defect-intelligence, requirements-validation (+6 more) +- **Learning**: Enabled (transformer embeddings) +- **Max Concurrent Agents**: 15 +- **Background Workers**: pattern-consolidator, routing-accuracy-monitor, coverage-gap-scanner, flaky-test-detector + +### V3 QE Agents + +QE agents are in `.claude/agents/v3/`. Use with Task tool: + +```javascript +Task({ prompt: "Generate tests", subagent_type: "qe-test-architect", run_in_background: true }) +Task({ prompt: "Find coverage gaps", subagent_type: "qe-coverage-specialist", run_in_background: true }) +Task({ prompt: "Security audit", subagent_type: "qe-security-scanner", run_in_background: true }) +``` + +### Data Storage + +- **Memory Backend**: `.agentic-qe/memory.db` (SQLite) +- **Configuration**: `.agentic-qe/config.yaml` + +--- +*Generated by AQE v3 init - 2026-03-28T08:27:45.923Z* diff --git a/docs/hackathon-ideas-qe.md b/docs/hackathon-ideas-qe.md new file mode 100644 index 0000000000..00a054cdd7 --- /dev/null +++ b/docs/hackathon-ideas-qe.md @@ -0,0 +1,330 @@ +# Superplane Hackathon Ideas — Quality Engineering Focus + +Six Thinking Hats analysis of QE-focused hackathon projects leveraging Agentic QE v3. +Scoped for a **3-hour hackathon timeframe**. + +--- + +## QE Arsenal Available + +| Category | Count | Highlights | +|----------|-------|------------| +| QE Agents | 60 | queen-coordinator, test-architect, coverage-specialist, quality-gate, flaky-hunter, chaos-engineer, etc. | +| QE Skills | 57 | qe-test-generation, qe-coverage-analysis, qe-quality-assessment, strict-tdd, coverage-guard, etc. | +| QCSD Phases | 5 | Ideation, Refinement, Development, CI/CD, Production swarms | +| MCP Tools | 67 | fleet_init, test_generate_enhanced, coverage_analyze_sublinear, quality_assess, security_scan_comprehensive | +| Domains | 12 | test-generation, coverage-analysis, quality-assessment, defect-intelligence, security-compliance, chaos-resilience, etc. | +| Learning | 150K+ patterns | ReasoningBank, HNSW vector search, pattern promotion, experience replay | + +--- + +## Six Thinking Hats Analysis + +### White Hat — Facts + +**Superplane's current testing state:** +- Go backend has tests (`make test`) and E2E tests (`make test.e2e`) using Playwright +- Frontend uses Vitest, has Storybook 9 for component stories +- No visible quality gates in CI/CD pipeline +- No coverage thresholds enforced +- No automated test generation integrated into the workflow +- Canvas workflows have no built-in validation or testing framework +- 200+ database migrations with no migration test suite +- 45+ integrations with no contract tests between them +- AI agent (PydanticAI) has an `evals/` directory — evaluation framework exists but is early +- Expression engine (`expr-lang/expr`) has no fuzz testing +- RBAC (Casbin) policies have no automated verification + +**AQE capabilities ready to use:** +- `qe-test-architect` can generate tests for Go and TypeScript +- `qe-coverage-specialist` provides O(log n) coverage gap detection +- `qe-quality-gate` can enforce pass/fail thresholds +- `qe-contract-validator` can validate API contracts +- `qe-chaos-engineer` can inject faults +- `qe-flaky-hunter` can detect unreliable tests +- `qe-security-scanner` does SAST/DAST scanning +- QCSD swarms provide phase-based quality workflows + +### Red Hat — Gut Feelings + +- **Excited:** Superplane has no quality gates — adding them would be transformative and immediately valuable +- **Feeling:** The most impactful QE project would make Superplane's own CI/CD pipeline significantly better +- **Intuition:** Contract testing between the 45+ integrations is a gold mine — each integration talks to external APIs with no verification +- **Anxious:** 3 hours is tight — must pick something that shows results fast, not infrastructure setup +- **Strong sense:** The Canvas workflow linting/validation idea from the previous analysis crosses into QE territory nicely +- **Gut:** Demo should show red/green — failing quality gate turning green after fixes + +### Black Hat — Risks + +| Risk | Impact | Mitigation | +|------|--------|------------| +| 3 hours is very tight | High | Pick ideas that produce visible results in <1 hour | +| Go test generation may need Go expertise | Medium | Focus on TypeScript/frontend tests where Vitest is already set up | +| AQE fleet_init may take time | Low | Initialize once at start, reuse across all work | +| Coverage tools need actual test execution | Medium | Use existing test suites, don't build from scratch | +| Quality gate enforcement needs CI/CD access | Medium | Demo locally with CLI, document CI integration | +| gRPC proto contract testing is complex | High | Focus on REST/HTTP API contracts instead | + +### Yellow Hat — Strengths & Opportunities + +**What makes QE projects ideal for this hackathon:** +- Superplane is a workflow automation platform — quality gates for workflows are a natural feature +- AQE agents can generate tests autonomously — show AI writing tests for Superplane's own code +- The 45+ integrations each follow a pattern — one contract test template scales to all +- Existing Vitest setup means frontend tests can run immediately +- QCSD framework provides a structured narrative for the demo +- Quality gates would make Superplane more enterprise-ready — directly valuable to the team +- AQE's learning system can show pattern evolution during the hackathon itself + +### Green Hat — Creative Ideas (3-Hour Scope) + +--- + +## Top 8 QE Hackathon Ideas + +### 1. Quality Gates for Canvas Workflows + +**Add quality gate validation that runs before a canvas is published, catching errors before they hit production.** + +What to build: +- Pre-publish validation hook that analyzes a canvas before it goes live +- Checks: all nodes configured, no orphan nodes, no cycles in non-loop paths, required integrations connected, approval gates on destructive actions +- Severity levels: Error (blocks publish), Warning (allows with acknowledgment), Info +- UI: Badge on canvas showing gate status (red/yellow/green) +- CLI: `aqe quality assess --scope canvas` produces SARIF report + +Leverage: +- `qe-quality-gate` agent for gate logic +- `qe-quality-assessment` skill for scoring +- Existing canvas model API to fetch workflow structure + +Why 3 hours works: Canvas structure is available via API. Validation is pure logic — no external dependencies needed. UI badge is a small React change. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Low | High | qe-quality-gate, qe-risk-assessor | + +--- + +### 2. AI Test Generation for Superplane's Own Codebase + +**Use AQE agents to generate a test suite for an undertested part of Superplane, demonstrating AI-powered QE in action.** + +What to build: +- Pick a module (e.g., `pkg/components/`, `pkg/exprruntime/`, or `web_src/src/ui/`) +- Run `qe-test-architect` to analyze code and generate tests +- Run `qe-coverage-specialist` to find gaps before and after +- Show coverage improvement: before (X%) -> after (Y%) +- Generate a coverage report with risk-weighted gap analysis + +Leverage: +- `test_generate_enhanced` MCP tool +- `coverage_analyze_sublinear` MCP tool +- `qe-gap-detector` for finding what to test +- Vitest (frontend) or Go test (backend) for execution + +Why 3 hours works: AQE generates tests automatically. Pick a small, self-contained module. The "before/after" demo tells a clear story. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Low-Medium | High | qe-test-architect, qe-coverage-specialist, qe-gap-detector | + +--- + +### 3. Integration Contract Test Suite + +**Create contract tests for Superplane's top integrations ensuring API compatibility doesn't break silently.** + +What to build: +- Pick 3-5 integrations (GitHub, Slack, PagerDuty, Datadog, AWS) +- For each: capture the expected request/response schema from the integration code +- Generate consumer-driven contract tests using `qe-contract-validator` +- Validate that integration components send correct payloads and handle responses properly +- Show: "GitHub changed their API response? This contract test catches it before your workflow breaks" + +Leverage: +- `qe-contract-validator` agent +- `contract-testing` skill (Pact patterns) +- `api-testing-patterns` skill +- Integration source code in `pkg/integrations/` + +Why 3 hours works: Integration code follows a consistent pattern. Schema extraction is mechanical. 3-5 integrations is achievable. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Medium | High | qe-contract-validator, qe-integration-tester | + +--- + +### 4. QCSD Pipeline Demo — Full Quality Lifecycle + +**Demonstrate the complete QCSD (Quality-Completeness-Security-Deployment) lifecycle on a Superplane feature.** + +What to build: +- Pick a real feature area (e.g., the approval component or webhook trigger) +- **Ideation phase**: Run `qcsd-ideation-swarm` to generate quality criteria using HTSM v6.3 +- **Refinement phase**: Run `qcsd-refinement-swarm` to produce BDD scenarios and SFDIPOT analysis +- **Development phase**: Run `qcsd-development-swarm` to check TDD adherence, complexity, and coverage +- **CI/CD phase**: Run `qcsd-cicd-swarm` to enforce quality gates and assess deployment readiness +- **Production phase**: Run `qcsd-production-swarm` to define monitoring and feedback loops +- Output: Complete quality dossier for the feature + +Leverage: +- All 5 QCSD swarm skills +- Cross-phase feedback loops (strategic, tactical, operational, quality-criteria, learning) +- `qe-quality-criteria-recommender` for HTSM analysis +- `qe-product-factors-assessor` for SFDIPOT + +Why 3 hours works: Each QCSD phase takes ~30 minutes. The framework is already built — you're applying it, not building it. The demo tells a compelling narrative. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Medium | Very High | All QCSD swarms, qe-quality-criteria-recommender, qe-product-factors-assessor | + +--- + +### 5. Flaky Test Hunter & Auto-Stabilizer + +**Detect flaky tests in Superplane's test suite, analyze root causes, and auto-fix them.** + +What to build: +- Run Superplane's test suite multiple times to identify non-deterministic failures +- Use `qe-flaky-hunter` to classify flaky patterns (timing, ordering, shared state, resource contention) +- Use `qe-root-cause-analyzer` to diagnose each flaky test +- Use `qe-retry-handler` to implement intelligent retry with adaptive backoff +- Generate a report: X flaky tests found, Y root causes identified, Z auto-fixed +- PR with stabilization fixes + +Leverage: +- `qe-flaky-hunter` agent +- `qe-root-cause-analyzer` agent +- `qe-retry-handler` agent +- `test-failure-investigator` skill +- `qe-test-execution` skill for parallel runs + +Why 3 hours works: Test suite already exists. Running it multiple times is mechanical. Flaky test detection produces immediate, tangible results. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Medium | High | qe-flaky-hunter, qe-root-cause-analyzer, qe-retry-handler | + +--- + +### 6. Security Quality Gate for Integrations + +**Scan Superplane's 45+ integrations for security vulnerabilities: hardcoded secrets, injection risks, insecure HTTP, missing auth validation.** + +What to build: +- Run `qe-security-scanner` across `pkg/integrations/` directory +- Check for: credentials in code, HTTP instead of HTTPS, missing input validation, SQL injection in queries, unvalidated webhook payloads +- Run `security_scan_comprehensive` MCP tool for SAST analysis +- Generate SARIF report compatible with GitHub Code Scanning +- Create a security scorecard: each integration gets a grade (A-F) +- Fix the critical findings and show before/after + +Leverage: +- `qe-security-scanner` agent +- `qe-security-auditor` agent +- `security-testing` skill +- `security_scan_comprehensive` MCP tool +- `pentest-validation` skill for exploit verification + +Why 3 hours works: Scanning is automated. The 45 integrations follow a pattern, so findings scale. SARIF output is a standard format. Scorecard is a compelling visual. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Low-Medium | High | qe-security-scanner, qe-security-auditor | + +--- + +### 7. Expression Engine Fuzz Testing & Property Tests + +**Fuzz test Superplane's expression runtime (`expr-lang/expr`) to find edge cases and crashes in workflow expressions.** + +What to build: +- Analyze `pkg/exprruntime/` to understand the expression language +- Use `qe-property-tester` to generate property-based tests (e.g., "any valid expression should not panic", "nested access on nil should return error not crash") +- Use `qe-mutation-tester` to verify existing tests catch real bugs +- Fuzz with random/malformed expressions: deeply nested, Unicode, injection attempts +- Report: X edge cases found, Y potential crashes, Z security issues + +Leverage: +- `qe-property-tester` agent +- `qe-mutation-tester` agent +- `qe-test-architect` for test generation +- Go's built-in fuzzing (`go test -fuzz`) + +Why 3 hours works: Expression engines are perfect fuzz targets — small input surface, clear correctness criteria. Property tests are auto-generated. Go has native fuzz support. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Medium | High | qe-property-tester, qe-mutation-tester, qe-test-architect | + +--- + +### 8. Accessibility Audit of Canvas UI + +**Run a comprehensive accessibility audit on the Canvas UI and fix critical WCAG violations.** + +What to build: +- Use `qe-accessibility-auditor` to scan the Canvas UI pages +- Run axe-core analysis on key pages: canvas editor, run history, integration settings, organization management +- Check: keyboard navigation, screen reader compatibility, color contrast, focus management, ARIA labels on React Flow nodes +- Generate WCAG 2.2 compliance report with severity ratings +- Fix top 5-10 critical violations (missing alt text, focus traps, contrast issues) +- Before/after screenshots showing improvements + +Leverage: +- `qe-accessibility-auditor` agent +- `qe-visual-accessibility` skill +- `a11y-ally` skill +- `accessibility-testing` skill +- Existing Storybook for component-level testing + +Why 3 hours works: axe-core scanning is fast. Canvas UI is a single-page app — limited surface area. WCAG fixes are often small CSS/ARIA changes. Before/after demos well. + +| Effort | Demo Impact | AQE Agents Used | +|--------|-------------|-----------------| +| Low-Medium | Medium-High | qe-accessibility-auditor, qe-visual-tester | + +--- + +## Blue Hat — Action Plan + +### Top 3 Picks for a 3-Hour Hackathon + +| Rank | Project | Why | Time Estimate | +|------|---------|-----|---------------| +| **1** | **Quality Gates for Canvas Workflows** (#1) | Directly extends the product. Pure logic + small UI. No external deps. Most relevant to Superplane team. | ~2.5 hours | +| **2** | **AI Test Generation for Superplane** (#2) | Shows AQE in action on real code. "Before/after coverage" is a compelling metric. Minimal setup. | ~2 hours | +| **3** | **QCSD Pipeline Demo** (#4) | Tells the best story. Shows a complete quality lifecycle. Each phase builds on the last. | ~3 hours | + +### Best "Impress the Judges" Pick +**Quality Gates for Canvas Workflows** (#1) — It's a real product feature that the Superplane team would actually ship. Shows you understand the product AND quality engineering. + +### Best "Technical Depth" Pick +**Expression Engine Fuzz Testing** (#7) — Finding real bugs with property-based testing and fuzzing is technically impressive and produces concrete "we found X crashes" results. + +### Best "Breadth of QE" Pick +**QCSD Pipeline Demo** (#4) — Showcases 5 phases, 10+ agents, cross-phase learning. Demonstrates the full power of agentic quality engineering. + +### Suggested 3-Hour Plan + +| Time | Activity | +|------|----------| +| 0:00-0:15 | Initialize AQE fleet (`fleet_init`), pick your project | +| 0:15-0:30 | Read the relevant Superplane source code | +| 0:30-2:00 | Build (90 minutes of focused implementation) | +| 2:00-2:30 | Run demos, capture screenshots/metrics | +| 2:30-2:45 | Polish: write 3-slide pitch with before/after | +| 2:45-3:00 | Present | + +### Combining Ideas + +These ideas compose well. If your team has 2-3 people: + +- **Person A:** Quality Gates for Canvas (#1) — product feature +- **Person B:** AI Test Generation (#2) — coverage improvement +- **Person C:** Security Scan (#6) — security scorecard + +Together: "We added quality gates, improved test coverage by X%, and found Y security issues across 45 integrations — in 3 hours." diff --git a/docs/hackathon-ideas.md b/docs/hackathon-ideas.md new file mode 100644 index 0000000000..b48724dafd --- /dev/null +++ b/docs/hackathon-ideas.md @@ -0,0 +1,240 @@ +# Superplane Hackathon Ideas — Novi Sad 2026 + +Six Thinking Hats analysis of hackathon project ideas for the Superplane platform. + +## Platform Summary + +Superplane is an open-source DevOps control plane for event-based workflows: +- **Backend:** Go 1.25, gRPC + REST, PostgreSQL, RabbitMQ +- **Frontend:** React 19, TypeScript, React Flow (Canvas UI), Tailwind, shadcn +- **AI Agent:** Python 3.13, PydanticAI, Claude API (alpha) +- **45+ integrations:** GitHub, AWS, Slack, PagerDuty, Datadog, and more +- **Key gaps:** No native K8s integration, no workflow testing, no auto-error recovery, limited observability + +--- + +## Top 10 Ideas (Ranked by Feasibility x Impact x Demo-ability) + +### 1. Incident Copilot — AI-Powered First-5-Minutes Triage Agent + +**Build an autonomous incident response workflow that triggers on PagerDuty/Datadog alerts and uses AI to gather context, correlate signals, and propose actions.** + +- **Trigger:** PagerDuty `onIncident` or Datadog `onAlert` +- **Canvas flow:** Parallel fan-out to fetch recent deploys (GitHub), check dashboards (Datadog/Grafana), pull logs, check pod status (HTTP to K8s API) +- **AI node:** Claude component receives all context, produces structured triage summary +- **Output:** Posts evidence pack to Slack with severity assessment and recommended actions +- **Approval gate** before any remediation action + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Low-Medium | Very High | Canvas design, AI prompting | + +**Why it wins:** Directly on-theme. Uses existing integrations. Mostly canvas template + AI prompt engineering. No backend changes needed. + +--- + +### 2. NL2Workflow — Natural Language to Complete Canvas Generation + +**Type a sentence describing your workflow and get a fully wired canvas.** + +Example input: *"When a PR is merged to main, run tests, if they pass deploy to staging with a 10-minute canary, then promote to production with approval"* + +- Enhance the existing AI agent to produce complete canvas operations +- Leverage the pattern library + component catalog as context +- Generate canvas YAML, import into UI +- Interactive refinement: "Add a Slack notification if canary fails" + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium | Very High | Python, PydanticAI, prompt engineering | + +**Why it wins:** AI sidebar already exists but only does Q&A. Full generation is the natural next step. Jaw-dropping demo. + +--- + +### 3. Canvas Replay — Workflow Execution Debugger & Time-Travel UI + +**Visual execution replay: step through a workflow run node-by-node, seeing inputs/outputs/timing at each step.** + +- New UI panel showing execution timeline +- Click any node to see input payload, output, duration, errors +- "Play" button animates execution flow through the canvas +- Highlight bottlenecks (slow nodes in red) +- Compare two runs side-by-side (success vs failure) + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium | High | React, TypeScript, React Flow | + +**Why it wins:** Pure frontend work. Execution data already exists in the backend. Visually stunning demo. + +--- + +### 4. Workflow Test Runner — Test Framework for Canvases + +**Testing mode: define expected inputs/outputs for a canvas and run assertions without hitting real integrations.** + +- Mock mode for components (return predefined responses) +- Test definition: trigger event -> expected node execution order -> expected outputs +- "Test" button in Canvas UI runs the workflow in simulation +- Green/red indicators on each node (passed/failed) +- Coverage report: which paths were exercised + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium-High | High | Go (backend), React (frontend) | + +**Why it wins:** Fills a critical gap. Makes Superplane enterprise-ready. Shows deep product understanding. + +--- + +### 5. Kubernetes Operator Integration — Native K8s Triggers & Components + +**Add Kubernetes as a first-class integration.** + +- New integration in `pkg/integrations/kubernetes/` +- **Triggers:** `onPodCrashLoop`, `onDeploymentRollout`, `onHPAScale`, `onNodeNotReady` +- **Components:** `applyManifest`, `scaleDeployment`, `rollbackDeployment`, `getPodLogs` +- Uses K8s API via `client-go` + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium-High | High | Go, Kubernetes | + +**Why it wins:** K8s is THE missing integration. Every platform engineer wants this. + +--- + +### 6. Self-Healing Workflows — AI Error Recovery Agent + +**When a workflow node fails, an AI agent analyzes the error, suggests a fix, and can auto-retry with corrected parameters.** + +- Intercept node execution failures in the worker +- Pass error context + node config to Claude component +- AI proposes: retry with different params, skip node, alert human, or rollback +- Configurable autonomy level per canvas: "suggest only" / "auto-fix with approval" / "full auto" +- Audit log of all AI decisions + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium-High | Very High | Go (workers), AI prompting | + +**Why it wins:** Makes workflows resilient without manual intervention. Novel feature no competitor has. + +--- + +### 7. Integration Marketplace — Community Component Store + +**UI where users can browse, install, and publish custom components/integrations.** + +- Browse page with categories, search, popularity +- One-click install (downloads integration config) +- Publish: package a custom HTTP-based integration as a template +- Rating/review system +- Featured workflows section + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium | Medium-High | React, API design | + +**Why it wins:** Creates ecosystem/community value. Pure frontend + API work. + +--- + +### 8. GitOps for Workflows — Canvas-as-Code with Git Sync + +**Store canvas definitions in Git, sync bidirectionally, enable PR-based workflow changes with diff visualization.** + +- Export canvas to YAML in a Git repo (GitHub integration exists) +- Watch for YAML changes, auto-update canvas +- PR workflow: propose canvas change as YAML diff, visual diff in Superplane UI +- Branch-based canvas environments (staging vs production) + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium | High | Go, Git APIs, React | + +**Why it wins:** GitOps is how infrastructure teams already work. Bridges visual editing and code review. + +--- + +### 9. Workflow Analytics Dashboard — Execution Intelligence + +**Real-time dashboard: success rates, execution times, failure patterns, cost estimates, anomaly detection.** + +- Aggregate execution data from existing tables +- Charts: success/fail ratio, p50/p95 duration, failure heatmap by node +- Anomaly detection: flag unusually slow or failing runs +- Cost estimation: track API calls, compute time per workflow +- Weekly digest email + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Medium | Medium-High | React, SQL, charting | + +**Why it wins:** Addresses observability gap. Data already exists. Visual and data-rich demo. + +--- + +### 10. Workflow Linter — Static Analysis for Canvases + +**Analyze workflows for common mistakes before execution.** + +- Graph analysis: detect cycles, orphan nodes, missing connections +- Config validation: required fields empty, invalid expressions, deprecated components +- Security checks: secrets in plaintext, missing approval gates +- Performance hints: suggest parallelization, flag long chains +- Inline warnings on Canvas nodes (yellow/red badges) + +| Effort | Demo Impact | Skills Needed | +|--------|-------------|---------------| +| Low-Medium | Medium-High | Go or TypeScript, graph algorithms | + +**Why it wins:** Low complexity, high value. Quick to build and demo. Makes Superplane feel enterprise-grade. + +--- + +## Six Thinking Hats Summary + +### White Hat (Facts) +- 45+ integrations, gRPC+REST API, React Flow canvas, PydanticAI agent (alpha) +- Key gaps: no K8s, no workflow testing, no auto-recovery, no anomaly detection + +### Red Hat (Gut Feelings) +- AI + Canvas combo will have the biggest wow-factor +- Incident response is emotionally compelling (everyone hates 3am pages) +- Projects that demo well in 5 minutes will win + +### Black Hat (Risks) +- One-day scope: overambitious projects won't finish +- Go backend requires Go expertise for deep changes +- Local dev setup may eat hours (Docker, Postgres, RabbitMQ) +- Canvas UI is a 228KB monolith, risky to modify deeply + +### Yellow Hat (Strengths) +- Integration registry is pluggable (clear pattern to follow) +- AI agent framework exists, extending it is incremental +- Expression engine enables powerful data transformation +- Strong API with auto-generated TypeScript client + +### Green Hat (Creative Ideas) +- See the 10 ideas above + +### Blue Hat (Action Plan) +- **Best overall pick:** Incident Copilot (#1) — low risk, high demo impact +- **Best AI pick:** NL2Workflow (#4) — extends existing agent +- **Best frontend pick:** Canvas Replay (#3) — pure React, data exists +- **Impress the core team:** Workflow Test Runner (#4) or Workflow Linter (#10) +- **Strong Go skills:** Kubernetes Integration (#5) or Self-Healing Workflows (#6) + +--- + +## Suggested Day Plan + +| Time | Activity | +|------|----------| +| 0:00-0:30 | Environment setup (`make dev.setup && make dev.start`) | +| 0:30-1:00 | Familiarize with Canvas UI, create a test workflow | +| 1:00-5:00 | Build your project (pick ONE from above) | +| 5:00-5:30 | Polish demo, write 3-slide pitch | +| 5:30-6:00 | Present | diff --git a/docs/hackathon-plan.md b/docs/hackathon-plan.md new file mode 100644 index 0000000000..49e0831081 --- /dev/null +++ b/docs/hackathon-plan.md @@ -0,0 +1,165 @@ +# Professorianci — SuperPlane Hackathon Plan + +**Team:** Dragan, Braca, Fedja +**Date:** March 28, 2026 — Novi Sad +**Project:** Incident Copilot + Workflow Quality Gates + +--- + +## The Pitch (One Sentence) + +We build an AI-powered Incident Copilot that auto-triages production alerts, AND a workflow quality gate that ensures the copilot (and every canvas) is safe before it goes live. + +--- + +## What We Built + +### Track A — Incident Copilot (Dragan) +An autonomous incident response workflow in SuperPlane Canvas: +- **Trigger:** PagerDuty `onIncident` (filtered to P1/P2 only) +- **Fan-out:** 3 parallel nodes fetch recent deploys (GitHub), metrics (Datadog), incident timeline (PagerDuty) +- **Merge:** Waits for all 3 data sources (2-minute timeout) +- **AI Triage:** Claude component receives all context, produces structured severity assessment with root cause hypotheses +- **Output:** Slack message with evidence pack to `#hackathon-demo` channel +- **Approval gate** before any remediation action +- **4 annotation widgets** explaining each stage + +**File:** `templates/canvases/incident-copilot.yaml` (13 nodes, 10 edges) + +### Track B — Workflow Quality Gate (Braca) +Static analysis engine that validates ANY canvas — implemented as both a Go backend package and TypeScript frontend linter with full parity: + +**9 Lint Rules:** +| Rule | Severity | What it catches | +|------|----------|----------------| +| `duplicate-node-id` | error | Two nodes sharing the same ID | +| `duplicate-node-name` | warning | Ambiguous expression references | +| `invalid-edge` | error | Dangling refs, self-loops, duplicate edges, widget endpoints | +| `cycle-detected` | error | Circular dependencies in workflow graph (Kahn's algorithm) | +| `orphan-node` | warning | Nodes not reachable from any trigger (BFS) | +| `dead-end` | warning | Non-terminal nodes with no outgoing edges | +| `missing-approval-gate` | error | Destructive actions without upstream approval (reverse BFS) | +| `missing-required-config` | error/warn | Empty prompts, missing channels, single-input merges | +| `invalid-expression` | error/warn | Unbalanced `{{ }}`, references to non-existent nodes | +| `unreachable-branch` | info | Filter nodes with no default outgoing edge | + +**Quality Scoring:** +- Score 0-100 with letter grades A-F +- Per-category caps: errors max -60pts, warnings max -30pts, info max -10pts +- All 3 existing templates score Grade A + +**Integration Points:** +- **Pre-save quality gate** — logs quality issues on every canvas save (warn-only, never blocks) +- **REST API** — `POST /api/v1/canvases/{id}/lint` returns full lint result as JSON +- **Frontend badge** — green/red badge in canvas header with tooltip showing all issues and quality score +- **36 unit tests** including dogfood tests against all 3 existing templates + +**Files:** +- `pkg/linter/linter.go` — Go linter engine (9 rules, quality scoring) +- `pkg/linter/linter_test.go` — 36 tests, all passing +- `pkg/grpc/actions/canvases/lint_canvas.go` — REST API handler +- `pkg/grpc/actions/canvases/update_canvas_version.go` — pre-save quality gate +- `pkg/public/server.go` — route registration +- `web_src/src/utils/canvasLinter.ts` — TypeScript linter (full parity with Go) +- `web_src/src/ui/CanvasPage/Header.tsx` — quality gate badge UI + +### Track C — Demo & Glue (Fedja) +- 4 mock data files for realistic demo scenario +- Slack channel configured: `#hackathon-demo` (C0APV7H889F) +- Demo script with quality gates narrative + +**Files:** +- `docs/mock-incident.json` — PagerDuty incident payload (API Gateway 5xx spike) +- `docs/mock-github-release.json` — GitHub release v2.14.3 +- `docs/mock-datadog-metrics.json` — Error rate, latency, request count time series +- `docs/mock-pagerduty-logs.json` — Incident timeline log entries + +--- + +## Why This Wins + +1. **Incident Copilot** is directly on-theme (AI + automation + production systems) +2. **Quality Gate** makes SuperPlane enterprise-ready — real product value +3. Together they tell one story: "We built the feature AND the safety net" +4. Every DevOps person relates to 3am incident pages +5. Demo is visual and compelling — canvas workflow + Slack output + red/green quality badge +6. Quality scoring (A-F grades) gives an instant readability to canvas health +7. Full Go + TypeScript parity means the badge is always accurate + +--- + +## Demo Script (5 minutes) + +### Slide 1 — The Problem (30 seconds) +"It's 3am. PagerDuty fires. Your engineer wakes up, spends 20 minutes across 5 dashboards gathering context before they even understand what's happening." + +### Live Demo — The Incident Copilot (90 seconds) +1. Show the canvas: "Here's our Incident Copilot — built entirely in SuperPlane's Canvas" +2. Walk through the flow: trigger, filter, parallel data collection, merge, AI triage, Slack output, approval gate +3. Point out the **green quality badge** in the header: "Score: 100/100, Grade A — this workflow is validated before it goes live" +4. Fire the webhook: `curl -X POST http://localhost:8000/api/v1/webhooks/ -H "Content-Type: application/json" -d @docs/mock-incident.json` +5. Watch nodes light up in real-time +6. Switch to Slack: show the evidence pack arriving with severity assessment +7. "47 seconds. From alert to actionable triage." + +### Live Demo — The Quality Gate (90 seconds) +1. "But how do you know this workflow is safe before it goes live?" +2. Show the green badge: "Quality Gate: A (100/100)" +3. **Break something:** Delete an edge — badge turns red immediately: "1 error — orphan node detected" +4. **Fix it:** Reconnect the edge — badge turns green again +5. **Break differently:** Remove the approval gate — badge shows: "Destructive action has no upstream approval gate" +6. **Show the API:** `curl -X POST http://localhost:8000/api/v1/canvases//lint` — show JSON output with quality score +7. "Every canvas gets a quality score. Errors are caught before they reach production." + +### Live Demo — Deep Validation (30 seconds) +1. Add a node with an expression referencing a non-existent node — badge catches it +2. Create a cycle — badge catches it +3. "9 rules, from graph cycles to expression validation. The linter catches what humans miss." + +### Slide 2 — What We Built (30 seconds) +- Incident Copilot: AI triage in < 60 seconds vs 20 minutes manual +- Quality Gate: 9 lint rules, quality scoring (A-F), REST API, real-time badge +- Full Go + TypeScript parity — backend and frontend always agree +- 36 unit tests, all 3 existing templates pass with Grade A +- Zero backend changes needed for copilot, minimal changes for quality gate + +### Slide 3 — What's Next (30 seconds) +- Quality gate as a pre-publish hook (block publish when grade < C) +- Linter as a built-in SuperPlane feature +- Copilot templates for common incident types (database, network, deployment) +- Self-healing: AI suggests workflow fixes when linter finds issues +- Integration contract tests using the same quality gate framework + +--- + +## Pre-Hackathon Checklist + +- [x] All three: clone repo, run `make dev.setup && make dev.start` +- [x] Dragan: review Canvas API and available integration components +- [x] Braca: review canvas data model (how nodes/edges are stored) +- [x] Fedja: find a real PagerDuty/Datadog alert payload format for mock data +- [x] All: agree on a Slack channel for demo output (`#hackathon-demo` — C0APV7H889F) + +--- + +## Technical Stats + +| Metric | Value | +|--------|-------| +| Go lines written | ~1,200 (linter + API + tests) | +| TypeScript lines written | ~400 (frontend linter + badge) | +| YAML template lines | ~280 (incident copilot) | +| Mock data files | 4 JSON files | +| Lint rules | 9 (full Go/TS parity) | +| Unit tests | 36 (all passing) | +| Template quality scores | 100/A, 100/A, 95/A | +| Devil's advocate reviews | 2 rounds, 26 issues found and fixed | + +--- + +## Fallback Plan + +If anything goes wrong with the full Incident Copilot: +- Simplify to just 2 nodes: trigger -> AI triage -> Slack (skip the parallel fan-out) +- The quality gate stands on its own as a valuable feature regardless +- Worst case: quality gate demo + copilot design walkthrough still tells the story diff --git a/docs/hackathon-reference-braca.md b/docs/hackathon-reference-braca.md new file mode 100644 index 0000000000..f73ede0111 --- /dev/null +++ b/docs/hackathon-reference-braca.md @@ -0,0 +1,241 @@ +# Track B Reference — Workflow Linter / Quality Gate (Braca) + +## Existing Validation (What Superplane Already Does) + +There's already validation in `pkg/grpc/actions/canvases/serialization.go` (lines 197-326). The linter should go **beyond** this with deeper semantic checks. + +**Already validated (don't duplicate):** +- Unique node IDs +- Node names present +- Component/trigger references exist in registry +- Edge source/target IDs exist +- Widgets not used as edge source/target +- Cycle detection (`CheckForCycles`) +- Group widget validation (no nesting, no self-reference) +- Basic config validation against component schema + +## What the Linter Should Add + +### 1. Orphan Node Detection +Nodes not reachable from any trigger (no path from root). + +```go +// Find root nodes (triggers with no incoming edges) +func FindOrphanNodes(nodes []Node, edges []Edge) []Node { + // Build adjacency: reachable set from all triggers + triggers := findTriggerNodes(nodes) + reachable := bfs(triggers, edges) + + var orphans []Node + for _, n := range nodes { + if n.Type == "widget" { continue } // groups are OK + if !reachable[n.ID] { + orphans = append(orphans, n) + } + } + return orphans +} +``` + +### 2. Dead-End Detection +Nodes with no outgoing edges that aren't terminal (Slack, email, approval, etc.). + +### 3. Missing Approval Before Destructive Actions +Destructive components that should have an approval gate upstream: +- `pagerduty.resolveIncident` +- `pagerduty.escalateIncident` +- `github.deleteRelease` +- `github.createRelease` +- Any HTTP DELETE/PUT to production URLs +- SSH commands + +Check: walk the graph backwards from these nodes — is there an `approval` component in the path? + +### 4. Missing Required Configuration +Go beyond basic "field required" — check semantic requirements: +- Claude `textPrompt` with empty `prompt` field +- HTTP component with no `url` +- Slack `sendTextMessage` with no `channel` +- Merge component with only 1 incoming edge (pointless merge) + +### 5. Expression Syntax Validation +Validate expression strings without executing them: +- Balanced `{{ }}` delimiters +- Valid `$['Node Name']` references point to actual node names +- `root()`, `previous()` used correctly + +### 6. Unreachable Branches +After an `if` component, check that both true/false branches lead somewhere. + +## Canvas Data Model + +### Node Structure (`pkg/models/blueprint.go`) +```go +type Node struct { + ID string + Name string + Type string // "trigger", "component", "blueprint", "widget" + Ref NodeRef // exactly one of: Component, Blueprint, Trigger, Widget + Configuration map[string]any + Metadata map[string]any + Position Position // {X, Y} + IsCollapsed bool + IntegrationID *string + ErrorMessage *string + WarningMessage *string +} + +type NodeRef struct { + Component *ComponentRef // {Name: "http"} + Blueprint *BlueprintRef // {ID: "..."} + Trigger *TriggerRef // {Name: "pagerduty.onIncident"} + Widget *WidgetRef // {Name: "group"} +} +``` + +### Edge Structure +```go +type Edge struct { + SourceID string // upstream node ID + TargetID string // downstream node ID + Channel string // "default", "success", "fail", "approved", "rejected", etc. +} +``` + +### Canvas Version (where nodes/edges live) +```go +type CanvasVersion struct { + Nodes []Node + Edges []Edge + // ... metadata +} +``` + +### Accessing Canvas via API +``` +GET /api/v1/canvases/{id} -> Canvas with live version spec +GET /api/v1/canvases/{id}/spec -> Just the nodes and edges +``` + +**Proto:** `protos/canvases.proto` — `Canvas.Spec` contains `repeated Node nodes` and `repeated Edge edges` + +## Component Configuration Schema + +Each component defines its config via `Configuration() []configuration.Field`: + +```go +type Field struct { + Name string + Label string + Type string // "string", "number", "boolean", "select", "expression", "text", etc. + Required bool + Default any + Sensitive bool +} +``` + +**Existing validation:** `pkg/configuration/validation.go` → `ValidateConfiguration(fields, config)` + +The registry at `pkg/registry/registry.go` has all components: +```go +Registry.Components // map[string]core.Component +``` + +## Graph Traversal Helpers + +Already in `pkg/models/blueprint.go`: +```go +FindEdges(sourceID, channel string) []Edge // outgoing edges from node +FindRootNode() *Node // node with no incoming edges +``` + +## Linter Output Format + +Suggested structure: +```json +{ + "status": "fail", + "errors": [ + { + "severity": "error", + "rule": "orphan-node", + "nodeId": "abc123", + "nodeName": "Unused HTTP Call", + "message": "Node is not reachable from any trigger" + } + ], + "warnings": [ + { + "severity": "warning", + "rule": "missing-approval-gate", + "nodeId": "def456", + "nodeName": "Delete Release", + "message": "Destructive action 'github.deleteRelease' has no upstream approval gate" + } + ], + "info": [ + { + "severity": "info", + "rule": "single-input-merge", + "nodeId": "ghi789", + "nodeName": "Wait for all", + "message": "Merge node has only 1 incoming edge — consider removing" + } + ], + "summary": { + "total": 3, + "errors": 1, + "warnings": 1, + "info": 1 + } +} +``` + +## Implementation Options + +### Option A: Go Package (recommended) +Add `pkg/linter/linter.go` with: +```go +func LintCanvas(nodes []models.Node, edges []models.Edge, registry *registry.Registry) *LintResult +``` +- Can access component registry for config validation +- Can be called from gRPC action (new API endpoint) +- Can be wired into pre-publish hook + +### Option B: TypeScript (frontend-only) +Add `web_src/src/utils/canvasLinter.ts`: +- Operates on the React Flow node/edge data already in memory +- Shows results inline in Canvas UI immediately +- No backend changes needed +- BUT: no access to component config schema + +### Option C: Both +- Go backend for deep validation (config schema, expression parsing) +- TypeScript frontend for instant visual feedback (orphans, dead-ends) + +For 3 hours, **Option A or B alone is sufficient**. Pick based on comfort. + +## Key Source Files + +| File | What to look at | +|------|-----------------| +| `pkg/models/blueprint.go:121-167` | Node, Edge, NodeRef structs | +| `pkg/grpc/actions/canvases/serialization.go:197-326` | Existing validation to extend | +| `pkg/configuration/field.go` | Config field schema | +| `pkg/configuration/validation.go` | Config validation logic | +| `pkg/core/component.go:70` | Component interface (Configuration method) | +| `pkg/registry/registry.go` | Component registry | +| `pkg/components/approval/approval.go` | Approval component | +| `pkg/components/merge/merge.go` | Merge component | +| `protos/canvases.proto` | Canvas proto definition | +| `protos/components.proto` | Node/Edge proto definition | + +## "Eat Our Own Dogfood" Demo + +At 2:15, run the linter against Dragan's Incident Copilot canvas: +1. It should PASS (green) — copilot is well-formed +2. Remove an edge → run again → catches orphan node (red) +3. Remove the approval gate → run again → warns about missing approval before destructive action +4. Fix → green again + +This is the money shot for the demo. diff --git a/docs/hackathon-reference-dragan.md b/docs/hackathon-reference-dragan.md new file mode 100644 index 0000000000..fbe2035e4d --- /dev/null +++ b/docs/hackathon-reference-dragan.md @@ -0,0 +1,184 @@ +# Track A Reference — Incident Copilot (Dragan) + +## Canvas Flow Design + +``` +PagerDuty onIncident (trigger) + | + v +Filter (P1/P2 only) + | + +---> GitHub getRelease (latest deploy) + +---> HTTP GET (Datadog metrics API) + +---> PagerDuty listLogEntries (incident timeline) + | + v +Merge (wait for all 3) + | + v +Claude textPrompt (AI triage) + | + v +Slack sendTextMessage (evidence pack) + | + v +Approval (before remediation) +``` + +## Existing Templates to Reference + +Two incident templates already exist — use them as a starting point: + +- `templates/canvases/incident-data-collection.yaml` — PagerDuty trigger + parallel data fetch + AI summary + GitHub issue +- `templates/canvases/incident-router.yaml` — Slack mention + AI title generation + PagerDuty/GitHub parallel create + +## PagerDuty Trigger + +**Trigger name:** `pagerduty.onIncident` + +**Events:** `incident.triggered`, `incident.acknowledged`, `incident.resolved` + +**Config fields:** +- `service` (required): PagerDuty service to monitor +- `events` (required): which incident events to listen for +- `urgencies` (optional): filter by "high" or "low" + +**Example payload (what downstream nodes receive):** +```json +{ + "data": { + "incident": { + "id": "PGR0VU2", + "number": 2, + "title": "A little bump in the road", + "status": "triggered", + "urgency": "high", + "service": { "id": "PF9KMXH", "summary": "API Service" }, + "assignees": [{ "id": "PTUXL6G", "summary": "User 123" }], + "escalation_policy": { "id": "PUS0KTE", "summary": "Default" }, + "teams": [{ "id": "PFCVPS0", "summary": "Engineering" }], + "priority": { "id": "PSO75BM", "summary": "P1" } + }, + "agent": { "id": "PLH1HKV", "summary": "Tenex Engineer" } + } +} +``` + +## Filter Node (P1/P2 Only) + +**Expression:** +``` +$["Listen for incidents"].data.incident.priority.summary == "P1" || $["Listen for incidents"].data.incident.priority.summary == "P2" +``` + +## Claude AI Triage Node + +**Component:** `claude.textPrompt` + +**Config:** +``` +model: claude-3-5-sonnet-latest (or claude-opus-4-6) +maxTokens: 4096 +temperature: 0.3 +``` + +**System message:** +``` +You are an expert SRE incident triage assistant. Given incident details, recent deployments, metrics, and logs, produce a structured triage report with: + +1. SEVERITY ASSESSMENT (P1-P4 with justification) +2. LIKELY ROOT CAUSE (top 3 hypotheses ranked by probability) +3. AFFECTED SYSTEMS (services, endpoints, user segments) +4. RECOMMENDED ACTIONS (ordered by priority, with estimated impact) +5. ESCALATION RECOMMENDATION (who to page, what team) + +Be concise. Use bullet points. Include specific evidence for each claim. +``` + +**Prompt (using expressions):** +``` +INCIDENT: +Title: {{ $['Listen for incidents'].data.incident.title }} +Status: {{ $['Listen for incidents'].data.incident.status }} +Urgency: {{ $['Listen for incidents'].data.incident.urgency }} +Priority: {{ $['Listen for incidents'].data.incident.priority.summary }} +Service: {{ $['Listen for incidents'].data.incident.service.summary }} +Assigned to: {{ $['Listen for incidents'].data.incident.assignees }} + +RECENT DEPLOYMENT: +{{ $['Get latest release'].data }} + +METRICS: +{{ $['Fetch metrics'].data }} + +INCIDENT LOG: +{{ $['Get incident logs'].data }} +``` + +## Slack Output Node + +**Component:** `slack.sendTextMessage` + +**Config:** +- `channel`: your demo Slack channel +- `text` (using expressions): + +``` +:rotating_light: *INCIDENT TRIAGE — AUTO-GENERATED* + +*{{ $['Listen for incidents'].data.incident.title }}* +Priority: {{ $['Listen for incidents'].data.incident.priority.summary }} +Service: {{ $['Listen for incidents'].data.incident.service.summary }} + +--- + +{{ $['AI Triage'].data.text }} + +--- + +_Triage generated by SuperPlane Incident Copilot in < 60 seconds_ +``` + +## Approval Gate + +**Component:** `approval` + +**Config:** +```json +{ + "items": [ + { "type": "anyone" } + ] +} +``` + +Output channels: `approved` / `rejected` + +## Expression Syntax Quick Reference + +| Pattern | Meaning | +|---------|---------| +| `$['Node Name'].data.field` | Access upstream node output | +| `root().data` | Access trigger event data | +| `previous().data` | Access immediate previous node | +| `{{ expression }}` | Template interpolation in text fields | + +## Key Source Files + +| File | What to look at | +|------|-----------------| +| `templates/canvases/incident-data-collection.yaml` | Working incident template | +| `templates/canvases/incident-router.yaml` | Working routing template | +| `pkg/integrations/pagerduty/example_data_on_incident.json` | Full payload example | +| `pkg/integrations/claude/claude.go` | Claude component config | +| `pkg/integrations/slack/send_text_message.go` | Slack message component | +| `pkg/components/merge/merge.go` | Fan-in merge component | +| `pkg/components/approval/approval.go` | Approval gate | + +## Fallback + +If parallel fan-out + merge is too complex: +``` +PagerDuty trigger -> Claude textPrompt (just pass raw incident data) -> Slack message +``` +This 3-node flow works and still demos the core value. diff --git a/docs/hackathon-reference-fedja.md b/docs/hackathon-reference-fedja.md new file mode 100644 index 0000000000..ddc7bfd2a8 --- /dev/null +++ b/docs/hackathon-reference-fedja.md @@ -0,0 +1,259 @@ +# Track C Reference — Demo & Glue (Fedja) + +## Mock PagerDuty Incident Payload + +Use this to trigger the Incident Copilot without a real PagerDuty account. +Send as POST to the canvas webhook URL. + +```json +{ + "event": { + "id": "01DEN4HPBQAAAG05V5QQYBRZMF", + "event_type": "incident.triggered", + "resource_type": "incident", + "occurred_at": "2026-03-28T14:30:00.000Z", + "agent": { + "html_url": "https://acme.pagerduty.com/users/PLH1HKV", + "id": "PLH1HKV", + "self": "https://api.pagerduty.com/users/PLH1HKV", + "summary": "Monitoring Bot", + "type": "user_reference" + }, + "data": { + "id": "PGR0VU2", + "type": "incident", + "self": "https://api.pagerduty.com/incidents/PGR0VU2", + "html_url": "https://acme.pagerduty.com/incidents/PGR0VU2", + "number": 42, + "status": "triggered", + "incident_key": "hackathon-demo-incident-001", + "created_at": "2026-03-28T14:30:00Z", + "title": "API Gateway: 5xx error rate spike to 15% on /api/v1/orders", + "urgency": "high", + "service": { + "html_url": "https://acme.pagerduty.com/services/PF9KMXH", + "id": "PF9KMXH", + "self": "https://api.pagerduty.com/services/PF9KMXH", + "summary": "API Gateway (Production)", + "type": "service_reference" + }, + "assignees": [ + { + "html_url": "https://acme.pagerduty.com/users/PTUXL6G", + "id": "PTUXL6G", + "self": "https://api.pagerduty.com/users/PTUXL6G", + "summary": "Dragan Petrovic (On-Call SRE)", + "type": "user_reference" + } + ], + "escalation_policy": { + "html_url": "https://acme.pagerduty.com/escalation_policies/PUS0KTE", + "id": "PUS0KTE", + "self": "https://api.pagerduty.com/escalation_policies/PUS0KTE", + "summary": "Production - Critical", + "type": "escalation_policy_reference" + }, + "teams": [ + { + "html_url": "https://acme.pagerduty.com/teams/PFCVPS0", + "id": "PFCVPS0", + "self": "https://api.pagerduty.com/teams/PFCVPS0", + "summary": "Platform Engineering", + "type": "team_reference" + } + ], + "priority": { + "html_url": "https://acme.pagerduty.com/priorities/PSO75BM", + "id": "PSO75BM", + "self": "https://api.pagerduty.com/priorities/PSO75BM", + "summary": "P1", + "type": "priority_reference" + }, + "conference_bridge": { + "conference_number": "+1 555-123-4567,,987654321#", + "conference_url": "https://meet.google.com/abc-defg-hij" + }, + "body": { + "type": "incident_body", + "details": "5xx error rate on API Gateway spiked from 0.1% to 15.3% at 14:28 UTC. Affects /api/v1/orders endpoint. 1,247 users impacted in last 2 minutes. Correlated with deployment deploy-api-v2.14.3 at 14:25 UTC." + } + } + } +} +``` + +## Mock GitHub Release (Latest Deploy) + +If using HTTP component to simulate GitHub data, return this: + +```json +{ + "id": 12345678, + "tag_name": "v2.14.3", + "name": "Release v2.14.3 - Order Service Refactor", + "body": "## Changes\n- Refactored order validation logic\n- Migrated to new payment gateway client\n- Updated database connection pooling\n\n## Authors\n- @braca (order validation)\n- @fedja (payment gateway)\n\n## Risk: Medium\nDatabase connection pool size changed from 20 to 50", + "draft": false, + "prerelease": false, + "created_at": "2026-03-28T14:25:00Z", + "published_at": "2026-03-28T14:25:30Z", + "author": { + "login": "braca", + "id": 87654321 + } +} +``` + +## Mock Datadog Metrics Response + +For the HTTP component fetching Datadog metrics: + +```json +{ + "series": [ + { + "metric": "api.gateway.error_rate_5xx", + "points": [ + [1711633200, 0.1], + [1711633260, 0.3], + [1711633320, 2.1], + [1711633380, 8.7], + [1711633440, 15.3], + [1711633500, 14.8] + ], + "tags": ["service:api-gateway", "env:production"] + }, + { + "metric": "api.gateway.latency_p99", + "points": [ + [1711633200, 120], + [1711633260, 145], + [1711633320, 890], + [1711633380, 2340], + [1711633440, 4500], + [1711633500, 4200] + ], + "tags": ["service:api-gateway", "env:production"] + } + ], + "status": "ok", + "query": "avg:api.gateway.error_rate_5xx{env:production} by {service}" +} +``` + +## Mock PagerDuty Log Entries + +```json +{ + "log_entries": [ + { + "type": "trigger_log_entry", + "created_at": "2026-03-28T14:30:00Z", + "summary": "Triggered by Datadog monitor: API 5xx Error Rate > 5%" + }, + { + "type": "notify_log_entry", + "created_at": "2026-03-28T14:30:05Z", + "summary": "Notified Dragan Petrovic via push notification" + }, + { + "type": "annotate_log_entry", + "created_at": "2026-03-28T14:30:10Z", + "summary": "Correlated with deploy-api-v2.14.3 (14:25 UTC)" + } + ] +} +``` + +## Slack Evidence Pack Template + +What the final Slack message should look like: + +``` +:rotating_light: *INCIDENT TRIAGE — AUTO-GENERATED* + +*API Gateway: 5xx error rate spike to 15% on /api/v1/orders* +Priority: P1 | Service: API Gateway (Production) +Assignee: Dragan Petrovic (On-Call SRE) + +--- + +*SEVERITY: P1 — Critical* +Customer-facing order flow is down for ~1,200 users. + +*LIKELY ROOT CAUSE:* +1. (85%) Deploy v2.14.3 changed DB connection pool 20->50, likely exhausting DB connections +2. (10%) Payment gateway client migration introduced timeout regression +3. (5%) Unrelated infrastructure issue + +*AFFECTED SYSTEMS:* +- API Gateway /api/v1/orders endpoint +- Order Service (downstream) +- ~1,247 active users in checkout flow + +*RECOMMENDED ACTIONS:* +1. :arrow_right: Rollback deploy v2.14.3 immediately (ETA: 3 min) +2. Check DB connection count: `SELECT count(*) FROM pg_stat_activity` +3. Monitor error rate after rollback for 5 min +4. If not resolved, escalate to Database Team + +*ESCALATION:* +- Current: Platform Engineering (Dragan) +- Next: Database Team (@db-oncall) if rollback doesn't resolve + +--- +:clock1: Triage generated by SuperPlane Incident Copilot in 47 seconds +:link: +``` + +## Demo Script — Detailed + +### Setup (before demo starts) +1. Have the canvas open in browser, zoomed to show full flow +2. Have Slack channel open in a second tab/window +3. Have a `curl` command ready to fire the mock webhook + +### Act 1: The Problem (30 seconds) +"It's 3am. PagerDuty fires. Your engineer opens 5 tabs: PagerDuty, Datadog, GitHub, the runbook, Slack. Spends 20 minutes gathering context before understanding the problem. We fixed that." + +### Act 2: The Copilot (90 seconds) +1. Show the canvas: "Here's our Incident Copilot — built entirely in SuperPlane's Canvas" +2. Walk through the flow: trigger, parallel data collection, AI triage, Slack output +3. Fire the webhook: `curl -X POST -H "Content-Type: application/json" -d @mock-incident.json` +4. Watch nodes light up in real-time (canvas execution visualization) +5. Switch to Slack: show the evidence pack arriving +6. "47 seconds. From alert to actionable triage." + +### Act 3: The Safety Net (60 seconds) +1. "But how do you know this workflow is safe before it goes live?" +2. Run the linter: show green pass +3. Delete an edge in the canvas +4. Run linter again: red fail — "Orphan node detected" +5. Remove the approval gate +6. Run linter again: warning — "Destructive action without approval" +7. "The linter catches mistakes before they reach production." + +### Act 4: What's Next (30 seconds) +- Linter as a built-in pre-publish hook +- Template library for common incident types +- Self-healing: AI suggests fixes when linter finds issues + +## Screenshot Checklist + +Capture these during the build (2:15-2:30): +- [ ] Full canvas view with all nodes connected +- [ ] Canvas with nodes executing (green highlights) +- [ ] Slack evidence pack message +- [ ] Linter output: passing (green) +- [ ] Linter output: failing (red) +- [ ] Before/after side-by-side + +## Curl Command for Demo + +Save this as `mock-incident.json` and use: +```bash +curl -X POST http://localhost:8000/api/v1/webhooks/ \ + -H "Content-Type: application/json" \ + -d @docs/mock-incident.json +``` + +(Get the webhook-id from the canvas trigger configuration after setup) diff --git a/docs/mock-datadog-metrics.json b/docs/mock-datadog-metrics.json new file mode 100644 index 0000000000..55675c10b3 --- /dev/null +++ b/docs/mock-datadog-metrics.json @@ -0,0 +1,48 @@ +{ + "status": "ok", + "series": [ + { + "metric": "api.gateway.error_rate_5xx", + "display_name": "5xx Error Rate (%)", + "points": [ + [1774973400, 0.1], + [1774973460, 0.3], + [1774973520, 2.1], + [1774973580, 8.7], + [1774973640, 15.3], + [1774973700, 14.8] + ], + "tags": ["service:api-gateway", "env:production"], + "unit": "percent" + }, + { + "metric": "api.gateway.latency_p99", + "display_name": "P99 Latency (ms)", + "points": [ + [1774973400, 120], + [1774973460, 145], + [1774973520, 890], + [1774973580, 2340], + [1774973640, 4500], + [1774973700, 4200] + ], + "tags": ["service:api-gateway", "env:production"], + "unit": "millisecond" + }, + { + "metric": "api.gateway.request_count", + "display_name": "Request Count", + "points": [ + [1774973400, 15234], + [1774973460, 14890], + [1774973520, 12456], + [1774973580, 8934], + [1774973640, 6721], + [1774973700, 7102] + ], + "tags": ["service:api-gateway", "env:production"], + "unit": "request" + } + ], + "query": "avg:api.gateway.error_rate_5xx{service:api-gateway,env:production}" +} diff --git a/docs/mock-github-release.json b/docs/mock-github-release.json new file mode 100644 index 0000000000..5b5f4caee6 --- /dev/null +++ b/docs/mock-github-release.json @@ -0,0 +1,17 @@ +{ + "id": 12345678, + "tag_name": "v2.14.3", + "name": "Release v2.14.3 - Order Service Refactor", + "body": "## Changes\n- Refactored order validation logic\n- Migrated to new payment gateway client\n- Updated database connection pooling\n\n## Authors\n- @braca (order validation)\n- @fedja (payment gateway)\n\n## Risk: Medium\nDatabase connection pool size changed from 20 to 50", + "draft": false, + "prerelease": false, + "created_at": "2026-03-28T14:25:00Z", + "published_at": "2026-03-28T14:25:30Z", + "author": { + "login": "braca", + "id": 87654321 + }, + "html_url": "https://github.com/acme-corp/api-service/releases/tag/v2.14.3", + "tarball_url": "https://api.github.com/repos/acme-corp/api-service/tarball/v2.14.3", + "zipball_url": "https://api.github.com/repos/acme-corp/api-service/zipball/v2.14.3" +} diff --git a/docs/mock-incident.json b/docs/mock-incident.json new file mode 100644 index 0000000000..b9a76fb12f --- /dev/null +++ b/docs/mock-incident.json @@ -0,0 +1,74 @@ +{ + "event": { + "id": "01DEN4HPBQAAAG05V5QQYBRZMF", + "event_type": "incident.triggered", + "resource_type": "incident", + "occurred_at": "2026-03-28T14:30:00.000Z", + "agent": { + "html_url": "https://acme.pagerduty.com/users/PLH1HKV", + "id": "PLH1HKV", + "self": "https://api.pagerduty.com/users/PLH1HKV", + "summary": "Monitoring Bot", + "type": "user_reference" + }, + "data": { + "id": "PGR0VU2", + "type": "incident", + "self": "https://api.pagerduty.com/incidents/PGR0VU2", + "html_url": "https://acme.pagerduty.com/incidents/PGR0VU2", + "number": 42, + "status": "triggered", + "incident_key": "hackathon-demo-incident-001", + "created_at": "2026-03-28T14:30:00Z", + "title": "API Gateway: 5xx error rate spike to 15% on /api/v1/orders", + "urgency": "high", + "service": { + "html_url": "https://acme.pagerduty.com/services/PF9KMXH", + "id": "PF9KMXH", + "self": "https://api.pagerduty.com/services/PF9KMXH", + "summary": "API Gateway (Production)", + "type": "service_reference" + }, + "assignees": [ + { + "html_url": "https://acme.pagerduty.com/users/PTUXL6G", + "id": "PTUXL6G", + "self": "https://api.pagerduty.com/users/PTUXL6G", + "summary": "Dragan Petrovic (On-Call SRE)", + "type": "user_reference" + } + ], + "escalation_policy": { + "html_url": "https://acme.pagerduty.com/escalation_policies/PUS0KTE", + "id": "PUS0KTE", + "self": "https://api.pagerduty.com/escalation_policies/PUS0KTE", + "summary": "Production - Critical", + "type": "escalation_policy_reference" + }, + "teams": [ + { + "html_url": "https://acme.pagerduty.com/teams/PFCVPS0", + "id": "PFCVPS0", + "self": "https://api.pagerduty.com/teams/PFCVPS0", + "summary": "Platform Engineering", + "type": "team_reference" + } + ], + "priority": { + "html_url": "https://acme.pagerduty.com/priorities/PSO75BM", + "id": "PSO75BM", + "self": "https://api.pagerduty.com/priorities/PSO75BM", + "summary": "P1", + "type": "priority_reference" + }, + "conference_bridge": { + "conference_number": "+1 555-123-4567,,987654321#", + "conference_url": "https://meet.google.com/abc-defg-hij" + }, + "body": { + "type": "incident_body", + "details": "5xx error rate on API Gateway spiked from 0.1% to 15.3% at 14:28 UTC. Affects /api/v1/orders endpoint. 1,247 users impacted in last 2 minutes. Correlated with deployment deploy-api-v2.14.3 at 14:25 UTC." + } + } + } +} diff --git a/docs/mock-pagerduty-logs.json b/docs/mock-pagerduty-logs.json new file mode 100644 index 0000000000..7848a1190f --- /dev/null +++ b/docs/mock-pagerduty-logs.json @@ -0,0 +1,40 @@ +{ + "log_entries": [ + { + "id": "R1YCD0YVSA", + "type": "trigger_log_entry", + "created_at": "2026-03-28T14:30:00Z", + "summary": "Triggered by Datadog monitor: API 5xx Error Rate > 5%", + "agent": { + "type": "service_reference", + "summary": "Datadog Integration" + }, + "channel": { + "type": "auto", + "details": "Monitor: api-gateway-5xx-rate, Threshold: 5%, Current: 15.3%" + } + }, + { + "id": "R2BDE1ZWTA", + "type": "notify_log_entry", + "created_at": "2026-03-28T14:30:05Z", + "summary": "Notified Dragan Petrovic via push notification", + "agent": { + "type": "user_reference", + "summary": "PagerDuty" + } + }, + { + "id": "R3CEF2AXUB", + "type": "annotate_log_entry", + "created_at": "2026-03-28T14:30:10Z", + "summary": "Correlated with deploy-api-v2.14.3 (14:25 UTC)", + "agent": { + "type": "service_reference", + "summary": "Deploy Tracker" + } + } + ], + "total": 3, + "more": false +} diff --git a/pkg/grpc/actions/canvases/lint_canvas.go b/pkg/grpc/actions/canvases/lint_canvas.go new file mode 100644 index 0000000000..75004717cf --- /dev/null +++ b/pkg/grpc/actions/canvases/lint_canvas.go @@ -0,0 +1,94 @@ +package canvases + +import ( + "encoding/json" + "errors" + "net/http" + + "github.com/google/uuid" + "github.com/gorilla/mux" + log "github.com/sirupsen/logrus" + "github.com/superplanehq/superplane/pkg/database" + "github.com/superplanehq/superplane/pkg/linter" + "github.com/superplanehq/superplane/pkg/models" + "github.com/superplanehq/superplane/pkg/registry" + "gorm.io/gorm" +) + +// jsonError writes a JSON error response with the given status code. +func jsonError(w http.ResponseWriter, msg string, code int) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(map[string]string{"error": msg}) +} + +// LintCanvasHandler returns an http.HandlerFunc that lints a canvas by ID. +// It reads the canvas spec from the live version and runs the linter. +// +// Route: POST /api/v1/canvases/{canvasId}/lint +func LintCanvasHandler(reg *registry.Registry) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + // Extract organization ID from header (set by auth middleware). + orgID := r.Header.Get("X-Organization-Id") + if orgID == "" { + jsonError(w, "missing organization id", http.StatusUnauthorized) + return + } + + // Extract canvas ID from gorilla/mux route variables. + canvasID := mux.Vars(r)["canvasId"] + if canvasID == "" { + jsonError(w, "missing canvas id", http.StatusBadRequest) + return + } + + orgUUID, err := uuid.Parse(orgID) + if err != nil { + jsonError(w, "invalid organization id", http.StatusBadRequest) + return + } + + canvasUUID, err := uuid.Parse(canvasID) + if err != nil { + jsonError(w, "invalid canvas id", http.StatusBadRequest) + return + } + + // Load the canvas. + canvas, err := models.FindCanvas(orgUUID, canvasUUID) + if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + jsonError(w, "canvas not found", http.StatusNotFound) + return + } + log.WithError(err).Error("failed to find canvas") + jsonError(w, "internal error", http.StatusInternalServerError) + return + } + + // Load the live version. + version, err := models.FindLiveCanvasVersionByCanvasInTransaction(database.Conn(), canvas) + if err != nil { + log.WithError(err).Error("failed to find live canvas version") + jsonError(w, "failed to load canvas version", http.StatusInternalServerError) + return + } + + // Run the linter. + nodes := []models.Node(version.Nodes) + edges := []models.Edge(version.Edges) + result := linter.LintCanvas(nodes, edges, reg) + + // Return JSON response. + w.Header().Set("Content-Type", "application/json") + if result.Status == "fail" { + w.WriteHeader(http.StatusUnprocessableEntity) + } else { + w.WriteHeader(http.StatusOK) + } + + if err := json.NewEncoder(w).Encode(result); err != nil { + log.WithError(err).Error("failed to encode lint result") + } + } +} diff --git a/pkg/grpc/actions/canvases/update_canvas_version.go b/pkg/grpc/actions/canvases/update_canvas_version.go index ef9597a2d7..23f44f07c1 100644 --- a/pkg/grpc/actions/canvases/update_canvas_version.go +++ b/pkg/grpc/actions/canvases/update_canvas_version.go @@ -12,6 +12,7 @@ import ( "github.com/superplanehq/superplane/pkg/crypto" "github.com/superplanehq/superplane/pkg/database" "github.com/superplanehq/superplane/pkg/grpc/actions/messages" + "github.com/superplanehq/superplane/pkg/linter" "github.com/superplanehq/superplane/pkg/models" pb "github.com/superplanehq/superplane/pkg/protos/canvases" usagepb "github.com/superplanehq/superplane/pkg/protos/usage" @@ -90,6 +91,20 @@ func UpdateCanvasVersionWithUsage( return nil, err } + // Quality gate: lint the canvas after layout and log results. + // Runs in warn-only mode — does not block saves, so users can always + // save their work. Issues are surfaced via the lint badge and API. + lintResult := linter.LintCanvas(nodes, edges, registry) + if lintResult.Summary.ErrorCount > 0 || lintResult.Summary.WarningCount > 0 { + log.WithFields(log.Fields{ + "canvas_id": canvasID, + "quality_score": lintResult.QualityScore, + "quality_grade": string(lintResult.QualityGrade), + "error_count": lintResult.Summary.ErrorCount, + "warning_count": lintResult.Summary.WarningCount, + }).Warn("Canvas has quality issues") + } + expandedNodes, err := expandNodes(organizationID, nodes) if err != nil { return nil, err diff --git a/pkg/linter/linter.go b/pkg/linter/linter.go new file mode 100644 index 0000000000..1fa195a2dc --- /dev/null +++ b/pkg/linter/linter.go @@ -0,0 +1,714 @@ +package linter + +import ( + "fmt" + "regexp" + "strings" + + "github.com/superplanehq/superplane/pkg/models" + "github.com/superplanehq/superplane/pkg/registry" +) + +// Severity indicates how critical a lint issue is. +type Severity string + +const ( + SeverityError Severity = "error" + SeverityWarning Severity = "warning" + SeverityInfo Severity = "info" +) + +// LintIssue represents a single problem detected during linting. +type LintIssue struct { + Severity Severity `json:"severity"` + Rule string `json:"rule"` + NodeID string `json:"nodeId"` + NodeName string `json:"nodeName"` + Message string `json:"message"` +} + +// QualityGrade represents an A-F quality rating. +type QualityGrade string + +const ( + GradeA QualityGrade = "A" + GradeB QualityGrade = "B" + GradeC QualityGrade = "C" + GradeD QualityGrade = "D" + GradeF QualityGrade = "F" +) + +// LintSummary provides aggregate counts of the lint results. +type LintSummary struct { + TotalNodes int `json:"totalNodes"` + TotalEdges int `json:"totalEdges"` + ErrorCount int `json:"errorCount"` + WarningCount int `json:"warningCount"` + InfoCount int `json:"infoCount"` +} + +// LintResult is the complete output of running the linter on a canvas. +type LintResult struct { + Status string `json:"status"` // "pass" or "fail" + Errors []LintIssue `json:"errors"` + Warnings []LintIssue `json:"warnings"` + Info []LintIssue `json:"info"` + Summary LintSummary `json:"summary"` + QualityScore int `json:"qualityScore"` // 0-100 + QualityGrade QualityGrade `json:"qualityGrade"` // A-F +} + +// computeQualityScore returns a score from 0-100 and a letter grade. +// Each error deducts 15 points (max 60 total), each warning deducts 5 (max 30), +// each info deducts 1 (max 10). This prevents scores from bottoming out too quickly. +func computeQualityScore(errors, warnings, info int) (int, QualityGrade) { + errorPenalty := errors * 15 + if errorPenalty > 60 { + errorPenalty = 60 + } + warningPenalty := warnings * 5 + if warningPenalty > 30 { + warningPenalty = 30 + } + infoPenalty := info * 1 + if infoPenalty > 10 { + infoPenalty = 10 + } + score := 100 - errorPenalty - warningPenalty - infoPenalty + if score < 0 { + score = 0 + } + + var grade QualityGrade + switch { + case score >= 90: + grade = GradeA + case score >= 75: + grade = GradeB + case score >= 60: + grade = GradeC + case score >= 40: + grade = GradeD + default: + grade = GradeF + } + + return score, grade +} + +// terminalComponents are components that naturally end a workflow and +// should not be flagged as dead-ends. +var terminalComponents = map[string]bool{ + "approval": true, + "slack.sendTextMessage": true, + "slack.waitForButtonClick": true, + "github.createIssue": true, + "github.createIssueComment": true, + "github.createRelease": true, + "github.updateIssue": true, + "github.publishCommitStatus": true, + "github.addReaction": true, + "pagerduty.createIncident": true, + "pagerduty.resolveIncident": true, + "pagerduty.escalateIncident": true, + "pagerduty.annotateIncident": true, + "pagerduty.acknowledgeIncident": true, +} + +// destructiveComponents are components that perform irreversible or +// high-impact actions and should require an upstream approval gate. +var destructiveComponents = map[string]bool{ + "pagerduty.resolveIncident": true, + "pagerduty.escalateIncident": true, + "github.deleteRelease": true, + "github.createRelease": true, +} + +// nodeRefDoubleQuotePattern matches $["Node Name"] references in expressions. +var nodeRefDoubleQuotePattern = regexp.MustCompile(`\$\["([^"]+)"\]`) + +// nodeRefSingleQuotePattern matches $['Node Name'] references in expressions. +var nodeRefSingleQuotePattern = regexp.MustCompile(`\$\['([^']+)'\]`) + +// LintCanvas performs static analysis on a canvas defined by nodes and edges. +// The registry parameter is accepted for future use and may be nil. +func LintCanvas(nodes []models.Node, edges []models.Edge, _ *registry.Registry) *LintResult { + result := &LintResult{ + Errors: []LintIssue{}, + Warnings: []LintIssue{}, + Info: []LintIssue{}, + } + + // Build lookup maps. + nodeByID := make(map[string]models.Node, len(nodes)) + nodeByName := make(map[string]bool, len(nodes)) + outgoing := make(map[string][]models.Edge) + incoming := make(map[string][]models.Edge) + triggers := make([]models.Node, 0) + widgets := make(map[string]bool) + + for _, n := range nodes { + nodeByID[n.ID] = n + nodeByName[n.Name] = true + + if n.Type == "TYPE_TRIGGER" { + triggers = append(triggers, n) + } + if n.Type == "TYPE_WIDGET" { + widgets[n.ID] = true + } + } + + for _, e := range edges { + outgoing[e.SourceID] = append(outgoing[e.SourceID], e) + incoming[e.TargetID] = append(incoming[e.TargetID], e) + } + + // Run all rule checkers. + checkDuplicateNodes(nodes, result) + checkEdgeValidity(edges, nodeByID, widgets, result) + checkCycles(nodes, edges, widgets, result) + checkOrphanNodes(nodes, triggers, outgoing, widgets, result) + checkDeadEnds(nodes, outgoing, widgets, result) + checkMissingApprovalGate(nodes, incoming, nodeByID, widgets, result) + checkMissingRequiredConfig(nodes, incoming, result) + checkExpressionSyntax(nodes, nodeByName, widgets, result) + checkUnreachableBranches(nodes, outgoing, result) + + // Compute summary. + result.Summary = LintSummary{ + TotalNodes: len(nodes), + TotalEdges: len(edges), + ErrorCount: len(result.Errors), + WarningCount: len(result.Warnings), + InfoCount: len(result.Info), + } + + if len(result.Errors) > 0 { + result.Status = "fail" + } else { + result.Status = "pass" + } + + result.QualityScore, result.QualityGrade = computeQualityScore( + len(result.Errors), len(result.Warnings), len(result.Info), + ) + + return result +} + +// checkDuplicateNodes detects duplicate node IDs and duplicate node names. +func checkDuplicateNodes(nodes []models.Node, result *LintResult) { + seenIDs := make(map[string]bool, len(nodes)) + seenNames := make(map[string]bool, len(nodes)) + + for _, n := range nodes { + if seenIDs[n.ID] { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "duplicate-node-id", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Duplicate node ID %q", n.ID), + }) + } + seenIDs[n.ID] = true + + if n.Type == "TYPE_WIDGET" { + continue + } + if seenNames[n.Name] { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "duplicate-node-name", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Duplicate node name %q — expression references may be ambiguous", n.Name), + }) + } + seenNames[n.Name] = true + } +} + +// checkEdgeValidity validates edges for dangling references, self-loops, +// duplicate edges, and edges involving widget nodes. +func checkEdgeValidity(edges []models.Edge, nodeByID map[string]models.Node, widgets map[string]bool, result *LintResult) { + type edgeKey struct{ src, tgt, ch string } + seen := make(map[edgeKey]bool, len(edges)) + + for i, e := range edges { + // Dangling source/target. + if _, ok := nodeByID[e.SourceID]; !ok { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-edge", + NodeID: e.SourceID, + Message: fmt.Sprintf("Edge %d references nonexistent source node %q", i, e.SourceID), + }) + } + if _, ok := nodeByID[e.TargetID]; !ok { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-edge", + NodeID: e.TargetID, + Message: fmt.Sprintf("Edge %d references nonexistent target node %q", i, e.TargetID), + }) + } + + // Self-loop. + if e.SourceID == e.TargetID { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-edge", + NodeID: e.SourceID, + NodeName: nodeByID[e.SourceID].Name, + Message: fmt.Sprintf("Edge %d is a self-loop on node %q", i, e.SourceID), + }) + } + + // Duplicate edge. + key := edgeKey{e.SourceID, e.TargetID, e.Channel} + if seen[key] { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "duplicate-edge", + NodeID: e.SourceID, + NodeName: nodeByID[e.SourceID].Name, + Message: fmt.Sprintf("Duplicate edge from %q to %q on channel %q", e.SourceID, e.TargetID, e.Channel), + }) + } + seen[key] = true + + // Widget as edge endpoint. + if widgets[e.SourceID] { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-edge", + NodeID: e.SourceID, + NodeName: nodeByID[e.SourceID].Name, + Message: fmt.Sprintf("Edge %d uses widget node %q as source", i, e.SourceID), + }) + } + if widgets[e.TargetID] { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-edge", + NodeID: e.TargetID, + NodeName: nodeByID[e.TargetID].Name, + Message: fmt.Sprintf("Edge %d uses widget node %q as target", i, e.TargetID), + }) + } + } +} + +// checkCycles detects cycles in the non-widget node graph using Kahn's algorithm. +func checkCycles(nodes []models.Node, edges []models.Edge, widgets map[string]bool, result *LintResult) { + // Build adjacency for non-widget nodes only. + inDegree := make(map[string]int) + adj := make(map[string][]string) + + for _, n := range nodes { + if widgets[n.ID] { + continue + } + inDegree[n.ID] = 0 + } + + for _, e := range edges { + if widgets[e.SourceID] || widgets[e.TargetID] { + continue + } + adj[e.SourceID] = append(adj[e.SourceID], e.TargetID) + inDegree[e.TargetID]++ + } + + // Kahn's: start from nodes with in-degree 0. + queue := make([]string, 0) + for id, deg := range inDegree { + if deg == 0 { + queue = append(queue, id) + } + } + + visited := 0 + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + visited++ + + for _, next := range adj[current] { + inDegree[next]-- + if inDegree[next] == 0 { + queue = append(queue, next) + } + } + } + + totalNonWidget := 0 + for _, n := range nodes { + if !widgets[n.ID] { + totalNonWidget++ + } + } + + if visited < totalNonWidget { + // Find nodes that are part of cycles (those with remaining in-degree > 0). + var cycleNodes []string + for id, deg := range inDegree { + if deg > 0 { + cycleNodes = append(cycleNodes, id) + } + } + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "cycle-detected", + Message: fmt.Sprintf("Cycle detected involving %d node(s): %v", len(cycleNodes), cycleNodes), + }) + } +} + +// checkOrphanNodes finds non-widget nodes that are not reachable from any trigger via BFS. +func checkOrphanNodes( + nodes []models.Node, + triggers []models.Node, + outgoing map[string][]models.Edge, + widgets map[string]bool, + result *LintResult, +) { + reachable := make(map[string]bool) + + // BFS from all trigger nodes. + queue := make([]string, 0, len(triggers)) + for _, t := range triggers { + queue = append(queue, t.ID) + reachable[t.ID] = true + } + + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + + for _, e := range outgoing[current] { + if !reachable[e.TargetID] { + reachable[e.TargetID] = true + queue = append(queue, e.TargetID) + } + } + } + + for _, n := range nodes { + if widgets[n.ID] { + continue + } + if !reachable[n.ID] { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "orphan-node", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q is not reachable from any trigger", n.Name), + }) + } + } +} + +// checkDeadEnds finds non-widget, non-trigger nodes with no outgoing edges +// that are not known terminal components. +func checkDeadEnds( + nodes []models.Node, + outgoing map[string][]models.Edge, + widgets map[string]bool, + result *LintResult, +) { + for _, n := range nodes { + if widgets[n.ID] || n.Type == "TYPE_TRIGGER" { + continue + } + + if len(outgoing[n.ID]) > 0 { + continue + } + + compName := getComponentName(n) + if terminalComponents[compName] { + continue + } + + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "dead-end", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q has no outgoing edges and is not a terminal component", n.Name), + }) + } +} + +// checkMissingApprovalGate verifies that every destructive component has +// an upstream approval node reachable by walking backwards through edges. +func checkMissingApprovalGate( + nodes []models.Node, + incoming map[string][]models.Edge, + nodeByID map[string]models.Node, + widgets map[string]bool, + result *LintResult, +) { + for _, n := range nodes { + if widgets[n.ID] { + continue + } + + compName := getComponentName(n) + if !destructiveComponents[compName] { + continue + } + + if !hasUpstreamApproval(n.ID, incoming, nodeByID) { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "missing-approval-gate", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Destructive action %q in node %q has no upstream approval gate", compName, n.Name), + }) + } + } +} + +// hasUpstreamApproval does a reverse BFS from the given node looking for +// an approval component in its ancestors. +func hasUpstreamApproval( + startID string, + incoming map[string][]models.Edge, + nodeByID map[string]models.Node, +) bool { + visited := make(map[string]bool) + queue := []string{startID} + visited[startID] = true + + for len(queue) > 0 { + current := queue[0] + queue = queue[1:] + + for _, e := range incoming[current] { + if visited[e.SourceID] { + continue + } + visited[e.SourceID] = true + + source, ok := nodeByID[e.SourceID] + if !ok { + continue + } + + if getComponentName(source) == "approval" { + return true + } + + queue = append(queue, e.SourceID) + } + } + + return false +} + +// checkMissingRequiredConfig checks specific component types for +// required or recommended configuration fields. +func checkMissingRequiredConfig( + nodes []models.Node, + incoming map[string][]models.Edge, + result *LintResult, +) { + for _, n := range nodes { + config := n.Configuration + if config == nil { + config = map[string]any{} + } + + compName := getComponentName(n) + + switch compName { + case "claude.textPrompt": + prompt, _ := config["prompt"].(string) + if strings.TrimSpace(prompt) == "" { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "missing-required-config", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q (claude.textPrompt) is missing required \"prompt\" configuration", n.Name), + }) + } + + case "slack.sendTextMessage": + if _, ok := config["channel"]; !ok { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "missing-required-config", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q (slack.sendTextMessage) is missing \"channel\" configuration", n.Name), + }) + } + + case "merge": + incomingCount := len(incoming[n.ID]) + if incomingCount < 2 { + result.Info = append(result.Info, LintIssue{ + Severity: SeverityInfo, + Rule: "missing-required-config", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q (merge) has %d incoming edge(s); merge typically expects 2 or more", n.Name, incomingCount), + }) + } + + case "filter": + expr, _ := config["expression"].(string) + if strings.TrimSpace(expr) == "" { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "missing-required-config", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q (filter) is missing required \"expression\" configuration", n.Name), + }) + } + + case "http": + if _, ok := config["url"]; !ok { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "missing-required-config", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q (http) is missing \"url\" configuration", n.Name), + }) + } + } + } +} + +// checkExpressionSyntax scans all string values in every node's Configuration +// for unbalanced {{ }} delimiters and invalid $["Node Name"] references. +func checkExpressionSyntax( + nodes []models.Node, + nodeByName map[string]bool, + widgets map[string]bool, + result *LintResult, +) { + for _, n := range nodes { + if widgets[n.ID] { + continue + } + + config := n.Configuration + if config == nil { + continue + } + + for _, val := range collectStringValues(config) { + // Check balanced {{ }} delimiters. + openCount := strings.Count(val, "{{") + closeCount := strings.Count(val, "}}") + if openCount != closeCount { + result.Errors = append(result.Errors, LintIssue{ + Severity: SeverityError, + Rule: "invalid-expression", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q has unbalanced expression delimiters: %d opening '{{' vs %d closing '}}'", n.Name, openCount, closeCount), + }) + } + + // Check $["Node Name"] references point to real nodes. + // Use separate patterns for double-quoted and single-quoted + // to correctly handle node names containing the other quote type. + for _, pat := range []*regexp.Regexp{nodeRefDoubleQuotePattern, nodeRefSingleQuotePattern} { + matches := pat.FindAllStringSubmatch(val, -1) + for _, match := range matches { + refName := match[1] + if !nodeByName[refName] { + result.Warnings = append(result.Warnings, LintIssue{ + Severity: SeverityWarning, + Rule: "invalid-expression", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Node %q references unknown node %q", n.Name, refName), + }) + } + } + } + } + } +} + +// collectStringValues recursively extracts all string values from a map. +func collectStringValues(m map[string]any) []string { + if m == nil { + return nil + } + var result []string + for _, v := range m { + switch val := v.(type) { + case string: + result = append(result, val) + case map[string]any: + result = append(result, collectStringValues(val)...) + case []any: + for _, item := range val { + if s, ok := item.(string); ok { + result = append(result, s) + } + if sub, ok := item.(map[string]any); ok { + result = append(result, collectStringValues(sub)...) + } + } + } + } + return result +} + +// checkUnreachableBranches checks that filter components have at least one +// "default" channel outgoing edge, ensuring the matching path has somewhere to go. +func checkUnreachableBranches( + nodes []models.Node, + outgoing map[string][]models.Edge, + result *LintResult, +) { + for _, n := range nodes { + compName := getComponentName(n) + if compName != "filter" { + continue + } + + hasDefault := false + for _, e := range outgoing[n.ID] { + if e.Channel == "default" { + hasDefault = true + break + } + } + + if !hasDefault { + result.Info = append(result.Info, LintIssue{ + Severity: SeverityInfo, + Rule: "unreachable-branch", + NodeID: n.ID, + NodeName: n.Name, + Message: fmt.Sprintf("Filter node %q has no \"default\" channel outgoing edge; matched events have nowhere to go", n.Name), + }) + } + } +} + +// getComponentName returns the component or trigger name for a node. +func getComponentName(node models.Node) string { + if node.Ref.Component != nil { + return node.Ref.Component.Name + } + if node.Ref.Trigger != nil { + return node.Ref.Trigger.Name + } + return "" +} diff --git a/pkg/linter/linter_test.go b/pkg/linter/linter_test.go new file mode 100644 index 0000000000..84f05a4697 --- /dev/null +++ b/pkg/linter/linter_test.go @@ -0,0 +1,948 @@ +package linter + +import ( + "os" + "testing" + + "github.com/ghodss/yaml" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/superplanehq/superplane/pkg/models" +) + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +func triggerNode(id, name string) models.Node { + return models.Node{ + ID: id, + Name: name, + Type: "TYPE_TRIGGER", + Ref: models.NodeRef{Trigger: &models.TriggerRef{Name: "pagerduty.onIncident"}}, + Configuration: map[string]any{}, + } +} + +func componentNode(id, name, componentName string, config map[string]any) models.Node { + return models.Node{ + ID: id, + Name: name, + Type: "TYPE_COMPONENT", + Ref: models.NodeRef{Component: &models.ComponentRef{Name: componentName}}, + Configuration: config, + } +} + +func widgetNode(id, name string) models.Node { + return models.Node{ + ID: id, + Name: name, + Type: "TYPE_WIDGET", + Ref: models.NodeRef{Widget: &models.WidgetRef{Name: "annotation"}}, + Configuration: map[string]any{}, + } +} + +func edge(src, tgt, channel string) models.Edge { + return models.Edge{SourceID: src, TargetID: tgt, Channel: channel} +} + +// countIssuesByRule returns the number of issues with the given rule name. +func countIssuesByRule(issues []LintIssue, rule string) int { + count := 0 + for _, i := range issues { + if i.Rule == rule { + count++ + } + } + return count +} + +// --------------------------------------------------------------------------- +// Original tests (existing rules) +// --------------------------------------------------------------------------- + +func TestLintCanvas_HealthyCanvas(t *testing.T) { + // Full valid flow: trigger -> filter -> 3 parallel -> merge -> claude -> slack -> approval + nodes := []models.Node{ + triggerNode("t1", "Listen for incidents"), + componentNode("f1", "Is it P1", "filter", map[string]any{ + "expression": `$["Listen for incidents"].data.priority == "P1"`, + }), + componentNode("c1", "Get deploy", "github.getRelease", nil), + componentNode("c2", "Get metrics", "http", map[string]any{"url": "https://api.example.com"}), + componentNode("c3", "Get logs", "pagerduty.listLogEntries", nil), + componentNode("m1", "Wait for all", "merge", nil), + componentNode("ai", "AI Assessment", "claude.textPrompt", map[string]any{ + "prompt": "Analyze the incident: {{ $[\"Listen for incidents\"].data.title }}", + }), + componentNode("sl", "Notify Slack", "slack.sendTextMessage", map[string]any{ + "channel": "#incidents", + }), + componentNode("ap", "Approve", "approval", nil), + } + edges := []models.Edge{ + edge("t1", "f1", "default"), + edge("f1", "c1", "default"), + edge("f1", "c2", "default"), + edge("f1", "c3", "default"), + edge("c1", "m1", "default"), + edge("c2", "m1", "default"), + edge("c3", "m1", "default"), + edge("m1", "ai", "success"), + edge("ai", "sl", "default"), + edge("sl", "ap", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "pass", result.Status) + assert.Empty(t, result.Errors) + assert.Equal(t, 9, result.Summary.TotalNodes) + assert.Equal(t, 10, result.Summary.TotalEdges) +} + +func TestLintCanvas_EmptyCanvas(t *testing.T) { + result := LintCanvas(nil, nil, nil) + + assert.Equal(t, "pass", result.Status) + assert.Empty(t, result.Errors) + assert.Empty(t, result.Warnings) + assert.Empty(t, result.Info) + assert.Equal(t, 0, result.Summary.TotalNodes) + assert.Equal(t, 0, result.Summary.TotalEdges) +} + +func TestLintCanvas_OrphanNode(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Connected", "http", map[string]any{"url": "https://example.com"}), + componentNode("orphan", "Orphaned Node", "http", map[string]any{"url": "https://example.com"}), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 1, countIssuesByRule(result.Warnings, "orphan-node")) + found := false + for _, w := range result.Warnings { + if w.Rule == "orphan-node" { + assert.Equal(t, "Orphaned Node", w.NodeName) + found = true + } + } + assert.True(t, found) +} + +func TestLintCanvas_DeadEnd(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Dead End Node", "http", map[string]any{"url": "https://example.com"}), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + deadEnds := countIssuesByRule(result.Warnings, "dead-end") + assert.Equal(t, 1, deadEnds) + + for _, w := range result.Warnings { + if w.Rule == "dead-end" { + assert.Equal(t, "Dead End Node", w.NodeName) + } + } +} + +func TestLintCanvas_DeadEnd_TerminalOK(t *testing.T) { + // All terminal components should not produce dead-end warnings. + terminals := []struct { + name string + component string + }{ + {"Approve", "approval"}, + {"Slack", "slack.sendTextMessage"}, + {"Create Issue", "github.createIssue"}, + {"Create PD", "pagerduty.createIncident"}, + {"Resolve PD", "pagerduty.resolveIncident"}, + } + + for _, tc := range terminals { + t.Run(tc.component, func(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("term", tc.name, tc.component, nil), + } + edges := []models.Edge{ + edge("t1", "term", "default"), + } + + result := LintCanvas(nodes, edges, nil) + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "dead-end")) + }) + } +} + +func TestLintCanvas_MissingApprovalGate(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("d1", "Resolve Incident", "pagerduty.resolveIncident", nil), + } + edges := []models.Edge{ + edge("t1", "d1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + require.Equal(t, 1, countIssuesByRule(result.Errors, "missing-approval-gate")) + + for _, e := range result.Errors { + if e.Rule == "missing-approval-gate" { + assert.Equal(t, "Resolve Incident", e.NodeName) + assert.Contains(t, e.Message, "pagerduty.resolveIncident") + } + } +} + +func TestLintCanvas_ApprovalGatePresent(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("ap", "Approve First", "approval", nil), + componentNode("d1", "Resolve Incident", "pagerduty.resolveIncident", nil), + } + edges := []models.Edge{ + edge("t1", "ap", "default"), + edge("ap", "d1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 0, countIssuesByRule(result.Errors, "missing-approval-gate")) +} + +func TestLintCanvas_MissingConfig_EmptyPrompt(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("ai", "AI Node", "claude.textPrompt", map[string]any{ + "prompt": "", + }), + } + edges := []models.Edge{ + edge("t1", "ai", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + configErrors := 0 + for _, e := range result.Errors { + if e.Rule == "missing-required-config" && e.NodeName == "AI Node" { + configErrors++ + assert.Contains(t, e.Message, "prompt") + } + } + assert.Equal(t, 1, configErrors) +} + +func TestLintCanvas_MissingConfig_MergeSingleInput(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("m1", "Solo Merge", "merge", nil), + } + edges := []models.Edge{ + edge("t1", "m1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + mergeInfo := 0 + for _, i := range result.Info { + if i.Rule == "missing-required-config" && i.NodeName == "Solo Merge" { + mergeInfo++ + assert.Contains(t, i.Message, "1 incoming edge") + } + } + assert.Equal(t, 1, mergeInfo) +} + +func TestLintCanvas_MissingConfig_FilterNoExpression(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("f1", "Empty Filter", "filter", map[string]any{ + "expression": "", + }), + } + edges := []models.Edge{ + edge("t1", "f1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + filterErrors := 0 + for _, e := range result.Errors { + if e.Rule == "missing-required-config" && e.NodeName == "Empty Filter" { + filterErrors++ + assert.Contains(t, e.Message, "expression") + } + } + assert.Equal(t, 1, filterErrors) +} + +func TestLintCanvas_InvalidExpression_UnbalancedBraces(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Bad Expr", "http", map[string]any{ + "url": "{{ no closing", + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + exprErrors := 0 + for _, e := range result.Errors { + if e.Rule == "invalid-expression" { + exprErrors++ + assert.Contains(t, e.Message, "unbalanced") + } + } + assert.Equal(t, 1, exprErrors) +} + +func TestLintCanvas_InvalidExpression_BadNodeRef(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Bad Ref", "http", map[string]any{ + "url": `{{ $["Nonexistent Node"].data }}`, + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + refWarnings := 0 + for _, w := range result.Warnings { + if w.Rule == "invalid-expression" { + refWarnings++ + assert.Contains(t, w.Message, "Nonexistent Node") + } + } + assert.Equal(t, 1, refWarnings) +} + +func TestLintCanvas_ValidExpression(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Listen for incidents"), + componentNode("c1", "Use Data", "http", map[string]any{ + "url": `{{ $["Listen for incidents"].data.field }}`, + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "invalid-expression")) + assert.Equal(t, 0, countIssuesByRule(result.Errors, "invalid-expression")) +} + +func TestLintCanvas_UnreachableBranch(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("f1", "Filter Without Default", "filter", map[string]any{ + "expression": "true", + }), + componentNode("c1", "On Match", "http", map[string]any{"url": "https://example.com"}), + } + edges := []models.Edge{ + edge("t1", "f1", "default"), + edge("f1", "c1", "match"), // not "default" + } + + result := LintCanvas(nodes, edges, nil) + + branchInfo := 0 + for _, i := range result.Info { + if i.Rule == "unreachable-branch" { + branchInfo++ + assert.Equal(t, "Filter Without Default", i.NodeName) + } + } + assert.Equal(t, 1, branchInfo) +} + +func TestLintCanvas_WidgetsIgnored(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Connected", "approval", nil), + widgetNode("w1", "My Annotation"), + widgetNode("w2", "Another Note"), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + // Widgets are not connected to anything — they should not produce warnings. + } + + result := LintCanvas(nodes, edges, nil) + + for _, w := range result.Warnings { + assert.NotEqual(t, "orphan-node", w.Rule, "widgets should not produce orphan-node warnings") + assert.NotEqual(t, "dead-end", w.Rule, "widgets should not produce dead-end warnings") + } +} + +// --------------------------------------------------------------------------- +// New tests for C1: Cycle detection +// --------------------------------------------------------------------------- + +func TestLintCanvas_CycleDetected(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("a", "Node A", "http", map[string]any{"url": "https://a.com"}), + componentNode("b", "Node B", "http", map[string]any{"url": "https://b.com"}), + componentNode("c", "Node C", "http", map[string]any{"url": "https://c.com"}), + } + edges := []models.Edge{ + edge("t1", "a", "default"), + edge("a", "b", "default"), + edge("b", "c", "default"), + edge("c", "a", "default"), // cycle: a -> b -> c -> a + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + assert.Equal(t, 1, countIssuesByRule(result.Errors, "cycle-detected")) +} + +func TestLintCanvas_NoCycle(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("a", "Node A", "approval", nil), + } + edges := []models.Edge{ + edge("t1", "a", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 0, countIssuesByRule(result.Errors, "cycle-detected")) +} + +// --------------------------------------------------------------------------- +// New tests for C6: Duplicate node detection +// --------------------------------------------------------------------------- + +func TestLintCanvas_DuplicateNodeID(t *testing.T) { + nodes := []models.Node{ + triggerNode("dup", "Trigger One"), + componentNode("dup", "Trigger Two", "approval", nil), + } + edges := []models.Edge{ + edge("dup", "dup", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + assert.GreaterOrEqual(t, countIssuesByRule(result.Errors, "duplicate-node-id"), 1) +} + +func TestLintCanvas_DuplicateNodeName(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Same Name"), + componentNode("c1", "Same Name", "approval", nil), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 1, countIssuesByRule(result.Warnings, "duplicate-node-name")) +} + +// --------------------------------------------------------------------------- +// New tests for C7: Edge validation +// --------------------------------------------------------------------------- + +func TestLintCanvas_DanglingEdge(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + } + edges := []models.Edge{ + edge("t1", "nonexistent", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + assert.GreaterOrEqual(t, countIssuesByRule(result.Errors, "invalid-edge"), 1) +} + +func TestLintCanvas_SelfLoop(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Self Looper", "http", map[string]any{"url": "https://example.com"}), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + edge("c1", "c1", "default"), // self-loop + } + + result := LintCanvas(nodes, edges, nil) + + assert.GreaterOrEqual(t, countIssuesByRule(result.Errors, "invalid-edge"), 1) + found := false + for _, e := range result.Errors { + if e.Rule == "invalid-edge" && e.NodeID == "c1" { + assert.Contains(t, e.Message, "self-loop") + found = true + } + } + assert.True(t, found) +} + +func TestLintCanvas_DuplicateEdge(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Target", "approval", nil), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + edge("t1", "c1", "default"), // duplicate + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 1, countIssuesByRule(result.Warnings, "duplicate-edge")) +} + +func TestLintCanvas_WidgetAsEdgeEndpoint(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + widgetNode("w1", "Annotation"), + } + edges := []models.Edge{ + edge("t1", "w1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.GreaterOrEqual(t, countIssuesByRule(result.Errors, "invalid-edge"), 1) +} + +// --------------------------------------------------------------------------- +// New tests for C9: Multiple destructive components +// --------------------------------------------------------------------------- + +func TestLintCanvas_MultipleDestructiveComponents_SingleApproval(t *testing.T) { + // One approval should cover all downstream destructive actions. + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("ap", "Approve", "approval", nil), + componentNode("d1", "Resolve", "pagerduty.resolveIncident", nil), + componentNode("d2", "Escalate", "pagerduty.escalateIncident", nil), + } + edges := []models.Edge{ + edge("t1", "ap", "default"), + edge("ap", "d1", "default"), + edge("d1", "d2", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 0, countIssuesByRule(result.Errors, "missing-approval-gate"), + "single upstream approval should satisfy both destructive nodes") +} + +func TestLintCanvas_MultipleDestructiveComponents_OneWithout(t *testing.T) { + // One destructive action has approval, the other does not. + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("ap", "Approve", "approval", nil), + componentNode("d1", "Resolve", "pagerduty.resolveIncident", nil), + componentNode("d2", "Delete Release", "github.deleteRelease", nil), + } + edges := []models.Edge{ + edge("t1", "ap", "default"), + edge("ap", "d1", "default"), + edge("t1", "d2", "default"), // d2 bypasses approval + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 1, countIssuesByRule(result.Errors, "missing-approval-gate")) + for _, e := range result.Errors { + if e.Rule == "missing-approval-gate" { + assert.Equal(t, "Delete Release", e.NodeName) + } + } +} + +// --------------------------------------------------------------------------- +// New test for C10: Nil Configuration map +// --------------------------------------------------------------------------- + +func TestLintCanvas_NilConfiguration(t *testing.T) { + // Nodes with nil Configuration should not panic. + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + { + ID: "nil-config", + Name: "Nil Config Claude", + Type: "TYPE_COMPONENT", + Ref: models.NodeRef{Component: &models.ComponentRef{Name: "claude.textPrompt"}}, + Configuration: nil, // explicitly nil + }, + } + edges := []models.Edge{ + edge("t1", "nil-config", "default"), + } + + // Should not panic. + result := LintCanvas(nodes, edges, nil) + + // Should report missing prompt config error. + assert.Equal(t, "fail", result.Status) + configErrors := 0 + for _, e := range result.Errors { + if e.Rule == "missing-required-config" && e.NodeName == "Nil Config Claude" { + configErrors++ + } + } + assert.Equal(t, 1, configErrors) +} + +// --------------------------------------------------------------------------- +// New test for C11: Deeply nested configuration values +// --------------------------------------------------------------------------- + +func TestLintCanvas_NestedConfigExpression(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Listen for incidents"), + componentNode("c1", "HTTP with headers", "http", map[string]any{ + "url": "https://api.example.com", + "headers": map[string]any{ + "Authorization": `Bearer {{ $["Listen for incidents"].data.token }}`, + }, + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + // Should find the valid reference in nested config — no warnings. + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "invalid-expression")) +} + +func TestLintCanvas_NestedConfigBadRef(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "HTTP nested bad ref", "http", map[string]any{ + "url": "https://api.example.com", + "headers": map[string]any{ + "X-Custom": `{{ $["Ghost Node"].data.value }}`, + }, + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 1, countIssuesByRule(result.Warnings, "invalid-expression")) +} + +// --------------------------------------------------------------------------- +// New test for C3: Expression regex with quotes in node names +// --------------------------------------------------------------------------- + +func TestLintCanvas_ExpressionSingleQuoteRef(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "Node's Data", "http", map[string]any{"url": "https://example.com"}), + componentNode("c2", "Consumer", "http", map[string]any{ + // Double-quote reference to a node name containing a single quote + "url": `{{ $["Node's Data"].data.field }}`, + }), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + edge("c1", "c2", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + // Should correctly parse the double-quoted reference containing a single quote. + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "invalid-expression")) +} + +// --------------------------------------------------------------------------- +// YAML parsing types for the dogfood test +// --------------------------------------------------------------------------- + +type canvasYAML struct { + Spec struct { + Nodes []nodeYAML `json:"nodes"` + Edges []edgeYAML `json:"edges"` + } `json:"spec"` +} + +type nodeYAML struct { + ID string `json:"id"` + Name string `json:"name"` + Type string `json:"type"` + Configuration map[string]any `json:"configuration"` + Component *struct { + Name string `json:"name"` + } `json:"component"` + Trigger *struct { + Name string `json:"name"` + } `json:"trigger"` + Widget *struct { + Name string `json:"name"` + } `json:"widget"` + Blueprint *struct { + ID string `json:"id"` + } `json:"blueprint"` +} + +type edgeYAML struct { + SourceID string `json:"sourceId"` + TargetID string `json:"targetId"` + Channel string `json:"channel"` +} + +// C8 fix: dogfood test now asserts specific expected warnings and rejects unexpected ones. +func TestLintCanvas_IncidentCopilotTemplate(t *testing.T) { + data, err := os.ReadFile("../../templates/canvases/incident-copilot.yaml") + require.NoError(t, err, "failed to read incident-copilot.yaml template") + + var canvas canvasYAML + err = yaml.Unmarshal(data, &canvas) + require.NoError(t, err, "failed to parse incident-copilot.yaml") + + // Convert YAML nodes to models.Node. + nodes := make([]models.Node, 0, len(canvas.Spec.Nodes)) + for _, yn := range canvas.Spec.Nodes { + n := models.Node{ + ID: yn.ID, + Name: yn.Name, + Type: yn.Type, + Configuration: yn.Configuration, + } + if n.Configuration == nil { + n.Configuration = map[string]any{} + } + if yn.Component != nil { + n.Ref.Component = &models.ComponentRef{Name: yn.Component.Name} + } + if yn.Trigger != nil { + n.Ref.Trigger = &models.TriggerRef{Name: yn.Trigger.Name} + } + if yn.Widget != nil { + n.Ref.Widget = &models.WidgetRef{Name: yn.Widget.Name} + } + if yn.Blueprint != nil { + n.Ref.Blueprint = &models.BlueprintRef{ID: yn.Blueprint.ID} + } + nodes = append(nodes, n) + } + + // Convert YAML edges to models.Edge. + edges := make([]models.Edge, 0, len(canvas.Spec.Edges)) + for _, ye := range canvas.Spec.Edges { + edges = append(edges, models.Edge{ + SourceID: ye.SourceID, + TargetID: ye.TargetID, + Channel: ye.Channel, + }) + } + + result := LintCanvas(nodes, edges, nil) + + // The incident-copilot template should pass the linter with zero errors. + assert.Equal(t, "pass", result.Status, "incident-copilot template should pass lint") + assert.Empty(t, result.Errors, "incident-copilot template should have zero errors") + + // Verify we actually parsed a non-trivial canvas. + assert.Greater(t, result.Summary.TotalNodes, 5, "should have parsed multiple nodes") + assert.Greater(t, result.Summary.TotalEdges, 5, "should have parsed multiple edges") + + // With channel configured, there should be no warnings. + assert.Empty(t, result.Warnings, "copilot template should have zero warnings with channel configured") + + // Assert no orphan nodes, no dead ends, no cycles. + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "orphan-node"), "no orphan nodes expected") + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "dead-end"), "no dead ends expected") + assert.Equal(t, 0, countIssuesByRule(result.Errors, "cycle-detected"), "no cycles expected") + + // Assert info section is reasonable. + for _, info := range result.Info { + t.Logf("INFO: [%s] %s: %s", info.Rule, info.NodeName, info.Message) + } + + // Quality score assertions. + assert.GreaterOrEqual(t, result.QualityScore, 90, "copilot template should score >= 90") + assert.Equal(t, GradeA, result.QualityGrade, "copilot template should be grade A") + t.Logf("Quality: score=%d grade=%s", result.QualityScore, result.QualityGrade) +} + +// --------------------------------------------------------------------------- +// Quality scoring tests +// --------------------------------------------------------------------------- + +func TestQualityScore_Perfect(t *testing.T) { + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("c1", "End", "approval", nil), + } + edges := []models.Edge{ + edge("t1", "c1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, 100, result.QualityScore) + assert.Equal(t, GradeA, result.QualityGrade) +} + +func TestQualityScore_WithErrors(t *testing.T) { + // Destructive component without approval = 1 error. + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("d1", "Resolve", "pagerduty.resolveIncident", nil), + } + edges := []models.Edge{ + edge("t1", "d1", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + // 1 error = -15 points -> score 85, grade B + assert.Equal(t, 85, result.QualityScore) + assert.Equal(t, GradeB, result.QualityGrade) +} + +func TestQualityScore_ManyIssues(t *testing.T) { + // 4 errors (missing-approval-gate) + 1 warning (dead-end on github.deleteRelease, + // which is not in terminalComponents). + // Error penalty: 4*15=60, capped at 60. Warning penalty: 1*5=5. Total: 65. + // Score: 100-65=35, grade F. + nodes := []models.Node{ + triggerNode("t1", "Trigger"), + componentNode("d1", "Resolve", "pagerduty.resolveIncident", nil), + componentNode("d2", "Escalate", "pagerduty.escalateIncident", nil), + componentNode("d3", "Delete", "github.deleteRelease", nil), + componentNode("d4", "Release", "github.createRelease", nil), + } + edges := []models.Edge{ + edge("t1", "d1", "default"), + edge("t1", "d2", "default"), + edge("t1", "d3", "default"), + edge("t1", "d4", "default"), + } + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "fail", result.Status) + assert.LessOrEqual(t, result.QualityScore, 40, "many issues should produce low score") + assert.GreaterOrEqual(t, result.Summary.ErrorCount, 4) +} + +// --------------------------------------------------------------------------- +// Dogfood tests for existing templates +// --------------------------------------------------------------------------- + +func loadTemplateForTest(t *testing.T, path string) ([]models.Node, []models.Edge) { + t.Helper() + data, err := os.ReadFile(path) + require.NoError(t, err, "failed to read template: %s", path) + + var canvas canvasYAML + err = yaml.Unmarshal(data, &canvas) + require.NoError(t, err, "failed to parse template: %s", path) + + nodes := make([]models.Node, 0, len(canvas.Spec.Nodes)) + for _, yn := range canvas.Spec.Nodes { + n := models.Node{ + ID: yn.ID, + Name: yn.Name, + Type: yn.Type, + Configuration: yn.Configuration, + } + if n.Configuration == nil { + n.Configuration = map[string]any{} + } + if yn.Component != nil { + n.Ref.Component = &models.ComponentRef{Name: yn.Component.Name} + } + if yn.Trigger != nil { + n.Ref.Trigger = &models.TriggerRef{Name: yn.Trigger.Name} + } + if yn.Widget != nil { + n.Ref.Widget = &models.WidgetRef{Name: yn.Widget.Name} + } + if yn.Blueprint != nil { + n.Ref.Blueprint = &models.BlueprintRef{ID: yn.Blueprint.ID} + } + nodes = append(nodes, n) + } + + edges := make([]models.Edge, 0, len(canvas.Spec.Edges)) + for _, ye := range canvas.Spec.Edges { + edges = append(edges, models.Edge{ + SourceID: ye.SourceID, + TargetID: ye.TargetID, + Channel: ye.Channel, + }) + } + + return nodes, edges +} + +func TestLintCanvas_IncidentDataCollectionTemplate(t *testing.T) { + nodes, edges := loadTemplateForTest(t, "../../templates/canvases/incident-data-collection.yaml") + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "pass", result.Status, "incident-data-collection template should pass lint") + assert.Empty(t, result.Errors, "incident-data-collection template should have zero errors") + assert.Greater(t, result.Summary.TotalNodes, 3, "should have parsed multiple nodes") + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "orphan-node")) + assert.Equal(t, 0, countIssuesByRule(result.Errors, "cycle-detected")) + assert.Equal(t, GradeA, result.QualityGrade, "incident-data-collection should be grade A") + + t.Logf("Quality: score=%d grade=%s errors=%d warnings=%d", + result.QualityScore, result.QualityGrade, result.Summary.ErrorCount, result.Summary.WarningCount) +} + +func TestLintCanvas_IncidentRouterTemplate(t *testing.T) { + nodes, edges := loadTemplateForTest(t, "../../templates/canvases/incident-router.yaml") + + result := LintCanvas(nodes, edges, nil) + + assert.Equal(t, "pass", result.Status, "incident-router template should pass lint") + assert.Empty(t, result.Errors, "incident-router template should have zero errors") + assert.Greater(t, result.Summary.TotalNodes, 3, "should have parsed multiple nodes") + assert.Equal(t, 0, countIssuesByRule(result.Warnings, "orphan-node")) + assert.Equal(t, 0, countIssuesByRule(result.Errors, "cycle-detected")) + assert.Equal(t, GradeA, result.QualityGrade, "incident-router should be grade A") + + t.Logf("Quality: score=%d grade=%s errors=%d warnings=%d", + result.QualityScore, result.QualityGrade, result.Summary.ErrorCount, result.Summary.WarningCount) +} diff --git a/pkg/public/server.go b/pkg/public/server.go index 014362143e..f8a11de058 100644 --- a/pkg/public/server.go +++ b/pkg/public/server.go @@ -24,6 +24,7 @@ import ( "github.com/superplanehq/superplane/pkg/core" "github.com/superplanehq/superplane/pkg/database" "github.com/superplanehq/superplane/pkg/grpc" + "github.com/superplanehq/superplane/pkg/grpc/actions/canvases" "github.com/superplanehq/superplane/pkg/grpc/actions/messages" "github.com/superplanehq/superplane/pkg/jwt" "github.com/superplanehq/superplane/pkg/logging" @@ -259,6 +260,22 @@ func (s *Server) RegisterGRPCGateway(grpcServerAddr string) error { w.WriteHeader(http.StatusOK) }).Methods("GET") + // Canvas lint endpoint — quality gate validation. + // Registered before the gRPC gateway catch-all so it takes priority. + lintHandler := canvases.LintCanvasHandler(s.registry) + orgLintMiddleware := middleware.OrganizationAuthMiddleware(s.jwt) + s.Router.HandleFunc("/api/v1/canvases/{canvasId}/lint", func(w http.ResponseWriter, r *http.Request) { + orgLintMiddleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + user, ok := middleware.GetUserFromContext(r.Context()) + if !ok { + http.Error(w, "User not found in context", http.StatusUnauthorized) + return + } + r.Header.Set("X-Organization-Id", user.OrganizationID.String()) + lintHandler(w, r) + })).ServeHTTP(w, r) + }).Methods("POST") + // Protect the gRPC gateway routes with organization authentication orgAuthMiddleware := middleware.OrganizationAuthMiddleware(s.jwt) protectedGRPCHandler := orgAuthMiddleware(s.grpcGatewayHandler(grpcGatewayMux)) diff --git a/templates/canvases/incident-copilot-demo.yaml b/templates/canvases/incident-copilot-demo.yaml new file mode 100644 index 0000000000..9be3f0df27 --- /dev/null +++ b/templates/canvases/incident-copilot-demo.yaml @@ -0,0 +1,167 @@ +metadata: + name: "Incident Copilot Demo" + description: "Simplified demo: Manual trigger with mock incident data → Claude AI triage → Slack evidence pack. Click Run to fire." + isTemplate: false +spec: + nodes: + - id: "start-start-dm01" + name: "Run with mock incident" + type: "TYPE_TRIGGER" + configuration: + templates: + - name: "P1 API Gateway Incident" + payload: + incident: + id: "PGR0VU2" + title: "API Gateway: 5xx error rate spike to 15% on /api/v1/orders" + status: "triggered" + urgency: "high" + html_url: "https://acme.pagerduty.com/incidents/PGR0VU2" + created_at: "2026-03-28T14:30:00Z" + priority: + summary: "P1" + service: + summary: "API Gateway (Production)" + assignees: + - summary: "Dragan Petrovic (On-Call SRE)" + body: + details: "5xx error rate spiked from 0.1% to 15.3% at 14:28 UTC. Affects /api/v1/orders endpoint. 1,247 users impacted. Correlated with deployment deploy-api-v2.14.3 at 14:25 UTC." + recent_deploy: + tag_name: "v2.14.3" + name: "Release v2.14.3 - Order Service Refactor" + body: "Refactored order validation logic. Migrated to new payment gateway client. Updated database connection pooling from 20 to 50." + published_at: "2026-03-28T14:25:30Z" + metrics: + error_rate_5xx: "15.3%" + latency_p99: "4500ms" + request_count_drop: "from 15234 to 6721" + metadata: null + position: + x: 200 + "y": 400 + component: null + blueprint: null + trigger: + name: "start" + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-ai01" + name: "AI Triage Assessment" + type: "TYPE_COMPONENT" + configuration: + model: "claude-sonnet-4-20250514" + maxTokens: 4096 + temperature: 0.3 + systemMessage: "You are an expert SRE incident triage assistant. Produce a structured triage report with: 1) SEVERITY ASSESSMENT (P1-P4 with justification), 2) LIKELY ROOT CAUSE (top 3 hypotheses ranked by probability), 3) AFFECTED SYSTEMS, 4) RECOMMENDED ACTIONS (ordered by priority), 5) ESCALATION RECOMMENDATION. Be concise. Use bullet points." + prompt: "Analyze the following production incident data and provide a structured triage report.\n\nFull incident context:\n{{ toJSON(root()) }}" + metadata: null + position: + x: 700 + "y": 400 + component: + name: "claude.textPrompt" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-sl01" + name: "Send to Slack" + type: "TYPE_COMPONENT" + configuration: + channel: "C0APV7H889F" + text: ":rotating_light: *INCIDENT TRIAGE — AUTO-GENERATED*\n\n{{ $[\"AI Triage Assessment\"].data.text }}\n\n---\n:clock1: _Triage generated by SuperPlane Incident Copilot_" + metadata: null + position: + x: 1200 + "y": 400 + component: + name: "slack.sendTextMessage" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation-dm1a" + name: "annotation" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 200 + text: "### 1. Click Run\n\nThe Manual Run trigger has a pre-loaded P1 incident payload with mock PagerDuty, GitHub deploy, and Datadog metrics data.\n\nClick the **Run** button to fire the workflow." + width: 400 + metadata: null + position: + x: 150 + "y": 130 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation-dm2b" + name: "annotation2" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 200 + text: "### 2. AI analyzes the incident\n\nClaude receives all incident context and produces a structured severity assessment with root cause hypotheses and recommended actions." + width: 400 + metadata: null + position: + x: 650 + "y": 130 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation-dm3c" + name: "annotation3" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 200 + text: "### 3. Evidence pack to Slack\n\nThe AI triage report is posted to #hackathon-demo as a structured evidence pack with severity, root cause, and recommended actions." + width: 400 + metadata: null + position: + x: 1150 + "y": 130 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + edges: + - sourceId: "start-start-dm01" + targetId: "component-node-ai01" + channel: "default" + - sourceId: "component-node-ai01" + targetId: "component-node-sl01" + channel: "default" diff --git a/templates/canvases/incident-copilot.yaml b/templates/canvases/incident-copilot.yaml new file mode 100644 index 0000000000..8e7ef68790 --- /dev/null +++ b/templates/canvases/incident-copilot.yaml @@ -0,0 +1,321 @@ +metadata: + name: "Incident Copilot" + description: "AI-powered incident triage: auto-collects context from PagerDuty, GitHub, and Datadog, then uses Claude to generate a structured severity assessment and evidence pack." + isTemplate: false +spec: + nodes: + - id: "pagerduty-onincident-pagerduty-onincident-hk8x3p" + name: "Listen for incidents" + type: "TYPE_TRIGGER" + configuration: + events: + - "incident.triggered" + service: "PQEAM2I" + urgencies: + - "high" + metadata: + service: + html_url: "https://superplane-test.eu.pagerduty.com/service-directory/PQEAM2I" + id: "PQEAM2I" + name: "Default Service" + position: + x: 200 + "y": 500 + component: null + blueprint: null + trigger: + name: "pagerduty.onIncident" + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-f1lt3r" + name: "Is it P1 or P2" + type: "TYPE_COMPONENT" + configuration: + expression: "$[\"Listen for incidents\"].data.incident.priority.summary == \"P1\" || $[\"Listen for incidents\"].data.incident.priority.summary == \"P2\"" + metadata: null + position: + x: 700 + "y": 500 + component: + name: "filter" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-gh8r3l" + name: "Get latest deploy" + type: "TYPE_COMPONENT" + configuration: + repository: "api-service" + metadata: + repository: + id: 1046188046 + name: "api-service" + url: "https://github.com/acme-corp/api-service" + position: + x: 1200 + "y": 350 + component: + name: "github.getRelease" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-dd8m3t" + name: "Fetch Datadog metrics" + type: "TYPE_COMPONENT" + configuration: + method: "GET" + url: "https://api.datadoghq.com/api/v1/query?query=avg:system.cpu.user{service:api-gateway}&from=-3600" + headers: + DD-API-KEY: "{{ $secret.DATADOG_API_KEY }}" + metadata: null + position: + x: 1200 + "y": 500 + component: + name: "http" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-pd8l0g" + name: "Get incident timeline" + type: "TYPE_COMPONENT" + configuration: + incidentId: "{{ $[\"Listen for incidents\"].data.incident.id }}" + metadata: null + position: + x: 1200 + "y": 650 + component: + name: "pagerduty.listLogEntries" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "merge-merge-m3rg3x" + name: "Wait for all context" + type: "TYPE_COMPONENT" + configuration: + enableStopIf: false + enableTimeout: true + executionTimeout: + unit: "minutes" + value: 2 + metadata: null + position: + x: 1700 + "y": 500 + component: + name: "merge" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-cl8ai9" + name: "AI Triage Assessment" + type: "TYPE_COMPONENT" + configuration: + model: "claude-sonnet-4-20250514" + maxTokens: 4096 + temperature: 0.3 + systemMessage: "You are an expert SRE incident triage assistant. Given incident details, recent deployments, metrics, and logs, produce a structured triage report with:\n\n1. SEVERITY ASSESSMENT (P1-P4 with justification)\n2. LIKELY ROOT CAUSE (top 3 hypotheses ranked by probability)\n3. AFFECTED SYSTEMS (services, endpoints, user segments)\n4. RECOMMENDED ACTIONS (ordered by priority, with estimated impact)\n5. ESCALATION RECOMMENDATION (who to page, what team)\n\nBe concise. Use bullet points. Include specific evidence for each claim." + prompt: "INCIDENT:\nTitle: {{ $[\"Listen for incidents\"].data.incident.title }}\nStatus: {{ $[\"Listen for incidents\"].data.incident.status }}\nUrgency: {{ $[\"Listen for incidents\"].data.incident.urgency }}\nPriority: {{ $[\"Listen for incidents\"].data.incident.priority.summary }}\nService: {{ $[\"Listen for incidents\"].data.incident.service.summary }}\n\nRECENT DEPLOYMENT:\n{{ $[\"Get latest deploy\"].data }}\n\nMETRICS:\n{{ $[\"Fetch Datadog metrics\"].data }}\n\nINCIDENT LOG:\n{{ $[\"Get incident timeline\"].data }}" + metadata: null + position: + x: 2200 + "y": 500 + component: + name: "claude.textPrompt" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-sl8ck1" + name: "Send evidence pack to Slack" + type: "TYPE_COMPONENT" + configuration: + channel: "C0APV7H889F" + text: ":rotating_light: *INCIDENT TRIAGE — AUTO-GENERATED*\n\n*{{ $[\"Listen for incidents\"].data.incident.title }}*\nPriority: {{ $[\"Listen for incidents\"].data.incident.priority.summary }}\nService: {{ $[\"Listen for incidents\"].data.incident.service.summary }}\n\n---\n\n{{ $[\"AI Triage Assessment\"].data.text }}\n\n---\n\n_Triage generated by SuperPlane Incident Copilot in < 60 seconds_" + metadata: null + position: + x: 2700 + "y": 500 + component: + name: "slack.sendTextMessage" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "component-node-apr0v1" + name: "Approve remediation" + type: "TYPE_COMPONENT" + configuration: + items: + - type: "anyone" + metadata: null + position: + x: 3200 + "y": 500 + component: + name: "approval" + blueprint: null + trigger: null + widget: null + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation-hk1a1a" + name: "annotation" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 250 + text: "### 1. Listen for production incidents\n\nThe PagerDuty trigger listens for new incidents. A filter ensures only P1 and P2 incidents proceed.\n\n___\nTo use this template:\n- Connect your PagerDuty account\n- Configure the service to monitor\n- Set urgency filters as needed" + width: 460 + metadata: null + position: + x: 150 + "y": 180 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation2-hk2b2b" + name: "annotation2" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 250 + text: "### 2. Collect context in parallel\n\nThree parallel branches fetch:\n- Latest deploy from GitHub\n- System metrics from Datadog\n- Incident timeline from PagerDuty\n\nA Merge component waits for all three to complete.\n\n___\nConfigure each data source with your credentials and endpoints." + width: 600 + metadata: null + position: + x: 1150 + "y": 100 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation3-hk3c3c" + name: "annotation3" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 220 + text: "### 3. AI-powered triage\n\nClaude receives all collected context and produces a structured severity assessment with root cause hypotheses and recommended actions.\n\n___\n- Review the system prompt to customize triage format\n- Adjust temperature for more/less creative analysis" + width: 500 + metadata: null + position: + x: 2150 + "y": 200 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + - id: "annotation-annotation4-hk4d4d" + name: "annotation4" + type: "TYPE_WIDGET" + configuration: + color: "yellow" + height: 220 + text: "### 4. Notify and approve\n\nThe AI triage report is posted to Slack as an evidence pack. An approval gate prevents any automated remediation actions until a human reviews and approves.\n\n___\n- Set the Slack channel for notifications\n- Configure approval requirements" + width: 560 + metadata: null + position: + x: 2650 + "y": 200 + component: null + blueprint: null + trigger: null + widget: + name: "annotation" + isCollapsed: false + integration: null + errorMessage: "" + warningMessage: "" + + edges: + - sourceId: "pagerduty-onincident-pagerduty-onincident-hk8x3p" + targetId: "component-node-f1lt3r" + channel: "default" + - sourceId: "component-node-f1lt3r" + targetId: "component-node-gh8r3l" + channel: "default" + - sourceId: "component-node-f1lt3r" + targetId: "component-node-dd8m3t" + channel: "default" + - sourceId: "component-node-f1lt3r" + targetId: "component-node-pd8l0g" + channel: "default" + - sourceId: "component-node-gh8r3l" + targetId: "merge-merge-m3rg3x" + channel: "default" + - sourceId: "component-node-dd8m3t" + targetId: "merge-merge-m3rg3x" + channel: "default" + - sourceId: "component-node-pd8l0g" + targetId: "merge-merge-m3rg3x" + channel: "default" + - sourceId: "merge-merge-m3rg3x" + targetId: "component-node-cl8ai9" + channel: "success" + - sourceId: "component-node-cl8ai9" + targetId: "component-node-sl8ck1" + channel: "default" + - sourceId: "component-node-sl8ck1" + targetId: "component-node-apr0v1" + channel: "default" diff --git a/web_src/src/ui/CanvasPage/Header.tsx b/web_src/src/ui/CanvasPage/Header.tsx index a3fe909fe8..242c3f3f8a 100644 --- a/web_src/src/ui/CanvasPage/Header.tsx +++ b/web_src/src/ui/CanvasPage/Header.tsx @@ -2,6 +2,7 @@ import { OrganizationMenuButton } from "@/components/OrganizationMenuButton"; import { PermissionTooltip } from "@/components/PermissionGate"; import { usePermissions } from "@/contexts/PermissionsContext"; import { + CheckCircle2, CloudAlert, CloudCheck, Copy, @@ -11,14 +12,18 @@ import { Plus, RefreshCw, RotateCcw, + ShieldAlert, + TriangleAlert, Undo2, Pencil, } from "lucide-react"; +import type { ComponentsNode } from "@/api-client"; +import { lintCanvas, type LintResult, type LintEdge } from "@/utils/canvasLinter"; import { Button } from "../button"; import { Button as UIButton } from "@/components/ui/button"; import { useCanvases } from "@/hooks/useCanvasData"; import { Link, useParams } from "react-router-dom"; -import { useEffect, useRef, useState, type ReactNode } from "react"; +import { useEffect, useMemo, useRef, useState, type ReactNode } from "react"; import { cn } from "@/lib/utils"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { DropdownMenu, DropdownMenuContent, DropdownMenuTrigger } from "@/ui/dropdownMenu"; @@ -77,6 +82,10 @@ interface HeaderProps { lastSavedAt?: Date | string | null; /** Shown in tooltip when saveState is error (last failed save message). */ saveErrorMessage?: string | null; + /** Canvas spec nodes for linter badge. */ + workflowNodes?: ComponentsNode[]; + /** Canvas spec edges for linter badge. */ + workflowEdges?: LintEdge[]; } function formatLastSavedTooltip(at: Date | string | null | undefined): string { @@ -246,8 +255,17 @@ export function Header({ enterEditModeDisabled, enterEditModeDisabledTooltip, unpublishedDraftChangeCount = 0, + workflowNodes, + workflowEdges, }: HeaderProps) { const { workflowId } = useParams<{ workflowId?: string }>(); + + // Canvas linter — runs on every node/edge change. + // Shows the badge whenever the canvas view is active, even on empty canvases. + const lintResult = useMemo(() => { + if (workflowNodes === undefined && workflowEdges === undefined) return null; + return lintCanvas(workflowNodes || [], workflowEdges || []); + }, [workflowNodes, workflowEdges]); const { data: workflows = [], isLoading: workflowsLoading } = useCanvases(organizationId || ""); const { canAct, isLoading: permissionsLoading } = usePermissions(); const canCreateCanvas = permissionsLoading || canAct("canvases", "create"); @@ -475,6 +493,50 @@ export function Header({
+ {lintResult && (topViewMode === "canvas" || topViewMode === undefined) ? (() => { + const hasErrors = lintResult.errorCount > 0; + const hasWarnings = lintResult.warningCount > 0; + const badgeColor = hasErrors + ? "bg-red-100 text-red-800" + : hasWarnings + ? "bg-yellow-100 text-yellow-800" + : "bg-green-100 text-green-800"; + const BadgeIcon = hasErrors ? ShieldAlert : hasWarnings ? TriangleAlert : CheckCircle2; + const badgeLabel = hasErrors + ? `${lintResult.errorCount} error${lintResult.errorCount !== 1 ? "s" : ""}` + : hasWarnings + ? `${lintResult.warningCount} warning${lintResult.warningCount !== 1 ? "s" : ""}` + : "Lint OK"; + + return ( + + + + + {badgeLabel} + + + +

+ Quality Gate: {lintResult.qualityGrade} ({lintResult.qualityScore}/100) +

+ {lintResult.errors.map((e, i) => ( +

+ {e.message} +

+ ))} + {lintResult.warnings.map((w, i) => ( +

+ {w.message} +

+ ))} + {!hasErrors && !hasWarnings ? ( +

No issues found

+ ) : null} +
+
+ ); + })() : null} {isDefaultMode ? ( <> {isVersioningDisabledMode && onExportYamlCopy && onExportYamlDownload ? ( diff --git a/web_src/src/ui/CanvasPage/index.tsx b/web_src/src/ui/CanvasPage/index.tsx index 14f3fd29d4..1fee993fb9 100644 --- a/web_src/src/ui/CanvasPage/index.tsx +++ b/web_src/src/ui/CanvasPage/index.tsx @@ -1099,6 +1099,7 @@ function CanvasPage(props: CanvasPageProps) { memoryItemCount={props.memoryItemCount} onExportYamlCopy={props.onExportYamlCopy} onExportYamlDownload={props.onExportYamlDownload} + workflowNodes={props.workflowNodes} /> {props.headerBanner ?
{props.headerBanner}
: null}
@@ -1733,6 +1734,7 @@ function CanvasContentHeader({ memoryItemCount, onExportYamlCopy, onExportYamlDownload, + workflowNodes, }: { state: CanvasPageState; onSave?: (nodes: CanvasNode[]) => void; @@ -1767,6 +1769,7 @@ function CanvasContentHeader({ memoryItemCount?: number; onExportYamlCopy?: (nodes: CanvasNode[]) => void; onExportYamlDownload?: (nodes: CanvasNode[]) => void; + workflowNodes?: ComponentsNode[]; }) { const stateRef = useRef(state); stateRef.current = state; @@ -1831,6 +1834,8 @@ function CanvasContentHeader({ memoryItemCount={memoryItemCount} onExportYamlCopy={onExportYamlCopy ? handleExportYamlCopy : undefined} onExportYamlDownload={onExportYamlDownload ? handleExportYamlDownload : undefined} + workflowNodes={workflowNodes} + workflowEdges={stateRef.current.edges} /> ); } @@ -2794,6 +2799,8 @@ function CanvasContent({ enterEditModeDisabled={enterEditModeDisabled} enterEditModeDisabledTooltip={enterEditModeDisabledTooltip} unpublishedDraftChangeCount={unpublishedDraftChangeCount} + workflowNodes={workflowNodes} + workflowEdges={styledEdges} /> )} diff --git a/web_src/src/utils/canvasLinter.ts b/web_src/src/utils/canvasLinter.ts new file mode 100644 index 0000000000..138d5ea8c0 --- /dev/null +++ b/web_src/src/utils/canvasLinter.ts @@ -0,0 +1,490 @@ +import type { ComponentsNode } from "@/api-client"; + +export type LintSeverity = "error" | "warning" | "info"; +export type QualityGrade = "A" | "B" | "C" | "D" | "F"; + +export interface LintIssue { + severity: LintSeverity; + rule: string; + nodeId: string; + nodeName: string; + message: string; +} + +export interface LintResult { + status: "pass" | "fail"; + errors: LintIssue[]; + warnings: LintIssue[]; + info: LintIssue[]; + errorCount: number; + warningCount: number; + infoCount: number; + qualityScore: number; + qualityGrade: QualityGrade; +} + +/** Accepts either ComponentsEdge (from API spec) or React Flow Edge shape. */ +export interface LintEdge { + sourceId?: string; + targetId?: string; + source?: string; + target?: string; + channel?: string; +} + +function edgeSourceId(e: LintEdge): string | undefined { + return e.sourceId || e.source; +} +function edgeTargetId(e: LintEdge): string | undefined { + return e.targetId || e.target; +} + +const TERMINAL_COMPONENTS = new Set([ + "approval", + "slack.sendTextMessage", + "slack.waitForButtonClick", + "github.createIssue", + "github.createIssueComment", + "github.createRelease", + "github.updateIssue", + "github.publishCommitStatus", + "github.addReaction", + "pagerduty.createIncident", + "pagerduty.resolveIncident", + "pagerduty.escalateIncident", + "pagerduty.annotateIncident", + "pagerduty.acknowledgeIncident", +]); + +const DESTRUCTIVE_COMPONENTS = new Set([ + "pagerduty.resolveIncident", + "pagerduty.escalateIncident", + "github.deleteRelease", + "github.createRelease", +]); + +const NODE_REF_DOUBLE = /\$\["([^"]+)"\]/g; +const NODE_REF_SINGLE = /\$\['([^']+)'\]/g; + +function getComponentName(node: ComponentsNode): string { + return node.component?.name || node.trigger?.name || ""; +} + +function computeQualityScore( + errors: number, + warnings: number, + infos: number, +): { score: number; grade: QualityGrade } { + const ep = Math.min(errors * 15, 60); + const wp = Math.min(warnings * 5, 30); + const ip = Math.min(infos * 1, 10); + const score = Math.max(0, 100 - ep - wp - ip); + + let grade: QualityGrade; + if (score >= 90) grade = "A"; + else if (score >= 75) grade = "B"; + else if (score >= 60) grade = "C"; + else if (score >= 40) grade = "D"; + else grade = "F"; + + return { score, grade }; +} + +/** Recursively collect all string values from a config object. */ +function collectStrings(obj: unknown): string[] { + if (typeof obj === "string") return [obj]; + if (Array.isArray(obj)) return obj.flatMap(collectStrings); + if (obj && typeof obj === "object") { + return Object.values(obj).flatMap(collectStrings); + } + return []; +} + +export function lintCanvas( + nodes: ComponentsNode[] | undefined, + edges: LintEdge[] | undefined, +): LintResult { + const result: LintResult = { + status: "pass", + errors: [], + warnings: [], + info: [], + errorCount: 0, + warningCount: 0, + infoCount: 0, + qualityScore: 100, + qualityGrade: "A", + }; + + if (!nodes?.length) return result; + + const safeEdges = edges || []; + const nodeById = new Map(nodes.map((n) => [n.id, n])); + const nodeNames = new Set(nodes.map((n) => n.name)); + const widgets = new Set(nodes.filter((n) => n.type === "TYPE_WIDGET").map((n) => n.id)); + const triggers = nodes.filter((n) => n.type === "TYPE_TRIGGER"); + + // Build adjacency. + const outgoing = new Map(); + const incoming = new Map(); + for (const e of safeEdges) { + const src = edgeSourceId(e); + const tgt = edgeTargetId(e); + if (src) { + const list = outgoing.get(src) || []; + list.push(e); + outgoing.set(src, list); + } + if (tgt) { + const list = incoming.get(tgt) || []; + list.push(e); + incoming.set(tgt, list); + } + } + + // ---- Rule: Duplicate node IDs ---- + const seenIds = new Set(); + for (const n of nodes) { + if (n.id && seenIds.has(n.id)) { + result.errors.push({ + severity: "error", + rule: "duplicate-node-id", + nodeId: n.id, + nodeName: n.name || "", + message: `Duplicate node ID "${n.id}"`, + }); + } + if (n.id) seenIds.add(n.id); + } + + // ---- Rule: Duplicate node names (non-widgets) ---- + const seenNames = new Set(); + for (const n of nodes) { + if (widgets.has(n.id!)) continue; + if (n.name && seenNames.has(n.name)) { + result.warnings.push({ + severity: "warning", + rule: "duplicate-node-name", + nodeId: n.id || "", + nodeName: n.name, + message: `Duplicate node name "${n.name}" — expression references may be ambiguous`, + }); + } + if (n.name) seenNames.add(n.name); + } + + // ---- Rule: Invalid edges ---- + const seenEdgeKeys = new Set(); + for (let i = 0; i < safeEdges.length; i++) { + const e = safeEdges[i]; + const src = edgeSourceId(e); + const tgt = edgeTargetId(e); + + if (src && !nodeById.has(src)) { + result.errors.push({ + severity: "error", + rule: "invalid-edge", + nodeId: src, + nodeName: "", + message: `Edge ${i} references nonexistent source node "${src}"`, + }); + } + if (tgt && !nodeById.has(tgt)) { + result.errors.push({ + severity: "error", + rule: "invalid-edge", + nodeId: tgt || "", + nodeName: "", + message: `Edge ${i} references nonexistent target node "${tgt}"`, + }); + } + if (src && tgt && src === tgt) { + result.errors.push({ + severity: "error", + rule: "invalid-edge", + nodeId: src, + nodeName: nodeById.get(src)?.name || "", + message: `Edge ${i} is a self-loop on node "${src}"`, + }); + } + if (src && tgt) { + const key = `${src}|${tgt}|${e.channel || "default"}`; + if (seenEdgeKeys.has(key)) { + result.warnings.push({ + severity: "warning", + rule: "duplicate-edge", + nodeId: src, + nodeName: nodeById.get(src)?.name || "", + message: `Duplicate edge from "${src}" to "${tgt}" on channel "${e.channel || "default"}"`, + }); + } + seenEdgeKeys.add(key); + } + if (src && widgets.has(src)) { + result.errors.push({ + severity: "error", + rule: "invalid-edge", + nodeId: src, + nodeName: nodeById.get(src)?.name || "", + message: `Edge ${i} uses widget node "${src}" as source`, + }); + } + if (tgt && widgets.has(tgt)) { + result.errors.push({ + severity: "error", + rule: "invalid-edge", + nodeId: tgt, + nodeName: nodeById.get(tgt)?.name || "", + message: `Edge ${i} uses widget node "${tgt}" as target`, + }); + } + } + + // ---- Rule: Cycle detection (Kahn's) ---- + const inDegree = new Map(); + const adj = new Map(); + for (const n of nodes) { + if (widgets.has(n.id!)) continue; + inDegree.set(n.id!, 0); + } + for (const e of safeEdges) { + const src = edgeSourceId(e); + const tgt = edgeTargetId(e); + if (!src || !tgt) continue; + if (widgets.has(src) || widgets.has(tgt)) continue; + adj.set(src, [...(adj.get(src) || []), tgt]); + inDegree.set(tgt, (inDegree.get(tgt) || 0) + 1); + } + const kahnQueue: string[] = []; + for (const [id, deg] of inDegree) { + if (deg === 0) kahnQueue.push(id); + } + let kahnVisited = 0; + while (kahnQueue.length > 0) { + const cur = kahnQueue.shift()!; + kahnVisited++; + for (const next of adj.get(cur) || []) { + const d = (inDegree.get(next) || 1) - 1; + inDegree.set(next, d); + if (d === 0) kahnQueue.push(next); + } + } + const totalNonWidget = nodes.filter((n) => !widgets.has(n.id!)).length; + if (kahnVisited < totalNonWidget) { + result.errors.push({ + severity: "error", + rule: "cycle-detected", + nodeId: "", + nodeName: "", + message: "Cycle detected in canvas graph", + }); + } + + // ---- Rule: Orphan nodes ---- + const reachable = new Set(); + const bfsQueue = triggers.map((t) => t.id!).filter(Boolean); + for (const id of bfsQueue) reachable.add(id); + while (bfsQueue.length > 0) { + const current = bfsQueue.shift()!; + for (const e of outgoing.get(current) || []) { + const tgt = edgeTargetId(e); + if (tgt && !reachable.has(tgt)) { + reachable.add(tgt); + bfsQueue.push(tgt); + } + } + } + for (const n of nodes) { + if (widgets.has(n.id!) || reachable.has(n.id!)) continue; + result.warnings.push({ + severity: "warning", + rule: "orphan-node", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" is not reachable from any trigger`, + }); + } + + // ---- Rule: Dead ends ---- + for (const n of nodes) { + if (widgets.has(n.id!) || n.type === "TYPE_TRIGGER") continue; + if ((outgoing.get(n.id!) || []).length > 0) continue; + if (TERMINAL_COMPONENTS.has(getComponentName(n))) continue; + result.warnings.push({ + severity: "warning", + rule: "dead-end", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" has no outgoing edges and is not a terminal component`, + }); + } + + // ---- Rule: Missing approval gate ---- + for (const n of nodes) { + if (widgets.has(n.id!)) continue; + const comp = getComponentName(n); + if (!DESTRUCTIVE_COMPONENTS.has(comp)) continue; + + const visited = new Set([n.id!]); + const rQueue = [n.id!]; + let found = false; + while (rQueue.length > 0 && !found) { + const cur = rQueue.shift()!; + for (const e of incoming.get(cur) || []) { + const srcId = edgeSourceId(e); + if (!srcId || visited.has(srcId)) continue; + visited.add(srcId); + const src = nodeById.get(srcId); + if (src && getComponentName(src) === "approval") { + found = true; + break; + } + rQueue.push(srcId); + } + } + if (!found) { + result.errors.push({ + severity: "error", + rule: "missing-approval-gate", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Destructive action "${comp}" in "${n.name}" has no upstream approval gate`, + }); + } + } + + // ---- Rule: Missing required config ---- + for (const n of nodes) { + const comp = getComponentName(n); + const config = (n.configuration || {}) as Record; + + switch (comp) { + case "claude.textPrompt": { + const prompt = typeof config.prompt === "string" ? config.prompt.trim() : ""; + if (!prompt) { + result.errors.push({ + severity: "error", + rule: "missing-required-config", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" (claude.textPrompt) is missing required "prompt" configuration`, + }); + } + break; + } + case "slack.sendTextMessage": { + if (!config.channel) { + result.warnings.push({ + severity: "warning", + rule: "missing-required-config", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" (slack.sendTextMessage) is missing "channel" configuration`, + }); + } + break; + } + case "merge": { + const inCount = (incoming.get(n.id!) || []).length; + if (inCount < 2) { + result.info.push({ + severity: "info", + rule: "missing-required-config", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" (merge) has ${inCount} incoming edge(s); merge typically expects 2 or more`, + }); + } + break; + } + case "filter": { + const expr = typeof config.expression === "string" ? config.expression.trim() : ""; + if (!expr) { + result.errors.push({ + severity: "error", + rule: "missing-required-config", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" (filter) is missing required "expression" configuration`, + }); + } + break; + } + case "http": { + if (!config.url) { + result.warnings.push({ + severity: "warning", + rule: "missing-required-config", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" (http) is missing "url" configuration`, + }); + } + break; + } + } + } + + // ---- Rule: Expression syntax validation ---- + for (const n of nodes) { + if (widgets.has(n.id!) || !n.configuration) continue; + const strings = collectStrings(n.configuration); + for (const val of strings) { + const openCount = (val.match(/\{\{/g) || []).length; + const closeCount = (val.match(/\}\}/g) || []).length; + if (openCount !== closeCount) { + result.errors.push({ + severity: "error", + rule: "invalid-expression", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" has unbalanced expression delimiters: ${openCount} '{{' vs ${closeCount} '}}'`, + }); + } + + for (const pat of [NODE_REF_DOUBLE, NODE_REF_SINGLE]) { + pat.lastIndex = 0; + let m; + while ((m = pat.exec(val)) !== null) { + if (!nodeNames.has(m[1])) { + result.warnings.push({ + severity: "warning", + rule: "invalid-expression", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Node "${n.name}" references unknown node "${m[1]}"`, + }); + } + } + } + } + } + + // ---- Rule: Unreachable branches ---- + for (const n of nodes) { + if (getComponentName(n) !== "filter") continue; + const edges = outgoing.get(n.id!) || []; + const hasDefault = edges.some((e) => e.channel === "default"); + if (!hasDefault) { + result.info.push({ + severity: "info", + rule: "unreachable-branch", + nodeId: n.id || "", + nodeName: n.name || "", + message: `Filter node "${n.name}" has no "default" channel outgoing edge; matched events have nowhere to go`, + }); + } + } + + // Compute counts and quality score. + result.errorCount = result.errors.length; + result.warningCount = result.warnings.length; + result.infoCount = result.info.length; + result.status = result.errorCount > 0 ? "fail" : "pass"; + + const qs = computeQualityScore(result.errorCount, result.warningCount, result.infoCount); + result.qualityScore = qs.score; + result.qualityGrade = qs.grade; + + return result; +}