From b10b649a8b8699fe5934227304e57ed1d1a531f4 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Tue, 24 Feb 2026 17:19:31 -0800 Subject: [PATCH 01/39] Create AKS folder and SKILL.md Create a top-level folder for organizing Azure Kubernetes Service skills. Create the service-level skill file targeting AKS cluster creation and best practices. --- plugin/skills/azure-kubernetes/SKILL.md | 252 ++++++++++++++++++++++++ 1 file changed, 252 insertions(+) create mode 100644 plugin/skills/azure-kubernetes/SKILL.md diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md new file mode 100644 index 000000000..c5fddd634 --- /dev/null +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -0,0 +1,252 @@ +--- +name: azure-kubernetes +description: >- + Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. + USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons + DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization) +--- + +# Azure Kubernetes Service + +> **AUTHORITATIVE GUIDANCE — MANDATORY COMPLIANCE** +> +> This document is the **official source** for setting up best practice Azure Kubernetes Service clusters. Follow these instructions to create and configure AKS clusters that are aligned with the user's requirements. + +## Triggers +Activate this skill when user wants to: +- Create a new AKS cluster +- Plan AKS cluster configuration for production workloads +- Design AKS networking (API server access, pod IP model, egress) +- Set up AKS identity and secrets management +- Configure AKS governance (Azure Policy, Deployment Safeguards) +- Enable AKS observability (monitoring, Prometheus, Grafana) +- Define AKS upgrade and patching strategy +- Enable AKS cost visibility and analysis +- Understand AKS Automatic vs Standard SKU differences +- Get a Day-0 checklist for AKS cluster setup and configuration + +## Rules + +1. Start with the user's requirements for provisioning compute, networking, security, and other settings. +2. Use the AKS MCP server for invoking Azure API and kubectl commands when applicable during the cluster setup and operations processes. +3. Determine if AKS Automatic or Standard SKU is more appropriate based on the user's need for control vs convenience. Default to AKS Automatic unless specific customizations are required. +4. Document decisions and rationale for cluster configuration choices, especially for Day-0 decisions that are hard to change later (networking, API server access). + +--- + +## Overview +This skill guides a user through planning and creating an Azure Kubernetes Service (AKS) cluster using public best practices for: +- cluster mode selection (Automatic vs Standard), +- networking (API server access, egress, pod IP model), +- identity (Microsoft Entra + Workload Identity), +- secrets management (Key Vault CSI), +- governance (Azure Policy + Deployment Safeguards), +- observability (Azure Monitor, Managed Prometheus, Managed Grafana), +- upgrades/patching (auto-upgrade channels, maintenance windows), +- cost visibility (AKS Cost Analysis). + +References are public and included at the end. + +--- + +## When to Use +Use this skill when a user asks: +- “What do I need to decide before creating AKS?” +- “Create an AKS cluster plan/design for production” +- “AKS networking: overlay vs pod subnet vs node subnet” +- “How do I set up Workload Identity / Key Vault CSI / Azure Policy?” +- “How do I configure upgrades, patching, and observability on AKS?” + +--- + +## Goals / Outcomes +1. Produce a **recommended AKS cluster configuration** based on user requirements (security, scale, connectivity, compliance). +2. Provide a **Day-0 checklist** (decisions that are hard to change later, like networking and API server exposure). +3. Provide a **Day-1 checklist** (baseline add-ons and settings for production readiness). +4. Optionally output a **command/IaC skeleton** (placeholders only unless user provides values). + +--- + +## Required Inputs (Ask only what’s needed) +If the user is unsure, use safe defaults. + +### A) Environment & scale +- Environment: `dev/test` or `production` +- Region(s) + availability zones needed? +- Expected scale: node count / cluster count (single vs multi) + +### B) Networking requirements (Day-0 critical) +- API server access: + - Public API server or Private cluster? +- Pod IP model: + - Do pods need **direct routable IPs in the VNet**? +- Egress control: + - Default outbound, NAT Gateway, or UDR + firewall/NVA? + +### C) Identity & security posture +- Microsoft Entra RBAC required? +- Need pod-to-Azure access with **Workload Identity**? +- Regulated environment needs (private cluster, policy enforcement, restricted egress)? + +--- + +## Outputs (What the Skill Produces) +### Primary Output: “AKS Setup Plan” +1. Cluster type recommendation (Automatic vs Standard) +2. Networking plan (control plane access, egress choice, pod IP model) +3. Node pools + scaling plan +4. Security baseline (identity, secrets, policy) +5. Observability baseline (metrics/logs/dashboards/alerts) +6. Upgrade & patching plan +7. Cost controls baseline +8. Day-0 checklist + Day-1 checklist + +### Optional Outputs +- CLI skeleton (placeholders) +- IaC outline (Bicep/Terraform module list) + +--- + +## Decision Framework (Defaults when user is unsure) + +### 1) Cluster Type +- Prefer **AKS Automatic** when you want a production-oriented, opinionated setup with many best practices preconfigured. +- Prefer **AKS Standard** when you need maximum control and customizations. +Docs: AKS Automatic overview: https://learn.microsoft.com/azure/aks/intro-aks-automatic + +### 2) Pod Networking Model (Key Day-0 decision) +- Prefer **Azure CNI Overlay** for scalability and conserving VNet IP space. +Docs: https://learn.microsoft.com/azure/aks/azure-cni-overlay + +If pods must be directly addressable/routable in your VNet, use VNet-based Azure CNI options: +- Azure CNI with pod subnet or node subnet models (see Azure CNI overlay + related networking docs) + +### 3) Dataplane / Network Policy +- Consider **Azure CNI powered by Cilium** for eBPF-based performance and policy/observability features. +Docs: https://learn.microsoft.com/azure/aks/azure-cni-powered-by-cilium + +### 4) Workload Identity (Preferred for pod-to-Azure auth) +- Prefer **Microsoft Entra Workload ID** for workloads calling Azure services without secrets. +Docs: https://learn.microsoft.com/azure/aks/workload-identity-overview + +### 5) Secrets +- Prefer Azure Key Vault via **Secrets Store CSI Driver** provider. +Docs: https://learn.microsoft.com/azure/aks/csi-secrets-store-driver + +### 6) Governance +- Enable **Azure Policy** (prereq) and **Deployment Safeguards** for workload best-practice enforcement. +Docs: Deployment Safeguards: https://learn.microsoft.com/azure/aks/deployment-safeguards + +### 7) Observability +- Use Azure Monitor for AKS monitoring enablement (logs + Prometheus + Grafana). +Docs: https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable +Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview + +### 8) Upgrades & Patching +- Establish an upgrade strategy and ensure workloads are upgrade-safe (PDBs, probes, etc.). +Docs: AKS patch/upgrade guidance: https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices + +For node OS patching: +- Node OS auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image +For cluster version auto-upgrades: +- Cluster auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-cluster + +--- + +## Step-by-Step Execution (Agent Behavior) + +### Step 1 — Classify scenario +Identify environment, compliance posture, region/AZ needs, scale, and workload types. + +### Step 2 — Recommend cluster type +Recommend AKS Automatic or Standard with short rationale. +- AKS Automatic intro: https://learn.microsoft.com/azure/aks/intro-aks-automatic + +### Step 3 — Lock networking (Day-0) +Ask: +- Public vs Private API server? +- Pod IP model: overlay vs VNet-routable requirement? +- Egress: LB vs NAT Gateway vs UDR+Firewall? + +Reference: Azure CNI Overlay setup: https://learn.microsoft.com/azure/aks/azure-cni-overlay + +### Step 4 — Node pools and compute +Recommend: +- system node pool + user node pools +- separate pools for GPU/batch/stateful if applicable +- capacity planning considerations (max pods per node affects IP planning, upgrades) + +### Step 5 — Configure autoscaling +Recommend: +- HPA for pods +- Cluster Autoscaler / node scaling strategy +- If user wants higher automation, discuss Node Auto Provisioning where available (if asked) + +### Step 6 — Identity and secrets +- Enable Workload Identity: + https://learn.microsoft.com/azure/aks/workload-identity-overview +- Use Key Vault CSI Driver: + https://learn.microsoft.com/azure/aks/csi-secrets-store-driver + +### Step 7 — Policy & safeguards +- Turn on Azure Policy and Deployment Safeguards (warn/enforce). +Docs: https://learn.microsoft.com/azure/aks/deployment-safeguards + +### Step 8 — Observability baseline +- Enable monitoring using Azure Monitor guidance: + https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable +- Managed Prometheus overview: + https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview + +### Step 9 — Upgrades & patching +- Define upgrade approach: + https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices +- Configure node OS upgrade channels: + https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image +- Configure cluster autoupgrade channels: + https://learn.microsoft.com/azure/aks/auto-upgrade-cluster + +### Step 10 — Cost visibility +- Enable AKS cost analysis add-on (OpenCost-based): + https://learn.microsoft.com/azure/aks/cost-analysis + +Return a final output with: +- recommended config +- Day-0 checklist +- Day-1 checklist +- optional command/IaC skeleton + +--- + +## Guardrails / Safety +- Do not request or output secrets (tokens, keys, subscription IDs). +- If requirements are ambiguous, propose 2–3 safe options with tradeoffs and choose a conservative default. +- Do not promise zero downtime; advise workload safeguards (PDBs, probes, replicas) and staged upgrades. +- If user asks for actions that require privileged access, provide a plan and commands with placeholders. + +--- + +## Quality Bar +A high-quality answer: +- flags Day-0 irreversible choices (networking, API server access), +- includes identity/secrets/policy defaults (Workload ID + Key Vault CSI + safeguards), +- includes observability baseline, +- includes upgrade/patch plan, +- includes cost visibility. + +--- + +## References (Public) +- AKS Automatic overview: https://learn.microsoft.com/azure/aks/intro-aks-automatic +- Azure CNI Overlay (setup and parameters): https://learn.microsoft.com/azure/aks/azure-cni-overlay +- Azure CNI powered by Cilium: https://learn.microsoft.com/azure/aks/azure-cni-powered-by-cilium +- Microsoft Entra Workload ID on AKS: https://learn.microsoft.com/azure/aks/workload-identity-overview +- Key Vault provider for Secrets Store CSI Driver: https://learn.microsoft.com/azure/aks/csi-secrets-store-driver +- Deployment Safeguards: https://learn.microsoft.com/azure/aks/deployment-safeguards +- Enable AKS monitoring (Prometheus + Grafana + logs): https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable +- Azure Monitor managed Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview +- AKS patch & upgrade practices (Day-2 guidance): https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices +- Node OS auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image +- Cluster auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-cluster +- AKS cost analysis (OpenCost-based): https://learn.microsoft.com/azure/aks/cost-analysis +`` \ No newline at end of file From 8ff360f75019bcc7effa45c71510faf736df3cb3 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Tue, 24 Feb 2026 17:22:40 -0800 Subject: [PATCH 02/39] Add azure-kubernetes to skill.json --- tests/skills.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/skills.json b/tests/skills.json index dc5f8cd98..ef9205c7a 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -10,6 +10,7 @@ "azure-deploy", "azure-diagnostics", "azure-hosted-copilot-sdk", + "azure-kubernetes", "azure-kusto", "azure-messaging", "azure-observability", From cc208217ed95152de2b75f9c76a167740f667806 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Tue, 24 Feb 2026 17:24:30 -0800 Subject: [PATCH 03/39] Update skills.json --- tests/skills.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/skills.json b/tests/skills.json index ef9205c7a..c64da42ed 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -26,6 +26,6 @@ "integrationTestSchedule": { "0 5 * * 2-6": "microsoft-foundry", "0 8 * * 2-6": "azure-deploy", - "0 12 * * 2-6": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-cloud-migrate,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" + "0 12 * * 2-6": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-cloud-migrate,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" } } \ No newline at end of file From 769631d7be1866139ed692300d1ea3bd364119c2 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Tue, 24 Feb 2026 17:34:28 -0800 Subject: [PATCH 04/39] Fix issue of postgres skill missing from skills.json --- tests/skills.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/skills.json b/tests/skills.json index c64da42ed..f4244a64b 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -14,6 +14,7 @@ "azure-kusto", "azure-messaging", "azure-observability", + "azure-postgres", "azure-prepare", "azure-rbac", "azure-resource-lookup", From c37988c4c381013516175d929fdc158f36ca92ae Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Tue, 24 Feb 2026 17:36:37 -0800 Subject: [PATCH 05/39] Fix skills.json --- tests/skills.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/skills.json b/tests/skills.json index f4244a64b..db203d6e0 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -14,7 +14,6 @@ "azure-kusto", "azure-messaging", "azure-observability", - "azure-postgres", "azure-prepare", "azure-rbac", "azure-resource-lookup", @@ -25,8 +24,14 @@ "microsoft-foundry" ], "integrationTestSchedule": { +<<<<<<< HEAD "0 5 * * 2-6": "microsoft-foundry", "0 8 * * 2-6": "azure-deploy", "0 12 * * 2-6": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-cloud-migrate,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" +======= + "0 5 * * *": "microsoft-foundry", + "0 8 * * *": "azure-deploy", + "0 12 * * *": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-compliance,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" +>>>>>>> ac9301a (Fix skills.json) } } \ No newline at end of file From 0a6efd5fb97df6ada6a406ab674a247c82ae6ecc Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 15:53:11 -0800 Subject: [PATCH 06/39] Add AKS to architecture.md and testing for AKS skill --- .../azure-prepare/references/architecture.md | 38 ++- .../__snapshots__/triggers.test.ts.snap | 159 ++++++++++ tests/azure-kubernetes/integration.test.ts | 271 ++++++++++++++++++ tests/azure-kubernetes/triggers.test.ts | 96 +++++++ tests/azure-kubernetes/unit.test.ts | 122 ++++++++ 5 files changed, 681 insertions(+), 5 deletions(-) create mode 100644 tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap create mode 100644 tests/azure-kubernetes/integration.test.ts create mode 100644 tests/azure-kubernetes/triggers.test.ts create mode 100644 tests/azure-kubernetes/unit.test.ts diff --git a/plugin/skills/azure-prepare/references/architecture.md b/plugin/skills/azure-prepare/references/architecture.md index dba46e7fb..c2f917748 100644 --- a/plugin/skills/azure-prepare/references/architecture.md +++ b/plugin/skills/azure-prepare/references/architecture.md @@ -22,6 +22,32 @@ Select hosting stack and map components to Azure services. | Workflow / orchestration | | ✓✓ (Durable Functions + DTS) | | | Minimal ops overhead | | ✓✓ | ✓ | +### Container Hosting: Container Apps vs AKS + +| Factor | Container Apps | AKS | +|--------|:--------------:|:---:| +| **Scale to zero** | ✓✓ | | +| **Kubernetes API access** | | ✓✓ | +| **Custom operators/CRDs** | | ✓✓ | +| **Service mesh** | Dapr (built-in) | Istio, Cilium | +| **GPU workloads** | | ✓✓ | +| **Best for** | Microservices, event-driven | Full K8s control, complex workloads | + +#### When to Use Container Apps +- Microservices without Kubernetes complexity +- Event-driven workloads (KEDA built-in) +- Need scale-to-zero for cost optimization +- Teams without Kubernetes expertise + +#### When to Use AKS +- Need Kubernetes API/kubectl access +- Require custom operators or CRDs +- Service mesh requirements (Istio, Linkerd) +- GPU/ML workloads +- Complex networking or multi-tenant architectures + +> **AKS Planning:** For AKS SKU selection (Automatic vs Standard), networking, identity, scaling, and security configuration, invoke the **azure-kubernetes** skill. + ## Service Mapping ### Hosting @@ -29,11 +55,13 @@ Select hosting stack and map components to Azure services. | Component Type | Primary Service | Alternatives | |----------------|-----------------|--------------| | SPA Frontend | Static Web Apps | Blob + CDN | -| SSR Web App | Container Apps | App Service | -| REST/GraphQL API | Container Apps | App Service, Functions | -| Background Worker | Container Apps | Functions | -| Scheduled Task | Functions (Timer) | Container Apps Jobs | -| Event Processor | Functions | Container Apps | +| SSR Web App | Container Apps | App Service, AKS | +| REST/GraphQL API | Container Apps | App Service, Functions, AKS | +| Background Worker | Container Apps | Functions, AKS | +| Scheduled Task | Functions (Timer) | Container Apps Jobs, AKS CronJob | +| Event Processor | Functions | Container Apps, AKS + KEDA | +| Microservices (full K8s) | AKS | Container Apps | +| GPU/ML Workloads | AKS | Azure ML | ### Data diff --git a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap new file mode 100644 index 000000000..54a28abd4 --- /dev/null +++ b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap @@ -0,0 +1,159 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` +{ + "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization)", + "extractedKeywords": [ + "access", + "add-ons", + "aks", + "analysis", + "applications", + "automatic", + "autoscaling", + "azure", + "azure-cost-optimization", + "azure-deploy", + "azure-diagnostics", + "azure-observability", + "azure-prepare", + "best", + "bicep", + "cli", + "cluster", + "clusters", + "configuration", + "container", + "cost", + "covers", + "create", + "creating", + "day-0", + "day-1", + "debugging", + "decisions", + "deploy", + "deploying", + "deployment", + "design", + "entra", + "general", + "governance", + "grafana", + "identity", + "issues", + "key vault", + "kubernetes", + "mcp", + "model", + "monitor", + "monitoring", + "networking", + "observability", + "optimization", + "other", + "plan", + "planning", + "policy", + "practices", + "production", + "production-ready", + "prometheus", + "rbac", + "resources", + "safeguards", + "secrets", + "security", + "server", + "service", + "setting", + "settings", + "skus", + "standard", + "strategies", + "terraform", + "upgrade", + "vault", + "with", + "workload", + ], + "name": "azure-kubernetes", +} +`; + +exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywords match snapshot 1`] = ` +[ + "access", + "add-ons", + "aks", + "analysis", + "applications", + "automatic", + "autoscaling", + "azure", + "azure-cost-optimization", + "azure-deploy", + "azure-diagnostics", + "azure-observability", + "azure-prepare", + "best", + "bicep", + "cli", + "cluster", + "clusters", + "configuration", + "container", + "cost", + "covers", + "create", + "creating", + "day-0", + "day-1", + "debugging", + "decisions", + "deploy", + "deploying", + "deployment", + "design", + "entra", + "general", + "governance", + "grafana", + "identity", + "issues", + "key vault", + "kubernetes", + "mcp", + "model", + "monitor", + "monitoring", + "networking", + "observability", + "optimization", + "other", + "plan", + "planning", + "policy", + "practices", + "production", + "production-ready", + "prometheus", + "rbac", + "resources", + "safeguards", + "secrets", + "security", + "server", + "service", + "setting", + "settings", + "skus", + "standard", + "strategies", + "terraform", + "upgrade", + "vault", + "with", + "workload", +] +`; diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts new file mode 100644 index 000000000..dbd606dfc --- /dev/null +++ b/tests/azure-kubernetes/integration.test.ts @@ -0,0 +1,271 @@ +/** + * Integration Tests for azure-kubernetes + * + * Tests skill behavior with a real Copilot agent session. + * Runs prompts multiple times to measure skill invocation rate. + * + * Prerequisites: + * 1. npm install -g @github/copilot-cli + * 2. Run `copilot` and authenticate + */ + +import { + useAgentRunner, + shouldSkipIntegrationTests, + getIntegrationSkipReason +} from "../utils/agent-runner"; +import { softCheckSkill } from "../utils/evaluate"; + +const SKILL_NAME = "azure-kubernetes"; +const RUNS_PER_PROMPT = 5; + +// Check if integration tests should be skipped at module level +const skipTests = shouldSkipIntegrationTests(); +const skipReason = getIntegrationSkipReason(); + +// Log skip reason if skipping +if (skipTests && skipReason) { + console.log(`⏭️ Skipping integration tests: ${skipReason}`); +} + +const describeIntegration = skipTests ? describe.skip : describe; + +describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { + const agent = useAgentRunner(); + + describe("skill-invocation", () => { + test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Help me create a production-ready AKS cluster with best practices" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS networking prompt", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What networking options should I choose for my AKS cluster?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS Automatic vs Standard", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Should I use AKS Automatic or AKS Standard for my production workloads?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for golden path AKS setup", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What is the recommended golden path for setting up an AKS cluster?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS security best practices", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What are the security best practices for AKS clusters?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS performance optimization", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "How do I optimize performance for my Azure Kubernetes workloads?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS reliability patterns", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What reliability best practices should I follow for AKS?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for workload identity setup", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "How do I configure workload identity for my AKS cluster?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS monitoring setup", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Set up monitoring for my Azure Kubernetes cluster with Prometheus and Grafana" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS upgrade strategy", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What is the best upgrade strategy for AKS clusters in production?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS autoscaling configuration", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "How do I configure cluster autoscaler and KEDA for my AKS cluster?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS deployment safeguards", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Configure deployment safeguards and Azure Policy for my AKS cluster" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes azure-kubernetes skill for AKS node pool sizing", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "How should I size my AKS node pools for a microservices application?" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + }); +}); diff --git a/tests/azure-kubernetes/triggers.test.ts b/tests/azure-kubernetes/triggers.test.ts new file mode 100644 index 000000000..9eb2129e6 --- /dev/null +++ b/tests/azure-kubernetes/triggers.test.ts @@ -0,0 +1,96 @@ +/** + * Trigger Tests for azure-kubernetes + * + * Tests that verify the skill triggers on appropriate prompts + * and does NOT trigger on unrelated prompts. + * + * Uses snapshot testing + parameterized tests for comprehensive coverage. + */ + +import { TriggerMatcher } from "../utils/trigger-matcher"; +import { loadSkill, LoadedSkill } from "../utils/skill-loader"; + +const SKILL_NAME = "azure-kubernetes"; + +describe(`${SKILL_NAME} - Trigger Tests`, () => { + let triggerMatcher: TriggerMatcher; + let skill: LoadedSkill; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + triggerMatcher = new TriggerMatcher(skill); + }); + + describe("Should Trigger", () => { + // Prompts that SHOULD trigger this skill - include multiple keywords + const shouldTriggerPrompts: string[] = [ + "Create an Azure Kubernetes cluster for production", + "Set up AKS cluster with Azure networking", + "Configure Azure AKS cluster autoscaling", + "Plan Azure Kubernetes workload identity setup", + "Azure AKS Automatic vs Standard cluster", + "Set up Azure Kubernetes monitoring with Prometheus", + "Configure Azure AKS deployment safeguards", + "Azure Kubernetes cluster upgrade strategy", + ]; + + test.each(shouldTriggerPrompts)('triggers on: "%s"', (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(true); + }); + }); + + describe("Should NOT Trigger", () => { + // Prompts that should NOT trigger this skill (avoid Azure/kubernetes/AKS keywords) + const shouldNotTriggerPrompts: string[] = [ + "What is the weather today?", + "Help me write a poem", + "Explain quantum computing", + "Help me with AWS EKS", + "How do I use Google GKE?", + "Write a Python script to parse JSON", + "What is the capital of France?", + "Configure my local Docker container", + "Set up PostgreSQL database locally", + ]; + + test.each(shouldNotTriggerPrompts)('does not trigger on: "%s"', (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(false); + }); + }); + + describe("Trigger Keywords Snapshot", () => { + test("skill keywords match snapshot", () => { + expect(triggerMatcher.getKeywords()).toMatchSnapshot(); + }); + + test("skill description triggers match snapshot", () => { + expect({ + name: skill.metadata.name, + description: skill.metadata.description, + extractedKeywords: triggerMatcher.getKeywords(), + }).toMatchSnapshot(); + }); + }); + + describe("Edge Cases", () => { + test("handles empty prompt", () => { + const result = triggerMatcher.shouldTrigger(""); + expect(result.triggered).toBe(false); + }); + + test("handles very long prompt", () => { + const longPrompt = "AKS cluster Azure ".repeat(1000); + const result = triggerMatcher.shouldTrigger(longPrompt); + expect(typeof result.triggered).toBe("boolean"); + }); + + test("is case insensitive", () => { + const result1 = triggerMatcher.shouldTrigger("azure kubernetes cluster"); + const result2 = triggerMatcher.shouldTrigger("AZURE KUBERNETES CLUSTER"); + expect(result1.triggered).toBe(result2.triggered); + }); + }); +}); + diff --git a/tests/azure-kubernetes/unit.test.ts b/tests/azure-kubernetes/unit.test.ts new file mode 100644 index 000000000..b6cb9bdd6 --- /dev/null +++ b/tests/azure-kubernetes/unit.test.ts @@ -0,0 +1,122 @@ +/** + * Unit Tests for azure-kubernetes + * + * Test isolated skill logic and validation rules. + */ + +import { readFileSync } from "node:fs"; +import { loadSkill, LoadedSkill } from "../utils/skill-loader"; + +const SKILL_NAME = "azure-kubernetes"; + +describe(`${SKILL_NAME} - Unit Tests`, () => { + let skill: LoadedSkill; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + }); + + describe("Skill Metadata", () => { + test("has valid SKILL.md with required fields", () => { + expect(skill.metadata).toBeDefined(); + expect(skill.metadata.name).toBe(SKILL_NAME); + expect(skill.metadata.description).toBeDefined(); + expect(skill.metadata.description.length).toBeGreaterThan(10); + }); + + test("description meets Medium-High compliance length", () => { + // Descriptions should be 150-1024 chars for Medium-High compliance + expect(skill.metadata.description.length).toBeGreaterThan(150); + expect(skill.metadata.description.length).toBeLessThanOrEqual(1024); + }); + + test("description contains USE FOR trigger phrases", () => { + const description = skill.metadata.description; + expect(description).toContain("USE FOR:"); + }); + + test("description contains DO NOT USE FOR anti-triggers", () => { + const description = skill.metadata.description; + expect(description).toContain("DO NOT USE FOR:"); + }); + + test("has AKS-specific trigger keywords", () => { + const description = skill.metadata.description.toLowerCase(); + const hasAKSKeywords = + description.includes("aks") || + description.includes("kubernetes") || + description.includes("cluster"); + expect(hasAKSKeywords).toBe(true); + }); + }); + + describe("Skill Content", () => { + test("has substantive content", () => { + expect(skill.content).toBeDefined(); + expect(skill.content.length).toBeGreaterThan(500); + }); + + test("contains expected sections", () => { + expect(skill.content).toContain("## Triggers"); + expect(skill.content).toContain("## When to Use"); + expect(skill.content).toContain("## Decision Framework"); + expect(skill.content).toContain("## Step-by-Step Execution"); + }); + + test("covers AKS cluster configuration topics", () => { + const content = skill.content.toLowerCase(); + expect(content).toContain("networking"); + expect(content).toContain("identity"); + expect(content).toContain("observability"); + }); + + test("references AKS documentation", () => { + expect(skill.content).toContain("learn.microsoft.com"); + }); + + test("covers Day-0 and Day-1 planning", () => { + expect(skill.content).toContain("Day-0"); + expect(skill.content).toContain("Day-1"); + }); + }); + + describe("Frontmatter Formatting", () => { + test("frontmatter has no tabs", () => { + const raw = readFileSync(skill.filePath, "utf-8"); + const frontmatter = raw.split("---")[1]; + expect(frontmatter).not.toMatch(/\t/); + }); + + test("frontmatter keys are only supported attributes", () => { + const raw = readFileSync(skill.filePath, "utf-8"); + const frontmatter = raw.split("---")[1]; + const supported = [ + "name", + "description", + "compatibility", + "license", + "metadata", + "argument-hint", + "disable-model-invocation", + "user-invokable", + ]; + const keys = frontmatter + .split("\n") + .filter((l: string) => /^[a-z][\w-]*\s*:/.test(l)) + .map((l: string) => l.split(":")[0].trim()); + for (const key of keys) { + expect(supported).toContain(key); + } + }); + + test("USE FOR and DO NOT USE FOR are inside description value, not separate keys", () => { + const description = skill.metadata.description; + if (description.includes("USE FOR")) { + expect(description).toContain("USE FOR:"); + } + if (description.includes("DO NOT USE FOR")) { + expect(description).toContain("DO NOT USE FOR:"); + } + }); + }); +}); From eb5c9a29ac060c48f256321295bf086277e3ba70 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 16:39:39 -0800 Subject: [PATCH 07/39] Update SKILL.md --- plugin/skills/azure-kubernetes/SKILL.md | 148 +++++++++++++++++++++--- 1 file changed, 132 insertions(+), 16 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index c5fddd634..d7772a2c3 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -34,6 +34,68 @@ Activate this skill when user wants to: --- +## MCP Tools (Preferred) + +When Azure MCP and AKS MCP are enabled, use these tools for AKS operations: + +### Cluster Management +| Tool | Purpose | +|------|---------| +| `mcp_azure_mcp_aks` | Subscription-scoped AKS cluster queries and metadata | +| `mcp_aks_mcp_az_aks_operations` | Cluster operations: show, list, get-versions, nodepool management | +| `mcp_aks_mcp_get_aks_vmss_info` | VMSS configuration for node pools | + +### kubectl Operations +| Tool | Purpose | +|------|---------| +| `mcp_aks_mcp_kubectl_resources` | Get/describe pods, deployments, services | +| `mcp_aks_mcp_kubectl_diagnostics` | Logs, events, top, exec, cp operations | +| `mcp_aks_mcp_kubectl_cluster` | Cluster info, API resources, explain | +| `mcp_aks_mcp_kubectl_config` | Config contexts, auth checks | + +### Networking +| Tool | Purpose | +|------|---------| +| `mcp_aks_mcp_az_network_resources` | VNet, NSG, route tables, subnets, load balancers | + +--- + +## CLI Fallback + +```bash +# List AKS clusters +az aks list --output table + +# Show cluster details +az aks show --name CLUSTER --resource-group RG + +# Get available Kubernetes versions +az aks get-versions --location LOCATION --output table + +# Create AKS Automatic cluster +az aks create --name CLUSTER --resource-group RG --sku automatic \ + --network-plugin azure --network-plugin-mode overlay \ + --enable-oidc-issuer --enable-workload-identity + +# Create AKS Standard cluster +az aks create --name CLUSTER --resource-group RG \ + --node-count 3 --zones 1 2 3 \ + --network-plugin azure --network-plugin-mode overlay \ + --enable-cluster-autoscaler --min-count 1 --max-count 10 + +# Get credentials +az aks get-credentials --name CLUSTER --resource-group RG + +# List node pools +az aks nodepool list --cluster-name CLUSTER --resource-group RG --output table + +# Enable monitoring +az aks enable-addons --name CLUSTER --resource-group RG \ + --addons monitoring --workspace-resource-id WORKSPACE_ID +``` + +--- + ## Overview This skill guides a user through planning and creating an Azure Kubernetes Service (AKS) cluster using public best practices for: - cluster mode selection (Automatic vs Standard), @@ -51,7 +113,7 @@ References are public and included at the end. ## When to Use Use this skill when a user asks: -- “What do I need to decide before creating AKS?” +- “What do I need to decide before creating an AKS cluster?” - “Create an AKS cluster plan/design for production” - “AKS networking: overlay vs pod subnet vs node subnet” - “How do I set up Workload Identity / Key Vault CSI / Azure Policy?” @@ -70,12 +132,12 @@ Use this skill when a user asks: ## Required Inputs (Ask only what’s needed) If the user is unsure, use safe defaults. -### A) Environment & scale +### 1. Environment & scale - Environment: `dev/test` or `production` - Region(s) + availability zones needed? - Expected scale: node count / cluster count (single vs multi) -### B) Networking requirements (Day-0 critical) +### 2. Networking requirements (Day-0 critical) - API server access: - Public API server or Private cluster? - Pod IP model: @@ -83,7 +145,7 @@ If the user is unsure, use safe defaults. - Egress control: - Default outbound, NAT Gateway, or UDR + firewall/NVA? -### C) Identity & security posture +### 3. Identity & security posture - Microsoft Entra RBAC required? - Need pod-to-Azure access with **Workload Identity**? - Regulated environment needs (private cluster, policy enforcement, restricted egress)? @@ -109,40 +171,54 @@ If the user is unsure, use safe defaults. ## Decision Framework (Defaults when user is unsure) -### 1) Cluster Type -- Prefer **AKS Automatic** when you want a production-oriented, opinionated setup with many best practices preconfigured. -- Prefer **AKS Standard** when you need maximum control and customizations. -Docs: AKS Automatic overview: https://learn.microsoft.com/azure/aks/intro-aks-automatic - -### 2) Pod Networking Model (Key Day-0 decision) +### 1. Cluster Type + +| Cluster SKU | Automatic | Standard | +|---------|-----------|----------| +| Best for | Production defaults, faster setup | Maximum control, custom requirements | +| Node provisioning | Auto (NAP) | Manual node pools | +| Autoscaling | Pre-configured | Configure manually | +| Networking | Azure CNI Overlay (fixed) | Choose model | +| Azure Policy | Enabled by default | Configure separately | +| Monitoring | Azure Monitor enabled | Configure separately | +| Deployment Safeguards | Enabled (Warning) | Configure separately | +| Node OS upgrades | Auto-configured | Configure channel | + +**Recommendation**: Default to **AKS Automatic** unless you need: +- Custom networking (kubenet, Azure CNI with pod subnet) +- Windows node pools +- Specific node pool configurations not supported by NAP +- Full control over autoscaling behavior + +### 2. Pod Networking Model (Key Day-0 decision) - Prefer **Azure CNI Overlay** for scalability and conserving VNet IP space. Docs: https://learn.microsoft.com/azure/aks/azure-cni-overlay If pods must be directly addressable/routable in your VNet, use VNet-based Azure CNI options: - Azure CNI with pod subnet or node subnet models (see Azure CNI overlay + related networking docs) -### 3) Dataplane / Network Policy +### 3. Dataplane / Network Policy - Consider **Azure CNI powered by Cilium** for eBPF-based performance and policy/observability features. Docs: https://learn.microsoft.com/azure/aks/azure-cni-powered-by-cilium -### 4) Workload Identity (Preferred for pod-to-Azure auth) +### 4. Workload Identity (Preferred for pod-to-Azure auth) - Prefer **Microsoft Entra Workload ID** for workloads calling Azure services without secrets. Docs: https://learn.microsoft.com/azure/aks/workload-identity-overview -### 5) Secrets +### 5. Secrets - Prefer Azure Key Vault via **Secrets Store CSI Driver** provider. Docs: https://learn.microsoft.com/azure/aks/csi-secrets-store-driver -### 6) Governance +### 6. Governance - Enable **Azure Policy** (prereq) and **Deployment Safeguards** for workload best-practice enforcement. Docs: Deployment Safeguards: https://learn.microsoft.com/azure/aks/deployment-safeguards -### 7) Observability +### 7. Observability - Use Azure Monitor for AKS monitoring enablement (logs + Prometheus + Grafana). Docs: https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview -### 8) Upgrades & Patching +### 8. Upgrades & Patching - Establish an upgrade strategy and ensure workloads are upgrade-safe (PDBs, probes, etc.). Docs: AKS patch/upgrade guidance: https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices @@ -236,6 +312,46 @@ A high-quality answer: --- +## Quick Reference + +### Common AKS Commands + +| Task | Command | +|------|---------| +| List clusters | `az aks list -o table` | +| Show cluster | `az aks show -n CLUSTER -g RG` | +| Get credentials | `az aks get-credentials -n CLUSTER -g RG` | +| List node pools | `az aks nodepool list --cluster-name CLUSTER -g RG` | +| Scale node pool | `az aks nodepool scale --cluster-name CLUSTER -g RG -n POOL --node-count 5` | +| Upgrade cluster | `az aks upgrade -n CLUSTER -g RG --kubernetes-version VERSION` | + +### kubectl Quick Commands + +| Task | Command | +|------|---------| +| Get pods | `kubectl get pods -A` | +| Get nodes | `kubectl get nodes -o wide` | +| Describe pod | `kubectl describe pod POD -n NAMESPACE` | +| View pod logs | `kubectl logs POD -n NAMESPACE --tail=100` | +| Check events | `kubectl get events --sort-by='.lastTimestamp'` | +| Top nodes | `kubectl top nodes` | + +--- + +## Error Handling + +### Common AKS Issues + +| Issue | Symptom | Resolution | +|-------|---------|------------| +| **Cluster creation fails** | Quota exceeded | Request quota increase or use different VM SKU | +| **Node not ready** | NotReady status | Check kubelet logs, node conditions | +| **Pod pending** | Insufficient resources | Scale node pool or check resource requests | +| **Image pull failed** | ImagePullBackOff | Check ACR access, image name, network | +| **API server unreachable** | Connection refused | Check authorized IPs, private cluster config | + +--- + ## References (Public) - AKS Automatic overview: https://learn.microsoft.com/azure/aks/intro-aks-automatic - Azure CNI Overlay (setup and parameters): https://learn.microsoft.com/azure/aks/azure-cni-overlay From 31523df3f5b91fa19b90b63a1eac1f4ebaf06741 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 16:23:06 -0800 Subject: [PATCH 08/39] Update plugin/skills/azure-kubernetes/SKILL.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- plugin/skills/azure-kubernetes/SKILL.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index d7772a2c3..80b6488c7 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -364,5 +364,4 @@ A high-quality answer: - AKS patch & upgrade practices (Day-2 guidance): https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices - Node OS auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image - Cluster auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-cluster -- AKS cost analysis (OpenCost-based): https://learn.microsoft.com/azure/aks/cost-analysis -`` \ No newline at end of file +- AKS cost analysis (OpenCost-based): https://learn.microsoft.com/azure/aks/cost-analysis \ No newline at end of file From be258cd04b2477885000634229a0485501f7b5f6 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 16:44:39 -0800 Subject: [PATCH 09/39] Remove trailing empty lines --- tests/azure-kubernetes/integration.test.ts | 2 +- tests/azure-kubernetes/triggers.test.ts | 1 - tests/azure-kubernetes/unit.test.ts | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index dbd606dfc..2e66b24f8 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -268,4 +268,4 @@ describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { } }); }); -}); +}); \ No newline at end of file diff --git a/tests/azure-kubernetes/triggers.test.ts b/tests/azure-kubernetes/triggers.test.ts index 9eb2129e6..d7e3ed39e 100644 --- a/tests/azure-kubernetes/triggers.test.ts +++ b/tests/azure-kubernetes/triggers.test.ts @@ -93,4 +93,3 @@ describe(`${SKILL_NAME} - Trigger Tests`, () => { }); }); }); - diff --git a/tests/azure-kubernetes/unit.test.ts b/tests/azure-kubernetes/unit.test.ts index b6cb9bdd6..a67c02423 100644 --- a/tests/azure-kubernetes/unit.test.ts +++ b/tests/azure-kubernetes/unit.test.ts @@ -119,4 +119,4 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { } }); }); -}); +}); \ No newline at end of file From 295a9ed1ff6993f50d59fdc5d54ddc4d725cba39 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 17:02:16 -0800 Subject: [PATCH 10/39] Add AKS to integration test schedule --- tests/skills.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/skills.json b/tests/skills.json index db203d6e0..7f40120ed 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -31,7 +31,6 @@ ======= "0 5 * * *": "microsoft-foundry", "0 8 * * *": "azure-deploy", - "0 12 * * *": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-compliance,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" ->>>>>>> ac9301a (Fix skills.json) + "0 12 * * *": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" } } \ No newline at end of file From 2c1d3a232629d4b8a1208d3a5801140a750565db Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 17:06:02 -0800 Subject: [PATCH 11/39] Fix pr.yaml creating leading space --- .github/workflows/pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index e517bd1f8..973c2ea04 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -235,6 +235,7 @@ jobs: for file in ${{ steps.changed-skills.outputs.all_changed_files }}; do SKILL_FILES="$SKILL_FILES ../$file" done + SKILLS="${SKILLS# }" # Run frontmatter spec validation if OUTPUT=$(npm run frontmatter -- $SKILL_FILES 2>&1); then From 7d79459216c374b8353e6961f9a4bcc944c485e3 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 17:08:41 -0800 Subject: [PATCH 12/39] Update SKILL.md --- plugin/skills/azure-kubernetes/SKILL.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 80b6488c7..1f09939e8 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -1,9 +1,6 @@ --- name: azure-kubernetes -description: >- - Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. - USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons - DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization) +description: "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons. DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization)." --- # Azure Kubernetes Service From afc8b05fec6036cac351945b4d0309d4073ca230 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Fri, 27 Feb 2026 17:13:35 -0800 Subject: [PATCH 13/39] Update triggers.test.ts.snap --- tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap index 54a28abd4..fc585f43d 100644 --- a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap +++ b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization)", + "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons. DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization).", "extractedKeywords": [ "access", "add-ons", @@ -36,6 +36,7 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill descri "deploying", "deployment", "design", + "diagnostic", "entra", "general", "governance", @@ -115,6 +116,7 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywor "deploying", "deployment", "design", + "diagnostic", "entra", "general", "governance", From 0d6e6ef656e6495e4b4c243cbbcb2fa493036d60 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 11:30:31 -0800 Subject: [PATCH 14/39] Add in missing best practices (ephemeral disk, auto upgrades, reliability best practices) --- plugin/skills/azure-kubernetes/SKILL.md | 33 +++++++++++++++++-------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 1f09939e8..95f9363d2 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -216,13 +216,25 @@ Docs: https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-moni Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview ### 8. Upgrades & Patching -- Establish an upgrade strategy and ensure workloads are upgrade-safe (PDBs, probes, etc.). -Docs: AKS patch/upgrade guidance: https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices - -For node OS patching: -- Node OS auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image -For cluster version auto-upgrades: -- Cluster auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-cluster +- Configure **Maintenance Windows** for controlled upgrade timing: https://learn.microsoft.com/en-us/azure/aks/planned-maintenance +- Enable **auto-upgrades** for cluster and node OS to stay up-to-date with security patches and Kubernetes versions +- Consider **LTS versions** for enterprise stability (2-year support) by upgrading your cluster to the AKS Premium tier: https://learn.microsoft.com/en-us/azure/aks/long-term-support + +### 9. Performance +- Use **Ephemeral OS disks** (`--node-osdisk-type Ephemeral`) for faster node startup +- Select **Azure Linux** as node OS (smaller footprint, faster boot) +- Enable **KEDA** for event-driven autoscaling beyond HPA + +### 10. Reliability +- Deploy across **3 Availability Zones** (`--zones 1 2 3`) +- Use **Standard tier** for zone-redundant control plane + 99.95% SLA for API server availability +- Enable **Microsoft Defender for Containers** for runtime protection +- Configure **PodDisruptionBudgets** for all production workloads + +### 11. Cost Controls +- Use **Spot node pools** for batch/interruptible workloads (up to 90% savings) +- **Stop/Start** dev/test clusters: `az aks stop/start` +- Consider **Reserved Instances** or **Savings Plans** for steady-state workloads --- @@ -303,9 +315,10 @@ Return a final output with: A high-quality answer: - flags Day-0 irreversible choices (networking, API server access), - includes identity/secrets/policy defaults (Workload ID + Key Vault CSI + safeguards), -- includes observability baseline, -- includes upgrade/patch plan, -- includes cost visibility. +- recommends availability zones + Standard tier for production, +- includes performance defaults (ephemeral disks, Azure Linux), +- includes observability + upgrade/patching plan, +- addresses cost (spot pools, stop/start for dev). --- From 2e15f00f76bce858db04e1d2db283fdb868d247b Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 11:58:40 -0800 Subject: [PATCH 15/39] Add security best practices --- plugin/skills/azure-kubernetes/SKILL.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 95f9363d2..02848d6fe 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -236,6 +236,13 @@ Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/pro - **Stop/Start** dev/test clusters: `az aks stop/start` - Consider **Reserved Instances** or **Savings Plans** for steady-state workloads +### 12. Security +- **Start secure by default**: Deploy supported AKS versions with Microsoft-recommended defaults; keep clusters up to date +- **Use identity everywhere**: Secure access with Microsoft Entra ID for control plane, workloads (Workload Identity), and node access—avoid static credentials +- **Encrypt by default**: Enable encryption at rest for etcd/API server data; use in-transit encryption for node-to-node traffic +- **Lock down supply chain**: Allow only signed, policy-approved container images (use Azure Policy + ImagePolicyWebhook or Ratify) +- **Isolate tenants**: Use namespaces, network policies, and scoped logging to reduce blast radius + --- ## Step-by-Step Execution (Agent Behavior) @@ -315,6 +322,7 @@ Return a final output with: A high-quality answer: - flags Day-0 irreversible choices (networking, API server access), - includes identity/secrets/policy defaults (Workload ID + Key Vault CSI + safeguards), +- includes security defaults (Entra ID everywhere, encryption, image signing, network policies), - recommends availability zones + Standard tier for production, - includes performance defaults (ephemeral disks, Azure Linux), - includes observability + upgrade/patching plan, From bd1e5e522f45ce012bfcc74e1a0d5225a4dd3de6 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 15:25:57 -0800 Subject: [PATCH 16/39] Streamline and reduce token count --- plugin/skills/azure-kubernetes/SKILL.md | 379 +++--------------- .../references/cli-reference.md | 33 ++ 2 files changed, 88 insertions(+), 324 deletions(-) create mode 100644 plugin/skills/azure-kubernetes/references/cli-reference.md diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 02848d6fe..dd160892a 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -7,7 +7,7 @@ description: "Plan and create production-ready Azure Kubernetes Service (AKS) cl > **AUTHORITATIVE GUIDANCE — MANDATORY COMPLIANCE** > -> This document is the **official source** for setting up best practice Azure Kubernetes Service clusters. Follow these instructions to create and configure AKS clusters that are aligned with the user's requirements. +> This skill produces a **recommended AKS cluster configuration** based on user requirements, distinguishing **Day-0 decisions** (networking, API server — hard to change later) from **Day-1 features** (can enable post-creation). See [CLI reference](./references/cli-reference.md) for commands. ## Triggers Activate this skill when user wants to: @@ -23,363 +23,94 @@ Activate this skill when user wants to: - Get a Day-0 checklist for AKS cluster setup and configuration ## Rules - 1. Start with the user's requirements for provisioning compute, networking, security, and other settings. 2. Use the AKS MCP server for invoking Azure API and kubectl commands when applicable during the cluster setup and operations processes. 3. Determine if AKS Automatic or Standard SKU is more appropriate based on the user's need for control vs convenience. Default to AKS Automatic unless specific customizations are required. 4. Document decisions and rationale for cluster configuration choices, especially for Day-0 decisions that are hard to change later (networking, API server access). ---- - -## MCP Tools (Preferred) - -When Azure MCP and AKS MCP are enabled, use these tools for AKS operations: - -### Cluster Management -| Tool | Purpose | -|------|---------| -| `mcp_azure_mcp_aks` | Subscription-scoped AKS cluster queries and metadata | -| `mcp_aks_mcp_az_aks_operations` | Cluster operations: show, list, get-versions, nodepool management | -| `mcp_aks_mcp_get_aks_vmss_info` | VMSS configuration for node pools | - -### kubectl Operations -| Tool | Purpose | -|------|---------| -| `mcp_aks_mcp_kubectl_resources` | Get/describe pods, deployments, services | -| `mcp_aks_mcp_kubectl_diagnostics` | Logs, events, top, exec, cp operations | -| `mcp_aks_mcp_kubectl_cluster` | Cluster info, API resources, explain | -| `mcp_aks_mcp_kubectl_config` | Config contexts, auth checks | - -### Networking -| Tool | Purpose | -|------|---------| -| `mcp_aks_mcp_az_network_resources` | VNet, NSG, route tables, subnets, load balancers | - ---- - -## CLI Fallback - -```bash -# List AKS clusters -az aks list --output table - -# Show cluster details -az aks show --name CLUSTER --resource-group RG - -# Get available Kubernetes versions -az aks get-versions --location LOCATION --output table - -# Create AKS Automatic cluster -az aks create --name CLUSTER --resource-group RG --sku automatic \ - --network-plugin azure --network-plugin-mode overlay \ - --enable-oidc-issuer --enable-workload-identity - -# Create AKS Standard cluster -az aks create --name CLUSTER --resource-group RG \ - --node-count 3 --zones 1 2 3 \ - --network-plugin azure --network-plugin-mode overlay \ - --enable-cluster-autoscaler --min-count 1 --max-count 10 - -# Get credentials -az aks get-credentials --name CLUSTER --resource-group RG - -# List node pools -az aks nodepool list --cluster-name CLUSTER --resource-group RG --output table - -# Enable monitoring -az aks enable-addons --name CLUSTER --resource-group RG \ - --addons monitoring --workspace-resource-id WORKSPACE_ID -``` - ---- - -## Overview -This skill guides a user through planning and creating an Azure Kubernetes Service (AKS) cluster using public best practices for: -- cluster mode selection (Automatic vs Standard), -- networking (API server access, egress, pod IP model), -- identity (Microsoft Entra + Workload Identity), -- secrets management (Key Vault CSI), -- governance (Azure Policy + Deployment Safeguards), -- observability (Azure Monitor, Managed Prometheus, Managed Grafana), -- upgrades/patching (auto-upgrade channels, maintenance windows), -- cost visibility (AKS Cost Analysis). - -References are public and included at the end. - ---- - -## When to Use -Use this skill when a user asks: -- “What do I need to decide before creating an AKS cluster?” -- “Create an AKS cluster plan/design for production” -- “AKS networking: overlay vs pod subnet vs node subnet” -- “How do I set up Workload Identity / Key Vault CSI / Azure Policy?” -- “How do I configure upgrades, patching, and observability on AKS?” - ---- - -## Goals / Outcomes -1. Produce a **recommended AKS cluster configuration** based on user requirements (security, scale, connectivity, compliance). -2. Provide a **Day-0 checklist** (decisions that are hard to change later, like networking and API server exposure). -3. Provide a **Day-1 checklist** (baseline add-ons and settings for production readiness). -4. Optionally output a **command/IaC skeleton** (placeholders only unless user provides values). - ---- ## Required Inputs (Ask only what’s needed) If the user is unsure, use safe defaults. +- Cluster environment: dev/test or production +- Region(s), availability zones, preferred node VM sizes +- Expected scale (node/cluster count, workload size) +- Networking requirements (API server access, pod IP model, ingress/egress control) +- Security and identity requirements, including image registry +- Upgrade and observability preferences +- Cost constraints -### 1. Environment & scale -- Environment: `dev/test` or `production` -- Region(s) + availability zones needed? -- Expected scale: node count / cluster count (single vs multi) - -### 2. Networking requirements (Day-0 critical) -- API server access: - - Public API server or Private cluster? -- Pod IP model: - - Do pods need **direct routable IPs in the VNet**? -- Egress control: - - Default outbound, NAT Gateway, or UDR + firewall/NVA? - -### 3. Identity & security posture -- Microsoft Entra RBAC required? -- Need pod-to-Azure access with **Workload Identity**? -- Regulated environment needs (private cluster, policy enforcement, restricted egress)? - ---- - -## Outputs (What the Skill Produces) -### Primary Output: “AKS Setup Plan” -1. Cluster type recommendation (Automatic vs Standard) -2. Networking plan (control plane access, egress choice, pod IP model) -3. Node pools + scaling plan -4. Security baseline (identity, secrets, policy) -5. Observability baseline (metrics/logs/dashboards/alerts) -6. Upgrade & patching plan -7. Cost controls baseline -8. Day-0 checklist + Day-1 checklist - -### Optional Outputs -- CLI skeleton (placeholders) -- IaC outline (Bicep/Terraform module list) - ---- - -## Decision Framework (Defaults when user is unsure) +## Key Decisions (Defaults when user is unsure) ### 1. Cluster Type +- **AKS Automatic** (default): Best for most production workloads, provides a curated experience with pre-configured best practices for security, reliability, and performance. Use unless you have specific custom requirements for networking, autoscaling, or node pool configurations not supported by NAP. +- **AKS Standard**: Use if you need full control over cluster configuration, will require additional overhead to setup and manage. -| Cluster SKU | Automatic | Standard | -|---------|-----------|----------| -| Best for | Production defaults, faster setup | Maximum control, custom requirements | -| Node provisioning | Auto (NAP) | Manual node pools | -| Autoscaling | Pre-configured | Configure manually | -| Networking | Azure CNI Overlay (fixed) | Choose model | -| Azure Policy | Enabled by default | Configure separately | -| Monitoring | Azure Monitor enabled | Configure separately | -| Deployment Safeguards | Enabled (Warning) | Configure separately | -| Node OS upgrades | Auto-configured | Configure channel | +### 2. Networking (Pod IP, Egress, Ingress, Dataplane) -**Recommendation**: Default to **AKS Automatic** unless you need: -- Custom networking (kubenet, Azure CNI with pod subnet) -- Windows node pools -- Specific node pool configurations not supported by NAP -- Full control over autoscaling behavior +**Pod IP Model** (Key Day-0 decision): +- **Azure CNI Overlay** (recommended): pod IPs from private overlay range, not VNet-routable, scales to large clusters and good for most workloads +- **Azure CNI (VNet-routable)**: pod IPs directly from VNet (pod subnet or node subnet), use when pods must be directly addressable from VNet or on-prem + - Docs: https://learn.microsoft.com/azure/aks/azure-cni-overlay -### 2. Pod Networking Model (Key Day-0 decision) -- Prefer **Azure CNI Overlay** for scalability and conserving VNet IP space. -Docs: https://learn.microsoft.com/azure/aks/azure-cni-overlay +**Dataplane & Network Policy**: +- **Azure CNI powered by Cilium** (recommended): eBPF-based for high-performance packet processing, network policies, and observability -If pods must be directly addressable/routable in your VNet, use VNet-based Azure CNI options: -- Azure CNI with pod subnet or node subnet models (see Azure CNI overlay + related networking docs) +**Egress**: +- **Static Egress Gateway** for stable, predictable outbound IPs +- For restricted egress: UDR + Azure Firewall or NVA -### 3. Dataplane / Network Policy -- Consider **Azure CNI powered by Cilium** for eBPF-based performance and policy/observability features. -Docs: https://learn.microsoft.com/azure/aks/azure-cni-powered-by-cilium +**Ingress**: +- **App Routing addon with Gateway API** — recommended default for HTTP/HTTPS workloads +- **Istio service mesh with Gateway API** — for advanced traffic management, mTLS, canary deployments +- **Application Gateway for Containers** — for L7 load balancing with WAF integration -### 4. Workload Identity (Preferred for pod-to-Azure auth) -- Prefer **Microsoft Entra Workload ID** for workloads calling Azure services without secrets. -Docs: https://learn.microsoft.com/azure/aks/workload-identity-overview +**DNS**: +- Enable **LocalDNS** on all node pools for reliable, performant DNS resolution -### 5. Secrets -- Prefer Azure Key Vault via **Secrets Store CSI Driver** provider. -Docs: https://learn.microsoft.com/azure/aks/csi-secrets-store-driver +### 3. Security +- Use **Microsoft Entra ID** everywhere (control plane, Workload Identity for pods, node access). Avoid static credentials. +- Azure Key Vault via **Secrets Store CSI Driver** for secrets +- Enable **Azure Policy** + **Deployment Safeguards** +- Enable **Encryption at rest** for etcd/API server; **in-transit** for node-to-node +- Allow only signed, policy-approved images (Azure Policy + Ratify), prefer **Azure Container Registry** +- **Isolation**: Use namespaces, network policies, scoped logging -### 6. Governance -- Enable **Azure Policy** (prereq) and **Deployment Safeguards** for workload best-practice enforcement. -Docs: Deployment Safeguards: https://learn.microsoft.com/azure/aks/deployment-safeguards +### 4. Observability +- Use Azure Monitor and Container Insights for AKS monitoring enablement (logs + Prometheus + Grafana). -### 7. Observability -- Use Azure Monitor for AKS monitoring enablement (logs + Prometheus + Grafana). -Docs: https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable -Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview - -### 8. Upgrades & Patching -- Configure **Maintenance Windows** for controlled upgrade timing: https://learn.microsoft.com/en-us/azure/aks/planned-maintenance +### 5. Upgrades & Patching +- Configure **Maintenance Windows** for controlled upgrade timing - Enable **auto-upgrades** for cluster and node OS to stay up-to-date with security patches and Kubernetes versions -- Consider **LTS versions** for enterprise stability (2-year support) by upgrading your cluster to the AKS Premium tier: https://learn.microsoft.com/en-us/azure/aks/long-term-support +- Consider **LTS versions** for enterprise stability (2-year support) by upgrading your cluster to the AKS Premium tier +- **Multi-cluster upgrades**: Use **AKS Fleet Manager** for staged rollout across test → production clusters -### 9. Performance +### 6. Performance - Use **Ephemeral OS disks** (`--node-osdisk-type Ephemeral`) for faster node startup - Select **Azure Linux** as node OS (smaller footprint, faster boot) - Enable **KEDA** for event-driven autoscaling beyond HPA -### 10. Reliability +### 7. Node Pools & Compute +- **Dedicated system node pool**: At least 2 nodes, tainted for system workloads only (`CriticalAddonsOnly`) +- Enable **Node Auto Provisioning (NAP)** on all pools for cost savings and responsive scaling +- Use **latest generation SKUs (v5/v6)** for host-level optimizations +- **Avoid B-series VMs** — burstable SKUs cause performance/reliability issues +- Use SKUs with **at least 4 vCPUs** for production workloads +- Set **topology spread constraints** to distribute pods across hosts/zones per SLO + +### 8. Reliability - Deploy across **3 Availability Zones** (`--zones 1 2 3`) - Use **Standard tier** for zone-redundant control plane + 99.95% SLA for API server availability - Enable **Microsoft Defender for Containers** for runtime protection - Configure **PodDisruptionBudgets** for all production workloads +- Use **topology spread constraints** to ensure pod distribution across failure domains -### 11. Cost Controls +### 9. Cost Controls - Use **Spot node pools** for batch/interruptible workloads (up to 90% savings) - **Stop/Start** dev/test clusters: `az aks stop/start` - Consider **Reserved Instances** or **Savings Plans** for steady-state workloads -### 12. Security -- **Start secure by default**: Deploy supported AKS versions with Microsoft-recommended defaults; keep clusters up to date -- **Use identity everywhere**: Secure access with Microsoft Entra ID for control plane, workloads (Workload Identity), and node access—avoid static credentials -- **Encrypt by default**: Enable encryption at rest for etcd/API server data; use in-transit encryption for node-to-node traffic -- **Lock down supply chain**: Allow only signed, policy-approved container images (use Azure Policy + ImagePolicyWebhook or Ratify) -- **Isolate tenants**: Use namespaces, network policies, and scoped logging to reduce blast radius - ---- - -## Step-by-Step Execution (Agent Behavior) - -### Step 1 — Classify scenario -Identify environment, compliance posture, region/AZ needs, scale, and workload types. - -### Step 2 — Recommend cluster type -Recommend AKS Automatic or Standard with short rationale. -- AKS Automatic intro: https://learn.microsoft.com/azure/aks/intro-aks-automatic - -### Step 3 — Lock networking (Day-0) -Ask: -- Public vs Private API server? -- Pod IP model: overlay vs VNet-routable requirement? -- Egress: LB vs NAT Gateway vs UDR+Firewall? - -Reference: Azure CNI Overlay setup: https://learn.microsoft.com/azure/aks/azure-cni-overlay - -### Step 4 — Node pools and compute -Recommend: -- system node pool + user node pools -- separate pools for GPU/batch/stateful if applicable -- capacity planning considerations (max pods per node affects IP planning, upgrades) - -### Step 5 — Configure autoscaling -Recommend: -- HPA for pods -- Cluster Autoscaler / node scaling strategy -- If user wants higher automation, discuss Node Auto Provisioning where available (if asked) - -### Step 6 — Identity and secrets -- Enable Workload Identity: - https://learn.microsoft.com/azure/aks/workload-identity-overview -- Use Key Vault CSI Driver: - https://learn.microsoft.com/azure/aks/csi-secrets-store-driver - -### Step 7 — Policy & safeguards -- Turn on Azure Policy and Deployment Safeguards (warn/enforce). -Docs: https://learn.microsoft.com/azure/aks/deployment-safeguards - -### Step 8 — Observability baseline -- Enable monitoring using Azure Monitor guidance: - https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable -- Managed Prometheus overview: - https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview - -### Step 9 — Upgrades & patching -- Define upgrade approach: - https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices -- Configure node OS upgrade channels: - https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image -- Configure cluster autoupgrade channels: - https://learn.microsoft.com/azure/aks/auto-upgrade-cluster - -### Step 10 — Cost visibility -- Enable AKS cost analysis add-on (OpenCost-based): - https://learn.microsoft.com/azure/aks/cost-analysis - -Return a final output with: -- recommended config -- Day-0 checklist -- Day-1 checklist -- optional command/IaC skeleton - ---- - ## Guardrails / Safety - Do not request or output secrets (tokens, keys, subscription IDs). -- If requirements are ambiguous, propose 2–3 safe options with tradeoffs and choose a conservative default. -- Do not promise zero downtime; advise workload safeguards (PDBs, probes, replicas) and staged upgrades. -- If user asks for actions that require privileged access, provide a plan and commands with placeholders. - ---- - -## Quality Bar -A high-quality answer: -- flags Day-0 irreversible choices (networking, API server access), -- includes identity/secrets/policy defaults (Workload ID + Key Vault CSI + safeguards), -- includes security defaults (Entra ID everywhere, encryption, image signing, network policies), -- recommends availability zones + Standard tier for production, -- includes performance defaults (ephemeral disks, Azure Linux), -- includes observability + upgrade/patching plan, -- addresses cost (spot pools, stop/start for dev). - ---- - -## Quick Reference - -### Common AKS Commands - -| Task | Command | -|------|---------| -| List clusters | `az aks list -o table` | -| Show cluster | `az aks show -n CLUSTER -g RG` | -| Get credentials | `az aks get-credentials -n CLUSTER -g RG` | -| List node pools | `az aks nodepool list --cluster-name CLUSTER -g RG` | -| Scale node pool | `az aks nodepool scale --cluster-name CLUSTER -g RG -n POOL --node-count 5` | -| Upgrade cluster | `az aks upgrade -n CLUSTER -g RG --kubernetes-version VERSION` | - -### kubectl Quick Commands - -| Task | Command | -|------|---------| -| Get pods | `kubectl get pods -A` | -| Get nodes | `kubectl get nodes -o wide` | -| Describe pod | `kubectl describe pod POD -n NAMESPACE` | -| View pod logs | `kubectl logs POD -n NAMESPACE --tail=100` | -| Check events | `kubectl get events --sort-by='.lastTimestamp'` | -| Top nodes | `kubectl top nodes` | - ---- - -## Error Handling - -### Common AKS Issues - -| Issue | Symptom | Resolution | -|-------|---------|------------| -| **Cluster creation fails** | Quota exceeded | Request quota increase or use different VM SKU | -| **Node not ready** | NotReady status | Check kubelet logs, node conditions | -| **Pod pending** | Insufficient resources | Scale node pool or check resource requests | -| **Image pull failed** | ImagePullBackOff | Check ACR access, image name, network | -| **API server unreachable** | Connection refused | Check authorized IPs, private cluster config | - ---- - -## References (Public) -- AKS Automatic overview: https://learn.microsoft.com/azure/aks/intro-aks-automatic -- Azure CNI Overlay (setup and parameters): https://learn.microsoft.com/azure/aks/azure-cni-overlay -- Azure CNI powered by Cilium: https://learn.microsoft.com/azure/aks/azure-cni-powered-by-cilium -- Microsoft Entra Workload ID on AKS: https://learn.microsoft.com/azure/aks/workload-identity-overview -- Key Vault provider for Secrets Store CSI Driver: https://learn.microsoft.com/azure/aks/csi-secrets-store-driver -- Deployment Safeguards: https://learn.microsoft.com/azure/aks/deployment-safeguards -- Enable AKS monitoring (Prometheus + Grafana + logs): https://learn.microsoft.com/azure/azure-monitor/containers/kubernetes-monitoring-enable -- Azure Monitor managed Prometheus overview: https://learn.microsoft.com/azure/azure-monitor/metrics/prometheus-metrics-overview -- AKS patch & upgrade practices (Day-2 guidance): https://learn.microsoft.com/azure/architecture/operator-guides/aks/aks-upgrade-practices -- Node OS auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-node-os-image -- Cluster auto-upgrade channels: https://learn.microsoft.com/azure/aks/auto-upgrade-cluster -- AKS cost analysis (OpenCost-based): https://learn.microsoft.com/azure/aks/cost-analysis \ No newline at end of file +- If requirements are ambiguous for day-0 critical decisions, ask the user clarifying questions. For day-1 enabled features, propose 2–3 safe options with tradeoffs and choose a conservative default. +- Do not promise zero downtime; advise workload safeguards (PDBs, probes, replicas) and staged upgrades along with best practices for reliability and performance. +- If user asks for actions that require privileged access, provide a plan and commands with placeholders. \ No newline at end of file diff --git a/plugin/skills/azure-kubernetes/references/cli-reference.md b/plugin/skills/azure-kubernetes/references/cli-reference.md new file mode 100644 index 000000000..6f6af3404 --- /dev/null +++ b/plugin/skills/azure-kubernetes/references/cli-reference.md @@ -0,0 +1,33 @@ +# CLI Reference for AKS + +```bash +# List AKS clusters +az aks list --output table + +# Show cluster details +az aks show --name CLUSTER --resource-group RG + +# Get available Kubernetes versions +az aks get-versions --location LOCATION --output table + +# Create AKS Automatic cluster +az aks create --name CLUSTER --resource-group RG --sku automatic \ + --network-plugin azure --network-plugin-mode overlay \ + --enable-oidc-issuer --enable-workload-identity + +# Create AKS Standard cluster +az aks create --name CLUSTER --resource-group RG \ + --node-count 3 --zones 1 2 3 \ + --network-plugin azure --network-plugin-mode overlay \ + --enable-cluster-autoscaler --min-count 1 --max-count 10 + +# Get credentials +az aks get-credentials --name CLUSTER --resource-group RG + +# List node pools +az aks nodepool list --cluster-name CLUSTER --resource-group RG --output table + +# Enable monitoring +az aks enable-addons --name CLUSTER --resource-group RG \ + --addons monitoring --workspace-resource-id WORKSPACE_ID +``` \ No newline at end of file From 8d8b187c1bf7de5ae59a313ea33fc2b6e3f97952 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 15:27:21 -0800 Subject: [PATCH 17/39] Add azure-kubernetes to skills.json --- tests/skills.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/skills.json b/tests/skills.json index 7f40120ed..e2d9e908a 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -28,9 +28,5 @@ "0 5 * * 2-6": "microsoft-foundry", "0 8 * * 2-6": "azure-deploy", "0 12 * * 2-6": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-cloud-migrate,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" -======= - "0 5 * * *": "microsoft-foundry", - "0 8 * * *": "azure-deploy", - "0 12 * * *": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" } } \ No newline at end of file From cfc9cc029ffd62ab6c147b39cbd5517db422703c Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 15:30:48 -0800 Subject: [PATCH 18/39] Fix naming issues --- plugin/skills/azure-kubernetes/SKILL.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index dd160892a..45c81a44d 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -9,7 +9,15 @@ description: "Plan and create production-ready Azure Kubernetes Service (AKS) cl > > This skill produces a **recommended AKS cluster configuration** based on user requirements, distinguishing **Day-0 decisions** (networking, API server — hard to change later) from **Day-1 features** (can enable post-creation). See [CLI reference](./references/cli-reference.md) for commands. -## Triggers +## Quick Reference +| Property | Value | +|----------|-------| +| Best for | AKS cluster planning and Day-0 decisions | +| MCP Tools | `mcp_azure_mcp_aks`, `mcp_aks_mcp_az_aks_operations` | +| CLI | `az aks create`, `az aks show` | +| Related skills | azure-diagnostics (troubleshooting), azure-deploy (app deployment) | + +## When to Use This Skill Activate this skill when user wants to: - Create a new AKS cluster - Plan AKS cluster configuration for production workloads @@ -39,7 +47,7 @@ If the user is unsure, use safe defaults. - Upgrade and observability preferences - Cost constraints -## Key Decisions (Defaults when user is unsure) +## Workflow ### 1. Cluster Type - **AKS Automatic** (default): Best for most production workloads, provides a curated experience with pre-configured best practices for security, reliability, and performance. Use unless you have specific custom requirements for networking, autoscaling, or node pool configurations not supported by NAP. From 9d12083156d2452a6e308c11f35b52e7e784b8b3 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 15:46:50 -0800 Subject: [PATCH 19/39] Update trigger and unit tests --- plugin/skills/azure-kubernetes/SKILL.md | 2 +- .../__snapshots__/triggers.test.ts.snap | 88 ++---------- tests/azure-kubernetes/unit.test.ts | 129 ++++++++---------- 3 files changed, 75 insertions(+), 144 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 45c81a44d..d5ac219de 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -1,6 +1,6 @@ --- name: azure-kubernetes -description: "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons. DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization)." +description: "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions and Day-1 configuration, cluster SKUs (Automatic vs Standard), security, monitoring, reliability/performance best practices, upgrades, and networking. WHEN: create AKS cluster, plan AKS configuration, design AKS networking, AKS Automatic vs Standard, AKS security, AKS upgrade strategy, AKS autoscaling, AKS monitoring setup, AKS cost analysis, Day-0 checklist." --- # Azure Kubernetes Service diff --git a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap index fc585f43d..87a08b438 100644 --- a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap +++ b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap @@ -2,23 +2,15 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions (networking, API server access, pod IP model), Day-1 configuration (identity, secrets, governance, observability), cluster SKUs (Automatic vs Standard), workload identity, Key Vault CSI, Azure Policy, deployment safeguards, monitoring with Prometheus/Grafana, upgrade strategies, and cost analysis. USE FOR: create AKS cluster, AKS cluster planning, AKS networking design, security design, upgrade settings, autoscaling, AKS monitoring, AKS cost analysis, AKS production best practices, AKS Automatic vs Standard, AKS add-ons. DO NOT USE FOR: debugging AKS issues (use azure-diagnostics), deploying applications to AKS (use azure-deploy), creating other Azure resources (use azure-prepare), setting up general monitoring (use azure-observability), general cost optimization strategies (use azure-cost-optimization).", + "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions and Day-1 configuration, cluster SKUs (Automatic vs Standard), security, monitoring, reliability/performance best practices, upgrades, and networking. WHEN: create AKS cluster, plan AKS configuration, design AKS networking, AKS Automatic vs Standard, AKS security, AKS upgrade strategy, AKS autoscaling, AKS monitoring setup, AKS cost analysis, Day-0 checklist.", "extractedKeywords": [ - "access", - "add-ons", "aks", "analysis", - "applications", "automatic", "autoscaling", "azure", - "azure-cost-optimization", - "azure-deploy", - "azure-diagnostics", - "azure-observability", - "azure-prepare", "best", - "bicep", + "checklist", "cli", "cluster", "clusters", @@ -27,56 +19,35 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill descri "cost", "covers", "create", - "creating", "day-0", "day-1", - "debugging", "decisions", "deploy", - "deploying", - "deployment", "design", "diagnostic", "entra", - "general", - "governance", - "grafana", "identity", - "issues", "key vault", "kubernetes", "mcp", - "model", "monitor", "monitoring", "networking", "observability", - "optimization", - "other", + "performance", "plan", - "planning", - "policy", "practices", - "production", "production-ready", - "prometheus", - "rbac", - "resources", - "safeguards", - "secrets", + "reliability", "security", - "server", "service", - "setting", - "settings", + "setup", "skus", "standard", - "strategies", - "terraform", + "strategy", "upgrade", - "vault", - "with", - "workload", + "upgrades", + "when", ], "name": "azure-kubernetes", } @@ -84,21 +55,13 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill descri exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywords match snapshot 1`] = ` [ - "access", - "add-ons", "aks", "analysis", - "applications", "automatic", "autoscaling", "azure", - "azure-cost-optimization", - "azure-deploy", - "azure-diagnostics", - "azure-observability", - "azure-prepare", "best", - "bicep", + "checklist", "cli", "cluster", "clusters", @@ -107,55 +70,34 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywor "cost", "covers", "create", - "creating", "day-0", "day-1", - "debugging", "decisions", "deploy", - "deploying", - "deployment", "design", "diagnostic", "entra", - "general", - "governance", - "grafana", "identity", - "issues", "key vault", "kubernetes", "mcp", - "model", "monitor", "monitoring", "networking", "observability", - "optimization", - "other", + "performance", "plan", - "planning", - "policy", "practices", - "production", "production-ready", - "prometheus", - "rbac", - "resources", - "safeguards", - "secrets", + "reliability", "security", - "server", "service", - "setting", - "settings", + "setup", "skus", "standard", - "strategies", - "terraform", + "strategy", "upgrade", - "vault", - "with", - "workload", + "upgrades", + "when", ] `; diff --git a/tests/azure-kubernetes/unit.test.ts b/tests/azure-kubernetes/unit.test.ts index a67c02423..a8593aa15 100644 --- a/tests/azure-kubernetes/unit.test.ts +++ b/tests/azure-kubernetes/unit.test.ts @@ -1,10 +1,10 @@ /** * Unit Tests for azure-kubernetes * - * Test isolated skill logic and validation rules. + * Tests domain invariants - concepts that should always be present + * in AKS cluster planning guidance. */ -import { readFileSync } from "node:fs"; import { loadSkill, LoadedSkill } from "../utils/skill-loader"; const SKILL_NAME = "azure-kubernetes"; @@ -24,99 +24,88 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { expect(skill.metadata.description.length).toBeGreaterThan(10); }); - test("description meets Medium-High compliance length", () => { - // Descriptions should be 150-1024 chars for Medium-High compliance - expect(skill.metadata.description.length).toBeGreaterThan(150); - expect(skill.metadata.description.length).toBeLessThanOrEqual(1024); + test("description mentions AKS or Kubernetes", () => { + const description = skill.metadata.description.toLowerCase(); + expect(description).toMatch(/aks|kubernetes/); + }); + }); + + describe("Day-0 vs Day-1 Guidance", () => { + test("distinguishes Day-0 decisions from Day-1 features", () => { + expect(skill.content).toContain("Day-0"); + expect(skill.content).toContain("Day-1"); }); - test("description contains USE FOR trigger phrases", () => { - const description = skill.metadata.description; - expect(description).toContain("USE FOR:"); + test("identifies networking as hard-to-change decision", () => { + const content = skill.content.toLowerCase(); + // Networking is a Day-0 decision that's hard to change after cluster creation + expect(content).toMatch(/network|cni|pod ip/i); }); + }); - test("description contains DO NOT USE FOR anti-triggers", () => { - const description = skill.metadata.description; - expect(description).toContain("DO NOT USE FOR:"); + describe("Cluster SKU Guidance", () => { + test("covers AKS Automatic vs Standard choice", () => { + expect(skill.content).toContain("Automatic"); + expect(skill.content).toContain("Standard"); }); - test("has AKS-specific trigger keywords", () => { - const description = skill.metadata.description.toLowerCase(); - const hasAKSKeywords = - description.includes("aks") || - description.includes("kubernetes") || - description.includes("cluster"); - expect(hasAKSKeywords).toBe(true); + test("recommends AKS Automatic as default for most workloads", () => { + const content = skill.content.toLowerCase(); + // AKS Automatic should be the recommended default + expect(content).toMatch(/automatic.*default|default.*automatic/); }); }); - describe("Skill Content", () => { - test("has substantive content", () => { - expect(skill.content).toBeDefined(); - expect(skill.content.length).toBeGreaterThan(500); + describe("Networking Guidance", () => { + test("covers pod IP model options", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/overlay|vnet|cni/); + }); + + test("mentions egress configuration", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/egress|outbound/); }); - test("contains expected sections", () => { - expect(skill.content).toContain("## Triggers"); - expect(skill.content).toContain("## When to Use"); - expect(skill.content).toContain("## Decision Framework"); - expect(skill.content).toContain("## Step-by-Step Execution"); + test("mentions ingress options", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/ingress|gateway/); }); + }); - test("covers AKS cluster configuration topics", () => { + describe("Security Guidance", () => { + test("recommends Entra ID / managed identity", () => { const content = skill.content.toLowerCase(); - expect(content).toContain("networking"); - expect(content).toContain("identity"); - expect(content).toContain("observability"); + expect(content).toMatch(/entra|workload identity|managed identity/); }); - test("references AKS documentation", () => { - expect(skill.content).toContain("learn.microsoft.com"); + test("mentions secrets management", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/key vault|secret/); }); - test("covers Day-0 and Day-1 planning", () => { - expect(skill.content).toContain("Day-0"); - expect(skill.content).toContain("Day-1"); + test("mentions policy or governance", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/policy|safeguard|governance/); }); }); - describe("Frontmatter Formatting", () => { - test("frontmatter has no tabs", () => { - const raw = readFileSync(skill.filePath, "utf-8"); - const frontmatter = raw.split("---")[1]; - expect(frontmatter).not.toMatch(/\t/); + describe("Observability Guidance", () => { + test("mentions monitoring or observability", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/monitor|observ|prometheus|grafana|insights/); }); + }); - test("frontmatter keys are only supported attributes", () => { - const raw = readFileSync(skill.filePath, "utf-8"); - const frontmatter = raw.split("---")[1]; - const supported = [ - "name", - "description", - "compatibility", - "license", - "metadata", - "argument-hint", - "disable-model-invocation", - "user-invokable", - ]; - const keys = frontmatter - .split("\n") - .filter((l: string) => /^[a-z][\w-]*\s*:/.test(l)) - .map((l: string) => l.split(":")[0].trim()); - for (const key of keys) { - expect(supported).toContain(key); - } + describe("Reliability & Upgrades", () => { + test("mentions availability zones", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/zone|az\b/); }); - test("USE FOR and DO NOT USE FOR are inside description value, not separate keys", () => { - const description = skill.metadata.description; - if (description.includes("USE FOR")) { - expect(description).toContain("USE FOR:"); - } - if (description.includes("DO NOT USE FOR")) { - expect(description).toContain("DO NOT USE FOR:"); - } + test("covers upgrade strategy", () => { + const content = skill.content.toLowerCase(); + expect(content).toMatch(/upgrade|patch|maintenance/); }); }); }); \ No newline at end of file From 14540ae8c59ec618c81f41140a72b732c6e5f78a Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 16:13:21 -0800 Subject: [PATCH 20/39] Bump azure-prepare version to 1.0.1 --- plugin/skills/azure-prepare/SKILL.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 7253caacc..4896278a1 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -1,10 +1,7 @@ --- name: azure-prepare -description: "Prepare Azure apps for deployment (infra Bicep/Terraform, azure.yaml, Dockerfiles). Use for create/modernize or create+deploy; not cross-cloud migration (use azure-cloud-migrate). WHEN: \"create app\", \"build web app\", \"create API\", \"create serverless HTTP API\", \"create frontend\", \"create back end\", \"build a service\", \"modernize application\", \"update application\", \"add authentication\", \"add caching\", \"host on Azure\", \"create and deploy\", \"deploy to Azure\", \"deploy to Azure using Terraform\", \"deploy to Azure App Service\", \"deploy to Azure App Service using Terraform\", \"deploy to Azure Container Apps\", \"deploy to Azure Container Apps using Terraform\", \"generate Terraform\", \"generate Bicep\", \"function app\", \"timer trigger\", \"service bus trigger\", \"event-driven function\", \"containerized Node.js app\", \"social media app\", \"static portfolio website\", \"todo list with frontend and API\", \"prepare my Azure application to use Key Vault\", \"managed identity\"." -license: MIT -metadata: - author: Microsoft - version: "1.0.3" +version: 1.0.1 +description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- # Azure Prepare From f6fcda1246a9a0dce84cadcd1b153650ef849e64 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 16:46:34 -0800 Subject: [PATCH 21/39] Fix metadata.version --- plugin/skills/azure-prepare/SKILL.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 4896278a1..3aef1a1fa 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -1,6 +1,8 @@ --- name: azure-prepare -version: 1.0.1 +metadata: + author: Microsoft + version: "1.0.1" description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- From f6dc9963a70066ab057c8fd87da54eb43c7d45a0 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 16:50:48 -0800 Subject: [PATCH 22/39] Add metadata to azure-kubernetes skill --- plugin/skills/azure-kubernetes/SKILL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index d5ac219de..f03437893 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -1,5 +1,8 @@ --- name: azure-kubernetes +metadata: + author: Microsoft + version: "1.0.0" description: "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions and Day-1 configuration, cluster SKUs (Automatic vs Standard), security, monitoring, reliability/performance best practices, upgrades, and networking. WHEN: create AKS cluster, plan AKS configuration, design AKS networking, AKS Automatic vs Standard, AKS security, AKS upgrade strategy, AKS autoscaling, AKS monitoring setup, AKS cost analysis, Day-0 checklist." --- From 453c478f4cf44374cfdd6b588f4fbf92d186ffcb Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 17:19:22 -0800 Subject: [PATCH 23/39] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../azure-kubernetes/references/cli-reference.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/plugin/skills/azure-kubernetes/references/cli-reference.md b/plugin/skills/azure-kubernetes/references/cli-reference.md index 6f6af3404..5969dcd6c 100644 --- a/plugin/skills/azure-kubernetes/references/cli-reference.md +++ b/plugin/skills/azure-kubernetes/references/cli-reference.md @@ -5,29 +5,29 @@ az aks list --output table # Show cluster details -az aks show --name CLUSTER --resource-group RG +az aks show --name --resource-group # Get available Kubernetes versions -az aks get-versions --location LOCATION --output table +az aks get-versions --location --output table # Create AKS Automatic cluster -az aks create --name CLUSTER --resource-group RG --sku automatic \ +az aks create --name --resource-group --sku automatic \ --network-plugin azure --network-plugin-mode overlay \ --enable-oidc-issuer --enable-workload-identity # Create AKS Standard cluster -az aks create --name CLUSTER --resource-group RG \ +az aks create --name --resource-group \ --node-count 3 --zones 1 2 3 \ --network-plugin azure --network-plugin-mode overlay \ --enable-cluster-autoscaler --min-count 1 --max-count 10 # Get credentials -az aks get-credentials --name CLUSTER --resource-group RG +az aks get-credentials --name --resource-group # List node pools -az aks nodepool list --cluster-name CLUSTER --resource-group RG --output table +az aks nodepool list --cluster-name --resource-group --output table # Enable monitoring -az aks enable-addons --name CLUSTER --resource-group RG \ - --addons monitoring --workspace-resource-id WORKSPACE_ID +az aks enable-addons --name --resource-group \ + --addons monitoring --workspace-resource-id ``` \ No newline at end of file From 0436b19f8e98976446c5d8f359e8e4425212febc Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 2 Mar 2026 17:20:33 -0800 Subject: [PATCH 24/39] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- plugin/skills/azure-prepare/references/architecture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/skills/azure-prepare/references/architecture.md b/plugin/skills/azure-prepare/references/architecture.md index c2f917748..44b57c0f1 100644 --- a/plugin/skills/azure-prepare/references/architecture.md +++ b/plugin/skills/azure-prepare/references/architecture.md @@ -58,7 +58,7 @@ Select hosting stack and map components to Azure services. | SSR Web App | Container Apps | App Service, AKS | | REST/GraphQL API | Container Apps | App Service, Functions, AKS | | Background Worker | Container Apps | Functions, AKS | -| Scheduled Task | Functions (Timer) | Container Apps Jobs, AKS CronJob | +| Scheduled Task | Functions (Timer) | Container Apps Jobs, Kubernetes CronJob (on AKS) | | Event Processor | Functions | Container Apps, AKS + KEDA | | Microservices (full K8s) | AKS | Container Apps | | GPU/ML Workloads | AKS | Azure ML | From c65c53f3e62c03a0cd6f01b20d9f204436b09181 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 10:34:26 -0800 Subject: [PATCH 25/39] Bump azure-prepare skill version --- plugin/skills/azure-prepare/SKILL.md | 1 - 1 file changed, 1 deletion(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 3aef1a1fa..0bc0812f6 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -3,7 +3,6 @@ name: azure-prepare metadata: author: Microsoft version: "1.0.1" -description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- # Azure Prepare From 1bf5c35f705b823e90165a418773790ca096d9dc Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:03:14 -0800 Subject: [PATCH 26/39] Revert pr.yml --- .github/workflows/pr.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 973c2ea04..e517bd1f8 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -235,7 +235,6 @@ jobs: for file in ${{ steps.changed-skills.outputs.all_changed_files }}; do SKILL_FILES="$SKILL_FILES ../$file" done - SKILLS="${SKILLS# }" # Run frontmatter spec validation if OUTPUT=$(npm run frontmatter -- $SKILL_FILES 2>&1); then From 3e578e0e82b45f8d4384dbc4b95c1aea0333f74e Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:14:35 -0800 Subject: [PATCH 27/39] Add license to AKS skill --- plugin/skills/azure-kubernetes/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index f03437893..9174cddb5 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -1,5 +1,6 @@ --- name: azure-kubernetes +license: MIT metadata: author: Microsoft version: "1.0.0" From 9144b03c90aa1a09e73613fcb74fc1df313b990d Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:33:59 -0800 Subject: [PATCH 28/39] Add back azure-prepare description --- plugin/skills/azure-prepare/SKILL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 0bc0812f6..3aef1a1fa 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -3,6 +3,7 @@ name: azure-prepare metadata: author: Microsoft version: "1.0.1" +description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- # Azure Prepare From 3294051b2874ed3868fff4076fa8b0858728ae2b Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:34:50 -0800 Subject: [PATCH 29/39] Bump azure-prepare version to 1.0.4 --- plugin/skills/azure-prepare/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 3aef1a1fa..5e7f8a246 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -2,7 +2,7 @@ name: azure-prepare metadata: author: Microsoft - version: "1.0.1" + version: "1.0.4" description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- From 4c6c56aeae5a57c37faaac8bfc741c563dd0ee79 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:36:01 -0800 Subject: [PATCH 30/39] Add back license to azure-prepare --- plugin/skills/azure-prepare/SKILL.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 5e7f8a246..8b016535d 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -1,9 +1,10 @@ --- name: azure-prepare +description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." +license: MIT metadata: author: Microsoft version: "1.0.4" -description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." --- # Azure Prepare From 02fa8315d18213d30a73320d9667b388f20711ef Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 4 Mar 2026 16:36:27 -0800 Subject: [PATCH 31/39] Fix description --- plugin/skills/azure-prepare/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 8b016535d..4d2fe0ff5 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -1,6 +1,6 @@ --- name: azure-prepare -description: "Default entry point for Azure application development EXCEPT cross-cloud migration — use azure-cloud-migrate instead. Analyzes your project and prepares it for Azure deployment by generating infrastructure code (Bicep/Terraform), azure.yaml, and Dockerfiles. WHEN: \"create an app\", \"build a web app\", \"create API\", \"create frontend\", \"create backend\", \"add a feature\", \"build a service\", \"develop a project\", \"modernize my code\", \"update my application\", \"add database\", \"add authentication\", \"add caching\", \"deploy to Azure\", \"host on Azure\", \"Azure with terraform\", \"Azure with azd\", \"generate azure.yaml\", \"generate Bicep\", \"generate Terraform\", \"create Azure Functions app\", \"create serverless HTTP API\", \"create function app\", \"create event-driven function\", \"create and deploy to Azure\", \"create Azure Functions and deploy\", \"create function app and deploy\"." +description: "Prepare Azure apps for deployment (infra Bicep/Terraform, azure.yaml, Dockerfiles). Use for create/modernize or create+deploy; not cross-cloud migration (use azure-cloud-migrate). WHEN: \"create app\", \"build web app\", \"create API\", \"create serverless HTTP API\", \"create frontend\", \"create back end\", \"build a service\", \"modernize application\", \"update application\", \"add authentication\", \"add caching\", \"host on Azure\", \"create and deploy\", \"deploy to Azure\", \"deploy to Azure using Terraform\", \"deploy to Azure App Service\", \"deploy to Azure App Service using Terraform\", \"deploy to Azure Container Apps\", \"deploy to Azure Container Apps using Terraform\", \"generate Terraform\", \"generate Bicep\", \"function app\", \"timer trigger\", \"service bus trigger\", \"event-driven function\", \"containerized Node.js app\", \"social media app\", \"static portfolio website\", \"todo list with frontend and API\", \"prepare my Azure application to use Key Vault\", \"managed identity\"." license: MIT metadata: author: Microsoft From 6dbda3620874cb3599edea1c7db3f0c22025c68c Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 9 Mar 2026 16:09:28 -0700 Subject: [PATCH 32/39] Fix Copilot feedback --- plugin/skills/azure-kubernetes/SKILL.md | 21 +- .../__snapshots__/triggers.test.ts.snap | 2 + tests/azure-kubernetes/integration.test.ts | 325 +++++------------- tests/azure-kubernetes/triggers.test.ts | 126 +++++-- tests/azure-kubernetes/unit.test.ts | 174 +++++++--- tests/skills.json | 1 - 6 files changed, 323 insertions(+), 326 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 9174cddb5..f977ee446 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -36,7 +36,7 @@ Activate this skill when user wants to: ## Rules 1. Start with the user's requirements for provisioning compute, networking, security, and other settings. -2. Use the AKS MCP server for invoking Azure API and kubectl commands when applicable during the cluster setup and operations processes. +2. Use the `azure` MCP server and its AKS-related MCP tools (`mcp_azure_mcp_aks`, `mcp_aks_mcp_az_aks_operations`) to invoke Azure APIs and perform AKS and kubectl operations; fall back to Azure CLI (`az aks`) only when required functionality is not available via MCP tools. 3. Determine if AKS Automatic or Standard SKU is more appropriate based on the user's need for control vs convenience. Default to AKS Automatic unless specific customizations are required. 4. Document decisions and rationale for cluster configuration choices, especially for Day-0 decisions that are hard to change later (networking, API server access). @@ -125,4 +125,21 @@ If the user is unsure, use safe defaults. - Do not request or output secrets (tokens, keys, subscription IDs). - If requirements are ambiguous for day-0 critical decisions, ask the user clarifying questions. For day-1 enabled features, propose 2–3 safe options with tradeoffs and choose a conservative default. - Do not promise zero downtime; advise workload safeguards (PDBs, probes, replicas) and staged upgrades along with best practices for reliability and performance. -- If user asks for actions that require privileged access, provide a plan and commands with placeholders. \ No newline at end of file +- If user asks for actions that require privileged access, provide a plan and commands with placeholders. + +## MCP Tools +| Tool | Purpose | Key Parameters | +|------|---------|----------------| +| `mcp_azure_mcp_aks` | Query AKS clusters at subscription scope | `subscription_id`, `resource_group` | +| `mcp_aks_mcp_az_aks_operations` | Cluster operations: show, list, get-versions, nodepool management | `cluster_name`, `resource_group`, `operation` | +| `mcp_aks_mcp_kubectl_resources` | Get/describe pods, deployments, services | `resource_type`, `namespace`, `name` | +| `mcp_aks_mcp_kubectl_diagnostics` | Logs, events, top, exec | `pod_name`, `namespace`, `command` | + +## Error Handling +| Error / Symptom | Likely Cause | Remediation | +|-----------------|--------------|-------------| +| MCP tool call fails or times out | Invalid credentials, subscription, or cluster context | Verify `az login`, check subscription ID and resource group | +| Cluster creation blocked by policy | Azure Policy denying configuration | Review policy assignments, adjust cluster settings to comply | +| Quota exceeded | Regional vCPU or resource limits | Request quota increase or select different region/VM SKU | +| Networking conflict (IP exhaustion) | Pod subnet too small for overlay/CNI | Re-plan IP ranges; may require cluster recreation (Day-0) | +| Workload Identity not working | Missing OIDC issuer or federated credential | Enable `--enable-oidc-issuer --enable-workload-identity`, configure federated identity | diff --git a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap index 87a08b438..6a0dbc2fb 100644 --- a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap +++ b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap @@ -26,6 +26,7 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill descri "design", "diagnostic", "entra", + "function", "identity", "key vault", "kubernetes", @@ -77,6 +78,7 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywor "design", "diagnostic", "entra", + "function", "identity", "key vault", "kubernetes", diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index 2e66b24f8..7f5381d3c 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -11,261 +11,120 @@ import { useAgentRunner, - shouldSkipIntegrationTests, - getIntegrationSkipReason + doesAssistantMessageIncludeKeyword, + shouldSkipIntegrationTests } from "../utils/agent-runner"; -import { softCheckSkill } from "../utils/evaluate"; +import { isSkillInvoked, softCheckSkill } from "../utils/evaluate"; const SKILL_NAME = "azure-kubernetes"; -const RUNS_PER_PROMPT = 5; +const RUNS_PER_PROMPT = 2; -// Check if integration tests should be skipped at module level const skipTests = shouldSkipIntegrationTests(); -const skipReason = getIntegrationSkipReason(); - -// Log skip reason if skipping -if (skipTests && skipReason) { - console.log(`⏭️ Skipping integration tests: ${skipReason}`); -} const describeIntegration = skipTests ? describe.skip : describe; -describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { +describeIntegration(`${SKILL_NAME} - Integration Tests`, () => { const agent = useAgentRunner(); - describe("skill-invocation", () => { - test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "Help me create a production-ready AKS cluster with best practices" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS networking prompt", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "What networking options should I choose for my AKS cluster?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS Automatic vs Standard", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "Should I use AKS Automatic or AKS Standard for my production workloads?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for golden path AKS setup", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "What is the recommended golden path for setting up an AKS cluster?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS security best practices", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "What are the security best practices for AKS clusters?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS performance optimization", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "How do I optimize performance for my Azure Kubernetes workloads?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS reliability patterns", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "What reliability best practices should I follow for AKS?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; + test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Help me create a production-ready AKS cluster with best practices" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; } + throw e; } - }); - - test("invokes azure-kubernetes skill for workload identity setup", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "How do I configure workload identity for my AKS cluster?" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS monitoring setup", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "Set up monitoring for my Azure Kubernetes cluster with Prometheus and Grafana" - }); - - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; - } - } - }); - - test("invokes azure-kubernetes skill for AKS upgrade strategy", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "What is the best upgrade strategy for AKS clusters in production?" - }); + } + }); - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; + test("responds with Day-0 vs Day-1 guidance", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What Day-0 decisions do I need to make for AKS?" + }); + + const hasDay0Content = doesAssistantMessageIncludeKeyword(agentMetadata, "tier") || + doesAssistantMessageIncludeKeyword(agentMetadata, "networking") || + doesAssistantMessageIncludeKeyword(agentMetadata, "API server"); + expect(hasDay0Content).toBe(true); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; } + throw e; } - }); - - test("invokes azure-kubernetes skill for AKS autoscaling configuration", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "How do I configure cluster autoscaler and KEDA for my AKS cluster?" - }); + } + }); - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; + test("recommends AKS Automatic vs Standard appropriately", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Should I use AKS Automatic or Standard for my production workload?" + }); + + const hasSkuGuidance = doesAssistantMessageIncludeKeyword(agentMetadata, "Automatic") || + doesAssistantMessageIncludeKeyword(agentMetadata, "Standard"); + expect(hasSkuGuidance).toBe(true); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; } + throw e; } - }); - - test("invokes azure-kubernetes skill for AKS deployment safeguards", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "Configure deployment safeguards and Azure Policy for my AKS cluster" - }); + } + }); - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; + test("provides networking recommendations", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "How should I configure AKS networking for pods that need VNet-routable IPs?" + }); + + const hasNetworkingContent = doesAssistantMessageIncludeKeyword(agentMetadata, "CNI") || + doesAssistantMessageIncludeKeyword(agentMetadata, "overlay") || + doesAssistantMessageIncludeKeyword(agentMetadata, "VNet"); + expect(hasNetworkingContent).toBe(true); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; } + throw e; } - }); - - test("invokes azure-kubernetes skill for AKS node pool sizing", async () => { - for (let i = 0; i < RUNS_PER_PROMPT; i++) { - try { - const agentMetadata = await agent.run({ - prompt: "How should I size my AKS node pools for a microservices application?" - }); + } + }); - softCheckSkill(agentMetadata, SKILL_NAME); - } catch (e: unknown) { - if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { - console.log("⏭️ SDK not loadable, skipping test"); - return; - } - throw e; + test("covers security best practices", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "What security best practices should I follow for AKS?" + }); + + const hasSecurityContent = doesAssistantMessageIncludeKeyword(agentMetadata, "identity") || + doesAssistantMessageIncludeKeyword(agentMetadata, "Entra") || + doesAssistantMessageIncludeKeyword(agentMetadata, "workload") || + doesAssistantMessageIncludeKeyword(agentMetadata, "Key Vault"); + expect(hasSecurityContent).toBe(true); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; } + throw e; } - }); + } }); -}); \ No newline at end of file +}); diff --git a/tests/azure-kubernetes/triggers.test.ts b/tests/azure-kubernetes/triggers.test.ts index d7e3ed39e..b2e8145c2 100644 --- a/tests/azure-kubernetes/triggers.test.ts +++ b/tests/azure-kubernetes/triggers.test.ts @@ -3,8 +3,6 @@ * * Tests that verify the skill triggers on appropriate prompts * and does NOT trigger on unrelated prompts. - * - * Uses snapshot testing + parameterized tests for comprehensive coverage. */ import { TriggerMatcher } from "../utils/trigger-matcher"; @@ -22,42 +20,96 @@ describe(`${SKILL_NAME} - Trigger Tests`, () => { }); describe("Should Trigger", () => { - // Prompts that SHOULD trigger this skill - include multiple keywords + // Common customer prompts for AKS cluster planning and creation const shouldTriggerPrompts: string[] = [ - "Create an Azure Kubernetes cluster for production", - "Set up AKS cluster with Azure networking", - "Configure Azure AKS cluster autoscaling", - "Plan Azure Kubernetes workload identity setup", - "Azure AKS Automatic vs Standard cluster", - "Set up Azure Kubernetes monitoring with Prometheus", - "Configure Azure AKS deployment safeguards", - "Azure Kubernetes cluster upgrade strategy", + // Cluster creation + "Help me create an AKS cluster", + "I need to set up a new Kubernetes cluster on Azure", + "Create a production-ready AKS cluster with best practices", + "How do I provision an AKS cluster for my team?", + + // Day-0 decisions + "What networking options should I choose for AKS?", + "AKS Day-0 checklist", + "Plan AKS configuration for production", + "Design AKS networking with private API server", + + // SKU selection + "What's the difference between AKS Automatic and Standard?", + "Should I use AKS Automatic or Standard SKU?", + "Help me choose the right AKS cluster SKU", + + // Networking + "Configure AKS with Azure CNI Overlay", + "How do I set up private AKS cluster?", + "AKS egress configuration options", + + // Security + "Configure AKS with workload identity", + "Set up Azure Policy for AKS", + "Set up Key Vault CSI driver for AKS", + "Enable Deployment Safeguards for AKS", + "How do I secure my AKS cluster?", + + // Operations + "Enable monitoring for my AKS cluster", + "Configure AKS upgrade strategy", + "How do I set up AKS autoscaling?", + "AKS cost analysis", + "Configure AKS cluster autoscaling and node pools", ]; - test.each(shouldTriggerPrompts)('triggers on: "%s"', (prompt) => { - const result = triggerMatcher.shouldTrigger(prompt); - expect(result.triggered).toBe(true); - }); + test.each(shouldTriggerPrompts)( + 'triggers on: "%s"', + (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(true); + } + ); }); describe("Should NOT Trigger", () => { - // Prompts that should NOT trigger this skill (avoid Azure/kubernetes/AKS keywords) - const shouldNotTriggerPrompts: string[] = [ + // Generic prompts unrelated to Azure/Kubernetes + const genericPrompts: string[] = [ "What is the weather today?", "Help me write a poem", "Explain quantum computing", - "Help me with AWS EKS", - "How do I use Google GKE?", "Write a Python script to parse JSON", - "What is the capital of France?", - "Configure my local Docker container", - "Set up PostgreSQL database locally", + "How do I bake a cake?", ]; - test.each(shouldNotTriggerPrompts)('does not trigger on: "%s"', (prompt) => { - const result = triggerMatcher.shouldTrigger(prompt); - expect(result.triggered).toBe(false); - }); + // Competing cloud providers (without "AKS" or "Azure" keywords) + const otherCloudPrompts: string[] = [ + "How do I use AWS EKS?", + "Help me with GCP GKE", + "Help me with AWS Lambda", + "How do I use Google Cloud Platform?", + "Set up EC2 instances", + "Configure S3 bucket policies", + ]; + + // Generic infrastructure prompts (without Azure keywords) + const genericInfraPrompts: string[] = [ + "Set up a PostgreSQL database", + "Configure nginx load balancer", + "How do I use Docker Compose?", + "Set up Redis caching", + "Configure SSL certificates", + ]; + + const shouldNotTriggerPrompts = [ + ...genericPrompts, + ...otherCloudPrompts, + ...genericInfraPrompts, + ]; + + test.each(shouldNotTriggerPrompts)( + 'does not trigger on: "%s"', + (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(false); + } + ); }); describe("Trigger Keywords Snapshot", () => { @@ -69,27 +121,25 @@ describe(`${SKILL_NAME} - Trigger Tests`, () => { expect({ name: skill.metadata.name, description: skill.metadata.description, - extractedKeywords: triggerMatcher.getKeywords(), + extractedKeywords: triggerMatcher.getKeywords() }).toMatchSnapshot(); }); }); describe("Edge Cases", () => { - test("handles empty prompt", () => { - const result = triggerMatcher.shouldTrigger(""); - expect(result.triggered).toBe(false); + test("handles mixed case input", () => { + const result = triggerMatcher.shouldTrigger("CREATE AN AKS CLUSTER"); + expect(result.triggered).toBe(true); }); - test("handles very long prompt", () => { - const longPrompt = "AKS cluster Azure ".repeat(1000); - const result = triggerMatcher.shouldTrigger(longPrompt); - expect(typeof result.triggered).toBe("boolean"); + test("handles partial matches", () => { + const result = triggerMatcher.shouldTrigger("kubernetes on azure"); + expect(result.triggered).toBe(true); }); - test("is case insensitive", () => { - const result1 = triggerMatcher.shouldTrigger("azure kubernetes cluster"); - const result2 = triggerMatcher.shouldTrigger("AZURE KUBERNETES CLUSTER"); - expect(result1.triggered).toBe(result2.triggered); + test("handles empty prompt", () => { + const result = triggerMatcher.shouldTrigger(""); + expect(result.triggered).toBe(false); }); }); }); diff --git a/tests/azure-kubernetes/unit.test.ts b/tests/azure-kubernetes/unit.test.ts index a8593aa15..3345976a5 100644 --- a/tests/azure-kubernetes/unit.test.ts +++ b/tests/azure-kubernetes/unit.test.ts @@ -1,8 +1,8 @@ /** * Unit Tests for azure-kubernetes * - * Tests domain invariants - concepts that should always be present - * in AKS cluster planning guidance. + * Tests skill content and structure without requiring external services. + * Focuses on domain invariants rather than exact formatting. */ import { loadSkill, LoadedSkill } from "../utils/skill-loader"; @@ -17,95 +17,165 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { }); describe("Skill Metadata", () => { - test("has valid SKILL.md with required fields", () => { - expect(skill.metadata).toBeDefined(); - expect(skill.metadata.name).toBe(SKILL_NAME); + test("has required frontmatter fields", () => { + expect(skill.metadata.name).toBe("azure-kubernetes"); expect(skill.metadata.description).toBeDefined(); - expect(skill.metadata.description.length).toBeGreaterThan(10); + expect(skill.metadata.description.length).toBeGreaterThan(50); }); - test("description mentions AKS or Kubernetes", () => { - const description = skill.metadata.description.toLowerCase(); - expect(description).toMatch(/aks|kubernetes/); + test("description contains WHEN triggers", () => { + expect(skill.metadata.description).toMatch(/WHEN:/i); + }); + + test("description mentions key AKS concepts", () => { + const desc = skill.metadata.description.toLowerCase(); + expect(desc).toMatch(/aks|kubernetes/); + expect(desc).toMatch(/cluster/); }); }); describe("Day-0 vs Day-1 Guidance", () => { - test("distinguishes Day-0 decisions from Day-1 features", () => { - expect(skill.content).toContain("Day-0"); - expect(skill.content).toContain("Day-1"); + test("distinguishes between Day-0 and Day-1 decisions", () => { + expect(skill.content).toMatch(/Day-0/i); + expect(skill.content).toMatch(/Day-1/i); + }); + + test("identifies networking as Day-0 decision", () => { + expect(skill.content).toMatch(/networking.*day-0|day-0.*networking/i); }); - test("identifies networking as hard-to-change decision", () => { - const content = skill.content.toLowerCase(); - // Networking is a Day-0 decision that's hard to change after cluster creation - expect(content).toMatch(/network|cni|pod ip/i); + test("identifies API server access as Day-0 consideration", () => { + expect(skill.content).toMatch(/api server/i); }); }); describe("Cluster SKU Guidance", () => { - test("covers AKS Automatic vs Standard choice", () => { - expect(skill.content).toContain("Automatic"); - expect(skill.content).toContain("Standard"); + test("covers AKS Automatic SKU", () => { + expect(skill.content).toMatch(/AKS Automatic/i); + }); + + test("covers AKS Standard SKU", () => { + expect(skill.content).toMatch(/AKS Standard/i); }); - test("recommends AKS Automatic as default for most workloads", () => { - const content = skill.content.toLowerCase(); - // AKS Automatic should be the recommended default - expect(content).toMatch(/automatic.*default|default.*automatic/); + test("recommends Automatic as default", () => { + expect(skill.content).toMatch(/automatic.*default|default.*automatic/i); }); }); describe("Networking Guidance", () => { - test("covers pod IP model options", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/overlay|vnet|cni/); + test("covers Azure CNI options", () => { + expect(skill.content).toMatch(/Azure CNI/i); }); - test("mentions egress configuration", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/egress|outbound/); + test("covers overlay networking", () => { + expect(skill.content).toMatch(/overlay/i); }); - test("mentions ingress options", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/ingress|gateway/); + test("covers egress patterns", () => { + expect(skill.content).toMatch(/egress/i); + }); + + test("covers ingress options", () => { + expect(skill.content).toMatch(/ingress/i); }); }); - describe("Security Guidance", () => { - test("recommends Entra ID / managed identity", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/entra|workload identity|managed identity/); + describe("Security Best Practices", () => { + test("recommends Entra ID / Azure AD", () => { + expect(skill.content).toMatch(/entra|azure ad/i); }); - test("mentions secrets management", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/key vault|secret/); + test("recommends Workload Identity", () => { + expect(skill.content).toMatch(/workload identity/i); }); - test("mentions policy or governance", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/policy|safeguard|governance/); + test("recommends Key Vault integration", () => { + expect(skill.content).toMatch(/key vault/i); + }); + + test("warns against static credentials", () => { + expect(skill.content).toMatch(/avoid.*static|static.*credential/i); + }); + + test("mentions Azure Policy", () => { + expect(skill.content).toMatch(/azure policy/i); }); }); describe("Observability Guidance", () => { - test("mentions monitoring or observability", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/monitor|observ|prometheus|grafana|insights/); + test("mentions monitoring options", () => { + expect(skill.content).toMatch(/monitor|observability/i); + }); + + test("mentions Prometheus", () => { + expect(skill.content).toMatch(/prometheus/i); + }); + + test("mentions Grafana", () => { + expect(skill.content).toMatch(/grafana/i); }); }); - describe("Reliability & Upgrades", () => { - test("mentions availability zones", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/zone|az\b/); + describe("Reliability Patterns", () => { + test("recommends availability zones", () => { + expect(skill.content).toMatch(/availability zone|--zones/i); + }); + + test("mentions PodDisruptionBudgets", () => { + expect(skill.content).toMatch(/poddisruptionbudget|pdb/i); }); test("covers upgrade strategy", () => { - const content = skill.content.toLowerCase(); - expect(content).toMatch(/upgrade|patch|maintenance/); + expect(skill.content).toMatch(/upgrade/i); + }); + + test("mentions maintenance windows", () => { + expect(skill.content).toMatch(/maintenance window/i); + }); + }); + + describe("Performance Recommendations", () => { + test("recommends ephemeral OS disks", () => { + expect(skill.content).toMatch(/ephemeral.*disk|--node-osdisk-type ephemeral/i); + }); + + test("warns against B-series VMs", () => { + expect(skill.content).toMatch(/avoid.*b-series|b-series.*avoid/i); + }); + + test("mentions autoscaling", () => { + expect(skill.content).toMatch(/autoscal|cluster.?autoscaler/i); + }); + }); + + describe("MCP Tools Section", () => { + test("lists MCP tools", () => { + expect(skill.content).toMatch(/mcp_azure_mcp_aks|mcp_aks_mcp/i); + }); + + test("has MCP Tools section", () => { + expect(skill.content).toMatch(/## MCP Tools/i); + }); + }); + + describe("Error Handling Section", () => { + test("has Error Handling section", () => { + expect(skill.content).toMatch(/## Error Handling/i); + }); + + test("includes remediation guidance", () => { + expect(skill.content).toMatch(/remediation|quota|policy/i); + }); + }); + + describe("Guardrails", () => { + test("warns about secrets handling", () => { + expect(skill.content).toMatch(/secret|token|key/i); + }); + + test("does not promise zero downtime", () => { + expect(skill.content).toMatch(/do not promise zero downtime/i); }); }); -}); \ No newline at end of file +}); diff --git a/tests/skills.json b/tests/skills.json index e2d9e908a..c64da42ed 100644 --- a/tests/skills.json +++ b/tests/skills.json @@ -24,7 +24,6 @@ "microsoft-foundry" ], "integrationTestSchedule": { -<<<<<<< HEAD "0 5 * * 2-6": "microsoft-foundry", "0 8 * * 2-6": "azure-deploy", "0 12 * * 2-6": "appinsights-instrumentation,azure-ai,azure-aigateway,azure-cloud-migrate,azure-compliance,azure-compute,azure-cost-optimization,azure-diagnostics,azure-hosted-copilot-sdk,azure-kubernetes,azure-kusto,azure-messaging,azure-observability,azure-prepare,azure-rbac,azure-resource-lookup,azure-resource-visualizer,azure-storage,azure-validate,entra-app-registration" From 078bd9d9e5e14c7634283ab7fb9b4164f9ce7037 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Mon, 9 Mar 2026 23:58:09 -0700 Subject: [PATCH 33/39] Update description and tests --- plugin/skills/azure-kubernetes/SKILL.md | 9 +-- .../__snapshots__/triggers.test.ts.snap | 58 +++++++++++++------ tests/azure-kubernetes/integration.test.ts | 4 +- tests/azure-kubernetes/unit.test.ts | 3 +- 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index f977ee446..02ed456bd 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -4,7 +4,7 @@ license: MIT metadata: author: Microsoft version: "1.0.0" -description: "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions and Day-1 configuration, cluster SKUs (Automatic vs Standard), security, monitoring, reliability/performance best practices, upgrades, and networking. WHEN: create AKS cluster, plan AKS configuration, design AKS networking, AKS Automatic vs Standard, AKS security, AKS upgrade strategy, AKS autoscaling, AKS monitoring setup, AKS cost analysis, Day-0 checklist." +description: "Plan, create, and configure production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 checklist, SKU selection (Automatic vs Standard), networking options (private API server, Azure CNI Overlay, egress configuration), security (workload identity, Azure Policy, Key Vault CSI driver, Deployment Safeguards), and operations (monitoring, upgrade strategy, autoscaling, cost analysis, node pools). WHEN: provision AKS cluster, design AKS networking, choose AKS SKU, secure AKS, set up AKS." --- # Azure Kubernetes Service @@ -125,21 +125,16 @@ If the user is unsure, use safe defaults. - Do not request or output secrets (tokens, keys, subscription IDs). - If requirements are ambiguous for day-0 critical decisions, ask the user clarifying questions. For day-1 enabled features, propose 2–3 safe options with tradeoffs and choose a conservative default. - Do not promise zero downtime; advise workload safeguards (PDBs, probes, replicas) and staged upgrades along with best practices for reliability and performance. -- If user asks for actions that require privileged access, provide a plan and commands with placeholders. ## MCP Tools | Tool | Purpose | Key Parameters | |------|---------|----------------| -| `mcp_azure_mcp_aks` | Query AKS clusters at subscription scope | `subscription_id`, `resource_group` | -| `mcp_aks_mcp_az_aks_operations` | Cluster operations: show, list, get-versions, nodepool management | `cluster_name`, `resource_group`, `operation` | -| `mcp_aks_mcp_kubectl_resources` | Get/describe pods, deployments, services | `resource_type`, `namespace`, `name` | -| `mcp_aks_mcp_kubectl_diagnostics` | Logs, events, top, exec | `pod_name`, `namespace`, `command` | +| `mcp_azure_mcp_aks` | Create and query AKS clusters at subscription scope | `subscription_id`, `resource_group` | ## Error Handling | Error / Symptom | Likely Cause | Remediation | |-----------------|--------------|-------------| | MCP tool call fails or times out | Invalid credentials, subscription, or cluster context | Verify `az login`, check subscription ID and resource group | -| Cluster creation blocked by policy | Azure Policy denying configuration | Review policy assignments, adjust cluster settings to comply | | Quota exceeded | Regional vCPU or resource limits | Request quota increase or select different region/VM SKU | | Networking conflict (IP exhaustion) | Pod subnet too small for overlay/CNI | Re-plan IP ranges; may require cluster recreation (Day-0) | | Workload Identity not working | Missing OIDC issuer or federated credential | Enable `--enable-oidc-issuer --enable-workload-identity`, configure federated identity | diff --git a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap index 6a0dbc2fb..ca0cc9a29 100644 --- a/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap +++ b/tests/azure-kubernetes/__snapshots__/triggers.test.ts.snap @@ -2,29 +2,31 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Plan and create production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 decisions and Day-1 configuration, cluster SKUs (Automatic vs Standard), security, monitoring, reliability/performance best practices, upgrades, and networking. WHEN: create AKS cluster, plan AKS configuration, design AKS networking, AKS Automatic vs Standard, AKS security, AKS upgrade strategy, AKS autoscaling, AKS monitoring setup, AKS cost analysis, Day-0 checklist.", + "description": "Plan, create, and configure production-ready Azure Kubernetes Service (AKS) clusters. Covers Day-0 checklist, SKU selection (Automatic vs Standard), networking options (private API server, Azure CNI Overlay, egress configuration), security (workload identity, Azure Policy, Key Vault CSI driver, Deployment Safeguards), and operations (monitoring, upgrade strategy, autoscaling, cost analysis, node pools). WHEN: provision AKS cluster, design AKS networking, choose AKS SKU, secure AKS, set up AKS.", "extractedKeywords": [ "aks", "analysis", "automatic", "autoscaling", "azure", - "best", "checklist", + "choose", "cli", "cluster", "clusters", "configuration", + "configure", "container", "cost", "covers", "create", "day-0", - "day-1", - "decisions", "deploy", + "deployment", "design", "diagnostic", + "driver", + "egress", "entra", "function", "identity", @@ -34,21 +36,29 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill descri "monitor", "monitoring", "networking", + "node", "observability", - "performance", + "operations", + "options", + "overlay", "plan", - "practices", + "policy", + "pools", + "private", "production-ready", - "reliability", + "provision", + "safeguards", + "secure", "security", + "selection", + "server", "service", - "setup", - "skus", "standard", "strategy", "upgrade", - "upgrades", + "vault", "when", + "workload", ], "name": "azure-kubernetes", } @@ -61,22 +71,24 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywor "automatic", "autoscaling", "azure", - "best", "checklist", + "choose", "cli", "cluster", "clusters", "configuration", + "configure", "container", "cost", "covers", "create", "day-0", - "day-1", - "decisions", "deploy", + "deployment", "design", "diagnostic", + "driver", + "egress", "entra", "function", "identity", @@ -86,20 +98,28 @@ exports[`azure-kubernetes - Trigger Tests Trigger Keywords Snapshot skill keywor "monitor", "monitoring", "networking", + "node", "observability", - "performance", + "operations", + "options", + "overlay", "plan", - "practices", + "policy", + "pools", + "private", "production-ready", - "reliability", + "provision", + "safeguards", + "secure", "security", + "selection", + "server", "service", - "setup", - "skus", "standard", "strategy", "upgrade", - "upgrades", + "vault", "when", + "workload", ] `; diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index 7f5381d3c..78dda8645 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -14,7 +14,7 @@ import { doesAssistantMessageIncludeKeyword, shouldSkipIntegrationTests } from "../utils/agent-runner"; -import { isSkillInvoked, softCheckSkill } from "../utils/evaluate"; +import { softCheckSkill } from "../utils/evaluate"; const SKILL_NAME = "azure-kubernetes"; const RUNS_PER_PROMPT = 2; @@ -23,7 +23,7 @@ const skipTests = shouldSkipIntegrationTests(); const describeIntegration = skipTests ? describe.skip : describe; -describeIntegration(`${SKILL_NAME} - Integration Tests`, () => { +describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { const agent = useAgentRunner(); test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { diff --git a/tests/azure-kubernetes/unit.test.ts b/tests/azure-kubernetes/unit.test.ts index 3345976a5..a02d23b88 100644 --- a/tests/azure-kubernetes/unit.test.ts +++ b/tests/azure-kubernetes/unit.test.ts @@ -30,7 +30,8 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { test("description mentions key AKS concepts", () => { const desc = skill.metadata.description.toLowerCase(); expect(desc).toMatch(/aks|kubernetes/); - expect(desc).toMatch(/cluster/); + // Description should mention core topics covered by the skill + expect(desc).toMatch(/cluster|networking|security|deploy/); }); }); From 244f5dd967117761df9ff38b2c25299a2a259ed8 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 16:16:24 -0700 Subject: [PATCH 34/39] Remove AKS MCP and add in kubectl commands to AKS skill --- plugin/skills/azure-kubernetes/SKILL.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index 02ed456bd..d25ddba68 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -17,9 +17,9 @@ description: "Plan, create, and configure production-ready Azure Kubernetes Serv | Property | Value | |----------|-------| | Best for | AKS cluster planning and Day-0 decisions | -| MCP Tools | `mcp_azure_mcp_aks`, `mcp_aks_mcp_az_aks_operations` | -| CLI | `az aks create`, `az aks show` | -| Related skills | azure-diagnostics (troubleshooting), azure-deploy (app deployment) | +| MCP Tools | `mcp_azure_mcp_aks` | +| CLI | `az aks create`, `az aks show`, `kubectl get`, `kubectl describe` | +| Related skills | azure-diagnostics (troubleshooting AKS), azure-deploy (app deployment) | ## When to Use This Skill Activate this skill when user wants to: From c1b94550e23409fb063228c72aa52a57355a6331 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 16:40:02 -0700 Subject: [PATCH 35/39] Add skip reason + fix typo in integration tests --- tests/azure-kubernetes/integration.test.ts | 14 ++++++-- tests/utils/agent-runner.ts | 40 +++++++++++++++++++++- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index 78dda8645..ec9a1f83e 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -12,7 +12,8 @@ import { useAgentRunner, doesAssistantMessageIncludeKeyword, - shouldSkipIntegrationTests + shouldSkipIntegrationTests, + getIntegrationSkipReason } from "../utils/agent-runner"; import { softCheckSkill } from "../utils/evaluate"; @@ -20,10 +21,19 @@ const SKILL_NAME = "azure-kubernetes"; const RUNS_PER_PROMPT = 2; const skipTests = shouldSkipIntegrationTests(); +const skipReason = getIntegrationSkipReason(); + +if (skipTests && skipReason) { + console.log(`⏭️ Skipping integration tests: ${skipReason}`); +} + +if (isFastLocalMode) { + console.log(`⚡ Fast local mode enabled: RUNS_PER_PROMPT=${RUNS_PER_PROMPT}`); +} const describeIntegration = skipTests ? describe.skip : describe; -describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { +describeIntegration(`${SKILL_NAME} - Integration Tests`, () => { const agent = useAgentRunner(); test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { diff --git a/tests/utils/agent-runner.ts b/tests/utils/agent-runner.ts index 878b11040..6d6860c84 100644 --- a/tests/utils/agent-runner.ts +++ b/tests/utils/agent-runner.ts @@ -16,7 +16,7 @@ import * as fs from "fs"; import * as os from "os"; import * as path from "path"; import { fileURLToPath } from "url"; -import { type CopilotSession, CopilotClient, type SessionEvent, approveAll } from "@github/copilot-sdk"; +import { type CopilotSession, CopilotClient, type SessionEvent, type PermissionHandler } from "@github/copilot-sdk"; import { redactSecrets } from "./redact"; import { listSkills } from "./skill-loader"; @@ -26,6 +26,8 @@ export { getAllAssistantMessages } from "./evaluate"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); +const approveAll: PermissionHandler = async () => ({ kind: "approved" }); + /** * Resolve the bundled Copilot CLI entry point. * @@ -472,6 +474,7 @@ export function useAgentRunner() { async function run(config: AgentRunConfig): Promise { const testWorkspace = fs.mkdtempSync(path.join(os.tmpdir(), "skill-test-")); const FOLLOW_UP_TIMEOUT = 1800000; // 30 minutes + const runStartTime = Date.now(); let isComplete = false; @@ -481,9 +484,16 @@ export function useAgentRunner() { entry.preserveWorkspace = config.preserveWorkspace; try { + if (process.env.DEBUG) { + console.log(`[agent-runner] starting run in ${testWorkspace}`); + } + // Run optional setup if (config.setup) { await config.setup(testWorkspace); + if (process.env.DEBUG) { + console.log(`[agent-runner] setup completed in ${Date.now() - runStartTime}ms`); + } } // Copilot client with yolo mode @@ -517,6 +527,11 @@ export function useAgentRunner() { disabledSkills = skills.filter((skillName) => !config.includeSkills?.includes(skillName)); } + const createSessionStartTime = Date.now(); + if (process.env.DEBUG) { + console.log("[agent-runner] creating Copilot session"); + } + const session = await client.createSession({ model: modelOverride || "claude-sonnet-4.5", onPermissionRequest: approveAll, @@ -534,19 +549,32 @@ export function useAgentRunner() { }); entry.session = session; + if (process.env.DEBUG) { + console.log(`[agent-runner] session created in ${Date.now() - createSessionStartTime}ms`); + } + const agentMetadata: AgentMetadata = { events: [], testComments: [] }; entry.agentMetadata = agentMetadata; + const sendStartTime = Date.now(); + let sawFirstSessionEvent = false; const done = new Promise((resolve) => { session.on(async (event: SessionEvent) => { if (isComplete) return; if (process.env.DEBUG) { + if (!sawFirstSessionEvent) { + sawFirstSessionEvent = true; + console.log(`[agent-runner] first session event after ${Date.now() - sendStartTime}ms: ${event.type}`); + } console.log(`=== session event ${event.type}`); } if (event.type === "session.idle") { isComplete = true; + if (process.env.DEBUG) { + console.log(`[agent-runner] session became idle after ${Date.now() - sendStartTime}ms`); + } resolve(); return; } @@ -562,9 +590,19 @@ export function useAgentRunner() { }); }); + if (process.env.DEBUG) { + console.log(`[agent-runner] sending prompt (${config.prompt.length} chars)`); + } await session.send({ prompt: config.prompt }); + if (process.env.DEBUG) { + console.log("[agent-runner] prompt sent; waiting for session.idle"); + } await done; + if (process.env.DEBUG) { + console.log(`[agent-runner] run completed in ${Date.now() - runStartTime}ms`); + } + // Extract token usage from assistant.usage events const tokenUsage: TokenUsage = { inputTokens: 0, From b8d7141d87f280717c34cf80b1ee82675c219328 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 16:44:25 -0700 Subject: [PATCH 36/39] Fixed Copilot comments + bumped up version for azure prepare skill --- plugin/skills/azure-kubernetes/SKILL.md | 4 ++-- plugin/skills/azure-prepare/SKILL.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugin/skills/azure-kubernetes/SKILL.md b/plugin/skills/azure-kubernetes/SKILL.md index d25ddba68..cb8218bd8 100644 --- a/plugin/skills/azure-kubernetes/SKILL.md +++ b/plugin/skills/azure-kubernetes/SKILL.md @@ -54,8 +54,8 @@ If the user is unsure, use safe defaults. ## Workflow ### 1. Cluster Type -- **AKS Automatic** (default): Best for most production workloads, provides a curated experience with pre-configured best practices for security, reliability, and performance. Use unless you have specific custom requirements for networking, autoscaling, or node pool configurations not supported by NAP. -- **AKS Standard**: Use if you need full control over cluster configuration, will require additional overhead to setup and manage. +- **AKS Automatic** (default): Best for most production workloads, provides a curated experience with pre-configured best practices for security, reliability, and performance. Use unless you have specific custom requirements for networking, autoscaling, or node pool configurations not supported by Node Auto-Provisioning (NAP). +- **AKS Standard**: Use if you need full control over cluster configuration, will require additional overhead to set up and manage. ### 2. Networking (Pod IP, Egress, Ingress, Dataplane) diff --git a/plugin/skills/azure-prepare/SKILL.md b/plugin/skills/azure-prepare/SKILL.md index 454d56807..4261b1286 100644 --- a/plugin/skills/azure-prepare/SKILL.md +++ b/plugin/skills/azure-prepare/SKILL.md @@ -4,7 +4,7 @@ description: "Prepare Azure apps for deployment (infra Bicep/Terraform, azure.ya license: MIT metadata: author: Microsoft - version: "1.0.6" + version: "1.0.7" --- # Azure Prepare From b3e960b8af12c951b9e52dead9184b60738d13c9 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 16:59:59 -0700 Subject: [PATCH 37/39] Update integration.test.ts --- tests/azure-kubernetes/integration.test.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index ec9a1f83e..a7fb8d042 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -27,10 +27,6 @@ if (skipTests && skipReason) { console.log(`⏭️ Skipping integration tests: ${skipReason}`); } -if (isFastLocalMode) { - console.log(`⚡ Fast local mode enabled: RUNS_PER_PROMPT=${RUNS_PER_PROMPT}`); -} - const describeIntegration = skipTests ? describe.skip : describe; describeIntegration(`${SKILL_NAME} - Integration Tests`, () => { From b7ce4d271a2471e554a69b643dbec0f2f63fc6ea Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 17:02:47 -0700 Subject: [PATCH 38/39] Update integration.test.ts --- tests/azure-kubernetes/integration.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/azure-kubernetes/integration.test.ts b/tests/azure-kubernetes/integration.test.ts index a7fb8d042..c55e08cd8 100644 --- a/tests/azure-kubernetes/integration.test.ts +++ b/tests/azure-kubernetes/integration.test.ts @@ -29,7 +29,7 @@ if (skipTests && skipReason) { const describeIntegration = skipTests ? describe.skip : describe; -describeIntegration(`${SKILL_NAME} - Integration Tests`, () => { +describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { const agent = useAgentRunner(); test("invokes azure-kubernetes skill for AKS cluster creation prompt", async () => { From fc839ba45f74c96a3a152dcb5a7a707e284ed9e3 Mon Sep 17 00:00:00 2001 From: Julia Yin Date: Wed, 11 Mar 2026 17:06:39 -0700 Subject: [PATCH 39/39] Revert agent-runner.ts changes from c1b9455 (debug timing logs + PermissionHandler swap) --- tests/utils/agent-runner.ts | 60 ++----------------------------------- 1 file changed, 3 insertions(+), 57 deletions(-) diff --git a/tests/utils/agent-runner.ts b/tests/utils/agent-runner.ts index ca1100aca..878b11040 100644 --- a/tests/utils/agent-runner.ts +++ b/tests/utils/agent-runner.ts @@ -16,7 +16,7 @@ import * as fs from "fs"; import * as os from "os"; import * as path from "path"; import { fileURLToPath } from "url"; -import { type CopilotSession, CopilotClient, type SessionEvent, type PermissionHandler } from "@github/copilot-sdk"; +import { type CopilotSession, CopilotClient, type SessionEvent, approveAll } from "@github/copilot-sdk"; import { redactSecrets } from "./redact"; import { listSkills } from "./skill-loader"; @@ -26,8 +26,6 @@ export { getAllAssistantMessages } from "./evaluate"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); -const approveAll: PermissionHandler = async () => ({ kind: "approved" }); - /** * Resolve the bundled Copilot CLI entry point. * @@ -346,23 +344,7 @@ function writeMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMetadat } const markdown = redactSecrets(generateMarkdownReport(config, agentMetadata)); - // Use "wx" flag for atomic create-if-not-exists to prevent race conditions - let reportTargetPath = filePath; - let suffix = 0; - while (true) { - try { - fs.writeFileSync(reportTargetPath, markdown, { encoding: "utf-8", flag: "wx" }); - break; - } catch (err: unknown) { - console.log("File exists", reportTargetPath); - if ((err as { code: string }).code === "EEXIST") { - suffix++; - reportTargetPath = filePath.replace(".md", `-${suffix}.md`); - continue; - } - throw err; - } - } + fs.writeFileSync(filePath, markdown, "utf-8"); // Write structured agent-metadata.json for machine consumption const jsonPath = path.join(dir, "agent-metadata.json"); @@ -375,7 +357,7 @@ function writeMarkdownReport(config: AgentRunConfig, agentMetadata: AgentMetadat fs.writeFileSync(jsonPath, redactSecrets(JSON.stringify(jsonData, null, 2)), "utf-8"); if (process.env.DEBUG) { - console.log(`Markdown report written to: ${reportTargetPath}`); + console.log(`Markdown report written to: ${filePath}`); } // Write token usage JSON alongside the markdown report @@ -490,7 +472,6 @@ export function useAgentRunner() { async function run(config: AgentRunConfig): Promise { const testWorkspace = fs.mkdtempSync(path.join(os.tmpdir(), "skill-test-")); const FOLLOW_UP_TIMEOUT = 1800000; // 30 minutes - const runStartTime = Date.now(); let isComplete = false; @@ -500,16 +481,9 @@ export function useAgentRunner() { entry.preserveWorkspace = config.preserveWorkspace; try { - if (process.env.DEBUG) { - console.log(`[agent-runner] starting run in ${testWorkspace}`); - } - // Run optional setup if (config.setup) { await config.setup(testWorkspace); - if (process.env.DEBUG) { - console.log(`[agent-runner] setup completed in ${Date.now() - runStartTime}ms`); - } } // Copilot client with yolo mode @@ -543,11 +517,6 @@ export function useAgentRunner() { disabledSkills = skills.filter((skillName) => !config.includeSkills?.includes(skillName)); } - const createSessionStartTime = Date.now(); - if (process.env.DEBUG) { - console.log("[agent-runner] creating Copilot session"); - } - const session = await client.createSession({ model: modelOverride || "claude-sonnet-4.5", onPermissionRequest: approveAll, @@ -565,32 +534,19 @@ export function useAgentRunner() { }); entry.session = session; - if (process.env.DEBUG) { - console.log(`[agent-runner] session created in ${Date.now() - createSessionStartTime}ms`); - } - const agentMetadata: AgentMetadata = { events: [], testComments: [] }; entry.agentMetadata = agentMetadata; - const sendStartTime = Date.now(); - let sawFirstSessionEvent = false; const done = new Promise((resolve) => { session.on(async (event: SessionEvent) => { if (isComplete) return; if (process.env.DEBUG) { - if (!sawFirstSessionEvent) { - sawFirstSessionEvent = true; - console.log(`[agent-runner] first session event after ${Date.now() - sendStartTime}ms: ${event.type}`); - } console.log(`=== session event ${event.type}`); } if (event.type === "session.idle") { isComplete = true; - if (process.env.DEBUG) { - console.log(`[agent-runner] session became idle after ${Date.now() - sendStartTime}ms`); - } resolve(); return; } @@ -606,19 +562,9 @@ export function useAgentRunner() { }); }); - if (process.env.DEBUG) { - console.log(`[agent-runner] sending prompt (${config.prompt.length} chars)`); - } await session.send({ prompt: config.prompt }); - if (process.env.DEBUG) { - console.log("[agent-runner] prompt sent; waiting for session.idle"); - } await done; - if (process.env.DEBUG) { - console.log(`[agent-runner] run completed in ${Date.now() - runStartTime}ms`); - } - // Extract token usage from assistant.usage events const tokenUsage: TokenUsage = { inputTokens: 0,