diff --git a/plugin/skills/microsoft-foundry/SKILL.md b/plugin/skills/microsoft-foundry/SKILL.md index 7c2eec70f..427b6e65d 100644 --- a/plugin/skills/microsoft-foundry/SKILL.md +++ b/plugin/skills/microsoft-foundry/SKILL.md @@ -1,23 +1,27 @@ --- name: microsoft-foundry -description: "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare)." +description: "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare)." license: MIT metadata: author: Microsoft - version: "1.0.4" + version: "1.0.5" --- # Microsoft Foundry Skill -> **MANDATORY:** Read this skill and the relevant sub-skill BEFORE calling any Foundry MCP tool. +This skill helps developers work with Microsoft Foundry resources, covering model discovery and deployment, complete dev lifecycle of AI agent, evaluation workflows, and troubleshooting. ## Sub-Skills +> **MANDATORY: Before executing ANY workflow, you MUST read the corresponding sub-skill document.** Do not call MCP tools for a workflow without reading its skill document. This applies even if you already know the MCP tool parameters — the skill document contains required workflow steps, pre-checks, and validation logic that must be followed. This rule applies on every new user message that triggers a different workflow, even if the skill is already loaded. + +This skill includes specialized sub-skills for specific workflows. **Use these instead of the main skill when they match your task:** + | Sub-Skill | When to Use | Reference | |-----------|-------------|-----------| | **deploy** | Containerize, build, push to ACR, create/update/start/stop/clone agent deployments | [deploy](foundry-agent/deploy/deploy.md) | | **invoke** | Send messages to an agent, single or multi-turn conversations | [invoke](foundry-agent/invoke/invoke.md) | -| **observe** | Eval-driven optimization loop: evaluate → analyze → optimize → compare → iterate | [observe](foundry-agent/observe/observe.md) | +| **observe** | Evaluate agent quality, run batch evals, analyze failures, optimize prompts, improve agent instructions, compare versions, and set up CI/CD monitoring | [observe](foundry-agent/observe/observe.md) | | **trace** | Query traces, analyze latency/failures, correlate eval results to specific responses via App Insights `customEvents` | [trace](foundry-agent/trace/trace.md) | | **troubleshoot** | View container logs, query telemetry, diagnose failures | [troubleshoot](foundry-agent/troubleshoot/troubleshoot.md) | | **create** | Create new hosted agent applications. Supports Microsoft Agent Framework, LangGraph, or custom frameworks in Python or C#. Downloads starter samples from foundry-samples repo. | [create](foundry-agent/create/create.md) | @@ -28,59 +32,112 @@ metadata: | **quota** | Managing quotas and capacity for Microsoft Foundry resources. Use when checking quota usage, troubleshooting deployment failures due to insufficient quota, requesting quota increases, or planning capacity. | [quota/quota.md](quota/quota.md) | | **rbac** | Managing RBAC permissions, role assignments, managed identities, and service principals for Microsoft Foundry resources. Use for access control, auditing permissions, and CI/CD setup. | [rbac/rbac.md](rbac/rbac.md) | -Onboarding flow: `project/create` → `deploy` → `invoke` +> 💡 **Tip:** For a complete onboarding flow: `project/create` → agent workflows (`deploy` → `invoke`). -## Agent Lifecycle +> 💡 **Model Deployment:** Use `models/deploy-model` for all deployment scenarios — it intelligently routes between quick preset deployment, customized deployment with full control, and capacity discovery across regions. -| Intent | Workflow | -|--------|----------| -| New agent from scratch | create → deploy → invoke | -| Deploy existing code | deploy → invoke | -| Test/chat with agent | invoke | -| Troubleshoot | invoke → troubleshoot | -| Fix + redeploy | troubleshoot → fix → deploy → invoke | +> 💡 **Prompt Optimization:** For requests like "optimize my prompt" or "improve my agent instructions," load [observe](foundry-agent/observe/observe.md) and use the `prompt_optimize` MCP tool through that eval-driven workflow. -## Project Context Resolution +## Agent Development Lifecycle -Resolve only missing values. Extract from user message first, then azd, then ask. +Match user intent to the correct workflow. Read each sub-skill in order before executing. -1. Check for `azure.yaml`; if found, run `azd env get-values` -2. Map azd variables: +| User Intent | Workflow (read in order) | +|-------------|------------------------| +| Create a new agent from scratch | [create](foundry-agent/create/create.md) → [deploy](foundry-agent/deploy/deploy.md) → [invoke](foundry-agent/invoke/invoke.md) | +| Deploy an agent (code already exists) | deploy → invoke | +| Update/redeploy an agent after code changes | deploy → invoke | +| Invoke/test/chat with an agent | invoke | +| Optimize / improve agent prompt or instructions | observe (Step 4: Optimize) | +| Evaluate and optimize agent (full loop) | observe | +| Troubleshoot an agent issue | invoke → troubleshoot | +| Fix a broken agent (troubleshoot + redeploy) | invoke → troubleshoot → apply fixes → deploy → invoke | +| Start/stop agent container | deploy | -| azd Variable | Resolves To | -|-------------|-------------| -| `AZURE_AI_PROJECT_ENDPOINT` / `AZURE_AIPROJECT_ENDPOINT` | Project endpoint | -| `AZURE_CONTAINER_REGISTRY_NAME` / `AZURE_CONTAINER_REGISTRY_ENDPOINT` | ACR registry | -| `AZURE_SUBSCRIPTION_ID` | Subscription | +## Agent: .foundry Workspace Standard -3. Ask user only for unresolved values (project endpoint, agent name) +Every agent source folder should keep Foundry-specific state under `.foundry/`: -## Validation +```text +/ + .foundry/ + agent-metadata.yaml + datasets/ + evaluators/ + results/ +``` -After each workflow step, validate before proceeding: -1. Run the operation -2. Check output for errors or unexpected results -3. If failed → diagnose using troubleshoot sub-skill → fix → retry -4. Only proceed to next step when validation passes +- `agent-metadata.yaml` is the required source of truth for environment-specific project settings, agent names, registry details, and evaluation test cases. +- `datasets/` and `evaluators/` are local cache folders. Reuse them when they are current, and ask before refreshing or overwriting them. +- See [Agent Metadata Contract](references/agent-metadata-contract.md) for the canonical schema and workflow rules. -## Agent Types +## Agent: Setup References -| Type | Kind | Description | -|------|------|-------------| -| **Prompt** | `"prompt"` | LLM-based, backed by model deployment | -| **Hosted** | `"hosted"` | Container-based, running custom code | +- [Standard Agent Setup](references/standard-agent-setup.md) - Standard capability-host setup with customer-managed data, search, and AI Services resources. +- [Private Network Standard Agent Setup](references/private-network-standard-agent-setup.md) - Standard setup with VNet isolation and private endpoints. + +## Agent: Project Context Resolution + +Agent skills should run this step **only when they need configuration values they don't already have**. If a value (for example, agent root, environment, project endpoint, or agent name) is already known from the user's message or a previous skill in the same session, skip resolution for that value. + +### Step 1: Discover Agent Roots + +Search the workspace for `.foundry/agent-metadata.yaml`. + +- **One match** → use that agent root. +- **Multiple matches** → require the user to choose the target agent folder. +- **No matches** → for create/deploy workflows, seed a new `.foundry/` folder during setup; for all other workflows, stop and ask the user which agent source folder to initialize. + +### Step 2: Resolve Environment + +Read `.foundry/agent-metadata.yaml` and resolve the environment in this order: +1. Environment explicitly named by the user +2. Environment already selected earlier in the session +3. `defaultEnvironment` from metadata + +If the metadata contains multiple environments and none of the rules above selects one, prompt the user to choose. Keep the selected agent root and environment visible in every workflow summary. -## Agent: Setup Types +### Step 3: Resolve Common Configuration -| Setup | Capability Host | Description | -|-------|----------------|-------------| -| **Basic** | None | Default. All resources Microsoft-managed. | -| **Standard** | Azure AI Services | Bring-your-own storage and search (public network). See [standard-agent-setup](references/standard-agent-setup.md). | -| **Standard + Private Network** | Azure AI Services | Standard setup with VNet isolation and private endpoints. See [private-network-standard-agent-setup](references/private-network-standard-agent-setup.md). | +Use the selected environment in `agent-metadata.yaml` as the primary source: + +| Metadata Field | Resolves To | Used By | +|----------------|-------------|---------| +| `environments..projectEndpoint` | Project endpoint | deploy, invoke, observe, trace, troubleshoot | +| `environments..agentName` | Agent name | invoke, observe, trace, troubleshoot | +| `environments..azureContainerRegistry` | ACR registry name / image URL prefix | deploy | +| `environments..testCases[]` | Dataset + evaluator + threshold bundles | observe, eval-datasets | + +### Step 4: Bootstrap Missing Metadata (Create/Deploy Only) + +If create/deploy is initializing a new `.foundry` workspace and metadata fields are still missing, check if `azure.yaml` exists in the project root. If found, run `azd env get-values` and use it to seed `agent-metadata.yaml` before continuing. + +| azd Variable | Seeds | +|-------------|-------| +| `AZURE_AI_PROJECT_ENDPOINT` or `AZURE_AIPROJECT_ENDPOINT` | `environments..projectEndpoint` | +| `AZURE_CONTAINER_REGISTRY_NAME` or `AZURE_CONTAINER_REGISTRY_ENDPOINT` | `environments..azureContainerRegistry` | +| `AZURE_SUBSCRIPTION_ID` | Azure subscription for trace/troubleshoot lookups | + +### Step 5: Collect Missing Values + +Use the `ask_user` or `askQuestions` tool **only for values not resolved** from the user's message, session context, metadata, or azd bootstrap. Common values skills may need: +- **Agent root** — Target folder containing `.foundry/agent-metadata.yaml` +- **Environment** — `dev`, `prod`, or another environment key from metadata +- **Project endpoint** — AI Foundry project endpoint URL +- **Agent name** — Name of the target agent + +> 💡 **Tip:** If the user already provides the agent path, environment, project endpoint, or agent name, extract it directly — do not ask again. + +## Agent: Agent Types + +All agent skills support two agent types: + +| Type | Kind | Description | +|------|------|-------------| +| **Prompt** | `"prompt"` | LLM-based agents backed by a model deployment | +| **Hosted** | `"hosted"` | Container-based agents running custom code | -> **MANDATORY:** For standard setup, read the appropriate reference before proceeding: -> - **Public network:** [references/standard-agent-setup.md](references/standard-agent-setup.md) -> - **Private network (VNet isolation):** [references/private-network-standard-agent-setup.md](references/private-network-standard-agent-setup.md) +Use `agent_get` MCP tool to determine an agent's type when needed. ## Tool Usage Conventions @@ -89,13 +146,12 @@ After each workflow step, validate before proceeding: - Prefer Azure MCP tools over direct CLI commands when available - Reference official Microsoft documentation URLs instead of embedding CLI command syntax -## References +## Additional Resources -- [Hosted Agents](https://learn.microsoft.com/azure/ai-foundry/agents/concepts/hosted-agents?view=foundry) -- [Runtime Components](https://learn.microsoft.com/azure/ai-foundry/agents/concepts/runtime-components?view=foundry) +- [Foundry Hosted Agents](https://learn.microsoft.com/azure/ai-foundry/agents/concepts/hosted-agents?view=foundry) +- [Foundry Agent Runtime Components](https://learn.microsoft.com/azure/ai-foundry/agents/concepts/runtime-components?view=foundry) - [Foundry Samples](https://github.com/azure-ai-foundry/foundry-samples) -- [Python SDK](references/sdk/foundry-sdk-py.md) -## Dependencies +## SDK Quick Reference -Scripts in sub-skills require: Azure CLI (`az`) ≥2.0, `jq` (for shell scripts). Install via `pip install azure-ai-projects azure-identity` for Python SDK usage. \ No newline at end of file +- [Python](references/sdk/foundry-sdk-py.md) diff --git a/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-azure-ai-search.md b/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-azure-ai-search.md index 9859e81c5..213ec14ed 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-azure-ai-search.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-azure-ai-search.md @@ -41,7 +41,7 @@ A project connection between your Foundry project and the Azure AI Search resour | Parameter | Required | Description | |-----------|----------|-------------| -| `project_connection_id` | Yes | Connection ID (resolve via `foundry_connections_get`) | +| `project_connection_id` | Yes | Connection ID (resolve via `project_connection_get`, typically after discovering the connection with `project_connection_list`) | | `index_name` | Yes | Search index name | | `top_k` | No | Number of results (default: 5) | | `query_type` | No | Search type (default: `vector_semantic_hybrid`) | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-bing-grounding.md b/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-bing-grounding.md index 9d466cd20..3eae452dd 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-bing-grounding.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/create/references/tool-bing-grounding.md @@ -36,7 +36,7 @@ Access real-time web information via Bing Search. Unlike the [Web Search tool](t | Issue | Cause | Resolution | |-------|-------|------------| -| Connection not found | Name mismatch or wrong project | Use `foundry_connections_list` to find correct name | +| Connection not found | Name mismatch or wrong project | Use `project_connection_list` to find the correct `connectionName` | | Unauthorized creating connection | Missing Azure AI Project Manager role | Assign role on the Foundry project | | Bing resource creation fails | Provider not registered | Run `az provider register --namespace 'Microsoft.Bing'` | | No results returned | Connection misconfigured | Verify Bing resource key and connection setup | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/deploy/deploy.md b/plugin/skills/microsoft-foundry/foundry-agent/deploy/deploy.md index 2a2a7891d..c216dca3d 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/deploy/deploy.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/deploy/deploy.md @@ -230,20 +230,20 @@ python -c "import base64,uuid;print(base64.urlsafe_b64encode(uuid.UUID('/.foundry/agent-metadata.yaml` under the selected environment so future conversations (evaluation, trace analysis, monitoring) can reuse it automatically. See [Agent Metadata Contract](../../references/agent-metadata-contract.md) for the canonical schema. -| Variable | Purpose | Example | -|----------|---------|---------| -| `AZURE_AI_PROJECT_ENDPOINT` | Foundry project endpoint | `https://.services.ai.azure.com/api/projects/` | -| `AZURE_AI_AGENT_NAME` | Deployed agent name | `my-support-agent` | -| `AZURE_AI_AGENT_VERSION` | Current agent version | `1` | -| `AZURE_CONTAINER_REGISTRY` | ACR resource (hosted agents) | `myregistry.azurecr.io` | +| Metadata Field | Purpose | Example | +|----------------|---------|---------| +| `environments..projectEndpoint` | Foundry project endpoint | `https://.services.ai.azure.com/api/projects/` | +| `environments..agentName` | Deployed agent name | `my-support-agent` | +| `environments..azureContainerRegistry` | ACR resource (hosted agents) | `myregistry.azurecr.io` | +| `environments..testCases[]` | Evaluation bundles for datasets, evaluators, and thresholds | `smoke-core`, `trace-regressions` | -If a `.env` file already exists, read it first and merge — do not overwrite existing values without confirmation. +If `agent-metadata.yaml` already exists, merge the selected environment instead of overwriting other environments or cached test cases without confirmation. ## After Deployment — Auto-Create Evaluators & Dataset -> ⚠️ **This step is automatic.** After a successful deployment, immediately prepare for evaluation without waiting for the user to request it. This matches the eval-driven optimization loop. +> ⚠️ **This step is automatic.** After a successful deployment, immediately prepare the selected `.foundry` environment for evaluation without waiting for the user to request it. This matches the eval-driven optimization loop. ### 1. Read Agent Instructions @@ -258,30 +258,43 @@ Use **`agent_get`** (or local `agent.yaml`) to understand the agent's purpose an ### 3. Identify LLM-Judge Deployment -Use **`model_deployment_get`** to find a suitable model (e.g., `gpt-4o`) for quality evaluators. +Use **`model_deployment_get`** to list the selected project's actual model deployments, then choose one that supports chat completions for quality evaluators. Do **not** assume `gpt-4o` exists in the project. If no deployment supports chat completions, stop the auto-setup flow and tell the user quality evaluators cannot run until a compatible judge deployment is available. -### 4. Generate Local Test Dataset +### 4. Reuse or Refresh Local Cache -Use the identified LLM deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `datasets/-test.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). +Inspect the selected agent root before generating anything new: -> ⚠️ **Prefer local dataset generation.** Generate test queries locally and save to `datasets/*.jsonl` rather than using `generateSyntheticData=true` on the eval API. Local datasets provide reproducibility, version control, and can be reviewed before running evals. +- Reuse `.foundry/evaluators/` and `.foundry/datasets/` when they already contain the right assets for the selected environment. +- Ask before refreshing cached files or replacing thresholds. +- If cache is missing or stale, regenerate the dataset/evaluators and update metadata for the active environment only. -### 5. Persist Artifacts +### 5. Generate Local Test Dataset -Save evaluator definitions to `evaluators/.yaml` and any locally generated test datasets to `datasets/*.jsonl`: +Use the identified chat-capable deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `.foundry/datasets/--test-v1.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). +> ⚠️ **Prefer local dataset generation.** Generate test queries locally and save to `.foundry/datasets/*.jsonl` rather than using `generateSyntheticData=true` on the eval API. Local datasets provide reproducibility, version control, and can be reviewed before running evals. + +### 6. Persist Artifacts and Test Cases + +Save evaluator definitions, local datasets, and evaluation outputs under `.foundry/`, then register or update test cases in `agent-metadata.yaml` for the selected environment: + +```text +.foundry/ + agent-metadata.yaml + evaluators/ + .yaml + datasets/ + --test-v1.jsonl + results/ ``` -evaluators/ # custom evaluator definitions - .yaml # prompt text, scoring type, thresholds -datasets/ # locally generated input datasets - *.jsonl # test queries -``` -### 6. Prompt User +Each test case should bundle one dataset with the evaluator list, thresholds, and a priority tag (`P0`, `P1`, or `P2`). For simplicity, seed exactly one `P0` smoke test case after deployment. + +### 7. Prompt User -*"Your agent is deployed and running. Evaluators and a test dataset have been auto-configured. Would you like to run an evaluation to identify optimization opportunities?"* +*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local test dataset, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* -- **Yes** → follow the [observe skill](../observe/observe.md) starting at **Step 2 (Evaluate)** — evaluators and dataset are already prepared. +- **Yes** → follow the [observe skill](../observe/observe.md) starting at **Step 2 (Evaluate)** — cache and metadata are already prepared. - **No** → stop. The user can return later. - **Production trace analysis** → follow the [trace skill](../trace/trace.md) to search conversations, diagnose failures, and analyze latency using App Insights. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md index ab62846be..d6a18917c 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md @@ -1,25 +1,25 @@ # Evaluation Datasets — Trace-to-Dataset Pipeline & Lifecycle Management -Manage the full lifecycle of evaluation datasets for Foundry agents — from harvesting production traces into test datasets, through versioning and organization, to evaluation trending and regression detection. This skill closes the gap between **production observability** and **evaluation quality** by turning real-world agent failures into reproducible test cases. +Manage the full lifecycle of evaluation datasets for Foundry agents: harvesting production traces into local `.foundry` cache, curating versioned test datasets, tracking evaluation quality over time, and syncing approved updates back to Foundry when needed. ## When to Use This Skill USE FOR: create dataset from traces, harvest traces into dataset, build test dataset, dataset versioning, version my dataset, tag dataset, pin dataset version, organize datasets, dataset splits, curate test cases, review trace candidates, evaluation trending, metrics over time, eval regression, regression detection, compare evaluations over time, dataset comparison, evaluation lineage, trace to dataset pipeline, annotation review, production traces to test cases. -> ⚠️ **DO NOT manually run** KQL queries to extract datasets or call `evaluation_dataset_create` **without reading this skill first.** This skill defines the correct trace extraction patterns, schema transformation, versioning conventions, and quality gates that raw tools do not enforce. +> ⚠️ **DO NOT manually run** KQL queries to extract datasets or call `evaluation_dataset_create` **without reading this skill first.** This skill defines the correct trace extraction patterns, schema transformation, cache rules, versioning conventions, and quality gates that raw tools do not enforce. -> 💡 **Tip:** This skill complements the [observe skill](../observe/observe.md) (eval-driven optimization loop) and the [trace skill](../trace/trace.md) (production trace analysis). Use this skill when you need to **bridge traces and evaluations** — turning production data into test cases and tracking evaluation quality over time. +> 💡 **Tip:** This skill complements the [observe skill](../observe/observe.md) (eval-driven optimization loop) and the [trace skill](../trace/trace.md) (production trace analysis). Use this skill when you need to bridge traces and evaluations: turning production data into test cases and tracking evaluation quality over time. ## Quick Reference | Property | Value | |----------|-------| | MCP server | `foundry-mcp` | -| Key MCP tools | `evaluation_dataset_get`, `evaluation_get`, `evaluation_comparison_create`, `evaluation_comparison_get` | -| Azure services | Application Insights (via `monitor_resource_log_query`) | -| ⚠️ Not available | `evaluation_dataset_create` (dataset upload MCP not ready — use local JSONL + `inputData`) | -| Prerequisites | Agent deployed, App Insights connected (see [trace skill](../trace/trace.md)) | -| Artifact paths | `datasets/`, `results/`, `evaluators/` | +| Key MCP tools | `evaluation_dataset_create`, `evaluation_dataset_get`, `evaluation_dataset_versions_get`, `evaluation_get`, `evaluation_comparison_create`, `evaluation_comparison_get` | +| Storage tools | `project_connection_list` (discover AzureBlob connection), `project_connection_create` (add storage connection) | +| Azure services | Application Insights (via `monitor_resource_log_query`), Azure Blob Storage (dataset sync) | +| Prerequisites | Agent deployed, `.foundry/agent-metadata.yaml` available, App Insights connected | +| Local cache | `.foundry/datasets/`, `.foundry/results/`, `.foundry/evaluators/` | ## Entry Points @@ -32,29 +32,29 @@ USE FOR: create dataset from traces, harvest traces into dataset, build test dat | "Show eval metrics over time" / "Evaluation trending" | [Eval Trending](references/eval-trending.md) | | "Did my agent regress?" / "Regression detection" | [Eval Regression](references/eval-regression.md) | | "Compare datasets" / "Experiment comparison" / "A/B test" | [Dataset Comparison](references/dataset-comparison.md) | +| "Sync dataset to Foundry" / "Refresh local dataset cache" | [Trace-to-Dataset Pipeline -> Step 5](references/trace-to-dataset.md#step-5--sync-local-cache-with-foundry-optional) | | "Trace my evaluation lineage" / "Audit eval history" | [Eval Lineage](references/eval-lineage.md) | ## Before Starting — Detect Current State -1. Check `.env` for `AZURE_AI_PROJECT_ENDPOINT`, `AZURE_AI_AGENT_NAME`, and `APPLICATIONINSIGHTS_CONNECTION_STRING` -2. If App Insights is missing, resolve via [trace skill](../trace/trace.md) (Before Starting section) -3. Check `datasets/` for existing datasets and `results/` for evaluation history -4. Check if `evaluation_dataset_get` returns any server-side datasets -5. Route to the appropriate entry point based on user intent +1. Resolve the target agent root and environment from `.foundry/agent-metadata.yaml`. +2. Confirm the selected environment's `projectEndpoint`, `agentName`, and observability settings. +3. Check `.foundry/datasets/` for existing datasets, `.foundry/results/` for evaluation history, and `.foundry/datasets/manifest.json` for lineage. +4. Check whether `evaluation_dataset_get` returns server-side datasets for the same environment. +5. Route to the appropriate entry point based on user intent. ## The Foundry Flywheel -This skill enables a closed-loop improvement cycle where production failures become regression tests: - -``` -Production Agent → [1] Trace (App Insights + OTel) - → [2] Harvest (KQL extraction) - → [3] Curate (human review) - → [4] Dataset (versioned, tagged) - → [5] Evaluate (batch eval) - → [6] Analyze (trending + regression) - → [7] Compare (version diff) - → [8] Deploy → back to [1] +```text +Production Agent -> [1] Trace (App Insights + OTel) + -> [2] Harvest (KQL extraction) + -> [3] Curate (human review) + -> [4] Dataset Cache (.foundry/datasets, versioned) + -> [5] Sync to Foundry (optional refresh/push) + -> [6] Evaluate (batch eval) + -> [7] Analyze (trending + regression) + -> [8] Compare (agent versions OR dataset versions) + -> [9] Deploy -> back to [1] ``` Each cycle makes the test suite harder and more representative. Production failures from release N become regression tests for release N+1. @@ -62,13 +62,16 @@ Each cycle makes the test suite harder and more representative. Production failu ## Behavioral Rules 1. **Always show KQL queries.** Before executing any trace extraction query, display it in a code block. Never run queries silently. -2. **Scope to time ranges.** Always include a time range in KQL queries (default: last 7 days for trace harvesting). Ask user for the range if not specified. +2. **Scope to time ranges.** Always include a time range in KQL queries (default: last 7 days for trace harvesting). Ask the user for the range if not specified. 3. **Require human review.** Never auto-commit harvested traces to a dataset without showing candidates to the user first. The curation step is mandatory. -4. **Use versioning conventions.** Follow the naming pattern `--v` (e.g., `support-bot-traces-v3`). -5. **Persist artifacts.** Save datasets to `datasets/`, evaluation results to `results/`, and track lineage in `datasets/manifest.json`. -6. **Confirm before overwriting.** If a dataset version already exists, warn the user and ask for confirmation before replacing. -7. **Never upload datasets to cloud storage.** Do not use blob upload, SAS URLs, or `evaluation_dataset_create`. Always persist datasets locally and reference them via `inputData` when running evaluations. -8. **Never remove dataset rows or weaken evaluators to recover scores.** Score drops after a dataset update are expected — harder tests expose real gaps. Optimize the agent for new failure patterns; do not shrink the test suite. +4. **Use versioning conventions.** Follow the naming pattern `---v` (for example, `support-bot-prod-traces-v3`). +5. **Treat local files as cache.** Reuse `.foundry/datasets/` and `.foundry/evaluators/` when they already match the selected environment. Offer refresh when the user asks or when remote state has changed. +6. **Persist artifacts.** Save datasets to `.foundry/datasets/`, evaluation results to `.foundry/results/`, and track lineage in `.foundry/datasets/manifest.json`. +7. **Keep test cases aligned.** Update the selected environment's `testCases[]` in `agent-metadata.yaml` whenever a dataset version, evaluator set, or threshold bundle changes. +8. **Confirm before overwriting.** If a dataset version or cache file already exists, warn the user and ask for confirmation before replacing or refreshing it. +9. **Sync to Foundry when requested or needed.** After saving datasets locally, refresh or register them in Foundry only when the user asks or the workflow needs shared/CI usage. +10. **Never remove dataset rows or weaken evaluators to recover scores.** Score drops after a dataset update are expected - harder tests expose real gaps. Optimize the agent for new failure patterns; do not shrink the test suite. +11. **Match eval parameter names exactly.** Use `evaluationId` when creating grouped runs, but use `evalId` for `evaluation_get` and comparison/trending lookups. ## Related Skills @@ -76,6 +79,7 @@ Each cycle makes the test suite harder and more representative. Production failu |-------------|-------| | "Run an evaluation" / "Optimize my agent" | [observe skill](../observe/observe.md) | | "Search traces" / "Analyze failures" / "Latency analysis" | [trace skill](../trace/trace.md) | -| "Find eval scores for a response ID" / "Link eval results to traces" | [trace skill → Eval Correlation](../trace/references/eval-correlation.md) (in `foundry-agent/trace/references/`) | +| "Find eval scores for a response ID" / "Link eval results to traces" | [trace skill -> Eval Correlation](../trace/references/eval-correlation.md) | | "Deploy my agent" | [deploy skill](../deploy/deploy.md) | | "Debug container issues" | [troubleshoot skill](../troubleshoot/troubleshoot.md) | +| "Review metadata schema" | [Agent Metadata Contract](../../references/agent-metadata-contract.md) | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-comparison.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-comparison.md index ed5feca61..4875ff4b0 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-comparison.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-comparison.md @@ -1,33 +1,37 @@ -# Dataset Comparison — Experiment Framework & A/B Testing +# Dataset Comparison — A/B Testing Across Dataset Versions -Run structured experiments that compare agent versions against the same dataset, and present results as leaderboards with per-evaluator breakdowns. +Run structured experiments that compare how an agent performs across different dataset versions, and present results as leaderboards with per-evaluator breakdowns. Use this to answer: "Did scores drop because of harder tests or agent regression?" ## Experiment Structure An experiment consists of: -1. **One pinned dataset version** — ensures fair comparison -2. **Multiple agent versions** — the variables being compared -3. **Same evaluators** — applied consistently across all versions -4. **Comparison results** — which version wins on each metric +1. **Pinned agent version** — the same agent evaluated on each dataset +2. **Varied dataset versions** — the versions being compared +3. **Same evaluators** — applied consistently across all runs +4. **Comparison results** — which dataset version the agent performs better on ## Step 1 — Define the Experiment | Parameter | Value | Example | |-----------|-------|---------| -| Dataset | Pinned version from `datasets/manifest.json` | `support-bot-traces-v3` (tag: `prod`) | -| Baseline | Agent version to compare against | `v2` | -| Treatment(s) | Agent version(s) to evaluate | `v3`, `v4` | +| Agent | Pinned agent version | `v3` | +| Baseline dataset | Previous dataset version | `support-bot-prod-traces-v2` | +| Treatment dataset(s) | New dataset version(s) | `support-bot-prod-traces-v3` | | Evaluators | Same set for all runs | coherence, fluency, relevance, intent_resolution, task_adherence | ## Step 2 — Run Evaluations -For each agent version, run **`evaluation_agent_batch_eval_create`** with: +For each dataset version, run **`evaluation_agent_batch_eval_create`** with: - Same `evaluationId` (groups all runs for comparison) -- Same `inputData` (from the pinned dataset) +- Same `agentVersion` - Same `evaluatorNames` -- Different `agentVersion` +- Different `inputData` (from each dataset version) -> **Important:** Use `evaluationId` (NOT `evalId`) to group runs. All versions must be in the same evaluation group for comparison to work. +> **Important:** Use `evaluationId` on `evaluation_agent_batch_eval_create` to group runs. After the runs exist, switch to `evalId` for `evaluation_get` and `evaluation_comparison_create`. + +> ⚠️ **Eval-group immutability:** Keep the evaluator set and thresholds fixed within one evaluation group. If you need to change evaluators or thresholds, create a new evaluation group instead of reusing the previous `evaluationId`. + +> ⚠️ **Score drops are expected.** When comparing v1→v2 datasets, lower scores on the new dataset likely mean the new test cases are harder (better coverage), not that the agent regressed. **Do NOT remove dataset rows or weaken evaluators to recover scores.** Instead, optimize the agent for the new failure patterns, then re-evaluate. ## Step 3 — Compare Results @@ -36,46 +40,47 @@ Use **`evaluation_comparison_create`** with the baseline and treatment runs: ```json { "insightRequest": { - "displayName": "Experiment: v2 vs v3 vs v4 on traces-v3", + "displayName": "Dataset comparison: traces-v2 vs traces-v3 on agent-v3", "state": "NotStarted", "request": { "type": "EvaluationComparison", "evalId": "", - "baselineRunId": "", - "treatmentRunIds": ["", ""] + "baselineRunId": "", + "treatmentRunIds": [""] } } } ``` +> ⚠️ **Common mistake:** `evaluation_comparison_create` uses `insightRequest.request.evalId`, not `evaluationId`, even when the runs were originally grouped with `evaluationId`. + ## Step 4 — Leaderboard Present results as a leaderboard table: -| Evaluator | v2 (baseline) | v3 | v4 | Best | -|-----------|:---:|:---:|:---:|:---:| -| Coherence | 3.5 | 4.1 | 4.0 | ✅ v3 | -| Fluency | 4.2 | 4.4 | 4.5 | ✅ v4 | -| Relevance | 3.0 | 3.8 | 3.6 | ✅ v3 | -| Intent Resolution | 3.3 | 4.0 | 4.1 | ✅ v4 | -| Task Adherence | 2.8 | 3.5 | 3.9 | ✅ v4 | -| **Wins** | **0** | **2** | **3** | — | +| Evaluator | traces-v2 (baseline) | traces-v3 | Effect | +|-----------|:---:|:---:|:---:| +| Coherence | 4.0 | 3.6 | ⚠️ Lower | +| Fluency | 4.5 | 4.3 | ⚠️ Lower | +| Relevance | 3.6 | 3.2 | ⚠️ Lower | +| Intent Resolution | 4.1 | 3.7 | ⚠️ Lower | +| Task Adherence | 3.9 | 3.4 | ⚠️ Lower | ### Recommendation -Based on the comparison: +If scores drop uniformly across all evaluators, the new dataset is likely harder: -*"v4 wins on 3/5 evaluators (Fluency, Intent Resolution, Task Adherence). v3 wins on 2/5 (Coherence, Relevance). Recommend deploying v4 with additional prompt tuning to recover Relevance."* +*"Agent v3 scores dropped on traces-v3 across all evaluators. traces-v3 added 15 edge-case queries from production failures. This is expected — optimize the agent for the new failure patterns rather than reverting the dataset."* ## Pairwise A/B Comparison -For detailed pairwise analysis between exactly two versions: +For detailed pairwise analysis between exactly two dataset versions: -| Evaluator | Baseline (v2) | Treatment (v3) | Delta | p-value | Effect | +| Evaluator | Baseline (traces-v2) | Treatment (traces-v3) | Delta | p-value | Effect | |-----------|:---:|:---:|:---:|:---:|:---:| -| Coherence | 3.5 ± 0.8 | 4.1 ± 0.6 | +0.6 | 0.02 | Improved | -| Fluency | 4.2 ± 0.5 | 4.4 ± 0.4 | +0.2 | 0.15 | Inconclusive | -| Relevance | 3.0 ± 1.1 | 3.8 ± 0.9 | +0.8 | 0.01 | Improved | +| Coherence | 4.0 ± 0.6 | 3.6 ± 0.9 | −0.4 | 0.03 | Degraded | +| Fluency | 4.5 ± 0.4 | 4.3 ± 0.5 | −0.2 | 0.12 | Inconclusive | +| Relevance | 3.6 ± 0.9 | 3.2 ± 1.1 | −0.4 | 0.04 | Degraded | > 💡 **Tip:** The `evaluation_comparison_create` result includes `pValue` and `treatmentEffect` fields. Use `pValue < 0.05` as the threshold for statistical significance. @@ -89,7 +94,7 @@ Compare how the same agent version performs across different datasets: | synthetic-v2 | 4.3 | 4.6 | 4.1 | May overestimate quality | | manual-v1 (curated) | 3.8 | 4.4 | 3.2 | Hardest test cases | -> ⚠️ **Warning:** Be cautious comparing scores across different datasets. Differences may reflect dataset difficulty, not agent quality. Always compare agent versions on the same dataset. +> ⚠️ **Warning:** Be cautious comparing scores across datasets with different structures (e.g., production traces vs synthetic). Differences may reflect dataset difficulty, not agent quality. ## Next Steps diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md index da1d76d2e..43bddb104 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md @@ -22,7 +22,7 @@ Raw Traces (from KQL harvest) After running a [trace harvest](trace-to-dataset.md), save candidates with a `status` field: ``` -datasets/-candidates-.jsonl +.foundry/datasets/--candidates-.jsonl ``` Each line includes a review status: @@ -65,11 +65,11 @@ For each candidate, the user can: After review, filter approved candidates and save to a versioned dataset: -1. Read `datasets/manifest.json` to find the latest version number +1. Read `.foundry/datasets/manifest.json` to find the latest version number 2. Filter candidates where `status == "approved"` 3. Remove the `status` field from the output -4. Save to `datasets/--v.jsonl` -5. Update `datasets/manifest.json` with metadata +4. Save to `.foundry/datasets/---v.jsonl` +5. Update `.foundry/datasets/manifest.json` with metadata ### Update Candidate Status diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-organization.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-organization.md index 1e6275213..59dfda5ff 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-organization.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-organization.md @@ -51,7 +51,7 @@ Run evaluations on specific subsets of a dataset by filtering JSONL before passi import json # Read full dataset -with open("datasets/support-bot-traces-v3.jsonl") as f: +with open(".foundry/datasets/support-bot-prod-traces-v3.jsonl") as f: examples = [json.loads(line) for line in f] # Filter to test split only diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md index f0fa5f4e9..9fac83bd7 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md @@ -4,22 +4,23 @@ Manage dataset versions with naming conventions, tagging, and version pinning fo ## Naming Convention -Use the pattern `--v`: +Use the pattern `---v`: | Component | Values | Example | |-----------|--------|---------| -| `` | Agent name from `.env` | `support-bot` | +| `` | Agent name from `agent-metadata.yaml` | `support-bot` | +| `` | Selected environment key | `prod` | | `` | `traces`, `synthetic`, `manual`, `combined` | `traces` | | `v` | Incremental version number | `v3` | **Full examples:** -- `support-bot-traces-v1` — first dataset from trace harvesting -- `support-bot-synthetic-v2` — second synthetic dataset -- `support-bot-combined-v5` — fifth dataset combining traces + manual examples +- `support-bot-prod-traces-v1` — first production dataset from trace harvesting +- `support-bot-dev-synthetic-v2` — second synthetic dataset +- `support-bot-prod-combined-v5` — fifth production dataset combining traces + manual examples ## Tagging Conventions -Tags are stored in `datasets/manifest.json` alongside dataset metadata: +Tags are stored in `.foundry/datasets/manifest.json` alongside dataset metadata: | Tag | Meaning | When to Apply | |-----|---------|---------------| @@ -38,25 +39,37 @@ Pin evaluations to a specific dataset version to ensure reproducible, comparable When using local JSONL files, reference the exact filename in evaluation runs: ``` -datasets/support-bot-traces-v3.jsonl ← pinned by filename +.foundry/datasets/support-bot-prod-traces-v3.jsonl ← pinned by filename ``` Pass the contents via `inputData` parameter in **`evaluation_agent_batch_eval_create`**. -### ~~Server-Side Pinning~~ (Not Available) +### Server-Side Version Discovery -> ⚠️ **Dataset upload MCP tools are not yet ready.** Skip `evaluation_dataset_create` (uploads) for now. You may use `evaluation_dataset_get` for read-only inspection of any existing server-side datasets, but do **not** rely on them for version pinning—use local JSONL files and pass data via `inputData` when running evaluations. +Use `evaluation_dataset_versions_get` to list all versions of a dataset registered in Foundry: + +``` +evaluation_dataset_versions_get(projectEndpoint, datasetName: "--") +``` + +Use `evaluation_dataset_get` without a name to list all datasets in the project: + +``` +evaluation_dataset_get(projectEndpoint) +``` + +> 💡 **Tip:** Server-side versions are available after syncing via [Trace-to-Dataset → Step 5](trace-to-dataset.md#step-5--sync-local-cache-with-foundry-optional). Local `manifest.json` remains useful for lineage metadata (source, harvestRule, reviewedBy) not stored server-side. ## Manifest File -Track all dataset versions, tags, and lineage in `datasets/manifest.json`: +Track all dataset versions, tags, and lineage in `.foundry/datasets/manifest.json`: ```json { "datasets": [ { - "name": "support-bot-traces-v1", - "file": "support-bot-traces-v1.jsonl", + "name": "support-bot-prod-traces-v1", + "file": "support-bot-prod-traces-v1.jsonl", "version": "1", "tag": "deprecated", "source": "trace-harvest", @@ -67,8 +80,8 @@ Track all dataset versions, tags, and lineage in `datasets/manifest.json`: "evalRunIds": ["run-abc-123"] }, { - "name": "support-bot-traces-v2", - "file": "support-bot-traces-v2.jsonl", + "name": "support-bot-prod-traces-v2", + "file": "support-bot-prod-traces-v2.jsonl", "version": "2", "tag": "baseline", "source": "trace-harvest", @@ -79,8 +92,8 @@ Track all dataset versions, tags, and lineage in `datasets/manifest.json`: "evalRunIds": ["run-def-456", "run-ghi-789"] }, { - "name": "support-bot-traces-v3", - "file": "support-bot-traces-v3.jsonl", + "name": "support-bot-prod-traces-v3", + "file": "support-bot-prod-traces-v3.jsonl", "version": "3", "tag": "prod", "source": "trace-harvest", @@ -96,7 +109,7 @@ Track all dataset versions, tags, and lineage in `datasets/manifest.json`: ## Creating a New Version -1. **Check existing versions**: Read `datasets/manifest.json` to find the latest version number +1. **Check existing versions**: Read `.foundry/datasets/manifest.json` to find the latest version number 2. **Increment version**: Use `v` as the new version 3. **Create dataset**: Via [Trace-to-Dataset](trace-to-dataset.md) or manual JSONL creation 4. **Update manifest**: Add the new entry with metadata @@ -141,11 +154,11 @@ To understand how a dataset evolved between versions: ```bash # Count examples per version -wc -l datasets/support-bot-traces-v*.jsonl +wc -l .foundry/datasets/support-bot-prod-traces-v*.jsonl # Diff example queries between versions -jq -r '.query' datasets/support-bot-traces-v2.jsonl | sort > /tmp/v2-queries.txt -jq -r '.query' datasets/support-bot-traces-v3.jsonl | sort > /tmp/v3-queries.txt +jq -r '.query' .foundry/datasets/support-bot-prod-traces-v2.jsonl | sort > /tmp/v2-queries.txt +jq -r '.query' .foundry/datasets/support-bot-prod-traces-v3.jsonl | sort > /tmp/v3-queries.txt diff /tmp/v2-queries.txt /tmp/v3-queries.txt ``` diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md index 0c6b56bcd..02fa5d1fd 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md @@ -8,11 +8,11 @@ Track the complete chain from production traces through dataset creation, evalua Production Trace (App Insights) │ conversationId, responseId ▼ -Dataset Version (datasets/*.jsonl) +Dataset Version (.foundry/datasets/*.jsonl, environment-scoped) │ metadata.conversationId, metadata.harvestRule ▼ Evaluation Run (evaluation_agent_batch_eval_create) - │ evaluationId, evalRunId + │ evaluationId when creating, evalId when querying, evalRunId ▼ Comparison (evaluation_comparison_create) │ insightId, baselineRunId, treatmentRunIds @@ -25,14 +25,14 @@ Production Trace (cycle repeats) ## Lineage Manifest -Track lineage in `datasets/manifest.json`: +Track lineage in `.foundry/datasets/manifest.json`: ```json { "datasets": [ { - "name": "support-bot-traces-v3", - "file": "support-bot-traces-v3.jsonl", + "name": "support-bot-prod-traces-v3", + "file": "support-bot-prod-traces-v3.jsonl", "version": "3", "tag": "prod", "source": "trace-harvest", @@ -81,7 +81,7 @@ Track lineage in `datasets/manifest.json`: ### "Why was version X deployed?" -1. Read `datasets/manifest.json` +1. Read `.foundry/datasets/manifest.json` 2. Find entries where `deployments[].agentVersion == X` 3. Show the comparison that justified the deployment 4. Show the dataset and eval runs that informed the comparison @@ -108,7 +108,7 @@ Track lineage in `datasets/manifest.json`: ## Maintaining Lineage -Update `datasets/manifest.json` at each step: +Update `.foundry/datasets/manifest.json` at each step: | Event | Fields to Update | |-------|-----------------| @@ -118,6 +118,8 @@ Update `datasets/manifest.json` at each step: | Deployment | Append to `deployments[]` with `agentVersion`, `reason` | | Tag change | Update `tag` field | +> 💡 **Tip:** Store the evaluation group identifier as `evalId` in lineage/manifest records, even if the create call used the parameter name `evaluationId`. + ## Next Steps - **View metric trends** → [Eval Trending](eval-trending.md) diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-regression.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-regression.md index c9377de20..c23fca9be 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-regression.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-regression.md @@ -11,7 +11,7 @@ Automatically detect when evaluation metrics degrade between agent versions. Com ### Automatic Baseline Selection -1. Read `datasets/manifest.json` and find the dataset tagged `baseline`. +1. Read `.foundry/datasets/manifest.json` and find the dataset tagged `baseline`. 2. If the baseline dataset entry includes a stored `baselineRunId` (or mapping to one or more `evalRunIds`), use that `baselineRunId` as the baseline run. 3. If no explicit `baselineRunId` is recorded, select the first (oldest) run in the evaluation group as the baseline. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-trending.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-trending.md index 6ea2d45c3..b4b3596d1 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-trending.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-trending.md @@ -4,8 +4,10 @@ Track evaluation metrics across multiple runs and versions to visualize improvem ## Prerequisites -- At least 2 evaluation runs in the same evaluation group (same `evaluationId`) -- Project endpoint available in `.env` +- At least 2 evaluation runs in the same evaluation group (same `evaluationId` when created) +- Project endpoint and selected environment available in `.foundry/agent-metadata.yaml` + +> ⚠️ **Eval-group immutability:** Trend a group only when its evaluator set and thresholds stayed fixed across runs. If either changed, start a new evaluation group and track that history separately. ## Step 1 — Retrieve Evaluation History @@ -24,6 +26,8 @@ Then retrieve all runs within the target evaluation group: | `evalId` | ✅ | Evaluation group ID | | `isRequestForRuns` | ✅ | `true` to list runs | +> ⚠️ **Parameter guardrail:** evaluation_get expects `evalId`, not `evaluationId`, even if the runs were grouped earlier with `evaluationId`. + ## Step 2 — Build Metrics Timeline For each run, extract per-evaluator scores and build a timeline: diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/mcp-gap-analysis.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/mcp-gap-analysis.md deleted file mode 100644 index 8b425e81b..000000000 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/mcp-gap-analysis.md +++ /dev/null @@ -1,133 +0,0 @@ -# MCP Tool Gap Analysis — Foundry Platform Roadmap Recommendations - -This document identifies MCP tool capabilities that would significantly enhance the evaluation dataset experience but are **not currently available** in the `foundry-mcp` server. These are recommendations for the platform team to close competitive gaps with LangSmith. - -## Current MCP Tool Coverage - -| Tool | Status | Gap | -|------|--------|-----| -| `evaluation_dataset_create` | ⚠️ Not practical | Requires Blob Storage SAS URL upload — no file upload path from agent. Use local JSONL + `inputData` instead | -| `evaluation_dataset_get` | ✅ Available | Cannot list all versions of a dataset; only gets by name+version | -| `evaluation_agent_batch_eval_create` | ✅ Available | Full-featured | -| `evaluation_dataset_batch_eval_create` | ✅ Available | Full-featured | -| `evaluation_get` | ✅ Available | Cannot filter runs by dataset version | -| `evaluation_comparison_create` | ✅ Available | No trend analysis; only pairwise comparison | -| `evaluation_comparison_get` | ✅ Available | Full-featured | -| `evaluator_catalog_*` | ✅ Available | No version history or audit trail | - -## Requested New MCP Tools - -### Priority 1: Critical (Blocks competitive parity with LangSmith) - -#### `dataset_version_list` -**Purpose:** List all versions of a named dataset. - -| Parameter | Type | Description | -|-----------|------|-------------| -| `projectEndpoint` | string (required) | Azure AI Project endpoint | -| `datasetName` | string (required) | Dataset name | - -**Why needed:** Currently, `evaluation_dataset_get` requires both name and version. There is no way to discover what versions exist for a given dataset. Users must track versions externally (our manifest.json workaround). - -**LangSmith equivalent:** Automatic version history with read-only historical access. - -#### `dataset_from_traces` -**Purpose:** Server-side extraction of App Insights traces into a dataset, with filtering and schema transformation. - -| Parameter | Type | Description | -|-----------|------|-------------| -| `projectEndpoint` | string (required) | Azure AI Project endpoint | -| `appInsightsResourceId` | string (required) | App Insights ARM resource ID | -| `filterQuery` | string (required) | KQL filter expression | -| `timeRange` | string (required) | Time range (e.g., "7d", "30d") | -| `datasetName` | string (optional) | Target dataset name | -| `datasetVersion` | string (optional) | Target version | -| `sampleSize` | integer (optional) | Max number of traces to extract | - -**Why needed:** Currently, trace-to-dataset requires client-side KQL execution, result parsing, schema transformation, and upload. A server-side tool would dramatically simplify the workflow and enable automation. - -**LangSmith equivalent:** Run rules with automatic trace-to-dataset routing. - -### Priority 2: High (Differentiating features) - -#### `evaluation_trend_get` -**Purpose:** Retrieve time-series metrics across all runs in an evaluation group. - -| Parameter | Type | Description | -|-----------|------|-------------| -| `projectEndpoint` | string (required) | Azure AI Project endpoint | -| `evalId` | string (required) | Evaluation group ID | -| `evaluatorNames` | string[] (optional) | Filter to specific evaluators | - -**Returns:** Array of `{ runId, agentVersion, date, metrics: { evaluatorName: { average, stddev, passRate } } }`. - -**Why needed:** Currently requires multiple `evaluation_get` calls and client-side aggregation. A dedicated tool would enable trend dashboards and regression detection in a single call. - -**LangSmith equivalent:** Evaluation dashboard with historical metrics and trend analysis. - -#### `dataset_tag_manage` -**Purpose:** Add, remove, or list tags on dataset versions. - -| Parameter | Type | Description | -|-----------|------|-------------| -| `projectEndpoint` | string (required) | Azure AI Project endpoint | -| `datasetName` | string (required) | Dataset name | -| `datasetVersion` | string (required) | Dataset version | -| `action` | string (required) | `add`, `remove`, `list` | -| `tag` | string (optional) | Tag to add/remove (e.g., `prod`, `baseline`) | - -**Why needed:** Tags enable version pinning semantics (e.g., "evaluate against the `prod` dataset"). Currently requires external tracking via manifest.json. - -**LangSmith equivalent:** Built-in dataset tagging with programmatic SDK access. - -### Priority 3: Medium (Nice-to-have for competitive advantage) - -#### `dataset_split_manage` -**Purpose:** Create and manage train/validation/test splits within a dataset. - -**Why needed:** Enables targeted evaluation on specific dataset subsets without creating separate datasets. Currently requires client-side JSONL filtering. - -#### `annotation_queue_create` / `annotation_queue_get` -**Purpose:** Server-side human review queues for trace candidates before dataset inclusion. - -**Why needed:** Enables multi-user review workflows. Currently, curation is a single-user, local-file process. - -**LangSmith equivalent:** Annotation queues with multi-user review, approval workflows, and queue management. - -#### `evaluation_regression_check` -**Purpose:** Automated regression detection with configurable thresholds. - -| Parameter | Type | Description | -|-----------|------|-------------| -| `projectEndpoint` | string (required) | Azure AI Project endpoint | -| `evalId` | string (required) | Evaluation group ID | -| `baselineRunId` | string (required) | Baseline run ID | -| `treatmentRunId` | string (required) | Treatment run ID | -| `regressionThreshold` | number (optional) | Percent drop that triggers regression (default: 5%) | - -**Why needed:** Currently requires comparison + client-side threshold logic. A dedicated tool could integrate with CI/CD pipelines directly. - -## Impact Assessment - -| Requested Tool | Impact on CX Feedback | Effort Estimate | -|---------------|----------------------|-----------------| -| `dataset_version_list` | Directly addresses "organizing datasets" feedback | Low | -| `dataset_from_traces` | Directly addresses "creating datasets from traces" feedback | High | -| `evaluation_trend_get` | Directly addresses "comparing runs and metrics over time" feedback | Medium | -| `dataset_tag_manage` | Supports "hierarchical containers" feedback (via tags) | Low | -| `dataset_split_manage` | Supports "hierarchical containers" feedback (via splits) | Medium | -| `annotation_queue_*` | Enhances trace-to-dataset quality | High | -| `evaluation_regression_check` | Enables CI/CD regression gates | Medium | - -## Interim Workarounds - -Until these MCP tools are available, the [eval-datasets skill](../eval-datasets.md) provides client-side workarounds: - -| Gap | Workaround | -|-----|-----------| -| No version listing | `datasets/manifest.json` tracks all versions locally | -| No trace-to-dataset | KQL harvest templates + local schema transform | -| No trend analysis | Multiple `evaluation_get` calls + client-side aggregation | -| No tagging | Tags stored in manifest.json | -| No annotation queues | Local candidate files with status tracking | -| No regression check | Comparison results + threshold logic in skill | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md index c48c7d7fc..b231fd0a0 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md @@ -4,8 +4,6 @@ Extract production traces from App Insights using KQL, transform them into evalu ## ⛔ Do NOT -- Do NOT upload datasets to blob storage or call `evaluation_dataset_create` — this MCP tool is not ready. -- Do NOT generate SAS URLs. Local JSONL + `inputData` is the only supported path. - Do NOT use `parse_json(customDimensions)` — `customDimensions` is already a `dynamic` column in App Insights KQL. Access properties directly: `customDimensions["gen_ai.response.id"]`. ## Related References @@ -16,7 +14,7 @@ Extract production traces from App Insights using KQL, transform them into evalu ## Prerequisites - App Insights resource resolved (see [trace skill](../../trace/trace.md) Before Starting) -- Agent name and project endpoint available in `.env` +- Agent root, environment, and project endpoint available in `.foundry/agent-metadata.yaml` - Time range confirmed with user (default: last 7 days) > 💡 **Run all KQL queries** using **`monitor_resource_log_query`** (Azure MCP tool) against the App Insights resource. This is preferred over delegating to the `azure-kusto` skill. @@ -39,6 +37,9 @@ App Insights traces │ ▼ [4] Persist Dataset (local JSONL files) + │ + ▼ +[5] Sync to Foundry (optional — upload to project-connected storage) ``` ## Key Concept: Linking Evaluation Results to Traces @@ -260,7 +261,7 @@ dependencies Extract the `query` from the last user-role entry in `gen_ai.input.messages` and the `response` from `gen_ai.output.messages`. Save extracted data to a local JSONL file: ``` -datasets/-traces-candidates-.jsonl +.foundry/datasets/--traces-candidates-.jsonl ``` ## Step 3 — Human Review (Curation) @@ -282,7 +283,7 @@ Ask the user: ## Step 4 — Persist Dataset (Local JSONL) -Save approved candidates to `datasets/--v.jsonl`: +Save approved candidates to `.foundry/datasets/---v.jsonl`: ```json {"query": "How do I reset my password?", "context": "User account management", "metadata": {"source": "trace", "conversationId": "conv-abc-123", "harvestRule": "error"}} @@ -291,14 +292,14 @@ Save approved candidates to `datasets/--v.jsonl`: ### Update Manifest -After persisting, update `datasets/manifest.json` with lineage information: +After persisting, update `.foundry/datasets/manifest.json` with lineage information: ```json { "datasets": [ { - "name": "support-bot-traces-v3", - "file": "support-bot-traces-v3.jsonl", + "name": "support-bot-prod-traces-v3", + "file": "support-bot-prod-traces-v3.jsonl", "version": "3", "source": "trace-harvest", "harvestRule": "error+latency", @@ -314,6 +315,77 @@ After persisting, update `datasets/manifest.json` with lineage information: ## Next Steps After creating a dataset: +- **Sync to Foundry** → Step 5 below (recommended for shared/CI use) - **Run evaluation** → [observe skill Step 2](../../observe/references/evaluate-step.md) - **Version and tag** → [Dataset Versioning](dataset-versioning.md) - **Organize into splits** → [Dataset Organization](dataset-organization.md) + +## Step 5 — Sync Local Cache with Foundry (Optional) + +Refresh or register the local cache in Foundry so it is available for server-side evaluations, shared access, and CI/CD pipelines. Reuse the local cache when it is current, and only refresh or push after user confirmation. + +### 5a. Discover Storage Connection + +Use `project_connection_list` to find an existing `AzureBlob` storage connection on the Foundry project: + +``` +project_connection_list(foundryProjectResourceId, category: "AzureBlob") +``` + +- **Found** → use its `connectionName` and `target` (storage account URL) +- **Not found** → proceed to 5b + +### 5b. Create Storage Connection (if needed) + +Ask the user for a storage account, then create a project connection: + +``` +project_connection_create( + foundryProjectResourceId, + connectionName: "datasets-storage", + category: "AzureBlob", + target: "https://.blob.core.windows.net", + authType: "AAD" +) +``` + +> 💡 **Tip:** The storage account must be in the same subscription or the user must have access. AAD auth is preferred — it uses the caller's identity. + +### 5c. Upload JSONL to Blob Storage + +Upload the local dataset file to a `datasets` container in the storage account: + +```bash +az storage blob upload \ + --account-name \ + --container-name datasets \ + --name ---v.jsonl \ + --file .foundry/datasets/---v.jsonl \ + --auth-mode login +``` + +> ⚠️ **Always pass `--auth-mode login`** to use AAD credentials. If the container doesn't exist, create it first with `az storage container create`. + +### 5d. Register Dataset in Foundry + +Use `evaluation_dataset_create` with the blob URI and the Azure Blob `connectionName` discovered in 5a or created in 5b. While `connectionName` can be optional in other MCP flows, include it in this workflow so the dataset is bound to the project-connected storage account: + +``` +evaluation_dataset_create( + projectEndpoint: "", + datasetContentUri: "https://.blob.core.windows.net/datasets/.jsonl", + connectionName: "datasets-storage", + datasetName: "--", + datasetVersion: "" +) +``` + +### 5e. Verify + +Confirm the dataset is registered: + +``` +evaluation_dataset_get(projectEndpoint, datasetName: "--", datasetVersion: "") +``` + +Display the registered dataset details to the user. Update `.foundry/datasets/manifest.json` with `"synced": true` and the server-side dataset name/version. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/observe.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/observe.md index 597e0493a..c3fb17d7e 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/observe.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/observe.md @@ -1,12 +1,12 @@ # Agent Observability Loop -Orchestrate the full eval-driven optimization cycle for a Foundry agent. This skill manages the **multi-step workflow** — auto-creating evaluators, generating test datasets, running batch evals, clustering failures, optimizing prompts, redeploying, and comparing versions. Use this skill instead of calling individual `azure` MCP evaluation tools manually. +Orchestrate the full eval-driven optimization cycle for a Foundry agent. This skill manages the **multi-step workflow** for a selected agent root and environment: reusing or refreshing `.foundry` cache, auto-creating evaluators, generating test datasets, running batch evals, clustering failures, optimizing prompts, redeploying, and comparing versions. Use this skill instead of calling individual `azure` MCP evaluation tools manually. ## When to Use This Skill USE FOR: evaluate my agent, run an eval, test my agent, check agent quality, run batch evaluation, analyze eval results, why did my eval fail, cluster failures, improve agent quality, optimize agent prompt, compare agent versions, re-evaluate after changes, set up CI/CD evals, agent monitoring, eval-driven optimization. -> ⚠️ **DO NOT manually call** `evaluation_agent_batch_eval_create`, `evaluator_catalog_create`, `evaluation_comparison_create`, or `prompt_optimize` **without reading this skill first.** This skill defines required pre-checks, artifact persistence, and multi-step orchestration that the raw tools do not enforce. +> ⚠️ **DO NOT manually call** `evaluation_agent_batch_eval_create`, `evaluator_catalog_create`, `evaluation_comparison_create`, or `prompt_optimize` **without reading this skill first.** This skill defines required pre-checks, environment selection, cache reuse, artifact persistence, and multi-step orchestration that the raw tools do not enforce. ## Quick Reference @@ -15,6 +15,7 @@ USE FOR: evaluate my agent, run an eval, test my agent, check agent quality, run | MCP server | `azure` | | Key MCP tools | `evaluation_agent_batch_eval_create`, `evaluator_catalog_create`, `evaluation_comparison_create`, `prompt_optimize`, `agent_update` | | Prerequisite | Agent deployed and running (use [deploy skill](../deploy/deploy.md)) | +| Local cache | `.foundry/agent-metadata.yaml`, `.foundry/evaluators/`, `.foundry/datasets/`, `.foundry/results/` | ## Entry Points @@ -22,44 +23,49 @@ USE FOR: evaluate my agent, run an eval, test my agent, check agent quality, run |-------------|----------| | "Deploy and evaluate my agent" | [Step 1: Auto-Setup Evaluators](references/deploy-and-setup.md) (deploy first via [deploy skill](../deploy/deploy.md)) | | "Agent just deployed" / "Set up evaluation" | [Step 1: Auto-Setup Evaluators](references/deploy-and-setup.md) (skip deploy, run auto-create) | -| "Evaluate my agent" / "Run an eval" | [Step 1: Auto-Setup Evaluators](references/deploy-and-setup.md) first if `evaluators/` is empty, then [Step 2: Evaluate](references/evaluate-step.md) | +| "Evaluate my agent" / "Run an eval" | [Step 1: Auto-Setup Evaluators](references/deploy-and-setup.md) first if `.foundry/evaluators/` or `.foundry/datasets/` cache is missing, stale, or the user requests refresh, then [Step 2: Evaluate](references/evaluate-step.md) | | "Why did my eval fail?" / "Analyze results" | [Step 3: Analyze](references/analyze-results.md) | | "Improve my agent" / "Optimize prompt" | [Step 4: Optimize](references/optimize-deploy.md) | | "Compare agent versions" | [Step 5: Compare](references/compare-iterate.md) | | "Set up CI/CD evals" | [Step 6: CI/CD](references/cicd-monitoring.md) | -> ⚠️ **Important:** Before running any evaluation (Step 2), always check if evaluators and test datasets exist in `evaluators/` and `datasets/`. If they don't, route through [Step 1: Auto-Setup](references/deploy-and-setup.md) first — even if the user only asked to "evaluate." +> ⚠️ **Important:** Before running any evaluation (Step 2), always resolve the selected agent root and environment, then inspect `.foundry/agent-metadata.yaml` plus `.foundry/evaluators/` and `.foundry/datasets/`. If the cache is missing, stale, or the user wants to refresh it, route through [Step 1: Auto-Setup](references/deploy-and-setup.md) first — even if the user only asked to "evaluate." ## Before Starting — Detect Current State -1. Check `.env` for `AZURE_AI_PROJECT_ENDPOINT` and `AZURE_AI_AGENT_NAME` -2. Use `agent_get` and `agent_container_status_get` to verify the agent exists and is running -3. Use `evaluation_get` to check for existing eval runs -4. Jump to the appropriate entry point +1. Resolve the target agent root and environment from `.foundry/agent-metadata.yaml`. +2. Use `agent_get` and `agent_container_status_get` to verify the environment's agent exists and is running. +3. Inspect the selected environment's `testCases[]` plus cached files under `.foundry/evaluators/` and `.foundry/datasets/`. +4. Use `evaluation_get` to check for existing eval runs. +5. Jump to the appropriate entry point. ## Loop Overview -``` -1. Auto-setup evaluators & local test dataset - → ask: "Run an evaluation to identify optimization opportunities?" +```text +1. Auto-setup evaluators or refresh .foundry cache for the selected environment + -> ask: "Run an evaluation to identify optimization opportunities?" 2. Evaluate (batch eval run) -3. Download & cluster failures -4. Pick a category to optimize +3. Download and cluster failures +4. Pick a category or test case to optimize 5. Optimize prompt 6. Deploy new version (after user sign-off) -7. Re-evaluate (same eval group) -8. Compare versions → decide which to keep +7. Re-evaluate (same env + same test case) +8. Compare versions -> decide which to keep 9. Loop to next category or finish -10. Prompt: enable CI/CD evals & continuous production monitoring +10. Prompt: enable CI/CD evals and continuous production monitoring ``` ## Behavioral Rules -1. **Auto-poll in background.** After creating eval runs or starting containers, poll in a background terminal. Only surface the final result. -2. **Confirm before changes.** Show diff/summary before modifying agent code or deploying. Wait for sign-off. -3. **Prompt for next steps.** After each step, present options. Never assume the path forward. -4. **Write scripts to files.** Python scripts go in `scripts/` — no inline code blocks. -5. **Persist eval artifacts.** Save to `evaluators/`, `datasets/`, and `results/` for version tracking (see [deploy-and-setup](references/deploy-and-setup.md) for structure). +1. **Keep context visible.** Restate the selected agent root and environment in setup, evaluation, and result summaries. +2. **Reuse cache before regenerating.** Prefer existing `.foundry/evaluators/` and `.foundry/datasets/` when they match the active environment. Ask before refreshing or overwriting them. +3. **Start with P0 test cases.** Run the selected environment's `P0` test cases before broader `P1` or `P2` coverage unless the user explicitly chooses otherwise. +4. **Auto-poll in background.** After creating eval runs or starting containers, poll in a background terminal. Only surface the final result. +5. **Confirm before changes.** Show diff/summary before modifying agent code, refreshing cache, or deploying. Wait for sign-off. +6. **Prompt for next steps.** After each step, present options. Never assume the path forward. +7. **Write scripts to files.** Python scripts go in `scripts/` - no inline code blocks. +8. **Persist eval artifacts.** Save local artifacts to `.foundry/evaluators/`, `.foundry/datasets/`, and `.foundry/results/` for version tracking and comparison. +9. **Use exact eval parameter names.** Use `evaluationId` only on batch-eval create calls that group runs; use `evalId` on `evaluation_get` and `evaluation_comparison_create`; use `evalRunId` for a specific run lookup. ## Related Skills diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md index e5f61f06f..f32fd77cf 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md @@ -4,16 +4,14 @@ `evaluation_get` returns run metadata but **not** full per-row output. Write a Python script (save to `scripts/`) to download detailed results: -1. Initialize `AIProjectClient` with project endpoint and `DefaultAzureCredential` +1. Initialize `AIProjectClient` with the selected environment's project endpoint and `DefaultAzureCredential` 2. Get OpenAI client via `project_client.get_openai_client()` 3. Call `openai_client.evals.runs.output_items.list(eval_id=..., run_id=...)` -4. Serialize each item with `item.model_dump()` and save to `results//.json` (use `default=str` for non-serializable fields) +4. Serialize each item with `item.model_dump()` and save to `.foundry/results///.json` (use `default=str` for non-serializable fields) 5. Print summary: total items, passed, failed, errored counts > ⚠️ **Data structure gotcha:** Query/response data lives in `datasource_item.query` and `datasource_item['sample.output_text']`, **not** in `sample.input`/`sample.output` (which are empty arrays). Parse `datasource_item` fields when extracting queries and responses for analysis. -> SDK setup: `pip install azure-ai-projects azure-identity openai` - ## Step 4 — Cluster Failures by Root Cause Analyze every row in the results. Group failures into clusters: @@ -27,23 +25,22 @@ Analyze every row in the results. Group failures into clusters: | Runtime error | Agent crashed or returned an error | | Off-topic / refusal | Agent refused or went off-topic | -Produce a **prioritized action table**: +Produce a prioritized action table: | Priority | Cluster | Suggested Action | |----------|---------|------------------| -| P0 | Runtime errors | Check container logs | -| P1 | Incorrect answers | Optimize prompt ([Step 6](optimize-deploy.md)) | -| P2 | Incomplete answers | Optimize prompt ([Step 6](optimize-deploy.md)) | +| P0 | Runtime errors or failing `P0` test cases | Check container logs or fix blockers first | +| P1 | Incorrect answers on key flows | Optimize prompt or tool instructions | +| P2 | Incomplete answers or broader quality gaps | Optimize prompt or expand context | | P3 | Tool call failures | Fix tool definitions or instructions | | P4 | Safety violations | Add guardrails to instructions | -| P5 | Off-topic / refusal | Clarify scope in instructions | -**Rule:** Runtime errors first (P0), then by count × severity. +**Rule:** Prioritize runtime errors first, then sort by test-case priority (`P0` before `P1` before `P2`) and count × severity. ## Step 5 — Dive Into Category -When the user wants to inspect a specific cluster, display the individual rows: input query, the agent's original response, evaluator scores, and failure reason. Let the user confirm which category to optimize. +When the user wants to inspect a specific cluster, display the individual rows: test-case ID, input query, the agent's original response, evaluator scores, and failure reason. Let the user confirm which category or test case to optimize. ## Next Steps -After clustering → proceed to [Step 6: Optimize Prompt](optimize-deploy.md). +After clustering -> proceed to [Step 6: Optimize Prompt](optimize-deploy.md). diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/cicd-monitoring.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/cicd-monitoring.md index 0fc85689a..c1fcd15f3 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/cicd-monitoring.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/cicd-monitoring.md @@ -6,25 +6,26 @@ After confirming the final agent version, prompt with two options: *"Would you like to add automated evaluations to your CI/CD pipeline so every deployment is evaluated before going live?"* -If yes, generate a GitHub Actions workflow (e.g., `.github/workflows/agent-eval.yml`) that: +If yes, generate a GitHub Actions workflow (for example, `.github/workflows/agent-eval.yml`) that: 1. Triggers on push to `main` or on pull request -2. Reads evaluator definitions from `evaluators/` and test datasets from `datasets/` -3. Runs `evaluation_agent_batch_eval_create` against the newly deployed agent version -4. Fails the workflow if any evaluator score falls below configured thresholds -5. Posts a summary as a PR comment or workflow annotation +2. Reads test-case definitions from `.foundry/agent-metadata.yaml` +3. Reads evaluator definitions from `.foundry/evaluators/` and test datasets from `.foundry/datasets/` +4. Runs `evaluation_agent_batch_eval_create` against the newly deployed agent version +5. Fails the workflow if any evaluator score falls below the configured thresholds for the selected environment/test case +6. Posts a summary as a PR comment or workflow annotation -Use repository secrets for `AZURE_AI_PROJECT_ENDPOINT` and Azure credentials. Confirm the workflow file with the user before committing. +Use repository secrets for the selected environment's project endpoint and Azure credentials. Confirm the workflow file with the user before committing. ## Option 2 — Continuous Production Monitoring *"Would you like to set up continuous evaluations to monitor your agent's quality in production?"* -If yes, generate a scheduled GitHub Actions workflow (e.g., `.github/workflows/agent-eval-scheduled.yml`) that: +If yes, generate a scheduled GitHub Actions workflow (for example, `.github/workflows/agent-eval-scheduled.yml`) that: -1. Runs on a cron schedule (ask user preference: daily, weekly, etc.) -2. Evaluates the current production agent version using stored evaluators and datasets -3. Saves results to `results/` +1. Runs on a cron schedule (ask the user preference: daily, weekly, and so on) +2. Evaluates the current production agent version using stored test cases, evaluators, and datasets +3. Saves results to `.foundry/results//` 4. Opens a GitHub issue or sends a notification if any score degrades below thresholds The user may choose one, both, or neither. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/compare-iterate.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/compare-iterate.md index 428138301..a6114b77c 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/compare-iterate.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/compare-iterate.md @@ -2,7 +2,11 @@ ## Step 8 — Re-Evaluate -Use **`evaluation_agent_batch_eval_create`** with the **same `evaluationId`** as the baseline run. This places both runs in the same eval group for comparison. Use the same local test dataset (from `datasets/`) and evaluators. Update `agentVersion` to the new version. +Use **`evaluation_agent_batch_eval_create`** with the **same `evaluationId`** as the baseline run. This places both runs in the same eval group for comparison. Use the same local test dataset (from `.foundry/datasets/`) and evaluator bundle from the selected environment/test case. Update `agentVersion` to the new version. + +> ⚠️ **Parameter switch reminder:** Re-evaluation creation uses `evaluationId`, but follow-up calls to `evaluation_get` and `evaluation_comparison_create` must use `evalId`. + +> ⚠️ **Eval-group immutability:** Reuse the same `evaluationId` only when `evaluatorNames` and thresholds are unchanged. If you add/remove evaluators or change thresholds, create a new evaluation group first, then compare runs within that new group. Auto-poll for completion in a background terminal (same as [Step 2](evaluate-step.md)). @@ -37,7 +41,7 @@ Use **`evaluation_comparison_create`** with a nested `insightRequest`: } ``` -> **Important:** Both runs must be in the **same eval group** (same `evaluationId` in Steps 2 and 8). +> **Important:** Both runs must be in the **same eval group** (same `evaluationId` in Steps 2 and 8), but comparison requests and lookups use `evalId` for that same group identifier. That shared group assumes the evaluator bundle is fixed for all runs in the group. Then use **`evaluation_comparison_get`** (with the returned `insightId`) to retrieve comparison results. Present a summary showing which version performed better per evaluator, and recommend which version to keep. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md index 47b2cbac1..9a5490864 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md @@ -1,67 +1,80 @@ # Step 1 — Auto-Setup Evaluators & Dataset -> **This step runs automatically after deployment.** If the agent was deployed via the [deploy skill](../../deploy/deploy.md), evaluators and a test dataset may already be configured. Check `evaluators/` and `datasets/` for existing artifacts before re-creating. +> **This step runs automatically after deployment.** If the agent was deployed via the [deploy skill](../../deploy/deploy.md), `.foundry` cache and metadata may already be configured. Check `.foundry/evaluators/`, `.foundry/datasets/`, and `.foundry/agent-metadata.yaml` for existing artifacts before re-creating them. > -> If the agent is **not yet deployed**, follow the [deploy skill](../../deploy/deploy.md) first. It handles project detection, Dockerfile generation, ACR build, agent creation, container startup, **and** auto-creates evaluators & dataset after a successful deployment. +> If the agent is **not yet deployed**, follow the [deploy skill](../../deploy/deploy.md) first. It handles project detection, Dockerfile generation, ACR build, agent creation, container startup, and auto-creates `.foundry` cache after a successful deployment. ## Auto-Create Evaluators & Dataset -> **This step is fully automatic.** After deployment, immediately prepare evaluators and a local test dataset without waiting for the user to request it. +> **This step is fully automatic.** After deployment, immediately prepare evaluators and a local test dataset for the selected environment without waiting for the user to request it. ### 1. Read Agent Instructions Use **`agent_get`** (or local `agent.yaml`) to understand the agent's purpose and capabilities. -### 2. Select Evaluators +### 2. Reuse or Refresh Cache -Combine **built-in, custom, and safety evaluators**: +Inspect `.foundry/evaluators/`, `.foundry/datasets/`, and the selected environment's `testCases[]`. + +- **Cache is current** -> reuse it and summarize what is already available. +- **Cache is missing or stale** -> refresh it after confirming with the user. +- **User explicitly asks for refresh** -> rebuild and rewrite only the selected environment's cache. + +### 3. Select Evaluators + +Combine built-in, custom, and safety evaluators: | Category | Evaluators | |----------|-----------| | **Quality (built-in)** | intent_resolution, task_adherence, coherence, fluency, relevance | -| **Safety (include ≥2)** | violence, self_harm, hate_unfairness, sexual, indirect_attack | -| **Custom (create 1–2)** | Domain-specific via `evaluator_catalog_create` (see below) | +| **Safety (include >=2)** | violence, self_harm, hate_unfairness, sexual, indirect_attack | +| **Custom (create 1-2)** | Domain-specific via `evaluator_catalog_create` | -### 3. Create Custom Evaluators +### 4. Create Custom Evaluators -Use **`evaluator_catalog_create`** with: +Use **`evaluator_catalog_create`** with the selected environment's project endpoint. | Parameter | Required | Description | |-----------|----------|-------------| | `projectEndpoint` | ✅ | Azure AI Project endpoint | -| `name` | ✅ | e.g., `domain_accuracy`, `citation_quality` | +| `name` | ✅ | For example, `domain_accuracy`, `citation_quality` | | `category` | ✅ | `quality`, `safety`, or `agents` | | `scoringType` | ✅ | `ordinal`, `continuous`, or `boolean` | | `promptText` | ✅* | Template with `{{query}}`, `{{response}}` placeholders | | `minScore` / `maxScore` | | Default: 1 / 5 | -| `passThreshold` | | Scores ≥ this value pass | +| `passThreshold` | | Scores >= this value pass | -> **LLM-judge tip:** Include in the evaluator prompt: *"Do NOT penalize the response for mentioning dates or events beyond your training cutoff. The agent has real-time access."* +### 5. Identify LLM-Judge Deployment -### 4. Identify LLM-Judge Deployment +Use **`model_deployment_get`** to list the selected project's actual model deployments, then choose one that supports chat completions for quality evaluators. Do **not** assume `gpt-4o` exists in the project. If no deployment supports chat completions, stop the setup flow and explain that quality evaluators need a compatible judge deployment. -Use **`model_deployment_get`** to find a suitable model (e.g., `gpt-4o`) for quality evaluators. +### 6. Generate Local Test Dataset -### 5. Generate Local Test Dataset +Use the identified chat-capable deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `.foundry/datasets/--test-v1.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). -Use the identified LLM deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `datasets/-test.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). +### 7. Persist Artifacts and Test Cases -### 6. Persist Artifacts - -``` -evaluators/ # custom evaluator definitions - .yaml # prompt text, scoring type, thresholds -datasets/ # locally generated input datasets - *.jsonl # test queries -results/ # evaluation run outputs (populated later) - / - .json +```text +.foundry/ + agent-metadata.yaml + evaluators/ + .yaml + datasets/ + *.jsonl + results/ + / + / + .json ``` -Save evaluator definitions to `evaluators/.yaml` and test data to `datasets/*.jsonl`. +Save evaluator definitions to `.foundry/evaluators/.yaml`, test data to `.foundry/datasets/*.jsonl`, and create or update test cases in `agent-metadata.yaml` with: +- `id` +- `priority` (`P0`, `P1`, `P2`) +- dataset reference +- evaluator names and thresholds -### 7. Prompt User +### 8. Prompt User -*"Your agent is deployed and running. Evaluators and a local test dataset have been auto-configured. Would you like to run an evaluation to identify optimization opportunities?"* +*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local test dataset, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* -If yes → proceed to [Step 2: Evaluate](evaluate-step.md). If no → stop. +If yes -> proceed to [Step 2: Evaluate](evaluate-step.md). If no -> stop. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md index 23148083b..e204f2d2e 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md @@ -2,28 +2,34 @@ ## Prerequisites -- Agent deployed and running -- Evaluators configured (from [Step 1](deploy-and-setup.md) or `evaluators/` folder) -- Local test dataset available (from `datasets/`) +- Agent deployed and running in the selected environment +- `.foundry/agent-metadata.yaml` loaded for the active agent root +- Evaluators configured (from [Step 1](deploy-and-setup.md) or `.foundry/evaluators/`) +- Local test dataset available (from `.foundry/datasets/`) +- Test case selected from the environment's `testCases[]` ## Run Evaluation -Use **`evaluation_agent_batch_eval_create`** to run evaluators against the agent. +Use **`evaluation_agent_batch_eval_create`** to run the selected test case's evaluators against the selected environment's agent. ### Required Parameters | Parameter | Description | |-----------|-------------| -| `projectEndpoint` | Azure AI Project endpoint | -| `agentName` | Agent name | -| `agentVersion` | Agent version (string, e.g. `"1"`) | -| `evaluatorNames` | Array of evaluator names (NOT `evaluators`) | +| `projectEndpoint` | Azure AI Project endpoint from `agent-metadata.yaml` | +| `agentName` | Agent name for the selected environment | +| `agentVersion` | Agent version (string, for example `"1"`) | +| `evaluatorNames` | Array of evaluator names from the selected test case | ### Test Data Options -**Preferred — local dataset:** Read JSONL from `datasets/` and pass via `inputData` (array of objects with `query` and optionally `context`, `ground_truth`). Provides reproducibility, version control, and reviewability. Always use this when `datasets/` contains files. +**Preferred — local dataset:** Read JSONL from `.foundry/datasets/` and pass via `inputData` (array of objects with `query` and optionally `context`, `ground_truth`). Always use this when the referenced cache file exists. -**Fallback only — server-side synthetic data:** Set `generateSyntheticData=true` AND provide `generationModelDeploymentName`. Only use when no local dataset exists and the user explicitly requests it. Optionally set `samplesCount` (default 50) and `generationPrompt` with the agent's instructions. +**Fallback only — server-side synthetic data:** Set `generateSyntheticData=true` and provide `generationModelDeploymentName`. Only use this when the local cache is missing and the user explicitly requests a refresh-free synthetic run. + +## Resolve Judge Deployment + +Before setting `deploymentName`, use **`model_deployment_get`** to list the selected project's actual model deployments. Choose a deployment that supports chat completions and use that deployment name for quality evaluators. Do **not** assume `gpt-4o` exists. If the project has no chat-completions-capable deployment, stop and tell the user quality evaluators cannot run until one is available. ### Additional Parameters @@ -31,19 +37,33 @@ Use **`evaluation_agent_batch_eval_create`** to run evaluators against the agent |-----------|-------------| | `deploymentName` | Required for quality evaluators (the LLM-judge model) | | `evaluationId` | Pass existing eval group ID to group runs for comparison | -| `evaluationName` | Name for a new evaluation group | +| `evaluationName` | Name for a new evaluation group; include environment and test-case ID | + +> **Important:** Use `evaluationId` on `evaluation_agent_batch_eval_create` (not `evalId`) to group runs. Run `P0` test cases first unless the user chooses a broader priority band. + +> ⚠️ **Eval-group immutability:** Reuse an existing `evaluationId` only when the dataset comparison setup is unchanged for that group: same evaluator list and same thresholds. If evaluator definitions or thresholds change, create a **new** evaluation group instead of adding another run to the old one. + +## Parameter Naming Guardrail + +These eval tools use similar names for the same evaluation-group identifier. Match the parameter name to the tool exactly: + +| Tool | Correct Group Parameter | Notes | +|------|-------------------------|-------| +| `evaluation_agent_batch_eval_create` | `evaluationId` | Reuse the existing group when creating a new run | +| `evaluation_get` | `evalId` | Use with `isRequestForRuns=true` to list runs in one group | +| `evaluation_comparison_create` | `insightRequest.request.evalId` | Comparison requests take `evalId`, not `evaluationId` | -> **Important:** Use `evaluationId` (NOT `evalId`) to group runs. +> ⚠️ **Common mistake:** `evaluation_get` does **not** accept `evaluationId`. Always switch from `evaluationId` to `evalId` after the run is created. ## Auto-Poll for Completion -Immediately after creating the run, poll **`evaluation_get`** in a **background terminal** until completion. Use `evalId` + `isRequestForRuns=true`. The run ID parameter is `evalRunId` (NOT `runId`). +Immediately after creating the run, poll **`evaluation_get`** in a background terminal until completion. Use `evalId + isRequestForRuns=true`. The run ID parameter is `evalRunId` (not `runId`). Only surface the final result when status reaches `completed`, `failed`, or `cancelled`. ## Next Steps -When evaluation completes → proceed to [Step 3: Analyze Results](analyze-results.md). +When evaluation completes -> proceed to [Step 3: Analyze Results](analyze-results.md). ## Reference diff --git a/plugin/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md b/plugin/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md index a663035ef..58d4a8cc7 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md @@ -5,10 +5,13 @@ Search agent traces at the conversation level. Returns summaries grouped by conv ## Prerequisites - App Insights resource resolved (see [trace.md](../trace.md) Before Starting) +- Selected agent root and environment confirmed from `.foundry/agent-metadata.yaml` - Time range confirmed with user (default: last 24 hours) ## Search by Conversation ID +Keep the selected environment visible in the summary, and add the selected agent name or environment tag filters when the telemetry emits them. + ```kql dependencies | where timestamp > ago(24h) @@ -138,4 +141,4 @@ union dependencies, requests, exceptions, traces ## After Successful Query -> 📝 **Reminder:** If this is the first trace query in this session, ensure App Insights connection info was persisted to `.env` (see [trace.md — Before Starting](../trace.md#before-starting--resolve-app-insights-connection)). +> 📝 **Reminder:** If this is the first trace query in this session, ensure App Insights connection info was persisted to `.foundry/agent-metadata.yaml` for the selected environment (see [trace.md — Before Starting](../trace.md#before-starting--resolve-app-insights-connection)). diff --git a/plugin/skills/microsoft-foundry/foundry-agent/trace/trace.md b/plugin/skills/microsoft-foundry/foundry-agent/trace/trace.md index 271cb84ba..e0b4549d3 100644 --- a/plugin/skills/microsoft-foundry/foundry-agent/trace/trace.md +++ b/plugin/skills/microsoft-foundry/foundry-agent/trace/trace.md @@ -1,6 +1,6 @@ # Foundry Agent Trace Analysis -Analyze production traces for Foundry agents using Application Insights and GenAI OpenTelemetry semantic conventions. This skill provides **structured KQL-powered workflows** for searching conversations, diagnosing failures, and identifying latency bottlenecks. Use this skill instead of writing ad-hoc KQL queries against App Insights manually. +Analyze production traces for Foundry agents using Application Insights and GenAI OpenTelemetry semantic conventions. This skill provides structured KQL-powered workflows for a selected agent root and environment: searching conversations, diagnosing failures, and identifying latency bottlenecks. ## When to Use This Skill @@ -8,7 +8,7 @@ USE FOR: analyze agent traces, search agent conversations, find failing traces, > **USE THIS SKILL INSTEAD OF** `azure-monitor` or `azure-applicationinsights` when querying Foundry agent traces, evaluations, or GenAI telemetry. This skill has correct GenAI OTel attribute mappings and tested KQL templates that those general tools lack. -> ⚠️ **DO NOT manually write KQL queries** for GenAI trace analysis **without reading this skill first.** This skill provides tested query templates with correct GenAI OTel attribute mappings, proper span correlation logic, and conversation-level aggregation patterns. +> ⚠️ **DO NOT manually write KQL queries** for GenAI trace analysis **without reading this skill first.** This skill provides tested query templates with correct GenAI OTel attribute mappings, proper span correlation logic, environment-aware scoping, and conversation-level aggregation patterns. ## Quick Reference @@ -16,16 +16,17 @@ USE FOR: analyze agent traces, search agent conversations, find failing traces, |----------|-------| | Data source | Application Insights (App Insights) | | Query language | KQL (Kusto Query Language) | -| Related skills | `troubleshoot` (container logs) | -| Preferred query tool | `monitor_resource_log_query` (Azure MCP) — use for App Insights KQL queries | +| Related skills | `troubleshoot` (container logs), `eval-datasets` (trace harvesting) | +| Preferred query tool | `monitor_resource_log_query` (Azure MCP) - use for App Insights KQL queries | | OTel conventions | [GenAI Spans](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/), [Agent Spans](https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/) | +| Local metadata | `.foundry/agent-metadata.yaml` | ## Entry Points | User Intent | Start At | |-------------|----------| | "Search agent conversations" / "Find traces" | [Search Traces](references/search-traces.md) | -| "Tell me about response ID X" / "Look up response ID" | [Search Traces — Search by Response ID](references/search-traces.md#search-by-response-id) | +| "Tell me about response ID X" / "Look up response ID" | [Search Traces - Search by Response ID](references/search-traces.md#search-by-response-id) | | "Why is my agent failing?" / "Find errors" | [Analyze Failures](references/analyze-failures.md) | | "My agent is slow" / "Latency analysis" | [Analyze Latency](references/analyze-latency.md) | | "Show me this conversation" / "Trace detail" | [Conversation Detail](references/conversation-detail.md) | @@ -34,27 +35,25 @@ USE FOR: analyze agent traces, search agent conversations, find failing traces, ## Before Starting — Resolve App Insights Connection -1. Check `.env` (or the same config file hosting other project variables) for `APPLICATIONINSIGHTS_CONNECTION_STRING` or `AZURE_APPINSIGHTS_RESOURCE_ID` -2. If not found, use `project_connection_list` (foundry-mcp tool) to discover App Insights linked to the Foundry project — this is the most reliable way to find the correct App Insights resource. Filter results for Application Insights connection type. -3. **IMMEDIATELY write back to `.env`** — as soon as `project_connection_list` returns App Insights info, write it to `.env` (or the same config file where `AZURE_AI_PROJECT_ENDPOINT` etc. live) BEFORE running any queries. Do not defer this step. This ensures future sessions skip discovery entirely. - -| Variable | Purpose | Example | -|----------|---------|---------| -| `APPLICATIONINSIGHTS_CONNECTION_STRING` | App Insights connection string | `InstrumentationKey=...;IngestionEndpoint=...` | -| `AZURE_APPINSIGHTS_RESOURCE_ID` | ARM resource ID | `/subscriptions/.../Microsoft.Insights/components/...` | - -If a `.env` file already exists, read it first and merge — do not overwrite existing values without confirmation. - -4. Confirm the App Insights resource with the user before querying +1. Resolve the target agent root and environment from `.foundry/agent-metadata.yaml`. +2. Check `environments..observability.applicationInsightsConnectionString` or `environments..observability.applicationInsightsResourceId` in the metadata. +3. If observability settings are missing, use `project_connection_list` to discover App Insights linked to the Foundry project, then persist the chosen resource back to `environments..observability` in `agent-metadata.yaml` before querying. +4. Confirm the selected App Insights resource and environment with the user before querying. 5. Use **`monitor_resource_log_query`** (Azure MCP tool) to execute KQL queries against the App Insights resource. This is preferred over delegating to the `azure-kusto` skill. Pass the App Insights resource ID and the KQL query directly. -> ⚠️ **Always pass `subscription` explicitly** to Azure MCP tools like `monitor_resource_log_query` — they don't extract it from resource IDs. +| Metadata field | Purpose | Example | +|----------------|---------|---------| +| `environments..observability.applicationInsightsConnectionString` | App Insights connection string | `InstrumentationKey=...;IngestionEndpoint=...` | +| `environments..observability.applicationInsightsResourceId` | ARM resource ID | `/subscriptions/.../Microsoft.Insights/components/...` | + +> ⚠️ **Always pass `subscription` explicitly** to Azure MCP tools like `monitor_resource_log_query` - they do not extract it from resource IDs. ## Behavioral Rules -1. **ALWAYS display the KQL query.** Before executing ANY KQL query, display it in a code block. Never run a query silently. This is a hard requirement, not a suggestion. Showing queries builds trust and helps users learn KQL patterns. -2. **Start broad, then narrow.** Begin with conversation-level summaries, then drill into specific conversations or spans on user request. -3. **Use time ranges.** Always scope queries with a time range (default: last 24 hours). Ask user for the range if not specified. -4. **Explain GenAI attributes.** When displaying results, translate OTel attribute names to human-readable labels (e.g., `gen_ai.operation.name` → "Operation"). -5. **Link to conversation detail.** When showing search or failure results, offer to drill into any specific conversation. -6. **Scope to the target agent.** An App Insights resource may contain traces from multiple agents. For hosted agents, start from the `requests` table where `gen_ai.agent.name` holds the Foundry-level name, then join to `dependencies` via `operation_ParentId`. For prompt agents, filter `dependencies` directly by `gen_ai.agent.name`. When showing overview summaries, group by agent and warn the user if multiple agents are present. +1. **Always display the KQL query.** Before executing any KQL query, display it in a code block. Never run a query silently. +2. **Keep environment visible.** Include the selected environment and agent name in each search summary and explain which metadata entry is being used. +3. **Start broad, then narrow.** Begin with conversation-level summaries, then drill into specific conversations or spans on user request. +4. **Use time ranges.** Always scope queries with a time range (default: last 24 hours). Ask the user for the range if not specified. +5. **Explain GenAI attributes.** When displaying results, translate OTel attribute names to human-readable labels (for example, `gen_ai.operation.name` -> "Operation"). +6. **Link to conversation detail.** When showing search or failure results, offer to drill into any specific conversation. +7. **Scope to the selected environment.** App Insights may contain traces from multiple agents or environments. Filter with the selected environment's agent name first, then add an environment tag filter if the telemetry emits one. diff --git a/plugin/skills/microsoft-foundry/project/connections.md b/plugin/skills/microsoft-foundry/project/connections.md index d4f78be6e..ede68e6d4 100644 --- a/plugin/skills/microsoft-foundry/project/connections.md +++ b/plugin/skills/microsoft-foundry/project/connections.md @@ -8,12 +8,14 @@ Use the Foundry MCP server for all connection operations. The MCP tools handle a | Operation | MCP Tool | Description | |-----------|----------|-------------| -| List all connections | `foundry_connections_list` | Lists all connections in the current project | -| Get connection details | `foundry_connections_get` | Retrieves a specific connection by name, including its ID | -| Create a connection | `foundry_connections_create` | Creates a new connection to an external resource | -| Delete a connection | `foundry_connections_delete` | Removes a connection from the project | +| List all connections | `project_connection_list` | Lists project connections and can filter by category or target | +| Get connection details | `project_connection_get` | Retrieves a specific connection by `connectionName` | +| Create a connection | `project_connection_create` | Creates or replaces a project connection to an external resource | +| Update a connection | `project_connection_update` | Updates auth, category, target, or expiry on an existing connection | +| Delete a connection | `project_connection_delete` | Removes a connection from the project by name | +| List supported categories/auth types | `project_connection_list_metadata` | Lists valid connection categories and auth types before create/update | -> 💡 **Tip:** The `connection_id` returned by `foundry_connections_get` is the value you pass as `project_connection_id` when configuring agent tools. +> 💡 **Tip:** Use `project_connection_get` or `project_connection_list` to resolve the connection name and full connection resource ID before configuring agent tools that require `project_connection_id`. ## Create Connection via Portal @@ -41,6 +43,7 @@ Python and C# SDKs resolve this automatically from the connection name. | `bing_custom_search` | Grounding with Bing Custom Search | Bing Custom Search tool | | `api_key` | Any API-key resource | MCP servers, custom tools | | `azure_openai` | Azure OpenAI | Model access | +| `AzureBlob` | Azure Blob Storage | Dataset upload via `evaluation_dataset_create` | ## RBAC for Connection Management @@ -53,6 +56,6 @@ Python and C# SDKs resolve this automatically from the connection name. | Error | Cause | Fix | |-------|-------|-----| -| `Connection not found` | Name mismatch or wrong project | Use `foundry_connections_list` to find correct name | +| `Connection not found` | Name mismatch or wrong project | Use `project_connection_list` to find the correct `connectionName` | | `Unauthorized` creating connection | Missing Azure AI Project Manager role | Assign role on the Foundry project | -| `Invalid connection ID format` | Using name instead of full resource ID | Use `foundry_connections_get` to resolve the full ID | +| `Invalid connection ID format` | Using name instead of full resource ID | Use `project_connection_get` to resolve the full ID | diff --git a/plugin/skills/microsoft-foundry/references/agent-metadata-contract.md b/plugin/skills/microsoft-foundry/references/agent-metadata-contract.md new file mode 100644 index 000000000..29a0c24bf --- /dev/null +++ b/plugin/skills/microsoft-foundry/references/agent-metadata-contract.md @@ -0,0 +1,104 @@ +# Agent Metadata Contract + +Use this contract for every agent source folder that participates in Microsoft Foundry workflows. + +## Required Local Layout + +```text +/ + .foundry/ + agent-metadata.yaml + datasets/ + evaluators/ + results/ +``` + +- `agent-metadata.yaml` is the required source of truth for environment-specific Foundry configuration. +- `datasets/` and `evaluators/` are local cache folders. Reuse existing files when they are current, and ask before refreshing or overwriting them. +- `results/` stores local evaluation outputs and comparison artifacts by environment. + +## Environment Model + +| Field | Required | Purpose | +|-------|----------|---------| +| `defaultEnvironment` | ✅ | Environment used when the user does not choose one explicitly | +| `environments..projectEndpoint` | ✅ | Foundry project endpoint for that environment | +| `environments..agentName` | ✅ | Deployed Foundry agent name | +| `environments..azureContainerRegistry` | ✅ for hosted agents | ACR used for deployment and image refresh | +| `environments..observability.applicationInsightsResourceId` | Recommended | App Insights resource for trace workflows | +| `environments..observability.applicationInsightsConnectionString` | Optional | Connection string when needed for tooling | +| `environments..testCases[]` | ✅ | Dataset + evaluator + threshold bundles for evaluation workflows | + +## Example `agent-metadata.yaml` + +```yaml +defaultEnvironment: dev +environments: + dev: + projectEndpoint: https://contoso.services.ai.azure.com/api/projects/support-dev + agentName: support-agent-dev + azureContainerRegistry: contosoregistry.azurecr.io + observability: + applicationInsightsResourceId: /subscriptions//resourceGroups//providers/Microsoft.Insights/components/support-dev-ai + testCases: + - id: smoke-core + priority: P0 + dataset: support-agent-dev-smoke-v1 + datasetFile: .foundry/datasets/support-agent-dev-smoke-v1.jsonl + evaluators: + - name: intent_resolution + threshold: 4 + - name: task_adherence + threshold: 4 + - name: citation_quality + threshold: 0.9 + definitionFile: .foundry/evaluators/citation-quality.yaml + - id: trace-regressions + priority: P1 + dataset: support-agent-dev-traces-v3 + datasetFile: .foundry/datasets/support-agent-dev-traces-v3.jsonl + evaluators: + - name: coherence + threshold: 4 + - name: groundedness + threshold: 4 + prod: + projectEndpoint: https://contoso.services.ai.azure.com/api/projects/support-prod + agentName: support-agent-prod + azureContainerRegistry: contosoregistry.azurecr.io + testCases: + - id: production-guardrails + priority: P0 + dataset: support-agent-prod-guardrails-v2 + datasetFile: .foundry/datasets/support-agent-prod-guardrails-v2.jsonl + evaluators: + - name: violence + threshold: 1 + - name: self_harm + threshold: 1 +``` + +## Workflow Rules + +1. Auto-discover agent roots by searching for `.foundry/agent-metadata.yaml`. +2. If exactly one agent root is found, use it. If multiple roots are found, require the user to choose one. +3. Resolve environment in this order: explicit user choice, remembered session choice, `defaultEnvironment`. +4. Keep the selected agent root and environment visible in every deploy, eval, dataset, and trace summary. +5. Treat `datasets/` and `evaluators/` as cache folders. Reuse local files when present, but offer refresh when the user asks or when remote state is newer. +6. Never overwrite cache files or metadata silently. + +## Test-Case Guidance + +| Priority | Meaning | Typical Use | +|----------|---------|-------------| +| `P0` | Must-pass gate | Smoke checks, safety, deployment blockers | +| `P1` | High-value regression coverage | Production trace regressions, key business flows | +| `P2` | Broader quality coverage | Long-tail scenarios, exploratory quality checks | + +Each test case should point to one dataset and one or more evaluators with explicit thresholds. Use test-case IDs in evaluation names, result folders, and regression summaries so the flow remains traceable. + +## Sync Guidance + +- Pull/refresh when the user asks, when the workflow detects missing local cache, or when remote versions clearly differ from local metadata. +- Push/register updates after the user confirms local changes that should be shared in Foundry. +- Record remote dataset names, versions, and last sync timestamps in `.foundry/datasets/manifest.json` or the relevant metadata section. diff --git a/tests/microsoft-foundry/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/__snapshots__/triggers.test.ts.snap index c1c7dba00..21a5553ac 100644 --- a/tests/microsoft-foundry/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/create/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/create/__snapshots__/triggers.test.ts.snap index 828e1abc7..58ac140ac 100644 --- a/tests/microsoft-foundry/foundry-agent/create/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/create/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill description trig "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill description trig "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill description trig "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill keywords match s "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill keywords match s "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`create - Trigger Tests Trigger Keywords Snapshot skill keywords match s "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/create/unit.test.ts b/tests/microsoft-foundry/foundry-agent/create/unit.test.ts index a86aa1cd3..805490207 100644 --- a/tests/microsoft-foundry/foundry-agent/create/unit.test.ts +++ b/tests/microsoft-foundry/foundry-agent/create/unit.test.ts @@ -77,5 +77,6 @@ describe("create - Unit Tests", () => { expect(createContent).toContain("agent.yaml"); expect(createContent).toContain("Dockerfile"); }); + }); }); diff --git a/tests/microsoft-foundry/foundry-agent/deploy/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/deploy/__snapshots__/triggers.test.ts.snap index 34ffd3642..58a9e4e52 100644 --- a/tests/microsoft-foundry/foundry-agent/deploy/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/deploy/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill description trig "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill description trig "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill description trig "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill keywords match s "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill keywords match s "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`deploy - Trigger Tests Trigger Keywords Snapshot skill keywords match s "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/deploy/triggers.test.ts b/tests/microsoft-foundry/foundry-agent/deploy/triggers.test.ts index 53ceeea3a..7652c70a0 100644 --- a/tests/microsoft-foundry/foundry-agent/deploy/triggers.test.ts +++ b/tests/microsoft-foundry/foundry-agent/deploy/triggers.test.ts @@ -50,7 +50,7 @@ describe("deploy - Trigger Tests", () => { "Help me with AWS SageMaker", "How do I configure my PostgreSQL database?", "Explain how Kubernetes pods work", - "Set up monitoring for my web application", + "Set up logging for my web application", "Push my image to a registry", ]; diff --git a/tests/microsoft-foundry/foundry-agent/deploy/unit.test.ts b/tests/microsoft-foundry/foundry-agent/deploy/unit.test.ts index e42a4a95d..ace962575 100644 --- a/tests/microsoft-foundry/foundry-agent/deploy/unit.test.ts +++ b/tests/microsoft-foundry/foundry-agent/deploy/unit.test.ts @@ -107,13 +107,16 @@ describe("deploy - Unit Tests", () => { expect(deployContent).toContain("coherence"); }); - test("instructs identifying LLM-judge deployment", () => { + test("instructs identifying judge deployment from actual project deployments", () => { expect(deployContent).toContain("model_deployment_get"); + expect(deployContent).toMatch(/actual model deployments/i); + expect(deployContent).toMatch(/supports chat completions/i); + expect(deployContent).toMatch(/do\s+\*\*not\*\*\s+assume\s+`gpt-4o`\s+exists/i); }); - test("instructs persisting artifacts to evaluators/ and datasets/", () => { - expect(deployContent).toContain("evaluators/"); - expect(deployContent).toContain("datasets/"); + test("instructs persisting artifacts to .foundry/evaluators/ and .foundry/datasets/", () => { + expect(deployContent).toContain(".foundry/evaluators/"); + expect(deployContent).toContain(".foundry/datasets/"); }); test("asks to RUN evaluation (not just set up)", () => { @@ -129,11 +132,11 @@ describe("deploy - Unit Tests", () => { }); describe("Document Deployment Context", () => { - test("persists environment variables to .env", () => { - expect(deployContent).toContain("AZURE_AI_PROJECT_ENDPOINT"); - expect(deployContent).toContain("AZURE_AI_AGENT_NAME"); - expect(deployContent).toContain("AZURE_AI_AGENT_VERSION"); - expect(deployContent).toContain("AZURE_CONTAINER_REGISTRY"); + test("persists deployment context to agent-metadata.yaml", () => { + expect(deployContent).toContain("projectEndpoint"); + expect(deployContent).toContain("agentName"); + expect(deployContent).toContain("azureContainerRegistry"); + expect(deployContent).toContain("testCases[]"); }); }); }); diff --git a/tests/microsoft-foundry/foundry-agent/eval-datasets/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/eval-datasets/__snapshots__/triggers.test.ts.snap new file mode 100644 index 000000000..9a0e54d44 --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/eval-datasets/__snapshots__/triggers.test.ts.snap @@ -0,0 +1,149 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`eval-datasets - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` +{ + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "extractedKeywords": [ + "agent", + "agents", + "ai", + "assignment", + "availability", + "azure", + "azure-deploy", + "azure-prepare", + "batch", + "build", + "capacity", + "cli", + "cognitive", + "container", + "create", + "curation", + "customize", + "dataset", + "deploy", + "deployment", + "docker", + "end-to-end", + "eval", + "evaluate", + "failure", + "foundry", + "from", + "functions", + "general", + "hosted", + "improve", + "index", + "instructions", + "invoke", + "knowledge", + "manage", + "mcp", + "microsoft", + "model", + "monitor", + "monitoring", + "onboard", + "optimization", + "optimize", + "optimizer", + "permissions", + "prep", + "project", + "prompt", + "provision", + "push", + "quota", + "rbac", + "region", + "resource", + "role", + "service", + "services", + "start", + "system", + "traces", + "trending", + "troubleshoot", + "validation", + "versioning", + "workflows", + "yaml", + ], + "name": "microsoft-foundry", +} +`; + +exports[`eval-datasets - Trigger Tests Trigger Keywords Snapshot skill keywords match snapshot 1`] = ` +[ + "agent", + "agents", + "ai", + "assignment", + "availability", + "azure", + "azure-deploy", + "azure-prepare", + "batch", + "build", + "capacity", + "cli", + "cognitive", + "container", + "create", + "curation", + "customize", + "dataset", + "deploy", + "deployment", + "docker", + "end-to-end", + "eval", + "evaluate", + "failure", + "foundry", + "from", + "functions", + "general", + "hosted", + "improve", + "index", + "instructions", + "invoke", + "knowledge", + "manage", + "mcp", + "microsoft", + "model", + "monitor", + "monitoring", + "onboard", + "optimization", + "optimize", + "optimizer", + "permissions", + "prep", + "project", + "prompt", + "provision", + "push", + "quota", + "rbac", + "region", + "resource", + "role", + "service", + "services", + "start", + "system", + "traces", + "trending", + "troubleshoot", + "validation", + "versioning", + "workflows", + "yaml", +] +`; diff --git a/tests/microsoft-foundry/foundry-agent/eval-datasets/triggers.test.ts b/tests/microsoft-foundry/foundry-agent/eval-datasets/triggers.test.ts new file mode 100644 index 000000000..57d2e0e02 --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/eval-datasets/triggers.test.ts @@ -0,0 +1,62 @@ +/** + * Trigger Tests for eval-datasets + */ + +import { TriggerMatcher } from "../../../utils/trigger-matcher"; +import { loadSkill, LoadedSkill } from "../../../utils/skill-loader"; + +const SKILL_NAME = "microsoft-foundry"; + +describe("eval-datasets - Trigger Tests", () => { + let triggerMatcher: TriggerMatcher; + let skill: LoadedSkill; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + triggerMatcher = new TriggerMatcher(skill); + }); + + describe("Should Trigger", () => { + const shouldTriggerPrompts: string[] = [ + "Create a dataset from my Foundry agent traces", + "Refresh my local Foundry dataset cache", + "Version my evaluation dataset for a Foundry agent", + "Detect regressions using my Foundry test datasets", + "Curate trace candidates into a dataset for Azure AI Foundry", + ]; + + test.each(shouldTriggerPrompts)('triggers on: "%s"', (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(true); + expect(result.matchedKeywords.length).toBeGreaterThanOrEqual(2); + }); + }); + + describe("Should NOT Trigger", () => { + const shouldNotTriggerPrompts: string[] = [ + "What is the weather today?", + "Explain how Kubernetes pods work", + "Build me a React dashboard", + "Set up PostgreSQL backups", + ]; + + test.each(shouldNotTriggerPrompts)('does not trigger on: "%s"', (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(false); + }); + }); + + describe("Trigger Keywords Snapshot", () => { + test("skill keywords match snapshot", () => { + expect(triggerMatcher.getKeywords()).toMatchSnapshot(); + }); + + test("skill description triggers match snapshot", () => { + expect({ + name: skill.metadata.name, + description: skill.metadata.description, + extractedKeywords: triggerMatcher.getKeywords() + }).toMatchSnapshot(); + }); + }); +}); diff --git a/tests/microsoft-foundry/foundry-agent/eval-datasets/unit.test.ts b/tests/microsoft-foundry/foundry-agent/eval-datasets/unit.test.ts new file mode 100644 index 000000000..b936a307e --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/eval-datasets/unit.test.ts @@ -0,0 +1,116 @@ +/** + * Unit Tests for eval-datasets + * + * Test isolated skill logic and validation rules. + */ + +import * as fs from "fs"; +import * as path from "path"; +import { fileURLToPath } from "url"; +import { loadSkill, LoadedSkill } from "../../../utils/skill-loader"; + +const SKILL_NAME = "microsoft-foundry"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); +const DATASETS_MD = path.resolve( + __dirname, + "../../../../plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md" +); +const REFERENCES_PATH = path.resolve( + __dirname, + "../../../../plugin/skills/microsoft-foundry/foundry-agent/eval-datasets/references" +); + +describe("eval-datasets - Unit Tests", () => { + let skill: LoadedSkill; + let datasetsContent: string; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + datasetsContent = fs.readFileSync(DATASETS_MD, "utf-8"); + }); + + describe("Skill Metadata", () => { + test("has valid SKILL.md with required fields", () => { + expect(skill.metadata).toBeDefined(); + expect(skill.metadata.name).toBe("microsoft-foundry"); + expect(skill.metadata.description).toBeDefined(); + expect(skill.metadata.description.length).toBeGreaterThan(10); + }); + }); + + describe("Eval-Datasets Content Structure", () => { + test("has substantive content", () => { + expect(datasetsContent).toBeDefined(); + expect(datasetsContent.length).toBeGreaterThan(100); + }); + + test("contains expected sections", () => { + expect(datasetsContent).toContain("## Quick Reference"); + expect(datasetsContent).toContain("## Before Starting"); + expect(datasetsContent).toContain("## The Foundry Flywheel"); + expect(datasetsContent).toContain("## Behavioral Rules"); + }); + + test("documents .foundry cache and metadata", () => { + expect(datasetsContent).toContain(".foundry/agent-metadata.yaml"); + expect(datasetsContent).toContain(".foundry/datasets/"); + expect(datasetsContent).toContain(".foundry/results/"); + }); + + test("documents environment-aware versioning and cache reuse", () => { + expect(datasetsContent).toContain("---v"); + expect(datasetsContent).toMatch(/cache|refresh/i); + expect(datasetsContent).toContain("testCases[]"); + }); + + test("documents evalId versus evaluationId guidance", () => { + const comparisonContent = fs.readFileSync( + path.join(REFERENCES_PATH, "dataset-comparison.md"), + "utf-8" + ); + const trendingContent = fs.readFileSync( + path.join(REFERENCES_PATH, "eval-trending.md"), + "utf-8" + ); + + expect(datasetsContent).toContain("evaluationId"); + expect(datasetsContent).toContain("evalId"); + expect(comparisonContent).toMatch(/switch to `evalId`/i); + expect(trendingContent).toMatch(/evaluation_get expects `evalId`, not `evaluationId`/i); + }); + + test("documents eval group immutability for evaluator and threshold changes", () => { + const comparisonContent = fs.readFileSync( + path.join(REFERENCES_PATH, "dataset-comparison.md"), + "utf-8" + ); + const trendingContent = fs.readFileSync( + path.join(REFERENCES_PATH, "eval-trending.md"), + "utf-8" + ); + + expect(comparisonContent).toMatch(/create a new evaluation group/i); + expect(comparisonContent).toMatch(/thresholds/i); + expect(trendingContent).toMatch(/evaluator set and thresholds stayed fixed/i); + }); + }); + + describe("Reference Files Exist", () => { + const expectedFiles = [ + "trace-to-dataset.md", + "dataset-versioning.md", + "dataset-organization.md", + "dataset-curation.md", + "eval-trending.md", + "eval-regression.md", + "dataset-comparison.md", + "eval-lineage.md", + ]; + + test.each(expectedFiles)("has reference file: %s", (file) => { + const filePath = path.join(REFERENCES_PATH, file); + expect(fs.existsSync(filePath)).toBe(true); + }); + }); +}); diff --git a/tests/microsoft-foundry/foundry-agent/invoke/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/invoke/__snapshots__/triggers.test.ts.snap index 8d19d8b8b..9041dcbfc 100644 --- a/tests/microsoft-foundry/foundry-agent/invoke/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/invoke/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill description trig "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill description trig "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill description trig "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill keywords match s "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill keywords match s "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`invoke - Trigger Tests Trigger Keywords Snapshot skill keywords match s "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/observe/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/observe/__snapshots__/triggers.test.ts.snap index f13a047d1..7d8e04a16 100644 --- a/tests/microsoft-foundry/foundry-agent/observe/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/observe/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill description tri "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill description tri "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill description tri "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill keywords match "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill keywords match "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`observe - Trigger Tests Trigger Keywords Snapshot skill keywords match "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/observe/unit.test.ts b/tests/microsoft-foundry/foundry-agent/observe/unit.test.ts index bc19d8a34..77fcfe58a 100644 --- a/tests/microsoft-foundry/foundry-agent/observe/unit.test.ts +++ b/tests/microsoft-foundry/foundry-agent/observe/unit.test.ts @@ -93,13 +93,14 @@ describe("observe - Unit Tests", () => { expect(observeContent).toMatch(/Agent just deployed|Set up evaluation/i); }); - test("routes evaluate intent through auto-setup when evaluators missing", () => { - expect(observeContent).toMatch(/evaluators\/.*empty|check.*evaluators/i); + test("routes evaluate intent through auto-setup when cache is missing or stale", () => { + expect(observeContent).toMatch(/cache is missing|stale|refresh|check.*evaluators/i); }); test("warns to check for existing evaluators before evaluation", () => { - expect(observeContent).toContain("evaluators/"); - expect(observeContent).toContain("datasets/"); + expect(observeContent).toContain(".foundry/agent-metadata.yaml"); + expect(observeContent).toContain(".foundry/evaluators/"); + expect(observeContent).toContain(".foundry/datasets/"); expect(observeContent).toMatch(/auto-setup|Auto-Setup/i); }); }); @@ -147,13 +148,17 @@ describe("observe - Unit Tests", () => { expect(setupContent).toContain("self_harm"); }); - test("includes LLM-judge deployment step", () => { + test("includes judge deployment step based on actual project deployments", () => { expect(setupContent).toContain("model_deployment_get"); + expect(setupContent).toMatch(/actual model deployments/i); + expect(setupContent).toMatch(/supports chat completions/i); + expect(setupContent).toMatch(/do\s+\*\*not\*\*\s+assume\s+`gpt-4o`\s+exists/i); }); test("includes artifact persistence structure", () => { - expect(setupContent).toContain("evaluators/"); - expect(setupContent).toContain("datasets/"); + expect(setupContent).toContain(".foundry/agent-metadata.yaml"); + expect(setupContent).toContain(".foundry/evaluators/"); + expect(setupContent).toContain(".foundry/datasets/"); expect(setupContent).toContain(".yaml"); expect(setupContent).toContain(".jsonl"); }); @@ -188,9 +193,52 @@ describe("observe - Unit Tests", () => { }); test("requires persisting eval artifacts", () => { - expect(observeContent).toContain("evaluators/"); - expect(observeContent).toContain("datasets/"); - expect(observeContent).toContain("results/"); + expect(observeContent).toContain(".foundry/evaluators/"); + expect(observeContent).toContain(".foundry/datasets/"); + expect(observeContent).toContain(".foundry/results/"); + expect(observeContent).toContain("P0"); + }); + + test("documents evalId versus evaluationId guardrail", () => { + const evaluateContent = fs.readFileSync( + path.join(REFERENCES_PATH, "evaluate-step.md"), + "utf-8" + ); + const compareContent = fs.readFileSync( + path.join(REFERENCES_PATH, "compare-iterate.md"), + "utf-8" + ); + + expect(evaluateContent).toContain("evaluationId"); + expect(evaluateContent).toContain("evalId"); + expect(evaluateContent).toMatch(/evaluation_get.*does\s+\*\*not\*\*\s+accept\s+`evaluationId`/i); + expect(compareContent).toMatch(/creation uses `evaluationId`.*`evaluation_get`.*`evalId`/i); + }); + + test("requires judge deployment lookup instead of assuming gpt-4o", () => { + const evaluateContent = fs.readFileSync( + path.join(REFERENCES_PATH, "evaluate-step.md"), + "utf-8" + ); + + expect(evaluateContent).toContain("model_deployment_get"); + expect(evaluateContent).toMatch(/supports chat completions/i); + expect(evaluateContent).toMatch(/do\s+\*\*not\*\*\s+assume\s+`gpt-4o`\s+exists/i); + }); + + test("documents eval group immutability for evaluators and thresholds", () => { + const evaluateContent = fs.readFileSync( + path.join(REFERENCES_PATH, "evaluate-step.md"), + "utf-8" + ); + const compareContent = fs.readFileSync( + path.join(REFERENCES_PATH, "compare-iterate.md"), + "utf-8" + ); + + expect(evaluateContent).toMatch(/new evaluation group/i); + expect(evaluateContent).toMatch(/thresholds/i); + expect(compareContent).toMatch(/reuse the same `evaluationId` only when `evaluatorNames` and thresholds are unchanged/i); }); }); }); diff --git a/tests/microsoft-foundry/foundry-agent/trace/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/trace/__snapshots__/triggers.test.ts.snap index 7d4b878ac..c223d4627 100644 --- a/tests/microsoft-foundry/foundry-agent/trace/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/trace/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill description trigg "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill description trigg "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill description trigg "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill keywords match sn "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill keywords match sn "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`trace - Trigger Tests Trigger Keywords Snapshot skill keywords match sn "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/foundry-agent/troubleshoot/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/troubleshoot/__snapshots__/triggers.test.ts.snap index 684bc4c83..888b829d4 100644 --- a/tests/microsoft-foundry/foundry-agent/troubleshoot/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/foundry-agent/troubleshoot/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill descriptio "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill descriptio "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill descriptio "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill keywords m "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill keywords m "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`troubleshoot - Trigger Tests Trigger Keywords Snapshot skill keywords m "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/integration.test.ts b/tests/microsoft-foundry/integration.test.ts index 8eef8ff00..40c81c355 100644 --- a/tests/microsoft-foundry/integration.test.ts +++ b/tests/microsoft-foundry/integration.test.ts @@ -196,6 +196,42 @@ describeIntegration(`${SKILL_NAME}_ - Integration Tests`, () => { } } }); + + test("invokes microsoft-foundry skill for trace-to-dataset prompt", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Create an evaluation dataset from my Foundry agent traces" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); + + test("invokes microsoft-foundry skill for dataset versioning prompt", async () => { + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await agent.run({ + prompt: "Version my Foundry evaluation dataset and compare regressions" + }); + + softCheckSkill(agentMetadata, SKILL_NAME); + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + }); }); }); diff --git a/tests/microsoft-foundry/models/deploy/capacity/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/models/deploy/capacity/__snapshots__/triggers.test.ts.snap index 8d6ed1ba3..f797bf619 100644 --- a/tests/microsoft-foundry/models/deploy/capacity/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/models/deploy/capacity/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill description tr "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill description tr "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill description tr "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill keywords match "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill keywords match "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`capacity - Trigger Tests Trigger Keywords Snapshot skill keywords match "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/models/deploy/customize-deployment/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/models/deploy/customize-deployment/__snapshots__/triggers.test.ts.snap index c1c7dba00..21a5553ac 100644 --- a/tests/microsoft-foundry/models/deploy/customize-deployment/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/models/deploy/customize-deployment/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/models/deploy/deploy-model-optimal-region/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/models/deploy/deploy-model-optimal-region/__snapshots__/triggers.test.ts.snap index c1c7dba00..21a5553ac 100644 --- a/tests/microsoft-foundry/models/deploy/deploy-model-optimal-region/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/models/deploy/deploy-model-optimal-region/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/models/deploy/deploy-model/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/models/deploy/deploy-model/__snapshots__/triggers.test.ts.snap index c1c7dba00..21a5553ac 100644 --- a/tests/microsoft-foundry/models/deploy/deploy-model/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/models/deploy/deploy-model/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill descr "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], "name": "microsoft-foundry", @@ -88,7 +88,6 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -108,20 +107,22 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -135,15 +136,14 @@ exports[`microsoft-foundry - Trigger Tests Trigger Keywords Snapshot skill keywo "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/resource/create/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/resource/create/__snapshots__/triggers.test.ts.snap index 5decccce6..998abee1a 100644 --- a/tests/microsoft-foundry/resource/create/__snapshots__/triggers.test.ts.snap +++ b/tests/microsoft-foundry/resource/create/__snapshots__/triggers.test.ts.snap @@ -2,7 +2,7 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` { - "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability, standard agent setup, capability host. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", + "description": "Deploy, evaluate, and manage Foundry agents end-to-end: Docker build, ACR push, hosted/prompt agent create, container start, batch eval, prompt optimization, prompt optimizer workflows, agent.yaml, dataset curation from traces. USE FOR: deploy agent to Foundry, hosted agent, create agent, invoke agent, evaluate agent, run batch eval, optimize prompt, improve prompt, prompt optimization, prompt optimizer, improve agent instructions, optimize agent instructions, optimize system prompt, deploy model, Foundry project, RBAC, role assignment, permissions, quota, capacity, region, troubleshoot agent, deployment failure, create dataset from traces, dataset versioning, eval trending, create AI Services, Cognitive Services, create Foundry resource, provision resource, knowledge index, agent monitoring, customize deployment, onboard, availability. DO NOT USE FOR: Azure Functions, App Service, general Azure deploy (use azure-deploy), general Azure prep (use azure-prepare).", "extractedKeywords": [ "agent", "agents", @@ -14,7 +14,6 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -34,20 +33,22 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -61,15 +62,14 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ], } @@ -87,7 +87,6 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "azure-prepare", "batch", "build", - "capability", "capacity", "cli", "cognitive", @@ -107,20 +106,22 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "from", "functions", "general", - "host", "hosted", - "identity", + "improve", "index", + "instructions", "invoke", "knowledge", "manage", "mcp", "microsoft", "model", + "monitor", "monitoring", "onboard", "optimization", "optimize", + "optimizer", "permissions", "prep", "project", @@ -134,15 +135,14 @@ exports[`microsoft-foundry:resource/create - Trigger Tests Trigger Keywords Snap "role", "service", "services", - "setup", - "standard", "start", - "storage", + "system", "traces", "trending", "troubleshoot", "validation", "versioning", + "workflows", "yaml", ] `; diff --git a/tests/microsoft-foundry/triggers.test.ts b/tests/microsoft-foundry/triggers.test.ts index 4303da68b..b5b45c050 100644 --- a/tests/microsoft-foundry/triggers.test.ts +++ b/tests/microsoft-foundry/triggers.test.ts @@ -26,6 +26,9 @@ describe(`${SKILL_NAME} - Trigger Tests`, () => { "Build a RAG application with Azure AI Foundry knowledge index", "Create an AI agent in Microsoft Foundry with web search", "Evaluate agent performance using Foundry evaluators", + "Optimize my prompt for a Microsoft Foundry agent", + "Improve my agent instructions in Azure AI Foundry", + "Use a prompt optimizer on my Foundry system prompt", "Set up agent monitoring and continuous evaluation in Foundry", "Help me with Microsoft Foundry model deployment", "How to use knowledge index for RAG in Azure AI Foundry?", diff --git a/tests/microsoft-foundry/unit.test.ts b/tests/microsoft-foundry/unit.test.ts index 0c712dcd0..fd3640688 100644 --- a/tests/microsoft-foundry/unit.test.ts +++ b/tests/microsoft-foundry/unit.test.ts @@ -47,18 +47,27 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { }); test("contains expected sections", () => { - expect(skill.content).toContain("## Agent Lifecycle"); + expect(skill.content).toContain("## Agent Development Lifecycle"); expect(skill.content).toContain("## Sub-Skills"); - expect(skill.content).toContain("## Project Context Resolution"); - expect(skill.content).toContain("## Agent Types"); + expect(skill.content).toContain("## Agent: Project Context Resolution"); + expect(skill.content).toContain("## Agent: Agent Types"); }); test("contains agent routing references", () => { expect(skill.content).toContain("deploy"); expect(skill.content).toContain("invoke"); + expect(skill.content).toContain("observe"); expect(skill.content).toContain("troubleshoot"); }); + test("description includes prompt optimization routing keywords", () => { + const description = skill.metadata.description; + expect(description).toContain("improve prompt"); + expect(description).toContain("prompt optimizer"); + expect(description).toContain("improve agent instructions"); + expect(description).toContain("optimize system prompt"); + }); + test("contains common project context resolution", () => { expect(skill.content).toContain("azure.yaml"); expect(skill.content).toContain("azd env get-values"); @@ -68,6 +77,13 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { expect(skill.content).toContain("AZURE_AI_PROJECT_ENDPOINT"); expect(skill.content).toContain("AZURE_CONTAINER_REGISTRY_NAME"); }); + + test("documents .foundry workspace standard", () => { + expect(skill.content).toContain(".foundry/agent-metadata.yaml"); + expect(skill.content).toContain("defaultEnvironment"); + expect(skill.content).toContain("Agent Metadata Contract"); + }); + }); describe("Sub-Skills Reference", () => { @@ -78,9 +94,16 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { test("references agent sub-skills in table", () => { expect(skill.content).toContain("foundry-agent/deploy/deploy.md"); expect(skill.content).toContain("foundry-agent/invoke/invoke.md"); + expect(skill.content).toContain("foundry-agent/observe/observe.md"); expect(skill.content).toContain("foundry-agent/troubleshoot/troubleshoot.md"); }); + test("observe sub-skill row routes prompt optimization scenarios", () => { + expect(skill.content).toMatch(/observe.*optimize prompts/i); + expect(skill.content).toMatch(/observe.*improve agent instructions/i); + expect(skill.content).toMatch(/observe.*CI\/CD monitoring/i); + }); + test("references quota sub-skill", () => { expect(skill.content).toContain("quota"); expect(skill.content).toContain("quota/quota.md"); @@ -143,6 +166,21 @@ describe(`${SKILL_NAME} - Unit Tests`, () => { }); }); + describe("Agent Development Lifecycle Routing", () => { + test("routes prompt optimization intents to observe", () => { + expect(skill.content).toContain( + "Optimize / improve agent prompt or instructions" + ); + expect(skill.content).toContain("observe (Step 4: Optimize)"); + expect(skill.content).toContain("Evaluate and optimize agent (full loop)"); + }); + + test("mentions prompt_optimize at the top level", () => { + expect(skill.content).toContain("prompt_optimize"); + expect(skill.content).toMatch(/Prompt Optimization:/i); + }); + }); + describe("RBAC Sub-Skill Content", () => { let rbacContent: string;