diff --git a/plugin/skills/microsoft-foundry/SKILL.md b/plugin/skills/microsoft-foundry/SKILL.md index cc7dd68ad..29024a3e8 100644 --- a/plugin/skills/microsoft-foundry/SKILL.md +++ b/plugin/skills/microsoft-foundry/SKILL.md @@ -19,6 +19,7 @@ This skill includes specialized sub-skills for specific workflows. **Use these i | **invoke** | Send messages to an agent, single or multi-turn conversations | [invoke](foundry-agent/invoke/invoke.md) | | **troubleshoot** | View container logs, query telemetry, diagnose failures | [troubleshoot](foundry-agent/troubleshoot/troubleshoot.md) | | **create** | Create new hosted agent applications. Supports Microsoft Agent Framework, LangGraph, or custom frameworks in Python or C#. Downloads starter samples from foundry-samples repo. | [create](foundry-agent/create/create.md) | +| **evaluate** | Evaluate AI agents built with Microsoft Agent Framework using pytest-agent-evals plugin. Supports built-in and custom evaluators with VS Code Test Explorer integration. | [evaluate](foundry-agent/evaluate/agent-framework/SKILL.md) | | **project/create** | Creating a new Azure AI Foundry project for hosting agents and models. Use when onboarding to Foundry or setting up new infrastructure. | [project/create/create-foundry-project.md](project/create/create-foundry-project.md) | | **resource/create** | Creating Azure AI Services multi-service resource (Foundry resource) using Azure CLI. Use when manually provisioning AI Services resources with granular control. | [resource/create/create-foundry-resource.md](resource/create/create-foundry-resource.md) | | **models/deploy-model** | Unified model deployment with intelligent routing. Handles quick preset deployments, fully customized deployments (version/SKU/capacity/RAI), and capacity discovery across regions. Routes to sub-skills: `preset` (quick deploy), `customize` (full control), `capacity` (find availability). | [models/deploy-model/SKILL.md](models/deploy-model/SKILL.md) | @@ -40,6 +41,7 @@ Match user intent to the correct workflow. Read each sub-skill in order before e | Update/redeploy an agent after code changes | deploy → invoke | | Invoke/test/chat with an agent | invoke | | Troubleshoot an agent issue | invoke → troubleshoot | +| Evaluate agent performance / add evaluators | evaluate | | Fix a broken agent (troubleshoot + redeploy) | invoke → troubleshoot → apply fixes → deploy → invoke | | Start/stop agent container | deploy | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/SKILL.md b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/SKILL.md new file mode 100644 index 000000000..c24ec7df7 --- /dev/null +++ b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/SKILL.md @@ -0,0 +1,237 @@ +--- +name: agent-framework +description: | + Evaluate AI agents and workflows built with Microsoft Agent Framework using pytest-agent-evals plugin. Supports built-in and custom evaluators with VS Code Test Explorer integration. + USE FOR: evaluate agent, test agent, assess agent, agent evaluation, pytest evaluation, measure agent performance, agent quality, add evaluator, evaluation dataset, judge model. + DO NOT USE FOR: creating agents (use agent/create), deploying agents (use agent/deploy), evaluating non-Agent-Framework agents. +--- + +# Evaluate Agent with pytest-agent-evals + +Assess agent performance using the `pytest-agent-evals` plugin — a pytest-based evaluation framework integrated with VS Code Test Explorer and AI Toolkit. + +> **Prerequisite**: The agent under test MUST be built with Microsoft Agent Framework (`agent_framework.ChatAgent` or `agent_framework.WorkflowAgent`). If not, inform user and skip evaluation. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **Plugin** | pytest-agent-evals (>=0.0.1b260210) | +| **Framework** | pytest | +| **Agent SDK** | Microsoft Agent Framework (ChatAgent, WorkflowAgent) | +| **Integration** | VS Code Test Explorer, AI Toolkit | +| **Best For** | Systematic agent evaluation with built-in and custom evaluators | + +## When to Use This Skill + +Use when the user wants to: + +- **Evaluate** or test an existing agent's performance +- **Set up** systematic evaluation with metrics +- **Add** built-in or custom evaluators +- **Generate** evaluation test dataset +- **Configure** judge model for prompt-based evaluation + +## Defaults + +- **Plugin**: pytest-agent-evals (pin version `>=0.0.1b260210`) +- **Environment**: Reuse agent's existing virtual environment + +## MCP Tools + +This skill delegates to `microsoft-foundry` MCP tools for model and project operations: + +| Tool | Purpose | +|------|---------| +| `foundry_models_list` | Browse model catalog for judge model selection | +| `foundry_models_deployments_list` | List deployed models for judge model | +| `foundry_resource_get` | Get project endpoint | + +## References + +| Topic | File | Description | +|-------|------|-------------| +| Code Example | [references/code-example.md](references/code-example.md) | Complete evaluation code example, key concepts, code generation guidelines | +| Built-in Evaluators | [references/built-in-evaluators.md](references/built-in-evaluators.md) | Agent, general purpose, RAG, and similarity evaluators catalog | +| Custom Evaluators | [references/custom-evaluators.md](references/custom-evaluators.md) | Custom prompt (LLM judge) and code (Python function) evaluators | + +## Evaluation Workflow + +```markdown +Evaluation Setup: +- [ ] Clarify evaluation metrics +- [ ] Obtain test dataset (file or generate) +- [ ] Resolve judge model (if needed) +- [ ] Generate evaluation code +- [ ] Set up project configuration +- [ ] Install dependencies +- [ ] Verify & Handoff +``` + +### Step 1: Clarify Metrics + +Analyze user's app and suggest 1-3 relevant metrics. Metrics can be **built-in** (see [references/built-in-evaluators.md](references/built-in-evaluators.md)) or **custom**. Always state the metric type. + +``` +Based on your [agent type], I recommend: +1. [Metric name] — [brief description] (built-in | custom, prompt-based | code-based) +2. [Metric name] — [brief description] (built-in | custom, prompt-based | code-based) + +Should I proceed with these metrics? +``` + +**Example** — math solver agent, general request "set up evaluation": +``` +1. correctness — validates if the agent answer matches the ground truth (custom, code-based) +2. tool_call_accuracy — assesses tool usage relevance and parameter correctness (built-in, prompt-based) +``` + +**Guidelines:** +- If user specifies objectives (e.g., "evaluate tool accuracy") → suggest only relevant metrics +- If general request ("evaluate my agent") → suggest max 3 most important +- Match metric count to explicitly mentioned objectives +- Prefer built-in evaluators when they fit; use custom when no built-in covers the need + +### Step 2: Obtain Test Dataset + +``` +How would you like to provide test queries? +1. Point to existing JSONL file +2. Let me generate sample queries +``` + +**Dataset format** — JSONL with required `query` field and optional `id`: +```jsonl +{"id": "weather_ny", "query": "What's the weather in New York?"} +{"id": "time_utc", "query": "What's the current UTC time?"} +``` + +If generating, create 5-10 realistic queries and save to `_dataset.jsonl`. + +### Step 3: Resolve Judge Model (if needed) + +A judge model is only required when the selected metrics include **prompt-based** evaluators (see Type column in [Built-in Evaluators](#built-in-evaluators)) or Custom Prompt Evaluators. If all metrics are code-based or Custom Code Evaluators only, skip this step. + +**Default (silent)**: Use the agent's own Foundry model deployment (`FOUNDRY_MODEL_DEPLOYMENT_NAME`). Do NOT ask the user. + +**Unsupported models** — cannot produce structured evaluation output: +- **Reasoning models**: DeepSeek R-series, OpenAI o-series (e.g., `o1`, `o3`, `o4-mini`) +- **OpenAI gpt-5 series** (excluding `gpt-5-chat` variants which ARE supported) + +**If the agent's model is unsupported**: Use `microsoft-foundry` skill's model catalog to help the user select and deploy a suitable judge model. + +### Step 4: Confirm and Generate + +``` +Evaluation Plan: +- Metrics: [list] +- Dataset: [source] +- Agent: [agent fixture/file] +- Judge Model: [deployment name] (auto-selected / user-selected) + +Proceed to generate evaluation code? +``` + +### Step 5: Generate Evaluation Code + +Use the `pytest-agent-evals` plugin to write test-suite-style evaluation code. See [references/code-example.md](references/code-example.md) for the complete example, key concepts, and code generation guidelines. If the selected metrics include custom evaluators, see [references/custom-evaluators.md](references/custom-evaluators.md) for prompt-based and code-based patterns. + +### Step 6: Set Up Project + +#### Install Dependencies + +```bash +pip install pytest-agent-evals>=0.0.1b260210 --pre +``` + +#### .vscode/settings.json + +Create or update `.vscode/settings.json` to enable VS Code Test Explorer integration: + +```jsonc +{ + "python.testing.pytestArgs": [ + ".", + "--cache-mode=session" + ], + "python.testing.pytestEnabled": true +} +``` + +> `--cache-mode=session` clears response cache at startup for consistency. Use `--cache-mode=persistence` to preserve cache across sessions for rapid evaluator tuning. + +#### Verify + +If the required model environment variables are not yet configured (e.g., `.env` values are empty or placeholders), skip verification and inform the user to set the environment variables before running tests. + +Run **all evaluators with a single test case** to verify the setup and all evaluators work well without waiting for the full dataset: + +```bash +pytest test_.py -k "" -v +``` + +For example, if the test class is `TestWeatherAgent` and the first dataset entry has id `weather_ny`: +```bash +pytest test_weather_agent.py -k "weather_ny" -v +``` + +After the test runs successfully, the plugin saves results to `test-results/evaluation_results_.json`. The result file schema: + +```json +{ + "rows": [ + { + "inputs.query": "user query", + "outputs.response": "agent response", + "outputs.tool_calls": "tool calls made by agent", + "outputs.tool_definitions": "tool definitions available to agent", + "outputs..score": 5, + "outputs..reason": "explanation of score", + "outputs..result": "pass" + } + ] +} +``` + +Each row is a test case result containing the query, agent response, and all evaluators' scores/reasons/results. Read the latest result file, analyze it briefly, and report to the user: +- Whether the test passed or failed +- The evaluator score and reason (if available) +- Any issues to address + +If the test fails due to code/setup errors, fix and rerun. + +> For the full evaluation, the user can run `pytest test_.py -v` or use VS Code Test Explorer (recommended) — tests appear in the sidebar and results integrate with AI Toolkit. + +### Step 7: Generate Evaluation Documentation + +Create a `evaluation.md` in the project root with the following sections: + +**Setup**: Environment variables needed in `.env` (with placeholder values), install command (`pip install pytest-agent-evals>=0.0.1b260210 --pre`), VS Code settings for Test Explorer. + +**Run Evaluations**: VS Code Test Explorer (recommended) — Open Testing panel (flask icon) → click ▶️ to run all or individual tests. Terminal — `pytest test_.py -v`. + +**View Results**: Results saved to `test-results/evaluation_results_.json` after each run. Open in **AI Toolkit** panel → **Local Evaluation Results** to browse. + +**Update Test Dataset**: Open JSONL dataset file → click **"Generate Test Cases with Copilot"** CodeLens. + +**Update Custom Evaluators**: Click **"+ Add Custom Evaluator with Copilot"** CodeLens above test class, or **"Update Custom Evaluator with Copilot"** above a test method. + +After generating the documentation, inform the user they can follow `evaluation.md` to run full evaluations, update the test dataset, and add or update custom evaluators. + +## Built-in Evaluators + +See [references/built-in-evaluators.md](references/built-in-evaluators.md) for the full catalog of agent, general purpose, RAG, and similarity evaluators. + +## Custom Evaluators + +See [references/custom-evaluators.md](references/custom-evaluators.md) for custom prompt (LLM judge) and code (Python function) evaluator patterns. + +## Error Handling + +| Error | Cause | Resolution | +|-------|-------|------------| +| Agent not Agent Framework | Wrong SDK | Inform user; evaluation requires Agent Framework | +| Judge model unsupported | Reasoning/non-chat model | Use `microsoft-foundry` skill to select a supported model | +| Missing env vars | `.env` not configured | Set `FOUNDRY_PROJECT_ENDPOINT`, `FOUNDRY_MODEL_DEPLOYMENT_NAME`, `AZURE_OPENAI_ENDPOINT` | +| pytest not found | Plugin not installed | `pip install pytest-agent-evals>=0.0.1b260210 --pre` | +| Import error | Agent not importable | Refactor agent creation into importable function | diff --git a/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/built-in-evaluators.md b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/built-in-evaluators.md new file mode 100644 index 000000000..cdf0b398a --- /dev/null +++ b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/built-in-evaluators.md @@ -0,0 +1,42 @@ +# Built-in Evaluators + +Use `BuiltInEvaluatorConfig(name, threshold)` in `@evals.evaluator(...)`. + +## Agent Evaluators + +| Evaluator | Type | Description | +|-----------|------|-------------| +| `task_adherence` | Prompt-based | Assesses how well an AI-generated response follows the assigned task based on alignment with instructions and definitions, accuracy and clarity of the response, and proper use of provided tool definitions. | +| `intent_resolution` | Prompt-based | Assesses whether the user intent was correctly identified and resolved. | +| `tool_call_accuracy` | Prompt-based | Assesses how accurately an AI uses tools by examining relevance to the conversation, parameter correctness according to tool definitions, and parameter value extraction from the conversation. | +| `tool_selection` | Prompt-based | Evaluates whether an AI agent selected the most appropriate and efficient tools for a given task, avoiding redundancy or missing essentials. | +| `task_completion` | Prompt-based | Evaluates whether an AI agent successfully completed the requested task end to end by analyzing the conversation history and agent response to determine if all task requirements were met. | +| `tool_call_success` | Prompt-based | Evaluates whether all tool calls were successful or not. Checks all tool calls to determine if any resulted in technical failure like exception, error, or timeout. | +| `tool_input_accuracy` | Prompt-based | Checks whether all parameters in an agent's tool call are correct, validating grounding, type, format, completeness, and contextual appropriateness. | +| `tool_output_utilization` | Prompt-based | Checks if an agent correctly interprets and contextually uses the outputs returned by invoked tools without fabrication or omission. | + +## General Purpose + +| Evaluator | Type | Description | +|-----------|------|-------------| +| `coherence` | Prompt-based | Assesses the ability of the language model to generate text that reads naturally, flows smoothly, and resembles human-like language. | +| `fluency` | Prompt-based | Assesses the extent to which the generated text conforms to grammatical rules, syntactic structures, and appropriate vocabulary usage. | +| `relevance` | Prompt-based | Assesses the ability of answers to capture the key points of the context and produce coherent and contextually appropriate outputs. | + +## RAG Evaluators + +| Evaluator | Type | Description | Dataset Fields | +|-----------|------|-------------|----------------| +| `groundedness` | Prompt-based | Assesses the correspondence between claims in an AI-generated answer and the source context, making sure that these claims are substantiated by the context. | `context` | +| `retrieval` | Prompt-based | Assesses the AI system's performance in retrieving information for additional context (e.g. a RAG scenario). | `context` | +| `response_completeness` | Prompt-based | Assesses how thoroughly an AI model's generated response aligns with the key information, claims, and statements established in the ground truth. | `ground_truth` | + +## Similarity Evaluators + +| Evaluator | Type | Description | Dataset Fields | +|-----------|------|-------------|----------------| +| `similarity` | Code-based | Evaluates the likeness between a ground truth sentence and the AI model's generated prediction using sentence-level embeddings. | `ground_truth` | +| `f1_score` | Code-based | Calculates F1 score. | `ground_truth` | +| `bleu_score` | Code-based | Calculates BLEU score. | `ground_truth` | + +> **Dataset Fields**: Additional columns required in the JSONL dataset beyond `query`. The plugin auto-extracts `response`, `tool_calls`, and `tool_definitions` from the agent. diff --git a/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/code-example.md b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/code-example.md new file mode 100644 index 000000000..75a7f3608 --- /dev/null +++ b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/code-example.md @@ -0,0 +1,64 @@ +# Evaluation Code Example + +## Complete Example + +```python +import os +import pytest +from pytest_agent_evals import ( + evals, + EvaluatorResults, + AzureOpenAIModelConfig, + ChatAgentConfig, + BuiltInEvaluatorConfig, +) +from dotenv import load_dotenv +from my_agent import create_my_agent # Import the agent from user's source code + +load_dotenv() +project_endpoint = os.getenv("FOUNDRY_PROJECT_ENDPOINT") +model_deployment = os.getenv("FOUNDRY_MODEL_DEPLOYMENT_NAME") +# Azure OpenAI endpoint derived from the Foundry resource +# Format: https://.openai.azure.com/ +azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + +# --- Agent Fixture --- +# The plugin calls this fixture to get the agent instance for testing. +@pytest.fixture +def my_agent(): + return create_my_agent(project_endpoint, model_deployment) + +# --- Evaluation Suite --- +# Judge model uses Azure OpenAI endpoint format (not Foundry project endpoint). +@evals.dataset("my_agent_dataset.jsonl") +@evals.judge_model(AzureOpenAIModelConfig(deployment_name=model_deployment, endpoint=azure_openai_endpoint)) +@evals.agent(ChatAgentConfig(my_agent)) +class TestMyAgent: + + @evals.evaluator(BuiltInEvaluatorConfig("task_adherence")) + def test_task_adherence(self, evaluator_results: EvaluatorResults): + assert evaluator_results.task_adherence.result == "pass" + + @evals.evaluator(BuiltInEvaluatorConfig("relevance")) + def test_relevance(self, evaluator_results: EvaluatorResults): + assert evaluator_results.relevance.result == "pass" +``` + +## Key Concepts + +1. **Agent Fixture**: A `@pytest.fixture` that returns an initialized `ChatAgent` or `WorkflowAgent` instance. Referenced via `ChatAgentConfig(agent_fixture)`. +2. **Dataset**: `@evals.dataset("file.jsonl")` — JSONL file with `query` (required) and `id` (optional) fields. +3. **Judge Model**: `@evals.judge_model(AzureOpenAIModelConfig(...))` — LLM used for AI-assisted (prompt-based) evaluation. The endpoint must be in Azure OpenAI format (`https://.openai.azure.com/`), not Foundry project endpoint format. +4. **Evaluators**: `@evals.evaluator(...)` on each test method — registers an evaluator that runs against the agent's response. +5. **Results**: Access via `evaluator_results..result` (returns `"pass"` or `"fail"`) and `evaluator_results..score`. + +## Code Generation Guidelines + +- **File naming**: `test_.py` +- **Class naming**: `Test` (must start with `Test`) +- **Test naming**: `test_` (must start with `test_`) +- **One evaluator per test method** — each `@evals.evaluator` maps to one test +- **Import the agent** from the user's source code and wrap in a `@pytest.fixture`. Always reuse the existing agent definition to keep a single source of truth — if the agent is updated, the evaluation automatically tests the updated version. If the agent cannot be directly imported (e.g., it's created inline in a `main()` function), perform a simple refactor: extract the agent creation into a standalone function or module-level variable in the user's source code, then import it in the test file +- **Judge model endpoint**: Must use Azure OpenAI endpoint format (`https://.openai.azure.com/`). The Foundry project endpoint (`https://.services.ai.azure.com/api/projects/`) will NOT work for the judge model. Add `AZURE_OPENAI_ENDPOINT` to `.env`. +- **Judge model selection**: By default, reuse the agent's model deployment silently. Only if the model is unsupported (reasoning models, gpt-5 non-chat), use a separate env var (e.g., `JUDGE_MODEL_DEPLOYMENT_NAME`) and use `microsoft-foundry` skill's model catalog to select a supported model. +- Ensure `.env` contains `FOUNDRY_PROJECT_ENDPOINT`, `FOUNDRY_MODEL_DEPLOYMENT_NAME` (reuse from agent creation), and `AZURE_OPENAI_ENDPOINT` (for judge model) diff --git a/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/custom-evaluators.md b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/custom-evaluators.md new file mode 100644 index 000000000..c11decaa9 --- /dev/null +++ b/plugin/skills/microsoft-foundry/foundry-agent/evaluate/agent-framework/references/custom-evaluators.md @@ -0,0 +1,64 @@ +# Custom Evaluators + +For metrics not covered by built-in evaluators, define custom evaluators. + +## Custom Prompt Evaluator (LLM Judge) + +Use `CustomPromptEvaluatorConfig` to define an LLM-based evaluator with a Jinja2 prompt template. + +```python +from pytest_agent_evals import CustomPromptEvaluatorConfig + +friendliness_prompt = """ +You are an AI assistant that evaluates the tone of a response. +Score the response on a scale of 1 to 5, where 1 is hostile/rude and 5 is very friendly. +Provide a brief reason for your score. + +Input: +Query: {{query}} +Response: {{response}} + +You must output your result in the following JSON format: +{ + "result": , + "reason": "" +} +""" + +@evals.evaluator(CustomPromptEvaluatorConfig(name="friendliness", prompt=friendliness_prompt, threshold=4)) +def test_friendliness(self, evaluator_results: EvaluatorResults): + assert evaluator_results.friendliness.result == "pass" +``` + +**Template variables**: `{{query}}`, `{{response}}`, `{{tool_calls}}`, `{{tool_definitions}}`, `{{context}}`, `{{ground_truth}}`, and any other dataset columns. + +**Output format**: The prompt must instruct the LLM to output JSON with `"result"` (int, float, or bool) and `"reason"` (string). + +**Threshold types**: +- `int` — ordinal scale (e.g., 1-5). Pass if score >= threshold. +- `float` — continuous scale (e.g., 0.0-1.0). Pass if score >= threshold. +- `bool` — boolean. Pass if returned boolean matches threshold. + +## Custom Code Evaluator (Python Function) + +Use `CustomCodeEvaluatorConfig` for deterministic or rule-based grading. + +```python +from pytest_agent_evals import CustomCodeEvaluatorConfig + +def length_check(sample, item): + """Pass if response is shorter than 100 characters.""" + return 1.0 if len(sample["output_text"]) < 100 else 0.0 + +@evals.evaluator(CustomCodeEvaluatorConfig(name="length_check", grader=length_check, threshold=0.9)) +def test_length_check(self, evaluator_results: EvaluatorResults): + assert evaluator_results.length_check.result == "pass" +``` + +**Grader function signature**: +```python +def grade(sample: dict[str, Any], item: dict[str, Any]) -> float: + # sample: {"output_text": str, "tool_calls": list, "tool_definitions": list} + # item: dataset row (e.g., {"query": str, "context": str, ...}) + return score # float, 0.0 to 1.0 +``` diff --git a/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/__snapshots__/triggers.test.ts.snap b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/__snapshots__/triggers.test.ts.snap new file mode 100644 index 000000000..441ef576c --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/__snapshots__/triggers.test.ts.snap @@ -0,0 +1,96 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`agent-framework (evaluate) - Trigger Tests Trigger Keywords Snapshot skill description triggers match snapshot 1`] = ` +{ + "description": "Evaluate AI agents and workflows built with Microsoft Agent Framework using pytest-agent-evals plugin. Supports built-in and custom evaluators with VS Code Test Explorer integration. +USE FOR: evaluate agent, test agent, assess agent, agent evaluation, pytest evaluation, measure agent performance, agent quality, add evaluator, evaluation dataset, judge model. +DO NOT USE FOR: creating agents (use agent/create), deploying agents (use agent/deploy), evaluating non-Agent-Framework agents. +", + "extractedKeywords": [ + "agent", + "agents", + "assess", + "azure", + "built", + "built-in", + "cli", + "code", + "create", + "creating", + "custom", + "dataset", + "deploy", + "deploying", + "evaluate", + "evaluating", + "evaluation", + "evaluator", + "evaluators", + "explorer", + "framework", + "function", + "integration", + "judge", + "mcp", + "measure", + "microsoft", + "model", + "non-agent-framework", + "performance", + "plugin", + "pytest", + "pytest-agent-evals", + "quality", + "supports", + "test", + "using", + "with", + "workflows", + ], + "name": "agent-framework", +} +`; + +exports[`agent-framework (evaluate) - Trigger Tests Trigger Keywords Snapshot skill keywords match snapshot 1`] = ` +[ + "agent", + "agents", + "assess", + "azure", + "built", + "built-in", + "cli", + "code", + "create", + "creating", + "custom", + "dataset", + "deploy", + "deploying", + "evaluate", + "evaluating", + "evaluation", + "evaluator", + "evaluators", + "explorer", + "framework", + "function", + "integration", + "judge", + "mcp", + "measure", + "microsoft", + "model", + "non-agent-framework", + "performance", + "plugin", + "pytest", + "pytest-agent-evals", + "quality", + "supports", + "test", + "using", + "with", + "workflows", +] +`; diff --git a/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/integration.test.ts b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/integration.test.ts new file mode 100644 index 000000000..f284b2f76 --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/integration.test.ts @@ -0,0 +1,98 @@ +/** + * Integration Tests for agent-framework (evaluate) + * + * Tests skill behavior with a real Copilot agent session. + * Runs prompts multiple times to measure skill invocation rate. + * + * Prerequisites: + * 1. npm install -g @github/copilot-cli + * 2. Run `copilot` and authenticate + */ + +import * as fs from "fs"; +import { + run, + AgentMetadata, + isSkillInvoked, + getToolCalls, + shouldSkipIntegrationTests, + getIntegrationSkipReason, +} from "../../../../utils/agent-runner"; + +const SKILL_NAME = "microsoft-foundry"; +const RUNS_PER_PROMPT = 5; +const EXPECTED_INVOCATION_RATE = 0.6; + +/** Terminate on first `create` tool call to avoid unnecessary file writes. */ +function terminateOnCreate(metadata: AgentMetadata): boolean { + return getToolCalls(metadata, "create").length > 0; +} + +const skipTests = shouldSkipIntegrationTests(); +const skipReason = getIntegrationSkipReason(); + +if (skipTests && skipReason) { + console.log(`⏭️ Skipping integration tests: ${skipReason}`); +} + +const describeIntegration = skipTests ? describe.skip : describe; + +describeIntegration("agent-framework (evaluate) - Integration Tests", () => { + describe("skill-invocation", () => { + test("invokes skill for agent evaluation prompt", async () => { + let successCount = 0; + + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await run({ + prompt: "Evaluate my Foundry agent built with Microsoft Agent Framework using pytest evaluators.", + shouldEarlyTerminate: terminateOnCreate, + }); + + if (isSkillInvoked(agentMetadata, SKILL_NAME)) { + successCount++; + } + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + + const invocationRate = successCount / RUNS_PER_PROMPT; + console.log(`agent-framework (evaluate) invocation rate for evaluation: ${(invocationRate * 100).toFixed(1)}% (${successCount}/${RUNS_PER_PROMPT})`); + fs.appendFileSync("./result-agent-framework-evaluate.txt", `agent-framework (evaluate) invocation rate for evaluation: ${(invocationRate * 100).toFixed(1)}% (${successCount}/${RUNS_PER_PROMPT})\n`); + expect(invocationRate).toBeGreaterThanOrEqual(EXPECTED_INVOCATION_RATE); + }); + + test("invokes skill for add evaluator prompt", async () => { + let successCount = 0; + + for (let i = 0; i < RUNS_PER_PROMPT; i++) { + try { + const agentMetadata = await run({ + prompt: "Add a custom evaluator to assess my agent's task completion using pytest-agent-evals.", + shouldEarlyTerminate: terminateOnCreate, + }); + + if (isSkillInvoked(agentMetadata, SKILL_NAME)) { + successCount++; + } + } catch (e: unknown) { + if (e instanceof Error && e.message?.includes("Failed to load @github/copilot-sdk")) { + console.log("⏭️ SDK not loadable, skipping test"); + return; + } + throw e; + } + } + + const invocationRate = successCount / RUNS_PER_PROMPT; + console.log(`agent-framework (evaluate) invocation rate for add evaluator: ${(invocationRate * 100).toFixed(1)}% (${successCount}/${RUNS_PER_PROMPT})`); + fs.appendFileSync("./result-agent-framework-evaluate.txt", `agent-framework (evaluate) invocation rate for add evaluator: ${(invocationRate * 100).toFixed(1)}% (${successCount}/${RUNS_PER_PROMPT})\n`); + expect(invocationRate).toBeGreaterThanOrEqual(EXPECTED_INVOCATION_RATE); + }); + }); +}); diff --git a/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/triggers.test.ts b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/triggers.test.ts new file mode 100644 index 000000000..bfbf85737 --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/triggers.test.ts @@ -0,0 +1,102 @@ +/** + * Trigger Tests for agent-framework (evaluate) + * + * Tests that verify the skill triggers on appropriate prompts + * and does NOT trigger on unrelated prompts. + */ + +import { TriggerMatcher } from "../../../../utils/trigger-matcher"; +import { loadSkill, LoadedSkill } from "../../../../utils/skill-loader"; + +const SKILL_NAME = "microsoft-foundry/foundry-agent/evaluate/agent-framework"; + +describe("agent-framework (evaluate) - Trigger Tests", () => { + let triggerMatcher: TriggerMatcher; + let skill: LoadedSkill; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + triggerMatcher = new TriggerMatcher(skill); + }); + + describe("Should Trigger", () => { + const shouldTriggerPrompts: string[] = [ + "Evaluate my agent's performance", + "Test my agent with built-in evaluators", + "Assess agent quality with pytest", + "Add evaluation to my agent framework project", + "Set up agent evaluation with metrics", + "Measure agent performance using evaluators", + "Add a custom evaluator to my agent tests", + "Create evaluation dataset for my agent", + "Configure judge model for agent evaluation", + "Run pytest evaluation on my agent", + ]; + + test.each(shouldTriggerPrompts)( + 'triggers on: "%s"', + (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(true); + expect(result.matchedKeywords.length).toBeGreaterThanOrEqual(2); + } + ); + }); + + describe("Should NOT Trigger", () => { + const shouldNotTriggerPrompts: string[] = [ + "What is the weather today?", + "Help me write a poem", + "Explain quantum computing", + "Help me fix my SageMaker pipeline", + "Configure my PostgreSQL database", + "Optimize my spending and reduce costs", + "Build a multi-agent workflow", + "Set up a virtual network", + "How do I write Python scripts?", + "Help me debug my Kubernetes pods", + "Explain how to use Docker containers", + ]; + + test.each(shouldNotTriggerPrompts)( + 'does not trigger on: "%s"', + (prompt) => { + const result = triggerMatcher.shouldTrigger(prompt); + expect(result.triggered).toBe(false); + } + ); + }); + + describe("Trigger Keywords Snapshot", () => { + test("skill keywords match snapshot", () => { + expect(triggerMatcher.getKeywords()).toMatchSnapshot(); + }); + + test("skill description triggers match snapshot", () => { + expect({ + name: skill.metadata.name, + description: skill.metadata.description, + extractedKeywords: triggerMatcher.getKeywords() + }).toMatchSnapshot(); + }); + }); + + describe("Edge Cases", () => { + test("handles empty prompt", () => { + const result = triggerMatcher.shouldTrigger(""); + expect(result.triggered).toBe(false); + }); + + test("handles very long prompt", () => { + const longPrompt = "evaluate agent ".repeat(100); + const result = triggerMatcher.shouldTrigger(longPrompt); + expect(typeof result.triggered).toBe("boolean"); + }); + + test("is case insensitive", () => { + const result1 = triggerMatcher.shouldTrigger("EVALUATE AGENT PERFORMANCE"); + const result2 = triggerMatcher.shouldTrigger("evaluate agent performance"); + expect(result1.triggered).toBe(result2.triggered); + }); + }); +}); diff --git a/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/unit.test.ts b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/unit.test.ts new file mode 100644 index 000000000..f83175a21 --- /dev/null +++ b/tests/microsoft-foundry/foundry-agent/evaluate/agent-framework/unit.test.ts @@ -0,0 +1,147 @@ +/** + * Unit Tests for agent-framework (evaluate) + * + * Test isolated skill logic and validation rules. + */ + +import * as fs from "fs"; +import * as path from "path"; +import { loadSkill, LoadedSkill } from "../../../../utils/skill-loader"; + +const SKILL_NAME = "microsoft-foundry/foundry-agent/evaluate/agent-framework"; + +describe("agent-framework (evaluate) - Unit Tests", () => { + let skill: LoadedSkill; + + beforeAll(async () => { + skill = await loadSkill(SKILL_NAME); + }); + + describe("Skill Metadata", () => { + test("has valid SKILL.md with required fields", () => { + expect(skill.metadata).toBeDefined(); + expect(skill.metadata.name).toBe("agent-framework"); + expect(skill.metadata.description).toBeDefined(); + expect(skill.metadata.description.length).toBeGreaterThan(10); + }); + + test("description is appropriately sized", () => { + expect(skill.metadata.description.length).toBeGreaterThan(150); + expect(skill.metadata.description.length).toBeLessThan(1024); + }); + + test("description contains USE FOR triggers", () => { + expect(skill.metadata.description).toMatch(/USE FOR:/i); + }); + + test("description contains DO NOT USE FOR anti-triggers", () => { + expect(skill.metadata.description).toMatch(/DO NOT USE FOR:/i); + }); + }); + + describe("Skill Content", () => { + test("has substantive content", () => { + expect(skill.content).toBeDefined(); + expect(skill.content.length).toBeGreaterThan(100); + }); + + test("contains expected sections", () => { + expect(skill.content).toContain("## Quick Reference"); + expect(skill.content).toContain("## When to Use This Skill"); + expect(skill.content).toContain("## Evaluation Workflow"); + }); + + test("contains references table pointing to reference files", () => { + expect(skill.content).toContain("## References"); + expect(skill.content).toContain("references/code-example.md"); + expect(skill.content).toContain("references/built-in-evaluators.md"); + expect(skill.content).toContain("references/custom-evaluators.md"); + }); + + test("contains built-in evaluators section referencing file", () => { + expect(skill.content).toContain("## Built-in Evaluators"); + expect(skill.content).toContain("references/built-in-evaluators.md"); + }); + + test("contains custom evaluators section referencing file", () => { + expect(skill.content).toContain("## Custom Evaluators"); + expect(skill.content).toContain("references/custom-evaluators.md"); + }); + + test("contains error handling section", () => { + expect(skill.content).toContain("## Error Handling"); + }); + + test("documents MCP tools", () => { + expect(skill.content).toContain("foundry_models_list"); + expect(skill.content).toContain("foundry_models_deployments_list"); + expect(skill.content).toContain("foundry_resource_get"); + }); + + test("specifies plugin version", () => { + expect(skill.content).toContain("pytest-agent-evals"); + expect(skill.content).toContain("0.0.1b260210"); + }); + + test("documents evaluation result schema", () => { + expect(skill.content).toContain("inputs.query"); + expect(skill.content).toContain("outputs.response"); + expect(skill.content).toContain("test-results/evaluation_results_"); + }); + + test("references microsoft-foundry skill for model selection", () => { + expect(skill.content).toContain("microsoft-foundry"); + }); + + test("documents judge model env vars", () => { + expect(skill.content).toContain("AZURE_OPENAI_ENDPOINT"); + expect(skill.content).toContain("FOUNDRY_MODEL_DEPLOYMENT_NAME"); + }); + + test("step 1 references built-in evaluators for metric suggestions", () => { + const step1Match = skill.content.match(/### Step 1:[\s\S]*?### Step 2:/); + expect(step1Match).toBeTruthy(); + const step1Content = step1Match![0]; + expect(step1Content).toContain("references/built-in-evaluators.md"); + expect(step1Content).toContain("built-in"); + expect(step1Content).toContain("custom"); + expect(step1Content).toContain("prompt-based"); + expect(step1Content).toContain("code-based"); + }); + }); + + describe("Reference Files", () => { + test("code-example.md exists and contains key content", () => { + const refPath = path.join(skill.path, "references", "code-example.md"); + expect(fs.existsSync(refPath)).toBe(true); + const content = fs.readFileSync(refPath, "utf-8"); + expect(content).toContain("AzureOpenAIModelConfig"); + expect(content).toContain("ChatAgentConfig"); + expect(content).toContain("BuiltInEvaluatorConfig"); + expect(content).toContain("@pytest.fixture"); + expect(content).toContain("@evals.dataset"); + expect(content).toContain("@evals.judge_model"); + }); + + test("built-in-evaluators.md exists and contains evaluator catalog", () => { + const refPath = path.join(skill.path, "references", "built-in-evaluators.md"); + expect(fs.existsSync(refPath)).toBe(true); + const content = fs.readFileSync(refPath, "utf-8"); + expect(content).toContain("task_adherence"); + expect(content).toContain("intent_resolution"); + expect(content).toContain("tool_call_accuracy"); + expect(content).toContain("coherence"); + expect(content).toContain("relevance"); + expect(content).toContain("groundedness"); + expect(content).toContain("similarity"); + }); + + test("custom-evaluators.md exists and contains evaluator patterns", () => { + const refPath = path.join(skill.path, "references", "custom-evaluators.md"); + expect(fs.existsSync(refPath)).toBe(true); + const content = fs.readFileSync(refPath, "utf-8"); + expect(content).toContain("CustomPromptEvaluatorConfig"); + expect(content).toContain("CustomCodeEvaluatorConfig"); + }); + }); +});