OpenHands · csmith49 · Jan 12, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/.github/workflows/condenser-runner.yml b/.github/workflows/condenser-runner.yml
@@ -0,0 +1,244 @@
+---
+name: Run Condenser Tests
+
+on:
+    # Use pull_request_target to access secrets even on fork PRs
+    # This is safe because we only run when the 'condenser-test' label is added by a maintainer
+    pull_request_target:
+        types:
+            - labeled
+    workflow_dispatch:
+        inputs:
+            reason:
+                description: Reason for manual trigger
+                required: true
+                default: ''
+
+env:
+    N_PROCESSES: 2 # Fewer parallel processes for condenser tests (only 2 LLMs)
+
+jobs:
+    post-initial-comment:
+        if: >
+            github.event_name == 'pull_request_target' &&
+            github.event.label.name == 'condenser-test'
+        runs-on: ubuntu-latest
+        permissions:
+            pull-requests: write
+        steps:
+            - name: Comment on PR
+              uses: KeisukeYamashita/create-comment@v1
+              with:
+                  unique: false
+                  comment: |
+                      Hi! I started running the condenser tests on your PR. You will receive a comment with the results shortly.
+
+                      Note: These are non-blocking tests that validate condenser functionality across different LLMs.
+
+    run-condenser-tests:
+        # Security: Only run when condenser-test label is present or via workflow_dispatch
+        # This prevents automatic execution on fork PRs without maintainer approval
+        if: |
+            always() && (
+                (
+                    github.event_name == 'pull_request_target' &&
+                    github.event.label.name == 'condenser-test'
+                ) ||
+                github.event_name == 'workflow_dispatch'
+            )
+        runs-on: blacksmith-4vcpu-ubuntu-2204
+        permissions:
+            contents: read
+            id-token: write
+            pull-requests: write
+        strategy:
+            matrix:
+                python-version: ['3.12']
+                job-config:
+                    # Only run against 2 LLMs for condenser tests:
+                    # - Claude Opus 4.5 (primary - supports thinking blocks)
+                    # - GPT-5.1 Codex Max (secondary - cross-LLM validation)
+                    - name: Claude Opus 4.5
+                      run-suffix: opus_condenser_run
+                      llm-config:
+                          model: litellm_proxy/anthropic/claude-opus-4-5-20251101
+                          extended_thinking: true
+                    - name: GPT-5.1 Codex Max
+                      run-suffix: gpt51_condenser_run
+                      llm-config:
+                          model: litellm_proxy/gpt-5.1-codex-max
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v5
+              with:
+                  # For pull_request_target: checkout fork PR code (requires explicit repository)
+                  # For other events: fallback to current repository and ref
+                  repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
+                  ref: ${{ github.event.pull_request.head.sha || github.ref }}
+                  # Security: Don't persist credentials to prevent untrusted PR code from using them
+                  persist-credentials: false
+
+            - name: Install uv
+              uses: astral-sh/setup-uv@v7
+              with:
+                  version: latest
+                  python-version: ${{ matrix.python-version }}
+
+            - name: Install Python dependencies using uv
+              run: |
+                  uv sync --dev
+                  uv pip install pytest
+
+            - name: Run condenser test evaluation for ${{ matrix.job-config.name }}
+              env:
+                  LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
+                  LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+                  LLM_BASE_URL: https://llm-proxy.app.all-hands.dev
+              run: |
+                  set -eo pipefail
+
+                  AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
+                  EVAL_NOTE="${AGENT_SDK_VERSION}_${{ matrix.job-config.run-suffix }}"
+
+                  echo "Running condenser tests only (c*.py pattern)"
+
+                  uv run python tests/integration/run_infer.py \
+                    --llm-config "$LLM_CONFIG" \
+                    --num-workers $N_PROCESSES \
+                    --eval-note "$EVAL_NOTE" \
+                    --test-type condenser
+
+                  # get condenser tests JSON results
+                  RESULTS_FILE=$(find tests/integration/outputs/*${{ matrix.job-config.run-suffix }}* -name "results.json" -type f | head -n 1)
+                  echo "RESULTS_FILE: $RESULTS_FILE"
+                  if [ -f "$RESULTS_FILE" ]; then
+                    echo "JSON_RESULTS_FILE=$RESULTS_FILE" >> $GITHUB_ENV
+                  else
+                    echo "JSON_RESULTS_FILE=" >> $GITHUB_ENV
+                  fi
+
+            - name: Wait a little bit
+              run: sleep 10
+
+            - name: Create archive of evaluation outputs
+              run: |
+                  TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
+                  cd tests/integration/outputs  # Change to the outputs directory
+                  tar -czvf ../../../condenser_tests_${{ matrix.job-config.run-suffix }}_${TIMESTAMP}.tar.gz *${{ matrix.job-config.run-suffix }}* # Include result directories for this model
+
+            - name: Upload evaluation results as artifact
+              uses: actions/upload-artifact@v5
+              id: upload_results_artifact
+              with:
+                  name: condenser-test-outputs-${{ matrix.job-config.run-suffix }}-${{ github.run_id }}-${{ github.run_attempt }}
+                  path: condenser_tests_${{ matrix.job-config.run-suffix }}_*.tar.gz
+
+            - name: Save test results for consolidation
+              run: |
+                  # Copy the structured JSON results file for consolidation
+                  mkdir -p test_results_summary
+
+                  if [ -n "${{ env.JSON_RESULTS_FILE }}" ] && [ -f "${{ env.JSON_RESULTS_FILE }}" ]; then
+                    # Copy the JSON results file directly
+                    cp "${{ env.JSON_RESULTS_FILE }}" "test_results_summary/${{ matrix.job-config.run-suffix }}_results.json"
+                    echo "✓ Copied JSON results file for consolidation"
+                  else
+                    echo "✗ No JSON results file found"
+                    exit 1
+                  fi
+
+            - name: Upload test results summary
+              uses: actions/upload-artifact@v5
+              with:
+                  name: test-results-${{ matrix.job-config.run-suffix }}
+                  path: test_results_summary/${{ matrix.job-config.run-suffix }}_results.json
+
+    consolidate-results:
+        needs: run-condenser-tests
+        if: |
+            always() && (
+                (
+                    github.event_name == 'pull_request_target' &&
+                    github.event.label.name == 'condenser-test'
+                ) ||
+                github.event_name == 'workflow_dispatch'
+            )
+        runs-on: blacksmith-2vcpu-ubuntu-2404
+        permissions:
+            contents: read
+            pull-requests: write
+        steps:
+            - name: Checkout repository
+              uses: actions/checkout@v5
+              with:
+                  # When using pull_request_target, explicitly checkout the PR branch
+                  # This ensures we use the scripts from the actual PR code
+                  ref: ${{ github.event.pull_request.head.sha || github.ref }}
+
+            - name: Install uv
+              uses: astral-sh/setup-uv@v7
+              with:
+                  version: latest
+                  python-version: '3.12'
+
+            - name: Install Python dependencies using uv
+              run: |
+                  uv sync --dev
+
+            - name: Download all test results
+              uses: actions/download-artifact@v6
+              with:
+                  pattern: test-results-*
+                  merge-multiple: true
+                  path: all_results
+
+            - name: Download all condenser test artifacts
+              uses: actions/download-artifact@v6
+              with:
+                  pattern: condenser-test-outputs-*
+                  path: artifacts
+
+            - name: Consolidate test results
+              env:
+                  EVENT_NAME: ${{ github.event_name }}
+                  PR_NUMBER: ${{ github.event.pull_request.number }}
+                  MANUAL_REASON: ${{ github.event.inputs.reason }}
+                  COMMIT_SHA: ${{ github.sha }}
+                  PYTHONPATH: ${{ github.workspace }}
+                  GITHUB_SERVER_URL: ${{ github.server_url }}
+                  GITHUB_REPOSITORY: ${{ github.repository }}
+                  GITHUB_RUN_ID: ${{ github.run_id }}
+              run: |
+                  uv run python tests/integration/utils/consolidate_json_results.py \
+                    --results-dir all_results \
+                    --artifacts-dir artifacts \
+                    --output-file consolidated_results.json
+
+                  echo "Consolidated results generated successfully"
+
+                  uv run python tests/integration/utils/generate_markdown_report.py \
+                    --input-file consolidated_results.json \
+                    --output-file consolidated_report.md
+
+            - name: Upload consolidated report
+              uses: actions/upload-artifact@v5
+              with:
+                  name: consolidated-condenser-report
+                  path: consolidated_report.md
+
+            - name: Create consolidated PR comment
+              if: github.event_name == 'pull_request_target'
+              run: |
+                  # Add header to clarify these are non-blocking tests
+                  echo "## Condenser Test Results (Non-Blocking)" > final_report.md
+                  echo "" >> final_report.md
+                  echo "> These tests validate condenser functionality and do not block PR merges." >> final_report.md
+                  echo "" >> final_report.md
+                  cat consolidated_report.md >> final_report.md
+
+                  # Sanitize @OpenHands mentions to prevent self-mention loops
+                  COMMENT_BODY=$(uv run python -c "from openhands.sdk.utils.github import sanitize_openhands_mentions; import sys; print(sanitize_openhands_mentions(sys.stdin.read()), end='')" < final_report.md)
+                  # Use GitHub CLI to create comment with explicit PR number
+                  echo "$COMMENT_BODY" | gh pr comment ${{ github.event.pull_request.number }} --body-file -
+              env:
+                  GH_TOKEN: ${{ github.token }}
diff --git a/tests/integration/README.md b/tests/integration/README.md
@@ -8,12 +8,13 @@ The integration tests are designed to verify that the agent-sdk works correctly
 
 ### Test Types
 
-Tests are classified into two types based on their filename prefix:
+Tests are classified into three types based on their filename prefix:
 
 - **Integration tests** (`t*.py`) - **REQUIRED**: Verify that the agent successfully completes essential tasks. These tests must pass for releases and focus on task completion and outcomes.
 - **Behavior tests** (`b*.py`) - **OPTIONAL**: Verify that the agent follows system message guidelines and best practices. These tests track quality improvements and focus on how the agent approaches problems. Failures don't block releases but should be addressed for optimal user experience.
+- **Condenser tests** (`c*.py`) - **OPTIONAL, NON-BLOCKING**: Stress test the condensation system's interaction with LLM APIs to ensure compatibility. These tests run on a limited set of LLMs (currently Claude Opus 4.5 and GPT-5.1 Codex Max) and are triggered separately from integration tests. They validate that conversation condensation works correctly across different models and API patterns.
 
-Success rates are calculated separately for each test type to track both completion capability and behavior quality.
+Success rates are calculated separately for each test type to track completion capability, behavior quality, and condenser reliability.
 
 See [BEHAVIOR_TESTS.md](BEHAVIOR_TESTS.md) for more details on behavior testing.
 
@@ -29,8 +30,9 @@ tests/integration/
 ├── run_infer.sh                 # Shell script wrapper for running tests
 ├── outputs/                     # Test results and reports (auto-generated)
 ├── tests/                       # Individual test files
-│   ├── t*.py                    # Task completion tests (critical)
-│   └── b*.py                    # Agent behavior tests (ux)
+│   ├── t*.py                    # Task completion tests (required)
+│   ├── b*.py                    # Agent behavior tests (optional)
+│   └── c*.py                    # Condenser stress tests (optional, non-blocking)
 └── utils/                       # Test utilities (e.g., llm_judge.py)
 ```
 
@@ -42,6 +44,8 @@ The easiest way to run the integration tests if from github by tagging the label
 A pull request comment will notify you as soon as the tests have been executed.
 The results of the tests (and all of the logs) will be downloadable using a link added in the comment.
 
+For condenser tests, use the `condenser-test` label instead.
+
 ### Locally
 
 ```bash
@@ -50,20 +54,38 @@ uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_pr
 
 # Run a specific test
 uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' --eval-ids t01_fix_simple_typo
+
+# Run only condenser tests
+uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-opus-4-5", "extended_thinking": true}' --test-type condenser
 ```
 
 ## Automated Testing with GitHub Actions
 
-The integration tests are automatically executed via GitHub Actions using the workflow defined in `.github/workflows/integration-runner.yml`.
+Tests are automatically executed via GitHub Actions using two separate workflows:
 
-### Workflow Triggers
+### Integration/Behavior Tests Workflow
 
-The GitHub workflow runs integration tests in the following scenarios:
+Defined in `.github/workflows/integration-runner.yml`, this workflow runs integration and behavior tests.
 
-1. **Pull Request Labels**: When a PR is labeled with `integration-test`
+**Triggers:**
+1. **Pull Request Labels**: When a PR is labeled with `integration-test` or `behavior-test`
 2. **Manual Trigger**: Via workflow dispatch with a required reason
 3. **Scheduled Runs**: Daily at 10:30 PM UTC (cron: `30 22 * * *`)
 
+**Test Coverage:** Runs across 6 LLM models (Claude Sonnet 4.5, GPT-5.1 Codex Max, Deepseek, Kimi K2, Gemini 3 Pro, Devstral 2512)
+
+### Condenser Tests Workflow
+
+Defined in `.github/workflows/condenser-runner.yml`, this workflow runs condenser stress tests separately.
+
+**Triggers:**
+1. **Pull Request Labels**: When a PR is labeled with `condenser-test`
+2. **Manual Trigger**: Via workflow dispatch with a required reason
+
+**Test Coverage:** Runs only against 2 LLMs (Claude Opus 4.5 with extended thinking, GPT-5.1 Codex Max) to save costs while validating cross-model compatibility
+
+**Note:** Condenser tests are non-blocking and do not prevent PR merges
+
 ## Available Tests
 
 ### Integration Tests (`t*.py`) - **Required**
@@ -78,7 +100,6 @@ These tests must pass for releases and verify that the agent can successfully co
 - **t06_github_pr_browsing** - Tests GitHub PR browsing
 - **t07_interactive_commands** - Tests interactive command handling
 - **t08_image_file_viewing** - Tests image file viewing capabilities
-- **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded
 
 ### Behavior Tests (`b*.py`) - **Optional**
 
@@ -88,6 +109,40 @@ These tests track quality improvements and don't block releases. They verify tha
 
 For more details on behavior testing and guidelines for adding new tests, see [BEHAVIOR_TESTS.md](BEHAVIOR_TESTS.md).
 
+### Condenser Tests (`c*.py`) - **Optional, Non-Blocking**
+
+These tests stress test the condensation system's interaction with LLM APIs to ensure compatibility across different models. Unlike integration tests, condenser tests run on a limited set of LLMs (currently Claude Opus 4.5 and GPT-5.1 Codex Max) to save costs while validating cross-model compatibility. They are triggered separately using the `condenser-test` label and do not block PR merges.
+
+**Purpose:** Validate that conversation condensation works correctly across different models and API patterns, particularly focusing on:
+- Model-specific features (e.g., thinking blocks in Claude Opus)
+- Condensation triggers (token limits, event counts, explicit requests)
+- Conversation history management
+- API signature compatibility after condensation
+
+**Current Tests:**
+
+- **c01_thinking_block_condenser** - Tests that Claude Opus's thinking blocks are properly handled during condensation. Verifies that:
+  - Multiple thinking blocks are generated across a multi-step conversation
+  - Condensation is triggered correctly
+  - The first thinking block is forgotten during condensation
+  - Later thinking blocks are preserved after condensation
+  - No malformed signature errors occur when condensed history is sent to the API
+- **c02_hard_condensation_requirement** - Tests hard requirement behavior when condensation is unavailable. Verifies that:
+  - Explicit condense() calls raise NoCondensationAvailableException when no valid range exists
+  - The exception is properly raised with only 1 event in history
+- **c03_soft_condensation_requirement** - Tests soft requirement behavior. Verifies that:
+  - Soft requirements (resource limits) gracefully continue when condensation is unavailable
+  - Conversation continues without crashing when condensation can't be satisfied
+  - Condensation succeeds once multiple atomic units make it available
+- **c04_token_condenser** - Tests that token-based condensation works correctly. Verifies that:
+  - An agent can be configured with LLMSummarizingCondenser using max_tokens
+  - The condenser correctly uses get_token_count to measure conversation size
+  - Condensation is triggered when token limit is exceeded
+- **c05_size_condenser** - Tests that size-based condensation works correctly. Verifies that:
+  - An agent can be configured with LLMSummarizingCondenser using max_size
+  - The condenser correctly counts events to measure conversation size
+  - Condensation is triggered when event count limit is exceeded
+
 ## Writing Integration Tests
 
 All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base class provides a consistent framework with several customizable properties:
@@ -102,7 +157,7 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 
 - **`condenser`** (property) - Optional condenser configuration for the agent (default: `None`)
   - Override to test condensation or manage long conversations
-  - Example: `t09_token_condenser` uses this to verify token counting
+  - Example: `c04_token_condenser` uses this to verify token counting
 - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`)
   - Override to limit LLM calls for faster tests
   - Useful for tests that should complete quickly