fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Offline Speaker diarization pipeline | |
| on: | |
| pull_request: | |
| branches: [main] | |
| types: [opened, synchronize, reopened] | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| benchmark: | |
| runs-on: macos-15 | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| - uses: swift-actions/setup-swift@v2 | |
| with: | |
| swift-version: "6.1" | |
| - name: Build package | |
| run: swift build -c release | |
| - name: Run Offline Pipeline Benchmark | |
| id: benchmark | |
| run: | | |
| echo "Running offline VBx pipeline benchmark..." | |
| # Record start time | |
| BENCHMARK_START=$(date +%s) | |
| swift run -c release fluidaudiocli diarization-benchmark --mode offline --auto-download --single-file ES2004a --output offline_results.json | |
| # Check if results file was generated | |
| if [ -f offline_results.json ]; then | |
| echo "SUCCESS=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "Benchmark failed - no results file generated" | |
| echo "SUCCESS=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Calculate execution time | |
| BENCHMARK_END=$(date +%s) | |
| EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START)) | |
| EXECUTION_MINS=$((EXECUTION_TIME / 60)) | |
| EXECUTION_SECS=$((EXECUTION_TIME % 60)) | |
| echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT | |
| timeout-minutes: 30 | |
| - name: Show offline_results.json | |
| if: always() | |
| run: | | |
| echo "--- offline_results.json ---" | |
| cat offline_results.json || echo "offline_results.json not found" | |
| echo "-----------------------------" | |
| - name: Extract benchmark metrics with jq | |
| id: extract | |
| run: | | |
| # The output is now an array, so we need to access the first element | |
| DER=$(jq '.[0].der' offline_results.json) | |
| RTF=$(jq '.[0].rtfx' offline_results.json) | |
| DURATION="1049" # ES2004a duration in seconds | |
| SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' offline_results.json) | |
| # Extract detailed timing information | |
| TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' offline_results.json) | |
| MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' offline_results.json) | |
| MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' offline_results.json) | |
| AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' offline_results.json) | |
| SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' offline_results.json) | |
| EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' offline_results.json) | |
| CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' offline_results.json) | |
| INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' offline_results.json) | |
| echo "DER=${DER}" >> $GITHUB_OUTPUT | |
| echo "RTF=${RTF}" >> $GITHUB_OUTPUT | |
| echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT | |
| echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT | |
| echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT | |
| echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT | |
| echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT | |
| echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT | |
| echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT | |
| echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT | |
| echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT | |
| echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT | |
| # Validate RTFx - 0 indicates benchmark failure | |
| if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then | |
| echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" | |
| echo "RTFx value: $RTF" | |
| exit 1 | |
| fi | |
| - name: Comment PR with Offline Pipeline Results | |
| if: always() | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const der = parseFloat('${{ steps.extract.outputs.DER }}'); | |
| const rtf = parseFloat('${{ steps.extract.outputs.RTF }}'); | |
| const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1); | |
| const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}'; | |
| const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}'); | |
| const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}'); | |
| const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}'); | |
| const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}'); | |
| const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}'); | |
| const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}'); | |
| const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}'); | |
| const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}'); | |
| const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' || 'N/A'; | |
| let comment = '## Offline VBx Pipeline Results\n\n'; | |
| comment += '### Speaker Diarization Performance (VBx Batch Mode)\n'; | |
| comment += '_Optimal clustering with Hungarian algorithm for maximum accuracy_\n\n'; | |
| comment += '| Metric | Value | Target | Status | Description |\n'; | |
| comment += '|--------|-------|--------|---------|-------------|\n'; | |
| comment += `| **DER** | **${der.toFixed(1)}%** | <20% | ${der < 20 ? '✅' : '⚠️'} | Diarization Error Rate (lower is better) |\n`; | |
| comment += `| **RTFx** | **${rtf.toFixed(2)}x** | >1.0x | ${rtf > 1.0 ? '✅' : '⚠️'} | Real-Time Factor (higher is faster) |\n\n`; | |
| comment += '### Offline VBx Pipeline Timing Breakdown\n'; | |
| comment += '_Time spent in each stage of batch diarization_\n\n'; | |
| comment += '| Stage | Time (s) | % | Description |\n'; | |
| comment += '|-------|----------|---|-------------|\n'; | |
| comment += `| Model Download | ${modelDownloadTime.toFixed(3)} | ${(modelDownloadTime/totalTime*100).toFixed(1)} | Fetching diarization models |\n`; | |
| comment += `| Model Compile | ${modelCompileTime.toFixed(3)} | ${(modelCompileTime/totalTime*100).toFixed(1)} | CoreML compilation |\n`; | |
| comment += `| Audio Load | ${audioLoadTime.toFixed(3)} | ${(audioLoadTime/totalTime*100).toFixed(1)} | Loading audio file |\n`; | |
| comment += `| Segmentation | ${segmentationTime.toFixed(3)} | ${(segmentationTime/totalTime*100).toFixed(1)} | VAD + speech detection |\n`; | |
| comment += `| Embedding | ${embeddingTime.toFixed(3)} | ${(embeddingTime/totalTime*100).toFixed(1)} | Speaker embedding extraction |\n`; | |
| comment += `| Clustering (VBx) | ${clusteringTime.toFixed(3)} | ${(clusteringTime/totalTime*100).toFixed(1)} | Hungarian algorithm + VBx clustering |\n`; | |
| comment += `| **Total** | **${totalTime.toFixed(3)}** | **100** | **Full VBx pipeline** |\n\n`; | |
| comment += '### Speaker Diarization Research Comparison\n'; | |
| comment += '_Offline VBx achieves competitive accuracy with batch processing_\n\n'; | |
| comment += '| Method | DER | Mode | Description |\n'; | |
| comment += '|--------|-----|------|-------------|\n'; | |
| comment += '| **FluidAudio (Offline)** | **' + der.toFixed(1) + '%** | **VBx Batch** | **On-device CoreML with optimal clustering** |\n'; | |
| comment += '| FluidAudio (Streaming) | 17.7% | Chunk-based | First-occurrence speaker mapping |\n'; | |
| comment += '| Research baseline | 18-30% | Various | Standard dataset performance |\n\n'; | |
| comment += '**Pipeline Details**:\n'; | |
| comment += '- **Mode**: Offline VBx with Hungarian algorithm for optimal speaker-to-cluster assignment\n'; | |
| comment += '- **Segmentation**: VAD-based voice activity detection\n'; | |
| comment += '- **Embeddings**: WeSpeaker-compatible speaker embeddings\n'; | |
| comment += '- **Clustering**: PowerSet with VBx refinement\n'; | |
| comment += '- **Accuracy**: Higher than streaming due to optimal post-hoc mapping\n\n'; | |
| comment += `<sub>🎯 **Offline VBx Test** • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s processing • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`; | |
| // Add hidden identifier for reliable comment detection | |
| comment += '<!-- fluidaudio-offline-pipeline -->'; | |
| try { | |
| // First, try to find existing benchmark comment | |
| const comments = await github.rest.issues.listComments({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| }); | |
| // Look for existing offline pipeline comment (identified by the hidden tag) | |
| const existingComment = comments.data.find(comment => { | |
| const isBot = comment.user.type === 'Bot' || | |
| comment.user.login === 'github-actions[bot]' || | |
| comment.user.login.includes('[bot]'); | |
| const hasIdentifier = comment.body.includes('<!-- fluidaudio-offline-pipeline -->'); | |
| const hasHeader = comment.body.includes('## Offline VBx Pipeline Results'); | |
| return isBot && (hasIdentifier || hasHeader); | |
| }); | |
| if (existingComment) { | |
| // Update existing comment | |
| await github.rest.issues.updateComment({ | |
| comment_id: existingComment.id, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| console.log('Successfully updated existing offline pipeline comment'); | |
| } else { | |
| // Create new comment if none exists | |
| await github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| console.log('Successfully posted new offline pipeline results comment'); | |
| } | |
| } catch (error) { | |
| console.error('Failed to update/post comment:', error.message); | |
| // Don't fail the workflow just because commenting failed | |
| } |