Skip to content

fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937

fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726)

fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937

name: Offline Speaker diarization pipeline
on:
pull_request:
branches: [main]
types: [opened, synchronize, reopened]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
benchmark:
runs-on: macos-15
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout code
uses: actions/checkout@v5
- uses: swift-actions/setup-swift@v2
with:
swift-version: "6.1"
- name: Build package
run: swift build -c release
- name: Run Offline Pipeline Benchmark
id: benchmark
run: |
echo "Running offline VBx pipeline benchmark..."
# Record start time
BENCHMARK_START=$(date +%s)
swift run -c release fluidaudiocli diarization-benchmark --mode offline --auto-download --single-file ES2004a --output offline_results.json
# Check if results file was generated
if [ -f offline_results.json ]; then
echo "SUCCESS=true" >> $GITHUB_OUTPUT
else
echo "Benchmark failed - no results file generated"
echo "SUCCESS=false" >> $GITHUB_OUTPUT
fi
# Calculate execution time
BENCHMARK_END=$(date +%s)
EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START))
EXECUTION_MINS=$((EXECUTION_TIME / 60))
EXECUTION_SECS=$((EXECUTION_TIME % 60))
echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT
timeout-minutes: 30
- name: Show offline_results.json
if: always()
run: |
echo "--- offline_results.json ---"
cat offline_results.json || echo "offline_results.json not found"
echo "-----------------------------"
- name: Extract benchmark metrics with jq
id: extract
run: |
# The output is now an array, so we need to access the first element
DER=$(jq '.[0].der' offline_results.json)
RTF=$(jq '.[0].rtfx' offline_results.json)
DURATION="1049" # ES2004a duration in seconds
SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' offline_results.json)
# Extract detailed timing information
TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' offline_results.json)
MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' offline_results.json)
MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' offline_results.json)
AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' offline_results.json)
SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' offline_results.json)
EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' offline_results.json)
CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' offline_results.json)
INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' offline_results.json)
echo "DER=${DER}" >> $GITHUB_OUTPUT
echo "RTF=${RTF}" >> $GITHUB_OUTPUT
echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT
echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT
echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT
echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT
echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT
echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT
echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT
echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT
echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT
# Validate RTFx - 0 indicates benchmark failure
if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
echo "RTFx value: $RTF"
exit 1
fi
- name: Comment PR with Offline Pipeline Results
if: always()
uses: actions/github-script@v7
with:
script: |
const der = parseFloat('${{ steps.extract.outputs.DER }}');
const rtf = parseFloat('${{ steps.extract.outputs.RTF }}');
const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1);
const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}';
const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}');
const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}');
const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}');
const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}');
const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}');
const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}');
const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}');
const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}');
const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' || 'N/A';
let comment = '## Offline VBx Pipeline Results\n\n';
comment += '### Speaker Diarization Performance (VBx Batch Mode)\n';
comment += '_Optimal clustering with Hungarian algorithm for maximum accuracy_\n\n';
comment += '| Metric | Value | Target | Status | Description |\n';
comment += '|--------|-------|--------|---------|-------------|\n';
comment += `| **DER** | **${der.toFixed(1)}%** | <20% | ${der < 20 ? '✅' : '⚠️'} | Diarization Error Rate (lower is better) |\n`;
comment += `| **RTFx** | **${rtf.toFixed(2)}x** | >1.0x | ${rtf > 1.0 ? '✅' : '⚠️'} | Real-Time Factor (higher is faster) |\n\n`;
comment += '### Offline VBx Pipeline Timing Breakdown\n';
comment += '_Time spent in each stage of batch diarization_\n\n';
comment += '| Stage | Time (s) | % | Description |\n';
comment += '|-------|----------|---|-------------|\n';
comment += `| Model Download | ${modelDownloadTime.toFixed(3)} | ${(modelDownloadTime/totalTime*100).toFixed(1)} | Fetching diarization models |\n`;
comment += `| Model Compile | ${modelCompileTime.toFixed(3)} | ${(modelCompileTime/totalTime*100).toFixed(1)} | CoreML compilation |\n`;
comment += `| Audio Load | ${audioLoadTime.toFixed(3)} | ${(audioLoadTime/totalTime*100).toFixed(1)} | Loading audio file |\n`;
comment += `| Segmentation | ${segmentationTime.toFixed(3)} | ${(segmentationTime/totalTime*100).toFixed(1)} | VAD + speech detection |\n`;
comment += `| Embedding | ${embeddingTime.toFixed(3)} | ${(embeddingTime/totalTime*100).toFixed(1)} | Speaker embedding extraction |\n`;
comment += `| Clustering (VBx) | ${clusteringTime.toFixed(3)} | ${(clusteringTime/totalTime*100).toFixed(1)} | Hungarian algorithm + VBx clustering |\n`;
comment += `| **Total** | **${totalTime.toFixed(3)}** | **100** | **Full VBx pipeline** |\n\n`;
comment += '### Speaker Diarization Research Comparison\n';
comment += '_Offline VBx achieves competitive accuracy with batch processing_\n\n';
comment += '| Method | DER | Mode | Description |\n';
comment += '|--------|-----|------|-------------|\n';
comment += '| **FluidAudio (Offline)** | **' + der.toFixed(1) + '%** | **VBx Batch** | **On-device CoreML with optimal clustering** |\n';
comment += '| FluidAudio (Streaming) | 17.7% | Chunk-based | First-occurrence speaker mapping |\n';
comment += '| Research baseline | 18-30% | Various | Standard dataset performance |\n\n';
comment += '**Pipeline Details**:\n';
comment += '- **Mode**: Offline VBx with Hungarian algorithm for optimal speaker-to-cluster assignment\n';
comment += '- **Segmentation**: VAD-based voice activity detection\n';
comment += '- **Embeddings**: WeSpeaker-compatible speaker embeddings\n';
comment += '- **Clustering**: PowerSet with VBx refinement\n';
comment += '- **Accuracy**: Higher than streaming due to optimal post-hoc mapping\n\n';
comment += `<sub>🎯 **Offline VBx Test** • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s processing • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`;
// Add hidden identifier for reliable comment detection
comment += '<!-- fluidaudio-offline-pipeline -->';
try {
// First, try to find existing benchmark comment
const comments = await github.rest.issues.listComments({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
});
// Look for existing offline pipeline comment (identified by the hidden tag)
const existingComment = comments.data.find(comment => {
const isBot = comment.user.type === 'Bot' ||
comment.user.login === 'github-actions[bot]' ||
comment.user.login.includes('[bot]');
const hasIdentifier = comment.body.includes('<!-- fluidaudio-offline-pipeline -->');
const hasHeader = comment.body.includes('## Offline VBx Pipeline Results');
return isBot && (hasIdentifier || hasHeader);
});
if (existingComment) {
// Update existing comment
await github.rest.issues.updateComment({
comment_id: existingComment.id,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
console.log('Successfully updated existing offline pipeline comment');
} else {
// Create new comment if none exists
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});
console.log('Successfully posted new offline pipeline results comment');
}
} catch (error) {
console.error('Failed to update/post comment:', error.message);
// Don't fail the workflow just because commenting failed
}