fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937

Workflow file for this run

.github/workflows/offline-pipeline.yml at 08e488d

	name: Offline Speaker diarization pipeline

	on:
	pull_request:
	branches: [main]
	types: [opened, synchronize, reopened]

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	benchmark:
	runs-on: macos-15
	permissions:
	contents: read
	pull-requests: write

	steps:
	- name: Checkout code
	uses: actions/checkout@v5

	- uses: swift-actions/setup-swift@v2
	with:
	swift-version: "6.1"

	- name: Build package
	run: swift build -c release

	- name: Run Offline Pipeline Benchmark
	id: benchmark
	run: \|
	echo "Running offline VBx pipeline benchmark..."

	# Record start time
	BENCHMARK_START=$(date +%s)

	swift run -c release fluidaudiocli diarization-benchmark --mode offline --auto-download --single-file ES2004a --output offline_results.json

	# Check if results file was generated
	if [ -f offline_results.json ]; then
	echo "SUCCESS=true" >> $GITHUB_OUTPUT
	else
	echo "Benchmark failed - no results file generated"
	echo "SUCCESS=false" >> $GITHUB_OUTPUT
	fi

	# Calculate execution time
	BENCHMARK_END=$(date +%s)
	EXECUTION_TIME=$((BENCHMARK_END - BENCHMARK_START))
	EXECUTION_MINS=$((EXECUTION_TIME / 60))
	EXECUTION_SECS=$((EXECUTION_TIME % 60))

	echo "EXECUTION_TIME=${EXECUTION_MINS}m ${EXECUTION_SECS}s" >> $GITHUB_OUTPUT
	timeout-minutes: 30

	- name: Show offline_results.json
	if: always()
	run: \|
	echo "--- offline_results.json ---"
	cat offline_results.json \|\| echo "offline_results.json not found"
	echo "-----------------------------"

	- name: Extract benchmark metrics with jq
	id: extract
	run: \|
	# The output is now an array, so we need to access the first element
	DER=$(jq '.[0].der' offline_results.json)
	RTF=$(jq '.[0].rtfx' offline_results.json)
	DURATION="1049" # ES2004a duration in seconds
	SPEAKER_COUNT=$(jq '.[0].detectedSpeakers' offline_results.json)

	# Extract detailed timing information
	TOTAL_TIME=$(jq '.[0].timings.totalProcessingSeconds' offline_results.json)
	MODEL_DOWNLOAD_TIME=$(jq '.[0].timings.modelDownloadSeconds' offline_results.json)
	MODEL_COMPILE_TIME=$(jq '.[0].timings.modelCompilationSeconds' offline_results.json)
	AUDIO_LOAD_TIME=$(jq '.[0].timings.audioLoadingSeconds' offline_results.json)
	SEGMENTATION_TIME=$(jq '.[0].timings.segmentationSeconds' offline_results.json)
	EMBEDDING_TIME=$(jq '.[0].timings.embeddingExtractionSeconds' offline_results.json)
	CLUSTERING_TIME=$(jq '.[0].timings.speakerClusteringSeconds' offline_results.json)
	INFERENCE_TIME=$(jq '.[0].timings.totalInferenceSeconds' offline_results.json)

	echo "DER=${DER}" >> $GITHUB_OUTPUT
	echo "RTF=${RTF}" >> $GITHUB_OUTPUT
	echo "DURATION=${DURATION}" >> $GITHUB_OUTPUT
	echo "SPEAKER_COUNT=${SPEAKER_COUNT}" >> $GITHUB_OUTPUT
	echo "TOTAL_TIME=${TOTAL_TIME}" >> $GITHUB_OUTPUT
	echo "MODEL_DOWNLOAD_TIME=${MODEL_DOWNLOAD_TIME}" >> $GITHUB_OUTPUT
	echo "MODEL_COMPILE_TIME=${MODEL_COMPILE_TIME}" >> $GITHUB_OUTPUT
	echo "AUDIO_LOAD_TIME=${AUDIO_LOAD_TIME}" >> $GITHUB_OUTPUT
	echo "SEGMENTATION_TIME=${SEGMENTATION_TIME}" >> $GITHUB_OUTPUT
	echo "EMBEDDING_TIME=${EMBEDDING_TIME}" >> $GITHUB_OUTPUT
	echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
	echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT

	# Validate RTFx - 0 indicates benchmark failure
	if [ "$RTF" = "0" ] \|\| [ -z "$RTF" ]; then
	echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
	echo "RTFx value: $RTF"
	exit 1
	fi

	- name: Comment PR with Offline Pipeline Results
	if: always()
	uses: actions/github-script@v7
	with:
	script: \|
	const der = parseFloat('${{ steps.extract.outputs.DER }}');
	const rtf = parseFloat('${{ steps.extract.outputs.RTF }}');
	const duration = parseFloat('${{ steps.extract.outputs.DURATION }}').toFixed(1);
	const speakerCount = '${{ steps.extract.outputs.SPEAKER_COUNT }}';
	const totalTime = parseFloat('${{ steps.extract.outputs.TOTAL_TIME }}');
	const inferenceTime = parseFloat('${{ steps.extract.outputs.INFERENCE_TIME }}');
	const modelDownloadTime = parseFloat('${{ steps.extract.outputs.MODEL_DOWNLOAD_TIME }}');
	const modelCompileTime = parseFloat('${{ steps.extract.outputs.MODEL_COMPILE_TIME }}');
	const audioLoadTime = parseFloat('${{ steps.extract.outputs.AUDIO_LOAD_TIME }}');
	const segmentationTime = parseFloat('${{ steps.extract.outputs.SEGMENTATION_TIME }}');
	const embeddingTime = parseFloat('${{ steps.extract.outputs.EMBEDDING_TIME }}');
	const clusteringTime = parseFloat('${{ steps.extract.outputs.CLUSTERING_TIME }}');
	const executionTime = '${{ steps.benchmark.outputs.EXECUTION_TIME }}' \|\| 'N/A';

	let comment = '## Offline VBx Pipeline Results\n\n';
	comment += '### Speaker Diarization Performance (VBx Batch Mode)\n';
	comment += '_Optimal clustering with Hungarian algorithm for maximum accuracy_\n\n';
	comment += '\| Metric \| Value \| Target \| Status \| Description \|\n';
	comment += '\|--------\|-------\|--------\|---------\|-------------\|\n';
	comment += `\| DER \| ${der.toFixed(1)}% \| <20% \| ${der < 20 ? '✅' : '⚠️'} \| Diarization Error Rate (lower is better) \|\n`;
	comment += `\| RTFx \| ${rtf.toFixed(2)}x \| >1.0x \| ${rtf > 1.0 ? '✅' : '⚠️'} \| Real-Time Factor (higher is faster) \|\n\n`;

	comment += '### Offline VBx Pipeline Timing Breakdown\n';
	comment += '_Time spent in each stage of batch diarization_\n\n';
	comment += '\| Stage \| Time (s) \| % \| Description \|\n';
	comment += '\|-------\|----------\|---\|-------------\|\n';
	comment += `\| Model Download \| ${modelDownloadTime.toFixed(3)} \| ${(modelDownloadTime/totalTime*100).toFixed(1)} \| Fetching diarization models \|\n`;
	comment += `\| Model Compile \| ${modelCompileTime.toFixed(3)} \| ${(modelCompileTime/totalTime*100).toFixed(1)} \| CoreML compilation \|\n`;
	comment += `\| Audio Load \| ${audioLoadTime.toFixed(3)} \| ${(audioLoadTime/totalTime*100).toFixed(1)} \| Loading audio file \|\n`;
	comment += `\| Segmentation \| ${segmentationTime.toFixed(3)} \| ${(segmentationTime/totalTime*100).toFixed(1)} \| VAD + speech detection \|\n`;
	comment += `\| Embedding \| ${embeddingTime.toFixed(3)} \| ${(embeddingTime/totalTime*100).toFixed(1)} \| Speaker embedding extraction \|\n`;
	comment += `\| Clustering (VBx) \| ${clusteringTime.toFixed(3)} \| ${(clusteringTime/totalTime*100).toFixed(1)} \| Hungarian algorithm + VBx clustering \|\n`;
	comment += `\| Total \| ${totalTime.toFixed(3)} \| 100 \| Full VBx pipeline \|\n\n`;

	comment += '### Speaker Diarization Research Comparison\n';
	comment += '_Offline VBx achieves competitive accuracy with batch processing_\n\n';
	comment += '\| Method \| DER \| Mode \| Description \|\n';
	comment += '\|--------\|-----\|------\|-------------\|\n';
	comment += '\| FluidAudio (Offline) \| ' + der.toFixed(1) + '% \| VBx Batch \| On-device CoreML with optimal clustering \|\n';
	comment += '\| FluidAudio (Streaming) \| 17.7% \| Chunk-based \| First-occurrence speaker mapping \|\n';
	comment += '\| Research baseline \| 18-30% \| Various \| Standard dataset performance \|\n\n';

	comment += 'Pipeline Details:\n';
	comment += '- Mode: Offline VBx with Hungarian algorithm for optimal speaker-to-cluster assignment\n';
	comment += '- Segmentation: VAD-based voice activity detection\n';
	comment += '- Embeddings: WeSpeaker-compatible speaker embeddings\n';
	comment += '- Clustering: PowerSet with VBx refinement\n';
	comment += '- Accuracy: Higher than streaming due to optimal post-hoc mapping\n\n';

	comment += `<sub>🎯 Offline VBx Test • AMI Corpus ES2004a • ${duration}s meeting audio • ${inferenceTime.toFixed(1)}s processing • Test runtime: ${executionTime} • ${new Date().toLocaleString('en-US', { timeZone: 'America/New_York', year: 'numeric', month: '2-digit', day: '2-digit', hour: '2-digit', minute: '2-digit', hour12: true })} EST</sub>\n\n`;

	// Add hidden identifier for reliable comment detection
	comment += '<!-- fluidaudio-offline-pipeline -->';

	try {
	// First, try to find existing benchmark comment
	const comments = await github.rest.issues.listComments({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	});

	// Look for existing offline pipeline comment (identified by the hidden tag)
	const existingComment = comments.data.find(comment => {
	const isBot = comment.user.type === 'Bot' \|\|
	comment.user.login === 'github-actions[bot]' \|\|
	comment.user.login.includes('[bot]');

	const hasIdentifier = comment.body.includes('<!-- fluidaudio-offline-pipeline -->');
	const hasHeader = comment.body.includes('## Offline VBx Pipeline Results');

	return isBot && (hasIdentifier \|\| hasHeader);
	});

	if (existingComment) {
	// Update existing comment
	await github.rest.issues.updateComment({
	comment_id: existingComment.id,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});
	console.log('Successfully updated existing offline pipeline comment');
	} else {
	// Create new comment if none exists
	await github.rest.issues.createComment({
	issue_number: context.issue.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});
	console.log('Successfully posted new offline pipeline results comment');
	}
	} catch (error) {
	console.error('Failed to update/post comment:', error.message);
	// Don't fail the workflow just because commenting failed
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937

Workflow file

fix(sortformer): consume BNNS-fixed v3 models + config-mismatch guard (#726) #1937

Uh oh!

Workflow file for this run