Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion Sources/FluidAudio/Diarizer/DiarizerTimeline.swift
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ public class DiarizerTimeline {
private let lock = NSLock()

/// Post-processing configuration
public let config: DiarizerTimelineConfig
public private(set) var config: DiarizerTimelineConfig

/// Finalized frame-wise speaker predictions.
/// Flat array of shape [numFrames, numSpeakers].
Expand Down Expand Up @@ -933,6 +933,21 @@ public class DiarizerTimeline {
}
}

// MARK: - Configuration

/// Temporarily override whether committed segments are persisted on speakers,
/// returning the previous value. Enrollment forces this on because it must read
/// the detected speaker back from the timeline to map it to a slot; callers are
/// responsible for restoring the configured value when finished.
@discardableResult
public func setStoreSegments(_ enabled: Bool) -> Bool {
lock.withLock {
let previous = config.storeSegments
config.storeSegments = enabled
return previous
}
Comment thread
SGD2718 marked this conversation as resolved.
Outdated
}

// MARK: - Rebuild Timeline

/// Rebuild the timeline from initial predictions. This is equivalent to reinitializing the timeline.
Expand Down
16 changes: 16 additions & 0 deletions Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,14 @@ public final class LSEENDDiarizer: Diarizer {

let sessionSnapshot = try session.takeSnapshot()
let timelineSnapshot = timeline.takeSnapshot()
let framesFedSnapshot = framesFedToModel

// Enrollment identifies the new speaker by reading it back from the timeline,
// which requires segments to be stored. Force storage on for the duration and
// restore the configured value when finished (the caller may have disabled it).
let storeSegmentsSnapshot = timeline.setStoreSegments(true)
defer { timeline.setStoreSegments(storeSegmentsSnapshot) }

let isNamed = name != nil

let requireNewSpeaker = isNamed && !overwriteAssignedSpeakerName
Expand Down Expand Up @@ -334,6 +342,7 @@ public final class LSEENDDiarizer: Diarizer {
else {
session.rollback(to: sessionSnapshot)
timeline.rollback(to: timelineSnapshot)
framesFedToModel = framesFedSnapshot
return nil
}

Expand All @@ -360,13 +369,20 @@ public final class LSEENDDiarizer: Diarizer {
else {
session.rollback(to: sessionSnapshot)
timeline.rollback(to: timelineSnapshot)
framesFedToModel = framesFedSnapshot
return nil
}

// Rename speaker and report success
enrolledSpeaker.name = name
timeline.reset(keepingSpeakers: true)

// Re-arm the right-context warmup strip for the live stream. The timeline
// origin is now frame 0, but the encoder's convDelay look-ahead still holds
// the enrollment drain silence; resetting the counter makes the first live
// chunk strip those frames so reported timestamps stay aligned to real time.
framesFedToModel = 0

return enrolledSpeaker
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,12 @@ public final class SortformerDiarizer: Diarizer {
throw SortformerError.notInitialized
}

// Enrollment identifies the new speaker by reading it back from the timeline,
// which requires segments to be stored. Force storage on for the duration and
// restore the configured value when finished (the caller may have disabled it).
let storeSegmentsSnapshot = _timeline.setStoreSegments(true)
defer { _timeline.setStoreSegments(storeSegmentsSnapshot) }

if _timeline.hasSegments {
logger.warning("Trying to enroll a speaker while timeline has segments; timeline will be reset")
}
Expand Down
100 changes: 100 additions & 0 deletions Tests/FluidAudioTests/Diarizer/SpeakerEnrollmentTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,38 @@ final class SpeakerEnrollmentTests: XCTestCase {
XCTAssertEqual(namedSpeakerNames(in: diarizer.timeline), ["Alice"])
}

func testSortformerEnrollmentWorksWhenTimelineDoesNotStoreSegments() async throws {
XCTExpectFailure("Download might fail in CI environment", strict: false)

let config = SortformerConfig.default
let models = try await loadSortformerModelsForTest(config: config)
let enrollmentAudio = try DiarizationTestFixtures.fixtureAudio(
sampleRate: config.sampleRate, startSeconds: 0.0, durationSeconds: 5.0)

// Control: confirm the fixture yields a confident speaker with storage on.
let control = SortformerDiarizer(config: config)
control.initialize(models: models)
let controlSpeaker = try control.enrollSpeaker(withAudio: enrollmentAudio, named: "Alice")
try XCTSkipIf(
controlSpeaker == nil, "Fixture did not produce a confident Sortformer speaker segment on this host.")

// Timeline configured NOT to store segments: enrollment must still succeed by
// temporarily storing segments, then fall back to the configured value.
let diarizer = SortformerDiarizer(
config: config, timelineConfig: DiarizerTimelineConfig(storeSegments: false))
diarizer.initialize(models: models)
XCTAssertFalse(diarizer.timeline.config.storeSegments)

let speaker = try diarizer.enrollSpeaker(withAudio: enrollmentAudio, named: "Alice")

XCTAssertNotNil(
speaker, "Enrollment must work even when the timeline is configured not to store segments")
XCTAssertEqual(speaker?.name, "Alice")
XCTAssertFalse(
diarizer.timeline.config.storeSegments,
"store-segments must fall back to its configured value after enrollment")
}

// MARK: - LS-EEND enrollSpeaker: Error Cases

func testLseendEnrollSpeakerThrowsWhenNotInitialized() {
Expand Down Expand Up @@ -407,6 +439,74 @@ final class SpeakerEnrollmentTests: XCTestCase {
}
}

/// Enrollment resets the visible timeline to frame 0, but must not shift the
/// streaming timeline relative to real audio. Regression test for the
/// right-context (convDelay) offset: an enrolled stream must finalize the same
/// number of frames as a baseline stream of identical live audio. Before the
/// fix, the enrolled stream skipped the convDelay output strip on its first
/// live chunk and ran convDelay frames long, shifting every reported timestamp
/// later by the right-context lag.
func testLseendEnrollmentDoesNotOffsetStreamingTimeline() async throws {
let model = try await loadLseendModelForTest()
let chunkSizes = [977, 1231, 1607]

// Baseline: stream live audio with no enrollment.
let baseline = try LSEENDDiarizer(model: model)
let sampleRate = baseline.targetSampleRate ?? 16_000
let liveAudio = try DiarizationTestFixtures.fixtureAudio(
sampleRate: sampleRate, startSeconds: 3.0, durationSeconds: 3.0)
for chunk in DiarizationTestFixtures.chunk(liveAudio, sizes: chunkSizes) {
_ = try baseline.process(samples: chunk, sourceSampleRate: nil)
}
_ = try baseline.finalizeSession()
let baselineFrames = baseline.timeline.numFinalizedFrames
XCTAssertGreaterThan(baselineFrames, 0)

// Enrolled: enroll a speaker, then stream the identical live audio.
let enrolled = try LSEENDDiarizer(model: model)
let enrollmentAudio = try DiarizationTestFixtures.fixtureAudio(
sampleRate: sampleRate, startSeconds: 0.0, durationSeconds: 3.0)
_ = try enrolled.enrollSpeaker(
withAudio: enrollmentAudio, sourceSampleRate: nil, named: "Alice")
for chunk in DiarizationTestFixtures.chunk(liveAudio, sizes: chunkSizes) {
_ = try enrolled.process(samples: chunk, sourceSampleRate: nil)
}
_ = try enrolled.finalizeSession()

XCTAssertEqual(
enrolled.timeline.numFinalizedFrames, baselineFrames,
"Enrollment must not offset the streaming timeline by the right-context lag")
}

func testLseendEnrollmentWorksWhenTimelineDoesNotStoreSegments() async throws {
let model = try await loadLseendModelForTest()

// Control: with segment storage on (default), confirm the fixture yields a
// confident speaker on this host; otherwise the host can't exercise enrollment.
let control = try LSEENDDiarizer(model: model)
let sampleRate = control.targetSampleRate ?? 16_000
let enrollmentAudio = try DiarizationTestFixtures.fixtureAudio(
sampleRate: sampleRate, startSeconds: 0.0, durationSeconds: 3.0)
let controlSpeaker = try control.enrollSpeaker(withAudio: enrollmentAudio, named: "Alice")
try XCTSkipIf(
controlSpeaker == nil, "Fixture did not produce a confident LS-EEND speaker segment on this host.")

// Timeline configured NOT to store segments: enrollment must still succeed by
// temporarily storing segments, then fall back to the configured value.
let config = DiarizerTimelineConfig(storeSegments: false)
let diarizer = try LSEENDDiarizer(model: model, timelineConfig: config)
XCTAssertFalse(diarizer.timeline.config.storeSegments)

let speaker = try diarizer.enrollSpeaker(withAudio: enrollmentAudio, named: "Alice")

XCTAssertNotNil(
speaker, "Enrollment must work even when the timeline is configured not to store segments")
XCTAssertEqual(speaker?.name, "Alice")
XCTAssertFalse(
diarizer.timeline.config.storeSegments,
"store-segments must fall back to its configured value after enrollment")
}

func testLseendMultipleEnrollmentsRetainNamedSpeakersAndSession() async throws {
let model = try await loadLseendModelForTest()
let diarizer = try LSEENDDiarizer(model: model)
Expand Down
Loading