moq-dev · kixelated · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025
diff --git a/js/hang/src/catalog/captions.ts → js/hang/src/catalog/audio/captions.ts b/js/hang/src/catalog/captions.ts → js/hang/src/catalog/audio/captions.ts
@@ -1,5 +1,5 @@
 import { z } from "zod";
-import { TrackSchema } from "./track";
+import { TrackSchema } from "../track";
 
 export const CaptionsSchema = z.object({
 	// The MoQ track information.

diff --git a/js/hang/src/catalog/audio.ts → js/hang/src/catalog/audio/index.ts b/js/hang/src/catalog/audio.ts → js/hang/src/catalog/audio/index.ts
@@ -1,7 +1,8 @@
 import { z } from "zod";
+import { u53Schema } from "../integers";
+import { TrackSchema } from "../track";
 import { CaptionsSchema } from "./captions";
-import { u53Schema } from "./integers";
-import { TrackSchema } from "./track";
+import { SpeakingSchema } from "./speaking";
 
 // Mirrors AudioDecoderConfig
 // https://w3c.github.io/webcodecs/#audio-decoder-config
@@ -34,6 +35,9 @@ export const AudioSchema = z.object({
 
 	// An optional captions track
 	captions: CaptionsSchema.optional(),
+
+	// An optional speaking track
+	speaking: SpeakingSchema.optional(),
 });
 
 export type Audio = z.infer<typeof AudioSchema>;

diff --git a/js/hang/src/catalog/audio/speaking.ts b/js/hang/src/catalog/audio/speaking.ts
@@ -0,0 +1,9 @@
+import { z } from "zod";
+import { TrackSchema } from "../track";
+
+export const SpeakingSchema = z.object({
+	// The MoQ track information.
+	track: TrackSchema,
+});
+
+export type Speaking = z.infer<typeof SpeakingSchema>;
diff --git a/js/hang/src/catalog/index.ts b/js/hang/src/catalog/index.ts
@@ -1,6 +1,7 @@
 export * from "./audio";
+export * from "./audio/captions";
+export * from "./audio/speaking";
 export * from "./capabilities";
-export * from "./captions";
 export * from "./chat";
 export * from "./detection";
 export * from "./integers";

diff --git a/js/hang/src/container/bool.ts b/js/hang/src/container/bool.ts
@@ -0,0 +1,70 @@
+import type { GroupConsumer, GroupProducer, TrackConsumer, TrackProducer } from "@kixelated/moq";
+
+// Creates a track that write a frame on true, and closes the group on false.
+export class BoolProducer {
+	track: TrackProducer;
+	#group?: GroupProducer;
+
+	constructor(track: TrackProducer) {
+		this.track = track;
+	}
+
+	write(value: boolean) {
+		if (value) {
+			if (this.#group) return; // noop
+			this.#group = this.track.appendGroup();
+			this.#group.writeFrame(new Uint8Array([1]));
+		} else {
+			if (!this.#group) return; // noop
+			this.#group.close();
+			this.#group = undefined;
+		}
+	}
+
+	clone() {
+		return new BoolProducer(this.track);
+	}
+
+	close() {
+		this.track.close();
+		this.#group?.close();
+		this.#group = undefined;
+	}
+}
+
+export class BoolConsumer {
+	track: TrackConsumer;
+	#group?: GroupConsumer;
+
+	constructor(track: TrackConsumer) {
+		this.track = track;
+	}
+
+	async next(): Promise<boolean | undefined> {
+		for (;;) {
+			if (!this.#group) {
+				const group = await this.track.nextGroup();
+				if (!group) return undefined;
+
+				this.#group = group;
+				return true;
+			}
+
+			const group = await Promise.race([this.track.nextGroup(), this.#group.closed()]);
+			if (group) {
+				this.#group = group;
+				continue;
+			}
+
+			this.#group.close();
+			this.#group = undefined;
+			return false;
+		}
+	}
+
+	close() {
+		this.track.close();
+		this.#group?.close();
+		this.#group = undefined;
+	}
+}
diff --git a/js/hang/src/publish/audio/captions-worker.ts b/js/hang/src/publish/audio/captions-worker.ts
@@ -1,28 +1,21 @@
-import {
-	AutoModel,
-	type AutomaticSpeechRecognitionPipeline,
-	type PreTrainedModel,
-	pipeline,
-	Tensor,
-} from "@huggingface/transformers";
+import { type AutomaticSpeechRecognitionPipeline, pipeline } from "@huggingface/transformers";
+import type { AudioFrame } from "./capture";
 
-export type Request = Init;
+export type Request = Init | Speaking;
 
 export interface Init {
 	type: "init";
-
-	// Receive "speaking" audio directly from the VAD worker.
-	// TODO strongly type this, receives Speaking and NotSpeaking.
+	// Captured audio from the worklet.
 	worklet: MessagePort;
 }
 
-export type Result = Speaking | Text | Error;
-
 export interface Speaking {
 	type: "speaking";
 	speaking: boolean;
 }
 
+export type Result = Text | Error;
+
 export interface Text {
 	type: "text";
 	text: string;
@@ -35,121 +28,6 @@ export interface Error {
 
 const SAMPLE_RATE = 16000;
 
-// This VAD model expects 512 samples at a time, or 31ms
-const VAD_CHUNK_SIZE = 512;
-
-// Require 8 silence chunks to be detected before we consider the user done speaking.
-const VAD_SILENCE_PADDING = 8;
-
-class Vad {
-	whisper: Whisper;
-
-	// Simple circular buffer, primarily so we keep the previous buffer around once speaking is detected.
-	#next = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // queued
-	#current = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // being processed
-	#prev = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // already processed
-
-	#processing = false;
-
-	// Initial state for VAD
-	#sr = new Tensor("int64", [SAMPLE_RATE], []);
-	#state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
-	#speaking = false;
-
-	// Count the number of silence results, if we get 3 in a row then we're done.
-	#silence = 0;
-
-	#model: Promise<PreTrainedModel>;
-
-	constructor(whisper: Whisper) {
-		this.whisper = whisper;
-
-		this.#model = AutoModel.from_pretrained("onnx-community/silero-vad", {
-			// @ts-expect-error Not sure why this is needed.
-			config: { model_type: "custom" },
-			dtype: "fp32", // Full-precision
-		});
-	}
-
-	write(samples: Float32Array) {
-		if (this.#next.byteLength >= this.#next.buffer.byteLength) {
-			if (!this.flush()) {
-				// Drop the sample if VAD is still processing.
-				return;
-			}
-		}
-
-		this.#next = new Float32Array(this.#next.buffer, 0, this.#next.length + samples.length);
-		this.#next.set(samples, this.#next.length - samples.length);
-
-		if (this.#next.byteLength === this.#next.buffer.byteLength) {
-			this.flush(); // don't care if it fails
-		}
-	}
-
-	flush(): boolean {
-		if (this.#processing) {
-			return false;
-		}
-
-		this.#processing = true;
-
-		this.#current = this.#next;
-		this.#next = new Float32Array(this.#prev.buffer, 0, 0);
-		this.#prev = this.#current;
-
-		this.#flush().finally(() => {
-			this.#processing = false;
-		});
-
-		return true;
-	}
-
-	async #flush() {
-		const model = await this.#model;
-
-		const input = new Tensor("float32", this.#current, [1, this.#current.length]);
-		const result = await model({ input, sr: this.#sr, state: this.#state });
-		this.#state = result.stateN;
-
-		const wasSpeaking = this.#speaking;
-
-		const isSpeech = result.output.data[0];
-		if (this.#speaking && isSpeech < 0.3) {
-			this.#silence++;
-
-			if (this.#silence > VAD_SILENCE_PADDING) {
-				this.#speaking = false;
-
-				postResult({
-					type: "speaking",
-					speaking: false,
-				});
-			}
-		} else if (!this.#speaking && isSpeech >= 0.1) {
-			this.#speaking = true;
-			this.#silence = 0;
-
-			postResult({
-				type: "speaking",
-				speaking: true,
-			});
-		}
-
-		if (!wasSpeaking && this.#speaking) {
-			this.whisper.write(this.#prev);
-		}
-
-		if (wasSpeaking || this.#speaking) {
-			this.whisper.write(this.#current);
-		}
-
-		if (wasSpeaking && !this.#speaking) {
-			this.whisper.flush();
-		}
-	}
-}
-
 const MAX_WHISPER_BUFFER = 15 * SAMPLE_RATE; // 15 seconds
 
 class Whisper {
@@ -158,6 +36,7 @@ class Whisper {
 
 	#processing = false;
 
+	#speaking = false;
 	#model: Promise<AutomaticSpeechRecognitionPipeline>;
 
 	constructor() {
@@ -188,7 +67,12 @@ class Whisper {
 			}
 		}
 
-		this.#queued = new Float32Array(this.#queued.buffer, 0, this.#queued.length + samples.length);
+		// Determine how many samples to keep.
+		// If we're not speaking, only keep the previous chunk.
+		// TODO add a constant to keep more.
+		const keep = this.#speaking ? this.#queued.length : 0;
+
+		this.#queued = new Float32Array(this.#queued.buffer, 0, keep + samples.length);
 		this.#queued.set(samples, this.#queued.length - samples.length);
 	}
 
@@ -227,16 +111,30 @@ class Whisper {
 			text,
 		});
 	}
+
+	set speaking(speaking: boolean) {
+		if (this.#speaking === speaking) return;
+
+		this.#speaking = speaking;
+
+		if (!speaking) {
+			this.flush();
+		}
+	}
 }
 
+const whisper = new Whisper();
+
 self.addEventListener("message", async (event: MessageEvent<Request>) => {
 	const message = event.data;
-	const whisper = new Whisper();
-	const vad = new Vad(whisper);
 
-	message.worklet.onmessage = ({ data: samples }: MessageEvent<Float32Array>) => {
-		vad.write(samples);
-	};
+	if (message.type === "init") {
+		message.worklet.onmessage = ({ data: { channels } }: MessageEvent<AudioFrame>) => {
+			whisper.write(channels[0]);
+		};
+	} else if (message.type === "speaking") {
+		whisper.speaking = message.speaking;
+	}
 });
 
 function postResult(msg: Result) {

diff --git a/js/hang/src/publish/audio/captions-worklet.ts b/js/hang/src/publish/audio/captions-worklet.ts