Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { z } from "zod";
import { TrackSchema } from "./track";
import { TrackSchema } from "../track";

export const CaptionsSchema = z.object({
// The MoQ track information.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { z } from "zod";
import { u53Schema } from "../integers";
import { TrackSchema } from "../track";
import { CaptionsSchema } from "./captions";
import { u53Schema } from "./integers";
import { TrackSchema } from "./track";
import { SpeakingSchema } from "./speaking";

// Mirrors AudioDecoderConfig
// https://w3c.github.io/webcodecs/#audio-decoder-config
Expand Down Expand Up @@ -34,6 +35,9 @@ export const AudioSchema = z.object({

// An optional captions track
captions: CaptionsSchema.optional(),

// An optional speaking track
speaking: SpeakingSchema.optional(),
});

export type Audio = z.infer<typeof AudioSchema>;
Expand Down
9 changes: 9 additions & 0 deletions js/hang/src/catalog/audio/speaking.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { z } from "zod";
import { TrackSchema } from "../track";

export const SpeakingSchema = z.object({
// The MoQ track information.
track: TrackSchema,
});

export type Speaking = z.infer<typeof SpeakingSchema>;
3 changes: 2 additions & 1 deletion js/hang/src/catalog/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
export * from "./audio";
export * from "./audio/captions";
export * from "./audio/speaking";
export * from "./capabilities";
export * from "./captions";
export * from "./chat";
export * from "./detection";
export * from "./integers";
Expand Down
70 changes: 70 additions & 0 deletions js/hang/src/container/bool.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import type { GroupConsumer, GroupProducer, TrackConsumer, TrackProducer } from "@kixelated/moq";

// Creates a track that write a frame on true, and closes the group on false.
export class BoolProducer {
track: TrackProducer;
#group?: GroupProducer;

constructor(track: TrackProducer) {
this.track = track;
}

write(value: boolean) {
if (value) {
if (this.#group) return; // noop
this.#group = this.track.appendGroup();
this.#group.writeFrame(new Uint8Array([1]));
} else {
if (!this.#group) return; // noop
this.#group.close();
this.#group = undefined;
}
}

clone() {
return new BoolProducer(this.track);
}

close() {
this.track.close();
this.#group?.close();
this.#group = undefined;
}
}

export class BoolConsumer {
track: TrackConsumer;
#group?: GroupConsumer;

constructor(track: TrackConsumer) {
this.track = track;
}

async next(): Promise<boolean | undefined> {
for (;;) {
if (!this.#group) {
const group = await this.track.nextGroup();
if (!group) return undefined;

this.#group = group;
return true;
}

const group = await Promise.race([this.track.nextGroup(), this.#group.closed()]);
if (group) {
this.#group = group;
continue;
}

this.#group.close();
this.#group = undefined;
return false;
}
}

close() {
this.track.close();
this.#group?.close();
this.#group = undefined;
}
}
166 changes: 32 additions & 134 deletions js/hang/src/publish/audio/captions-worker.ts
Original file line number Diff line number Diff line change
@@ -1,28 +1,21 @@
import {
AutoModel,
type AutomaticSpeechRecognitionPipeline,
type PreTrainedModel,
pipeline,
Tensor,
} from "@huggingface/transformers";
import { type AutomaticSpeechRecognitionPipeline, pipeline } from "@huggingface/transformers";
import type { AudioFrame } from "./capture";

export type Request = Init;
export type Request = Init | Speaking;

export interface Init {
type: "init";

// Receive "speaking" audio directly from the VAD worker.
// TODO strongly type this, receives Speaking and NotSpeaking.
// Captured audio from the worklet.
worklet: MessagePort;
}

export type Result = Speaking | Text | Error;

export interface Speaking {
type: "speaking";
speaking: boolean;
}

export type Result = Text | Error;

export interface Text {
type: "text";
text: string;
Expand All @@ -35,121 +28,6 @@ export interface Error {

const SAMPLE_RATE = 16000;

// This VAD model expects 512 samples at a time, or 31ms
const VAD_CHUNK_SIZE = 512;

// Require 8 silence chunks to be detected before we consider the user done speaking.
const VAD_SILENCE_PADDING = 8;

class Vad {
whisper: Whisper;

// Simple circular buffer, primarily so we keep the previous buffer around once speaking is detected.
#next = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // queued
#current = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // being processed
#prev = new Float32Array(new ArrayBuffer(Float32Array.BYTES_PER_ELEMENT * VAD_CHUNK_SIZE), 0, 0); // already processed

#processing = false;

// Initial state for VAD
#sr = new Tensor("int64", [SAMPLE_RATE], []);
#state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
#speaking = false;

// Count the number of silence results, if we get 3 in a row then we're done.
#silence = 0;

#model: Promise<PreTrainedModel>;

constructor(whisper: Whisper) {
this.whisper = whisper;

this.#model = AutoModel.from_pretrained("onnx-community/silero-vad", {
// @ts-expect-error Not sure why this is needed.
config: { model_type: "custom" },
dtype: "fp32", // Full-precision
});
}

write(samples: Float32Array) {
if (this.#next.byteLength >= this.#next.buffer.byteLength) {
if (!this.flush()) {
// Drop the sample if VAD is still processing.
return;
}
}

this.#next = new Float32Array(this.#next.buffer, 0, this.#next.length + samples.length);
this.#next.set(samples, this.#next.length - samples.length);

if (this.#next.byteLength === this.#next.buffer.byteLength) {
this.flush(); // don't care if it fails
}
}

flush(): boolean {
if (this.#processing) {
return false;
}

this.#processing = true;

this.#current = this.#next;
this.#next = new Float32Array(this.#prev.buffer, 0, 0);
this.#prev = this.#current;

this.#flush().finally(() => {
this.#processing = false;
});

return true;
}

async #flush() {
const model = await this.#model;

const input = new Tensor("float32", this.#current, [1, this.#current.length]);
const result = await model({ input, sr: this.#sr, state: this.#state });
this.#state = result.stateN;

const wasSpeaking = this.#speaking;

const isSpeech = result.output.data[0];
if (this.#speaking && isSpeech < 0.3) {
this.#silence++;

if (this.#silence > VAD_SILENCE_PADDING) {
this.#speaking = false;

postResult({
type: "speaking",
speaking: false,
});
}
} else if (!this.#speaking && isSpeech >= 0.1) {
this.#speaking = true;
this.#silence = 0;

postResult({
type: "speaking",
speaking: true,
});
}

if (!wasSpeaking && this.#speaking) {
this.whisper.write(this.#prev);
}

if (wasSpeaking || this.#speaking) {
this.whisper.write(this.#current);
}

if (wasSpeaking && !this.#speaking) {
this.whisper.flush();
}
}
}

const MAX_WHISPER_BUFFER = 15 * SAMPLE_RATE; // 15 seconds

class Whisper {
Expand All @@ -158,6 +36,7 @@ class Whisper {

#processing = false;

#speaking = false;
#model: Promise<AutomaticSpeechRecognitionPipeline>;

constructor() {
Expand Down Expand Up @@ -188,7 +67,12 @@ class Whisper {
}
}

this.#queued = new Float32Array(this.#queued.buffer, 0, this.#queued.length + samples.length);
// Determine how many samples to keep.
// If we're not speaking, only keep the previous chunk.
// TODO add a constant to keep more.
const keep = this.#speaking ? this.#queued.length : 0;

this.#queued = new Float32Array(this.#queued.buffer, 0, keep + samples.length);
this.#queued.set(samples, this.#queued.length - samples.length);
}

Expand Down Expand Up @@ -227,16 +111,30 @@ class Whisper {
text,
});
}

set speaking(speaking: boolean) {
if (this.#speaking === speaking) return;

this.#speaking = speaking;

if (!speaking) {
this.flush();
}
}
}

const whisper = new Whisper();

self.addEventListener("message", async (event: MessageEvent<Request>) => {
const message = event.data;
const whisper = new Whisper();
const vad = new Vad(whisper);

message.worklet.onmessage = ({ data: samples }: MessageEvent<Float32Array>) => {
vad.write(samples);
};
if (message.type === "init") {
message.worklet.onmessage = ({ data: { channels } }: MessageEvent<AudioFrame>) => {
whisper.write(channels[0]);
};
} else if (message.type === "speaking") {
whisper.speaking = message.speaking;
}
});

function postResult(msg: Result) {
Expand Down
14 changes: 0 additions & 14 deletions js/hang/src/publish/audio/captions-worklet.ts

This file was deleted.

Loading
Loading