From 30029d7c40530236d33f2e01aeb56b87e0e98386 Mon Sep 17 00:00:00 2001 From: Umesh Madan Date: Thu, 30 Jan 2025 19:45:07 -0800 Subject: [PATCH] Timestamp index --- ts/packages/knowPro/src/dataFormat.ts | 10 ++++ ts/packages/knowPro/src/import.ts | 13 +++- ts/packages/knowPro/src/query.ts | 23 +------- ts/packages/knowPro/src/timestampIndex.ts | 72 +++++++++++++++++++++++ 4 files changed, 96 insertions(+), 22 deletions(-) create mode 100644 ts/packages/knowPro/src/timestampIndex.ts diff --git a/ts/packages/knowPro/src/dataFormat.ts b/ts/packages/knowPro/src/dataFormat.ts index b89cbd767..be856ae60 100644 --- a/ts/packages/knowPro/src/dataFormat.ts +++ b/ts/packages/knowPro/src/dataFormat.ts @@ -76,6 +76,7 @@ export interface IConversation { semanticRefIndex?: ITermToSemanticRefIndex | undefined; semanticRefs: SemanticRef[] | undefined; relatedTermsIndex?: ITermToRelatedTermsIndex | undefined; + timestampIndex?: ITimestampToMessageIndex | undefined; } export type MessageIndex = number; @@ -138,3 +139,12 @@ export interface ITextEmbeddingDataItem { text: string; embedding: number[]; } + +export type DateRange = { + start: Date; + end?: Date | undefined; +}; + +export interface ITimestampToMessageIndex { + getMessagesInDateRange(dateRange: DateRange): MessageIndex[]; +} diff --git a/ts/packages/knowPro/src/import.ts b/ts/packages/knowPro/src/import.ts index 0070be2f1..e8631341f 100644 --- a/ts/packages/knowPro/src/import.ts +++ b/ts/packages/knowPro/src/import.ts @@ -26,6 +26,7 @@ import { SemanticIndexSettings, TermSemanticIndex, } from "./termIndex.js"; +import { TimestampToMessageIndex } from "./timestampIndex.js"; // metadata for podcast messages export class PodcastMessageMeta implements IKnowledgeSource { @@ -122,6 +123,7 @@ export class Podcast implements IConversation { public semanticRefs: SemanticRef[] = [], public semanticRefIndex: ConversationIndex | undefined = undefined, public relatedTermsIndex: TermSemanticIndex | undefined = undefined, + public timestampIndex: TimestampToMessageIndex | undefined = undefined, ) { this.settings = createPodcastSettings(); } @@ -177,6 +179,7 @@ export class Podcast implements IConversation { ): Promise { const result = await buildConversationIndex(this, progressCallback); this.addMetadataToIndex(); + this.buildTimestampIndex(); return result; } @@ -198,6 +201,10 @@ export class Podcast implements IConversation { } } + public buildTimestampIndex(): void { + this.timestampIndex = new TimestampToMessageIndex(this.messages); + } + public serialize(): PodcastData { return { nameTag: this.nameTag, @@ -221,6 +228,7 @@ export class Podcast implements IConversation { data.relatedTermIndexData, ); } + this.buildTimestampIndex(); } } @@ -231,6 +239,8 @@ export interface PodcastData extends IConversationData { export async function importPodcast( transcriptFilePath: string, podcastName?: string, + startDate?: Date, + lengthMinutes: number = 60, ): Promise { const transcriptText = await readAllText(transcriptFilePath); podcastName ??= getFileName(transcriptFilePath); @@ -276,7 +286,8 @@ export async function importPodcast( } assignMessageListeners(msgs, participants); const pod = new Podcast(podcastName, msgs, [podcastName]); - // TODO: add timestamps and more tags + pod.generateTimestamps(startDate, lengthMinutes); + // TODO: add more tags // list all the books // what did K say about Children of Time? return pod; diff --git a/ts/packages/knowPro/src/query.ts b/ts/packages/knowPro/src/query.ts index 18abf0ccd..ed095d0a3 100644 --- a/ts/packages/knowPro/src/query.ts +++ b/ts/packages/knowPro/src/query.ts @@ -2,6 +2,7 @@ // Licensed under the MIT License. import { + DateRange, IConversation, IMessage, ITag, @@ -23,7 +24,7 @@ import { SemanticRefAccumulator, TextRangeAccumulator, } from "./accumulators.js"; -import { collections, dateTime } from "typeagent"; +import { collections } from "typeagent"; export function isConversationSearchable(conversation: IConversation): boolean { return ( @@ -62,21 +63,6 @@ export function timestampRangeForConversation( return undefined; } -/** - * Assumes messages are in timestamp order. - * @param conversation - */ -export function getMessagesInDateRange( - conversation: IConversation, - dateRange: DateRange, -): IMessage[] { - return collections.getInRange( - conversation.messages, - dateTime.timestampString(dateRange.start), - dateRange.end ? dateTime.timestampString(dateRange.end) : undefined, - (x, y) => x.localeCompare(y), - ); -} /** * Returns: * 0 if locations are equal @@ -118,11 +104,6 @@ export function isInTextRange( return cmpStart <= 0 && cmpEnd <= 0; } -export type DateRange = { - start: Date; - end?: Date | undefined; -}; - export function compareDates(x: Date, y: Date): number { return x.getTime() - y.getTime(); } diff --git a/ts/packages/knowPro/src/timestampIndex.ts b/ts/packages/knowPro/src/timestampIndex.ts new file mode 100644 index 000000000..b25596d7c --- /dev/null +++ b/ts/packages/knowPro/src/timestampIndex.ts @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { collections, dateTime } from "typeagent"; +import { + DateRange, + IMessage, + ITimestampToMessageIndex, + MessageIndex, +} from "./dataFormat.js"; + +export class TimestampToMessageIndex implements ITimestampToMessageIndex { + private messageIndex: Timestamped[]; + constructor(messages: IMessage[]) { + this.messageIndex = []; + for (let i = 0; i < messages.length; ++i) { + this.addMessage(messages[i], i); + } + this.messageIndex.sort(compareTimestamped); + } + + public getMessagesInDateRange(dateRange: DateRange): MessageIndex[] { + return collections.getInRange( + this.messageIndex, + dateTime.timestampString(dateRange.start), + dateRange.end ? dateTime.timestampString(dateRange.end) : undefined, + compareTimestamped, + ); + } + + private addMessage( + message: IMessage, + messageIndex: MessageIndex, + inOrder = false, + ): boolean { + if (!message.timestamp) { + return false; + } + const date = new Date(message.timestamp); + // This string is formatted to be searchable + const entry: Timestamped = makeTimestamped( + date, + messageIndex, + ); + if (inOrder) { + collections.insertIntoSorted( + this.messageIndex, + entry, + compareTimestamped, + ); + } else { + this.messageIndex.push(entry); + } + return true; + } +} + +type Timestamped = { + timestamp: string; + value: T; +}; + +function compareTimestamped(x: Timestamped, y: Timestamped) { + return x.timestamp.localeCompare(y.timestamp); +} + +function makeTimestamped(timestamp: Date, value: any): Timestamped { + return { + value, + timestamp: dateTime.timestampString(timestamp, false), + }; +}