diff --git a/package.json b/package.json index 0b0be1f..700eb2b 100644 --- a/package.json +++ b/package.json @@ -9,8 +9,8 @@ "examples/*" ], "dependencies": { - "@daily-co/daily-js": "^0.84.0", - "@pipecat-ai/client-js": "^1.4.0", + "@daily-co/daily-js": "^0.85.0", + "@pipecat-ai/client-js": "^1.5.0", "@pipecat-ai/client-react": "^1.1.0", "react": "^19.2.1", "react-dom": "^19.2.1", diff --git a/package/package.json b/package/package.json index d163903..8e1dc79 100644 --- a/package/package.json +++ b/package/package.json @@ -103,17 +103,18 @@ "lucide-react": "^0.511.0", "react-chartjs-2": "^5.3.0", "react-resizable-panels": "^3.0.6", + "semver": "^7.6.3", "tailwind-merge": "^3.3.1", "zustand": "^5.0.8" }, "devDependencies": { - "@daily-co/daily-js": "^0.80.0", + "@daily-co/daily-js": "^0.85.0", "@eslint/js": "^9.36.0", "@ladle/react": "^5.0.3", - "@pipecat-ai/client-js": "^1.4.0", + "@pipecat-ai/client-js": "^1.5.0", "@pipecat-ai/client-react": "^1.1.0", - "@pipecat-ai/daily-transport": "^1.4.0", - "@pipecat-ai/small-webrtc-transport": "^1.5.0", + "@pipecat-ai/daily-transport": "^1.5.0", + "@pipecat-ai/small-webrtc-transport": "^1.8.0", "@tailwindcss/vite": "^4.1.13", "@types/node": "^22.18.8", "@types/react": "^19.1.16", diff --git a/package/src/components/ConversationProvider.tsx b/package/src/components/ConversationProvider.tsx index e340821..1290e07 100644 --- a/package/src/components/ConversationProvider.tsx +++ b/package/src/components/ConversationProvider.tsx @@ -3,9 +3,11 @@ import { type ConversationMessage, type ConversationMessagePart, } from "@/types/conversation"; -import { RTVIEvent } from "@pipecat-ai/client-js"; +import { useBotMessages } from "@/hooks/useBotMessages"; +import { BotOutputData, BotReadyData, RTVIEvent } from "@pipecat-ai/client-js"; import { useRTVIClientEvent } from "@pipecat-ai/client-react"; -import { createContext, useContext, useRef } from "react"; +import { createContext, useContext, useRef, useState } from "react"; +import { isMinVersion } from "@/utils/version"; interface ConversationContextValue { messages: ConversationMessage[]; @@ -13,6 +15,11 @@ interface ConversationContextValue { role: "user" | "assistant" | "system"; parts: ConversationMessagePart[]; }) => void; + /** + * Whether BotOutput events are supported (RTVI 1.1.0+) + * null = unknown (before BotReady), true = supported, false = not supported + */ + botOutputSupported: boolean | null; } const ConversationContext = createContext( @@ -29,32 +36,24 @@ export const ConversationProvider = ({ children }: React.PropsWithChildren) => { injectMessage, upsertUserTranscript, updateAssistantText, - startAssistantLlmStream, + updateAssistantBotOutput, } = useConversationStore(); + // null = unknown (before BotReady), true = supported, false = not supported + const [botOutputSupported, setBotOutputSupported] = useState( + null, + ); const userStoppedTimeout = useRef>(undefined); const assistantStreamResetRef = useRef(0); useRTVIClientEvent(RTVIEvent.Connected, () => { clearMessages(); + setBotOutputSupported(null); + botOutputLastChunkRef.current = { spoken: "", unspoken: "" }; }); - useRTVIClientEvent(RTVIEvent.BotLlmStarted, () => { - startAssistantLlmStream(); - // Nudge a reset counter so any consumer logic can infer fresh turn if needed - assistantStreamResetRef.current += 1; - }); - - useRTVIClientEvent(RTVIEvent.BotLlmText, (data) => { - updateAssistantText(data.text, false, "llm"); - }); - - useRTVIClientEvent(RTVIEvent.BotLlmStopped, () => { - finalizeLastMessage("assistant"); - }); - - useRTVIClientEvent(RTVIEvent.BotTtsStarted, () => { - // Start a new assistant message for TTS if there isn't one already in progress + // Helper to ensure assistant message exists + const ensureAssistantMessage = () => { const store = useConversationStore.getState(); const lastAssistantIndex = store.messages.findLastIndex( (msg: ConversationMessage) => msg.role === "assistant", @@ -70,15 +69,77 @@ export const ConversationProvider = ({ children }: React.PropsWithChildren) => { final: false, parts: [], }); + assistantStreamResetRef.current += 1; + return true; } + return false; + }; + + // Detect BotOutput support from BotReady event + useRTVIClientEvent(RTVIEvent.BotReady, (botData: BotReadyData) => { + const rtviVersion = botData.version; + const supportsBotOutput = isMinVersion(rtviVersion, [1, 1, 0]); + setBotOutputSupported(supportsBotOutput); }); - useRTVIClientEvent(RTVIEvent.BotTtsText, (data) => { - updateAssistantText(data.text, false, "tts"); + // Track last chunk text per type for spacing detection in BotOutput mode + const botOutputLastChunkRef = useRef<{ spoken: string; unspoken: string }>({ + spoken: "", + unspoken: "", }); - useRTVIClientEvent(RTVIEvent.BotTtsStopped, () => { - // Finalize the TTS text stream + useRTVIClientEvent(RTVIEvent.BotOutput, (data: BotOutputData) => { + ensureAssistantMessage(); + + // Handle spacing for BotOutput chunks + let textToAdd = data.text; + const lastChunk = data.spoken + ? botOutputLastChunkRef.current.spoken + : botOutputLastChunkRef.current.unspoken; + + // Add space separator if needed between BotOutput chunks + if (lastChunk) { + textToAdd = " " + textToAdd; + } + + // Update the appropriate last chunk tracker + if (data.spoken) { + botOutputLastChunkRef.current.spoken = textToAdd; + } else { + botOutputLastChunkRef.current.unspoken = textToAdd; + } + + // Update both spoken and unspoken text streams + const isFinal = data.aggregated_by === "sentence"; + updateAssistantBotOutput(textToAdd, isFinal, data.spoken); + }); + + // Handle legacy TTS/LLM events (when BotOutput not supported) + useBotMessages( + { + onBotMessageStarted: () => { + ensureAssistantMessage(); + }, + onBotMessageChunk: (type, text) => { + updateAssistantText(text, false, type); + }, + onBotMessageEnded: () => { + const store = useConversationStore.getState(); + const lastAssistant = store.messages.findLast( + (m: ConversationMessage) => m.role === "assistant", + ); + + if (lastAssistant && !lastAssistant.final) { + finalizeLastMessage("assistant"); + } + }, + }, + botOutputSupported === true, + ); + + useRTVIClientEvent(RTVIEvent.BotStoppedSpeaking, () => { + // Finalize the assistant message when bot stops speaking + // This works for both BotOutput and fallback scenarios const store = useConversationStore.getState(); const lastAssistant = store.messages.findLast( (m: ConversationMessage) => m.role === "assistant", @@ -123,6 +184,7 @@ export const ConversationProvider = ({ children }: React.PropsWithChildren) => { const contextValue: ConversationContextValue = { messages, injectMessage, + botOutputSupported, }; return ( diff --git a/package/src/components/elements/MessageContent.tsx b/package/src/components/elements/MessageContent.tsx index 7c05b82..7dc6224 100644 --- a/package/src/components/elements/MessageContent.tsx +++ b/package/src/components/elements/MessageContent.tsx @@ -1,5 +1,6 @@ import { cn } from "@/lib/utils"; import { + BotOutputText, ConversationMessage, ConversationMessagePart, } from "@/types/conversation"; @@ -30,25 +31,77 @@ interface Props { message: ConversationMessage; } +/** + * Renders BotOutput mode: shows unspoken text muted, spoken text replaces it + */ +const renderBotOutput = (spoken: string, unspoken: string): React.ReactNode => { + const spokenLength = spoken?.length || 0; + const remainingUnspoken = unspoken ? unspoken.slice(spokenLength) : ""; + + return ( + + {spoken} + {remainingUnspoken && ( + {remainingUnspoken} + )} + + ); +}; + export const MessageContent = ({ classNames = {}, message }: Props) => { const parts = Array.isArray(message.parts) ? message.parts : []; + return (
{parts.map((part: ConversationMessagePart, idx: number) => { const nextPart = parts?.[idx + 1] ?? null; const isText = typeof part.text === "string"; - const nextIsText = nextPart && typeof nextPart.text === "string"; + const isBotOutputTextValue = Boolean( + part.text && + typeof part.text === "object" && + "spoken" in part.text && + "unspoken" in part.text, + ); + const nextIsText = + nextPart && + Boolean( + typeof nextPart.text === "string" || + (nextPart.text && + typeof nextPart.text === "object" && + "spoken" in nextPart.text), + ); + + let content: React.ReactNode; + if (isBotOutputTextValue) { + const botText = part.text as BotOutputText; + content = renderBotOutput(botText.spoken, botText.unspoken); + } else { + content = part.text as React.ReactNode; + } + return ( - {isText ? part.text : part.text} - {isText && nextIsText ? " " : null} + {content} + {(isText || isBotOutputTextValue) && nextIsText ? " " : null} ); })} {parts.length === 0 || - parts.every( - (part) => typeof part.text === "string" && part.text.trim() === "", - ) ? ( + parts.every((part) => { + if (typeof part.text === "string") { + return part.text.trim() === ""; + } + if ( + part.text && + typeof part.text === "object" && + "spoken" in part.text && + "unspoken" in part.text + ) { + const botText = part.text as unknown as BotOutputText; + return botText.spoken.trim() === "" && botText.unspoken.trim() === ""; + } + return false; + }) ? ( ) : null}
{ const [transcript, setTranscript] = useState([]); const [turnEnd, setIsTurnEnd] = useState(false); + const [botOutputSupported, setBotOutputSupported] = useState(false); const transportState = usePipecatClientTransportState(); - useRTVIClientEvent( - RTVIEvent.BotTtsText, - useCallback( - (event: BotTTSTextData) => { + // Detect BotOutput support from BotReady event + useRTVIClientEvent(RTVIEvent.BotReady, (botData: BotReadyData) => { + const rtviVersion = botData.version; + const supportsBotOutput = isMinVersion(rtviVersion, [1, 1, 0]); + setBotOutputSupported(supportsBotOutput); + }); + + // Handle BotOutput events (when supported) - only word-level spoken chunks + useRTVIClientEvent(RTVIEvent.BotOutput, (data: BotOutputData) => { + if (participant === "local" || !botOutputSupported) { + return; + } + + // Only process word-level outputs that have been spoken + // These provide real-time word-by-word streaming for karaoke-like UI + if (data.aggregated_by === "word" && data.spoken === true && data.text) { + if (turnEnd) { + setTranscript([]); + setIsTurnEnd(false); + } + + setTranscript((prev) => [...prev, data.text]); + } + }); + + // Handle legacy TTS events (when BotOutput not supported) + useBotMessages( + { + onBotMessageChunk: (type, text) => { if (participant === "local") { return; } - if (turnEnd) { - setTranscript([]); - setIsTurnEnd(false); - } + // Only process TTS chunks (spoken content) + if (type === "tts") { + if (turnEnd) { + setTranscript([]); + setIsTurnEnd(false); + } - setTranscript((prev) => [...prev, event.text]); + setTranscript((prev) => [...prev, text]); + } }, - [turnEnd, participant], - ), + onBotMessageEnded: (type) => { + if (participant === "local") { + return; + } + // Only handle TTS ended events + if (type === "tts") { + setIsTurnEnd(true); + } + }, + }, + botOutputSupported, ); useRTVIClientEvent( @@ -212,16 +252,6 @@ export const TranscriptOverlay = ({ }, [participant]), ); - useRTVIClientEvent( - RTVIEvent.BotTtsStopped, - useCallback(() => { - if (participant === "local") { - return; - } - setIsTurnEnd(true); - }, [participant]), - ); - if (transcript.length === 0 || transportState !== "ready") { return null; } diff --git a/package/src/components/panels/ConversationPanel.tsx b/package/src/components/panels/ConversationPanel.tsx index 7aa435a..e48b306 100644 --- a/package/src/components/panels/ConversationPanel.tsx +++ b/package/src/components/panels/ConversationPanel.tsx @@ -5,6 +5,7 @@ import { Button } from "@/components/ui/button"; import { Panel, PanelContent, PanelHeader } from "@/components/ui/panel"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; import { TextMode } from "@/types/conversation"; +import { useConversationContext } from "@/components/ConversationProvider"; import { LineChartIcon, MessagesSquareIcon } from "lucide-react"; import { memo, useState } from "react"; @@ -53,6 +54,13 @@ export const ConversationPanel: React.FC = memo( }) => { const defaultValue = noConversation ? "metrics" : "conversation"; const [textMode, setTextMode] = useState(initialTextMode); + const { botOutputSupported } = useConversationContext(); + + // Show toggle only when BotOutput is confirmed unsupported (false) and not disabled + // Hide by default (when botOutputSupported is still unknown/null or true) + const shouldShowToggle = + !noTextModeToggle && botOutputSupported === false; + return ( @@ -71,7 +79,7 @@ export const ConversationPanel: React.FC = memo( )} - {!noTextModeToggle && ( + {shouldShowToggle && (