Add Microphone Input to MultimodalTextbox (gradio-app#10186)

dawoodkhan82 · gradio-pr-bot · abidlabs · web-flow · commit 9b17032a6564 · 2024-12-17T17:15:16.000-05:00
* microphone

* add changeset

* undo css changes

* notebook

* css fix

* fixes

* add changeset

* fixes

* pr fixes

* guides

* format

* ally ignore

* type fix

---------

Co-authored-by: gradio-pr-bot &lt;gradio-pr-bot@users.noreply.github.com&gt;
Co-authored-by: Abubakar Abid &lt;abubakar@huggingface.co&gt;
diff --git a/.changeset/fluffy-pots-clap.md b/.changeset/fluffy-pots-clap.md
@@ -0,0 +1,7 @@
+---
+"@gradio/audio": minor
+"@gradio/multimodaltextbox": minor
+"gradio": minor
+---
+
+feat:Add Microphone Input to MultimodalTextbox
diff --git a/demo/chatbot_multimodal/run.ipynb b/demo/chatbot_multimodal/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", "    print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", "    for x in message[\"files\"]:\n", "        history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", "    if message[\"text\"] is not None:\n", "        history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", "    return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", "    response = \"**That's cool!**\"\n", "    history.append({\"role\": \"assistant\", \"content\": \"\"})\n", "    for character in response:\n", "        history[-1][\"content\"] += character\n", "        time.sleep(0.05)\n", "        yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", "    chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", "    chat_input = gr.MultimodalTextbox(\n", "        interactive=True,\n", "        file_count=\"multiple\",\n", "        placeholder=\"Enter message or upload file...\",\n", "        show_label=False,\n", "    )\n", "\n", "    chat_msg = chat_input.submit(\n", "        add_message, [chatbot, chat_input], [chatbot, chat_input]\n", "    )\n", "    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", "    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", "    chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", "    print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", "    for x in message[\"files\"]:\n", "        history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", "    if message[\"text\"] is not None:\n", "        history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", "    return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", "    response = \"**That's cool!**\"\n", "    history.append({\"role\": \"assistant\", \"content\": \"\"})\n", "    for character in response:\n", "        history[-1][\"content\"] += character\n", "        time.sleep(0.05)\n", "        yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", "    chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", "    chat_input = gr.MultimodalTextbox(\n", "        interactive=True,\n", "        file_count=\"multiple\",\n", "        placeholder=\"Enter message or upload file...\",\n", "        show_label=False,\n", "        sources=[\"microphone\", \"upload\"],\n", "    )\n", "\n", "    chat_msg = chat_input.submit(\n", "        add_message, [chatbot, chat_input], [chatbot, chat_input]\n", "    )\n", "    bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", "    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", "    chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/chatbot_multimodal/run.py b/demo/chatbot_multimodal/run.py
@@ -33,6 +33,7 @@ def bot(history: list):
         file_count="multiple",
         placeholder="Enter message or upload file...",
         show_label=False,
+        sources=["microphone", "upload"],
     )
 
     chat_msg = chat_input.submit(
diff --git a/gradio/components/multimodal_textbox.py b/gradio/components/multimodal_textbox.py
@@ -61,6 +61,9 @@ def __init__(
         self,
         value: str | dict[str, str | list] | Callable | None = None,
         *,
+        sources: list[Literal["upload", "microphone"]]
+        | Literal["upload", "microphone"]
+        | None = None,
         file_types: list[str] | None = None,
         file_count: Literal["single", "multiple", "directory"] = "single",
         lines: int = 1,
@@ -91,6 +94,7 @@ def __init__(
         """
         Parameters:
             value: Default value to show in MultimodalTextbox. A string value, or a dictionary of the form {"text": "sample text", "files": [{path: "files/file.jpg", orig_name: "file.jpg", url: "http://image_url.jpg", size: 100}]}. If callable, the function will be called whenever the app loads to set the initial value of the component.
+            sources: A list of sources permitted. "upload" creates a button where users can click to upload or drop files, "microphone" creates a microphone input. If None, defaults to ["upload"].
             file_count: if single, allows user to upload one file. If "multiple", user uploads multiple files. If "directory", user uploads all files in selected directory. Return type will be list for each file in case of "multiple" or "directory".
             file_types: List of file extensions or types of files to be uploaded (e.g. ['image', '.json', '.mp4']). "file" allows any file to be uploaded, "image" allows only image files to be uploaded, "audio" allows only audio files to be uploaded, "video" allows only video files to be uploaded, "text" allows only text files to be uploaded.
             lines: minimum number of line rows to provide in textarea.
@@ -118,6 +122,22 @@ def __init__(
             stop_btn: If True, will show a stop button (useful for streaming demos). If a string, will use that string as the stop button text.
             max_plain_text_length: Maximum length of plain text in the textbox. If the text exceeds this length, the text will be pasted as a file. Default is 1000.
         """
+        valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"]
+        if sources is None:
+            self.sources = ["upload"]
+        elif isinstance(sources, str) and sources in valid_sources:
+            self.sources = [sources]
+        elif isinstance(sources, list):
+            self.sources = sources
+        else:
+            raise ValueError(
+                f"`sources` must be a list consisting of elements in {valid_sources}"
+            )
+        for source in self.sources:
+            if source not in valid_sources:
+                raise ValueError(
+                    f"`sources` must a list consisting of elements in {valid_sources}"
+                )
         self.file_types = file_types
         self.file_count = file_count
         if file_types is not None and not isinstance(file_types, list):
diff --git a/guides/05_chatbots/01_creating-a-chatbot-fast.md b/guides/05_chatbots/01_creating-a-chatbot-fast.md
@@ -194,7 +194,7 @@ This second parameter of your chat function, `history`, will be in the same open
 
 The return type of your chat function does *not change* when setting `multimodal=True` (i.e. in the simplest case, you should still return a string value). We discuss more complex cases, e.g. returning files [below](#returning-complex-responses).
 
-If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. Here's an example that illustrates how to set up and customize and multimodal chat interface:
+If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. Here's an example that illustrates how to set up and customize and multimodal chat interface:
  
 
 ```python
@@ -215,7 +215,7 @@ demo = gr.ChatInterface(
         {"text": "No files", "files": []}
     ], 
     multimodal=True,
-    textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"])
+    textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"], sources=["upload", "microphone"])
 )
 
 demo.launch()
diff --git a/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md b/guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md
@@ -70,15 +70,15 @@ def bot(history):
     return history
 ```
 
-In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:
+In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:
 
 ```python
 def add_message(history, message):
     for x in message["files"]:
         history.append({"role": "user", "content": {"path": x}})
     if message["text"] is not None:
         history.append({"role": "user", "content": message["text"]})
-    return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"])
+    return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"], sources=["upload", "microphone"])
 ```
 
 Putting this together, we can create a _multimodal_ chatbot with a multimodal textbox for a user to submit text and media files. The rest of the code looks pretty much the same as before:
diff --git a/js/audio/interactive/InteractiveAudio.svelte b/js/audio/interactive/InteractiveAudio.svelte
@@ -41,6 +41,7 @@
 	export let stream_every: number;
 	export let uploading = false;
 	export let recording = false;
+	export let class_name = "";
 
 	let time_limit: number | null = null;
 	let stream_state: "open" | "waiting" | "closed" = "closed";
@@ -246,7 +247,7 @@
 	float={active_source === "upload" && value === null}
 	label={label || i18n("audio.audio")}
 />
-<div class="audio-container">
+<div class="audio-container {class_name}">
 	<StreamingBar {time_limit} />
 	{#if value === null || streaming}
 		{#if active_source === "microphone"}
@@ -329,4 +330,30 @@
 		flex-direction: column;
 		justify-content: space-between;
 	}
+
+	.audio-container.compact-audio {
+		margin-top: calc(var(--size-8) * -1);
+		height: auto;
+		padding: 0px;
+		gap: var(--size-2);
+		min-height: var(--size-5);
+	}
+
+	.compact-audio :global(.audio-player) {
+		padding: 0px;
+	}
+
+	.compact-audio :global(.controls) {
+		gap: 0px;
+		padding: 0px;
+	}
+
+	.compact-audio :global(.waveform-container) {
+		height: var(--size-12) !important;
+	}
+
+	.compact-audio :global(.player-container) {
+		min-height: unset;
+		height: auto;
+	}
 </style>
diff --git a/js/multimodaltextbox/Index.svelte b/js/multimodaltextbox/Index.svelte
@@ -12,6 +12,8 @@
 	import { StatusTracker } from "@gradio/statustracker";
 	import type { LoadingStatus } from "@gradio/statustracker";
 	import type { FileData } from "@gradio/client";
+	import { onMount } from "svelte";
+	import type { WaveformOptions } from "../audio/shared/types";
 
 	export let gradio: Gradio<{
 		change: typeof value;
@@ -23,6 +25,11 @@
 		focus: never;
 		error: string;
 		clear_status: LoadingStatus;
+		start_recording: never;
+		pause_recording: never;
+		stop_recording: never;
+		upload: FileData[] | FileData;
+		clear: undefined;
 	}>;
 	export let elem_id = "";
 	export let elem_classes: string[] = [];
@@ -38,7 +45,6 @@
 	export let info: string | undefined = undefined;
 	export let show_label: boolean;
 	export let max_lines: number;
-	export let container = true;
 	export let scale: number | null = null;
 	export let min_width: number | undefined = undefined;
 	export let submit_btn: string | boolean | null = null;
@@ -53,8 +59,52 @@
 	export let root: string;
 	export let file_count: "single" | "multiple" | "directory";
 	export let max_plain_text_length: number;
+	export let sources: ["microphone" | "upload"] = ["upload"];
+	export let waveform_options: WaveformOptions = {};
 
 	let dragging: boolean;
+	let active_source: "microphone" | null = null;
+	let waveform_settings: Record<string, any>;
+	let color_accent = "darkorange";
+
+	onMount(() => {
+		color_accent = getComputedStyle(document?.documentElement).getPropertyValue(
+			"--color-accent"
+		);
+		set_trim_region_colour();
+		waveform_settings.waveColor = waveform_options.waveform_color || "#9ca3af";
+		waveform_settings.progressColor =
+			waveform_options.waveform_progress_color || color_accent;
+		waveform_settings.mediaControls = waveform_options.show_controls;
+		waveform_settings.sampleRate = waveform_options.sample_rate || 44100;
+	});
+
+	$: waveform_settings = {
+		height: 50,
+
+		barWidth: 2,
+		barGap: 3,
+		cursorWidth: 2,
+		cursorColor: "#ddd5e9",
+		autoplay: false,
+		barRadius: 10,
+		dragToSeek: true,
+		normalize: true,
+		minPxPerSec: 20
+	};
+
+	const trim_region_settings = {
+		color: waveform_options.trim_region_color,
+		drag: true,
+		resize: true
+	};
+
+	function set_trim_region_colour(): void {
+		document.documentElement.style.setProperty(
+			"--trim-region-color",
+			trim_region_settings.color || color_accent
+		);
+	}
 </script>
 
 <Block
@@ -80,6 +130,7 @@
 		bind:value
 		bind:value_is_output
 		bind:dragging
+		bind:active_source
 		{file_types}
 		{root}
 		{label}
@@ -88,14 +139,16 @@
 		{lines}
 		{rtl}
 		{text_align}
+		{waveform_settings}
+		i18n={gradio.i18n}
 		max_lines={!max_lines ? lines + 1 : max_lines}
 		{placeholder}
 		{submit_btn}
 		{stop_btn}
 		{autofocus}
-		{container}
 		{autoscroll}
 		{file_count}
+		{sources}
 		max_file_size={gradio.max_file_size}
 		on:change={() => gradio.dispatch("change", value)}
 		on:input={() => gradio.dispatch("input")}
@@ -107,6 +160,11 @@
 		on:error={({ detail }) => {
 			gradio.dispatch("error", detail);
 		}}
+		on:start_recording={() => gradio.dispatch("start_recording")}
+		on:pause_recording={() => gradio.dispatch("pause_recording")}
+		on:stop_recording={() => gradio.dispatch("stop_recording")}
+		on:upload={(e) => gradio.dispatch("upload", e.detail)}
+		on:clear={() => gradio.dispatch("clear")}
 		disabled={!interactive}
 		upload={(...args) => gradio.client.upload(...args)}
 		stream_handler={(...args) => gradio.client.stream(...args)}
diff --git a/js/multimodaltextbox/MultimodalTextbox.stories.svelte b/js/multimodaltextbox/MultimodalTextbox.stories.svelte
@@ -42,6 +42,12 @@
 			description: "Whether to render right-to-left",
 			control: { type: "boolean" },
 			defaultValue: false
+		},
+		sources: {
+			options: ["upload", "microphone"],
+			description: "The sources to enable",
+			control: { type: "select" },
+			defaultValue: ["upload", "microphone"]
 		}
 	}}
 />
@@ -87,3 +93,7 @@
 		}
 	}}
 />
+<Story
+	name="MultimodalTextbox with microphone input"
+	args={{ sources: ["microphone"] }}
+/>
diff --git a/js/multimodaltextbox/shared/MultimodalTextbox.svelte b/js/multimodaltextbox/shared/MultimodalTextbox.svelte

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"That's cool!\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
	`1`	+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"That's cool!\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " sources=[\"microphone\", \"upload\"],\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@ def bot(history: list):`
`33`	`33`	`file_count="multiple",`
`34`	`34`	`placeholder="Enter message or upload file...",`
`35`	`35`	`show_label=False,`
	`36`	`+ sources=["microphone", "upload"],`
`36`	`37`	`)`
`37`	`38`
`38`	`39`	`chat_msg = chat_input.submit(`