Skip to content

Commit 9b17032

Browse files
dawoodkhan82gradio-pr-botabidlabs
authored
Add Microphone Input to MultimodalTextbox (gradio-app#10186)
* microphone * add changeset * undo css changes * notebook * css fix * fixes * add changeset * fixes * pr fixes * guides * format * ally ignore * type fix --------- Co-authored-by: gradio-pr-bot <[email protected]> Co-authored-by: Abubakar Abid <[email protected]>
1 parent a95f8ef commit 9b17032

File tree

10 files changed

+322
-130
lines changed

10 files changed

+322
-130
lines changed

.changeset/fluffy-pots-clap.md

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
"@gradio/audio": minor
3+
"@gradio/multimodaltextbox": minor
4+
"gradio": minor
5+
---
6+
7+
feat:Add Microphone Input to MultimodalTextbox

demo/chatbot_multimodal/run.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
1+
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: chatbot_multimodal"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "!wget -q https://github.com/gradio-app/gradio/raw/main/demo/chatbot_multimodal/tuples_testcase.py"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import time\n", "\n", "# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.\n", "\n", "\n", "def print_like_dislike(x: gr.LikeData):\n", " print(x.index, x.value, x.liked)\n", "\n", "\n", "def add_message(history, message):\n", " for x in message[\"files\"]:\n", " history.append({\"role\": \"user\", \"content\": {\"path\": x}})\n", " if message[\"text\"] is not None:\n", " history.append({\"role\": \"user\", \"content\": message[\"text\"]})\n", " return history, gr.MultimodalTextbox(value=None, interactive=False)\n", "\n", "\n", "def bot(history: list):\n", " response = \"**That's cool!**\"\n", " history.append({\"role\": \"assistant\", \"content\": \"\"})\n", " for character in response:\n", " history[-1][\"content\"] += character\n", " time.sleep(0.05)\n", " yield history\n", "\n", "\n", "with gr.Blocks() as demo:\n", " chatbot = gr.Chatbot(elem_id=\"chatbot\", bubble_full_width=False, type=\"messages\")\n", "\n", " chat_input = gr.MultimodalTextbox(\n", " interactive=True,\n", " file_count=\"multiple\",\n", " placeholder=\"Enter message or upload file...\",\n", " show_label=False,\n", " sources=[\"microphone\", \"upload\"],\n", " )\n", "\n", " chat_msg = chat_input.submit(\n", " add_message, [chatbot, chat_input], [chatbot, chat_input]\n", " )\n", " bot_msg = chat_msg.then(bot, chatbot, chatbot, api_name=\"bot_response\")\n", " bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])\n", "\n", " chatbot.like(print_like_dislike, None, None, like_user_message=True)\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

demo/chatbot_multimodal/run.py

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def bot(history: list):
3333
file_count="multiple",
3434
placeholder="Enter message or upload file...",
3535
show_label=False,
36+
sources=["microphone", "upload"],
3637
)
3738

3839
chat_msg = chat_input.submit(

gradio/components/multimodal_textbox.py

+20
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ def __init__(
6161
self,
6262
value: str | dict[str, str | list] | Callable | None = None,
6363
*,
64+
sources: list[Literal["upload", "microphone"]]
65+
| Literal["upload", "microphone"]
66+
| None = None,
6467
file_types: list[str] | None = None,
6568
file_count: Literal["single", "multiple", "directory"] = "single",
6669
lines: int = 1,
@@ -91,6 +94,7 @@ def __init__(
9194
"""
9295
Parameters:
9396
value: Default value to show in MultimodalTextbox. A string value, or a dictionary of the form {"text": "sample text", "files": [{path: "files/file.jpg", orig_name: "file.jpg", url: "http://image_url.jpg", size: 100}]}. If callable, the function will be called whenever the app loads to set the initial value of the component.
97+
sources: A list of sources permitted. "upload" creates a button where users can click to upload or drop files, "microphone" creates a microphone input. If None, defaults to ["upload"].
9498
file_count: if single, allows user to upload one file. If "multiple", user uploads multiple files. If "directory", user uploads all files in selected directory. Return type will be list for each file in case of "multiple" or "directory".
9599
file_types: List of file extensions or types of files to be uploaded (e.g. ['image', '.json', '.mp4']). "file" allows any file to be uploaded, "image" allows only image files to be uploaded, "audio" allows only audio files to be uploaded, "video" allows only video files to be uploaded, "text" allows only text files to be uploaded.
96100
lines: minimum number of line rows to provide in textarea.
@@ -118,6 +122,22 @@ def __init__(
118122
stop_btn: If True, will show a stop button (useful for streaming demos). If a string, will use that string as the stop button text.
119123
max_plain_text_length: Maximum length of plain text in the textbox. If the text exceeds this length, the text will be pasted as a file. Default is 1000.
120124
"""
125+
valid_sources: list[Literal["upload", "microphone"]] = ["upload", "microphone"]
126+
if sources is None:
127+
self.sources = ["upload"]
128+
elif isinstance(sources, str) and sources in valid_sources:
129+
self.sources = [sources]
130+
elif isinstance(sources, list):
131+
self.sources = sources
132+
else:
133+
raise ValueError(
134+
f"`sources` must be a list consisting of elements in {valid_sources}"
135+
)
136+
for source in self.sources:
137+
if source not in valid_sources:
138+
raise ValueError(
139+
f"`sources` must a list consisting of elements in {valid_sources}"
140+
)
121141
self.file_types = file_types
122142
self.file_count = file_count
123143
if file_types is not None and not isinstance(file_types, list):

guides/05_chatbots/01_creating-a-chatbot-fast.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ This second parameter of your chat function, `history`, will be in the same open
194194

195195
The return type of your chat function does *not change* when setting `multimodal=True` (i.e. in the simplest case, you should still return a string value). We discuss more complex cases, e.g. returning files [below](#returning-complex-responses).
196196

197-
If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. Here's an example that illustrates how to set up and customize and multimodal chat interface:
197+
If you are customizing a multimodal chat interface, you should pass in an instance of `gr.MultimodalTextbox` to the `textbox` parameter. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. Here's an example that illustrates how to set up and customize and multimodal chat interface:
198198

199199

200200
```python
@@ -215,7 +215,7 @@ demo = gr.ChatInterface(
215215
{"text": "No files", "files": []}
216216
],
217217
multimodal=True,
218-
textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"])
218+
textbox=gr.MultimodalTextbox(file_count="multiple", file_types=["image"], sources=["upload", "microphone"])
219219
)
220220

221221
demo.launch()

guides/05_chatbots/04_creating-a-custom-chatbot-with-blocks.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,15 @@ def bot(history):
7070
return history
7171
```
7272

73-
In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:
73+
In addition, it can handle media files, such as images, audio, and video. You can use the `MultimodalTextbox` component to easily upload all types of media files to your chatbot. You can customize the `MultimodalTextbox` further by passing in the `sources` parameter, which is a list of sources to enable. To pass in a media file, we must pass in the file a dictionary with a `path` key pointing to a local file and an `alt_text` key. The `alt_text` is optional, so you can also just pass in a tuple with a single element `{"path": "filepath"}`, like this:
7474

7575
```python
7676
def add_message(history, message):
7777
for x in message["files"]:
7878
history.append({"role": "user", "content": {"path": x}})
7979
if message["text"] is not None:
8080
history.append({"role": "user", "content": message["text"]})
81-
return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"])
81+
return history, gr.MultimodalTextbox(value=None, interactive=False, file_types=["image"], sources=["upload", "microphone"])
8282
```
8383

8484
Putting this together, we can create a _multimodal_ chatbot with a multimodal textbox for a user to submit text and media files. The rest of the code looks pretty much the same as before:

js/audio/interactive/InteractiveAudio.svelte

+28-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
export let stream_every: number;
4242
export let uploading = false;
4343
export let recording = false;
44+
export let class_name = "";
4445
4546
let time_limit: number | null = null;
4647
let stream_state: "open" | "waiting" | "closed" = "closed";
@@ -246,7 +247,7 @@
246247
float={active_source === "upload" && value === null}
247248
label={label || i18n("audio.audio")}
248249
/>
249-
<div class="audio-container">
250+
<div class="audio-container {class_name}">
250251
<StreamingBar {time_limit} />
251252
{#if value === null || streaming}
252253
{#if active_source === "microphone"}
@@ -329,4 +330,30 @@
329330
flex-direction: column;
330331
justify-content: space-between;
331332
}
333+
334+
.audio-container.compact-audio {
335+
margin-top: calc(var(--size-8) * -1);
336+
height: auto;
337+
padding: 0px;
338+
gap: var(--size-2);
339+
min-height: var(--size-5);
340+
}
341+
342+
.compact-audio :global(.audio-player) {
343+
padding: 0px;
344+
}
345+
346+
.compact-audio :global(.controls) {
347+
gap: 0px;
348+
padding: 0px;
349+
}
350+
351+
.compact-audio :global(.waveform-container) {
352+
height: var(--size-12) !important;
353+
}
354+
355+
.compact-audio :global(.player-container) {
356+
min-height: unset;
357+
height: auto;
358+
}
332359
</style>

js/multimodaltextbox/Index.svelte

+60-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
import { StatusTracker } from "@gradio/statustracker";
1313
import type { LoadingStatus } from "@gradio/statustracker";
1414
import type { FileData } from "@gradio/client";
15+
import { onMount } from "svelte";
16+
import type { WaveformOptions } from "../audio/shared/types";
1517
1618
export let gradio: Gradio<{
1719
change: typeof value;
@@ -23,6 +25,11 @@
2325
focus: never;
2426
error: string;
2527
clear_status: LoadingStatus;
28+
start_recording: never;
29+
pause_recording: never;
30+
stop_recording: never;
31+
upload: FileData[] | FileData;
32+
clear: undefined;
2633
}>;
2734
export let elem_id = "";
2835
export let elem_classes: string[] = [];
@@ -38,7 +45,6 @@
3845
export let info: string | undefined = undefined;
3946
export let show_label: boolean;
4047
export let max_lines: number;
41-
export let container = true;
4248
export let scale: number | null = null;
4349
export let min_width: number | undefined = undefined;
4450
export let submit_btn: string | boolean | null = null;
@@ -53,8 +59,52 @@
5359
export let root: string;
5460
export let file_count: "single" | "multiple" | "directory";
5561
export let max_plain_text_length: number;
62+
export let sources: ["microphone" | "upload"] = ["upload"];
63+
export let waveform_options: WaveformOptions = {};
5664
5765
let dragging: boolean;
66+
let active_source: "microphone" | null = null;
67+
let waveform_settings: Record<string, any>;
68+
let color_accent = "darkorange";
69+
70+
onMount(() => {
71+
color_accent = getComputedStyle(document?.documentElement).getPropertyValue(
72+
"--color-accent"
73+
);
74+
set_trim_region_colour();
75+
waveform_settings.waveColor = waveform_options.waveform_color || "#9ca3af";
76+
waveform_settings.progressColor =
77+
waveform_options.waveform_progress_color || color_accent;
78+
waveform_settings.mediaControls = waveform_options.show_controls;
79+
waveform_settings.sampleRate = waveform_options.sample_rate || 44100;
80+
});
81+
82+
$: waveform_settings = {
83+
height: 50,
84+
85+
barWidth: 2,
86+
barGap: 3,
87+
cursorWidth: 2,
88+
cursorColor: "#ddd5e9",
89+
autoplay: false,
90+
barRadius: 10,
91+
dragToSeek: true,
92+
normalize: true,
93+
minPxPerSec: 20
94+
};
95+
96+
const trim_region_settings = {
97+
color: waveform_options.trim_region_color,
98+
drag: true,
99+
resize: true
100+
};
101+
102+
function set_trim_region_colour(): void {
103+
document.documentElement.style.setProperty(
104+
"--trim-region-color",
105+
trim_region_settings.color || color_accent
106+
);
107+
}
58108
</script>
59109

60110
<Block
@@ -80,6 +130,7 @@
80130
bind:value
81131
bind:value_is_output
82132
bind:dragging
133+
bind:active_source
83134
{file_types}
84135
{root}
85136
{label}
@@ -88,14 +139,16 @@
88139
{lines}
89140
{rtl}
90141
{text_align}
142+
{waveform_settings}
143+
i18n={gradio.i18n}
91144
max_lines={!max_lines ? lines + 1 : max_lines}
92145
{placeholder}
93146
{submit_btn}
94147
{stop_btn}
95148
{autofocus}
96-
{container}
97149
{autoscroll}
98150
{file_count}
151+
{sources}
99152
max_file_size={gradio.max_file_size}
100153
on:change={() => gradio.dispatch("change", value)}
101154
on:input={() => gradio.dispatch("input")}
@@ -107,6 +160,11 @@
107160
on:error={({ detail }) => {
108161
gradio.dispatch("error", detail);
109162
}}
163+
on:start_recording={() => gradio.dispatch("start_recording")}
164+
on:pause_recording={() => gradio.dispatch("pause_recording")}
165+
on:stop_recording={() => gradio.dispatch("stop_recording")}
166+
on:upload={(e) => gradio.dispatch("upload", e.detail)}
167+
on:clear={() => gradio.dispatch("clear")}
110168
disabled={!interactive}
111169
upload={(...args) => gradio.client.upload(...args)}
112170
stream_handler={(...args) => gradio.client.stream(...args)}

js/multimodaltextbox/MultimodalTextbox.stories.svelte

+10
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@
4242
description: "Whether to render right-to-left",
4343
control: { type: "boolean" },
4444
defaultValue: false
45+
},
46+
sources: {
47+
options: ["upload", "microphone"],
48+
description: "The sources to enable",
49+
control: { type: "select" },
50+
defaultValue: ["upload", "microphone"]
4551
}
4652
}}
4753
/>
@@ -87,3 +93,7 @@
8793
}
8894
}}
8995
/>
96+
<Story
97+
name="MultimodalTextbox with microphone input"
98+
args={{ sources: ["microphone"] }}
99+
/>

0 commit comments

Comments
 (0)