Skip to content

Commit 11e14b4

Browse files
Merge pull request #67 from FormulaMonks/feat/inline-audio-data
Feat/inline audio data
2 parents 93a974b + c2aadd9 commit 11e14b4

19 files changed

+381
-71
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
node_modules
22
dist
33
.env
4-
.turbo
4+
.turbo
5+
.idea
6+
*.iml

examples/basic/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
"@formula-monks/kurt-cache": "workspace:*",
2828
"@formula-monks/kurt-open-ai": "workspace:*",
2929
"@formula-monks/kurt-vertex-ai": "workspace:*",
30-
"@google-cloud/vertexai": "1.1.0",
31-
"openai": "^4.76.0",
30+
"@google-cloud/vertexai": "1.9.3",
31+
"openai": "4.85.1",
3232
"zod": "^3.23.8"
3333
}
3434
}

packages/kurt-cache/src/KurtCache.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ function hashMessages(digest: Hash, messages: KurtMessage[]): Hash {
319319
mayHash(digest, "text", m.text)
320320
mayHash(digest, "imageDataMimeType", m.imageData?.mimeType)
321321
mayHash(digest, "imageDataBase64Data", m.imageData?.base64Data)
322+
mayHash(digest, "inlineDataMimeType", m.inlineData?.mimeType)
323+
mayHash(digest, "inlineDataBase64Data", m.inlineData?.base64Data)
322324
if (m.toolCall) {
323325
mayHash(digest, "toolName", m.toolCall.name)
324326
mayHash(digest, "toolArgs", JSON.stringify(m.toolCall.args))

packages/kurt-open-ai/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
},
2929
"dependencies": {
3030
"@formula-monks/kurt": "^1.4.0",
31-
"openai": "4.76.0",
31+
"openai": "4.85.1",
3232
"zod": "^3.23.8",
3333
"zod-to-json-schema": "^3.23.3"
3434
},

packages/kurt-open-ai/spec/generateNaturalLanguage.spec.ts

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ describe("KurtOpenAI generateNaturalLanguage", () => {
5050
)
5151
})
5252

53-
test("describes a base64-encoded image", async () => {
53+
test("describes a base64-encoded image (imageData)", async () => {
5454
const result = await snapshotAndMock("gpt-4o-2024-05-13", (kurt) =>
5555
kurt.generateNaturalLanguage({
5656
prompt: "Describe this emoji, in two words.",
@@ -68,4 +68,48 @@ describe("KurtOpenAI generateNaturalLanguage", () => {
6868
)
6969
expect(result.text).toEqual("Heart eyes")
7070
})
71+
72+
test("describes a base64-encoded image (inlineData)", async () => {
73+
const result = await snapshotAndMock("gpt-4o-2024-05-13", (kurt) =>
74+
kurt.generateNaturalLanguage({
75+
prompt: "Describe this emoji, in two words.",
76+
extraMessages: [
77+
{
78+
role: "user",
79+
inlineData: {
80+
mimeType: "image/png",
81+
base64Data:
82+
"iVBORw0KGgoAAAANSUhEUgAAABgAAAAYCAYAAADgdz34AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAApgAAAKYB3X3/OAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAANCSURBVEiJtZZPbBtFFMZ/M7ubXdtdb1xSFyeilBapySVU8h8OoFaooFSqiihIVIpQBKci6KEg9Q6H9kovIHoCIVQJJCKE1ENFjnAgcaSGC6rEnxBwA04Tx43t2FnvDAfjkNibxgHxnWb2e/u992bee7tCa00YFsffekFY+nUzFtjW0LrvjRXrCDIAaPLlW0nHL0SsZtVoaF98mLrx3pdhOqLtYPHChahZcYYO7KvPFxvRl5XPp1sN3adWiD1ZAqD6XYK1b/dvE5IWryTt2udLFedwc1+9kLp+vbbpoDh+6TklxBeAi9TL0taeWpdmZzQDry0AcO+jQ12RyohqqoYoo8RDwJrU+qXkjWtfi8Xxt58BdQuwQs9qC/afLwCw8tnQbqYAPsgxE1S6F3EAIXux2oQFKm0ihMsOF71dHYx+f3NND68ghCu1YIoePPQN1pGRABkJ6Bus96CutRZMydTl+TvuiRW1m3n0eDl0vRPcEysqdXn+jsQPsrHMquGeXEaY4Yk4wxWcY5V/9scqOMOVUFthatyTy8QyqwZ+kDURKoMWxNKr2EeqVKcTNOajqKoBgOE28U4tdQl5p5bwCw7BWquaZSzAPlwjlithJtp3pTImSqQRrb2Z8PHGigD4RZuNX6JYj6wj7O4TFLbCO/Mn/m8R+h6rYSUb3ekokRY6f/YukArN979jcW+V/S8g0eT/N3VN3kTqWbQ428m9/8k0P/1aIhF36PccEl6EhOcAUCrXKZXXWS3XKd2vc/TRBG9O5ELC17MmWubD2nKhUKZa26Ba2+D3P+4/MNCFwg59oWVeYhkzgN/JDR8deKBoD7Y+ljEjGZ0sosXVTvbc6RHirr2reNy1OXd6pJsQ+gqjk8VWFYmHrwBzW/n+uMPFiRwHB2I7ih8ciHFxIkd/3Omk5tCDV1t+2nNu5sxxpDFNx+huNhVT3/zMDz8usXC3ddaHBj1GHj/As08fwTS7Kt1HBTmyN29vdwAw+/wbwLVOJ3uAD1wi/dUH7Qei66PfyuRj4Ik9is+hglfbkbfR3cnZm7chlUWLdwmprtCohX4HUtlOcQjLYCu+fzGJH2QRKvP3UNz8bWk1qMxjGTOMThZ3kvgLI5AzFfo379UAAAAASUVORK5CYII=",
83+
},
84+
},
85+
],
86+
})
87+
)
88+
expect(result.text).toEqual("Heart eyes")
89+
})
90+
91+
test("throws an error when a message includes inline audio data", async () => {
92+
await snapshotAndMockWithError(
93+
"gpt-4o-2024-05-13",
94+
(kurt) =>
95+
kurt.generateNaturalLanguage({
96+
prompt: "Transcribe this audio file.",
97+
extraMessages: [
98+
{
99+
role: "user",
100+
inlineData: {
101+
mimeType: "audio/mpeg",
102+
base64Data: "DUMMYDATA",
103+
},
104+
},
105+
],
106+
}),
107+
(errorAny) => {
108+
expect(errorAny).toBeInstanceOf(Error)
109+
expect(errorAny.message).toEqual(
110+
"Unsupported image MIME type: audio/mpeg"
111+
)
112+
}
113+
)
114+
})
71115
})

packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml renamed to packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,45 +23,51 @@ step2RawChunks:
2323
refusal: null
2424
logprobs: null
2525
finish_reason: null
26-
system_fingerprint: fp_5796ac6771
26+
system_fingerprint: fp_279b0a9ade
2727
usage: null
2828
- choices:
2929
- index: 0
3030
delta:
3131
content: Heart
3232
logprobs: null
3333
finish_reason: null
34-
system_fingerprint: fp_5796ac6771
34+
system_fingerprint: fp_279b0a9ade
3535
usage: null
3636
- choices:
3737
- index: 0
3838
delta:
3939
content: " eyes"
4040
logprobs: null
4141
finish_reason: null
42-
system_fingerprint: fp_5796ac6771
42+
system_fingerprint: fp_279b0a9ade
4343
usage: null
4444
- choices:
4545
- index: 0
4646
delta: {}
4747
logprobs: null
4848
finish_reason: stop
49-
system_fingerprint: fp_5796ac6771
49+
system_fingerprint: fp_279b0a9ade
5050
usage: null
5151
- choices: []
52-
system_fingerprint: fp_5796ac6771
52+
system_fingerprint: fp_279b0a9ade
5353
usage:
54-
prompt_tokens: 270
55-
completion_tokens: 2
56-
total_tokens: 272
54+
prompt_tokens: 455
55+
completion_tokens: 3
56+
total_tokens: 458
57+
prompt_tokens_details:
58+
cached_tokens: 0
59+
audio_tokens: 0
5760
completion_tokens_details:
5861
reasoning_tokens: 0
62+
audio_tokens: 0
63+
accepted_prediction_tokens: 0
64+
rejected_prediction_tokens: 0
5965
step3KurtEvents:
6066
- chunk: Heart
6167
- chunk: " eyes"
6268
- finished: true
6369
text: Heart eyes
6470
metadata:
65-
totalInputTokens: 270
66-
totalOutputTokens: 2
67-
systemFingerprint: fp_5796ac6771
71+
totalInputTokens: 455
72+
totalOutputTokens: 3
73+
systemFingerprint: fp_279b0a9ade
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
step1Request:
2+
stream: true
3+
stream_options:
4+
include_usage: true
5+
model: gpt-4o-2024-05-13
6+
max_tokens: 4096
7+
temperature: 0.5
8+
top_p: 0.95
9+
messages:
10+
- role: user
11+
content:
12+
- type: text
13+
text: Describe this emoji, in two words.
14+
- type: image_url
15+
image_url:
16+
url: 
17+
step2RawChunks:
18+
- choices:
19+
- index: 0
20+
delta:
21+
role: assistant
22+
content: ""
23+
refusal: null
24+
logprobs: null
25+
finish_reason: null
26+
system_fingerprint: fp_279b0a9ade
27+
usage: null
28+
- choices:
29+
- index: 0
30+
delta:
31+
content: Heart
32+
logprobs: null
33+
finish_reason: null
34+
system_fingerprint: fp_279b0a9ade
35+
usage: null
36+
- choices:
37+
- index: 0
38+
delta:
39+
content: " eyes"
40+
logprobs: null
41+
finish_reason: null
42+
system_fingerprint: fp_279b0a9ade
43+
usage: null
44+
- choices:
45+
- index: 0
46+
delta: {}
47+
logprobs: null
48+
finish_reason: stop
49+
system_fingerprint: fp_279b0a9ade
50+
usage: null
51+
- choices: []
52+
system_fingerprint: fp_279b0a9ade
53+
usage:
54+
prompt_tokens: 455
55+
completion_tokens: 3
56+
total_tokens: 458
57+
prompt_tokens_details:
58+
cached_tokens: 0
59+
audio_tokens: 0
60+
completion_tokens_details:
61+
reasoning_tokens: 0
62+
audio_tokens: 0
63+
accepted_prediction_tokens: 0
64+
rejected_prediction_tokens: 0
65+
step3KurtEvents:
66+
- chunk: Heart
67+
- chunk: " eyes"
68+
- finished: true
69+
text: Heart eyes
70+
metadata:
71+
totalInputTokens: 455
72+
totalOutputTokens: 3
73+
systemFingerprint: fp_279b0a9ade

packages/kurt-open-ai/src/KurtOpenAI.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ function toOpenAIMessages(messages: KurtMessage[]): OpenAIMessage[] {
247247
}
248248

249249
for (const [messageIndex, message] of messages.entries()) {
250-
const { text, toolCall, imageData } = message
250+
const { text, toolCall, imageData, inlineData } = message
251251
if (text) {
252252
const role = openAIRoleMapping[message.role]
253253

@@ -284,8 +284,8 @@ function toOpenAIMessages(messages: KurtMessage[]): OpenAIMessage[] {
284284
tool_call_id: id,
285285
content: JSON.stringify(result),
286286
})
287-
} else if (imageData && message.role === "user") {
288-
const { mimeType, base64Data } = imageData
287+
} else if ((imageData || inlineData) && message.role === "user") {
288+
const { mimeType, base64Data } = inlineData ?? imageData
289289

290290
// OpenAI only supports the following MIME types, according to these docs:
291291
// https://platform.openai.com/docs/guides/vision

packages/kurt-vertex-ai/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
},
2929
"dependencies": {
3030
"@formula-monks/kurt": "^1.4.0",
31-
"@google-cloud/vertexai": "1.1.0",
31+
"@google-cloud/vertexai": "1.9.3",
3232
"zod": "^3.23.8",
3333
"zod-to-json-schema": "^3.23.3"
3434
},
12.9 KB
Binary file not shown.

0 commit comments

Comments
 (0)