diff --git a/.gitignore b/.gitignore index bdec31e..ac7d3d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ node_modules dist .env -.turbo \ No newline at end of file +.turbo +.idea +*.iml diff --git a/examples/basic/package.json b/examples/basic/package.json index 65a48ea..05cab3c 100644 --- a/examples/basic/package.json +++ b/examples/basic/package.json @@ -27,8 +27,8 @@ "@formula-monks/kurt-cache": "workspace:*", "@formula-monks/kurt-open-ai": "workspace:*", "@formula-monks/kurt-vertex-ai": "workspace:*", - "@google-cloud/vertexai": "1.1.0", - "openai": "^4.76.0", + "@google-cloud/vertexai": "1.9.3", + "openai": "4.85.1", "zod": "^3.23.8" } } diff --git a/packages/kurt-cache/src/KurtCache.ts b/packages/kurt-cache/src/KurtCache.ts index 29ed8bc..92d7d79 100644 --- a/packages/kurt-cache/src/KurtCache.ts +++ b/packages/kurt-cache/src/KurtCache.ts @@ -319,6 +319,8 @@ function hashMessages(digest: Hash, messages: KurtMessage[]): Hash { mayHash(digest, "text", m.text) mayHash(digest, "imageDataMimeType", m.imageData?.mimeType) mayHash(digest, "imageDataBase64Data", m.imageData?.base64Data) + mayHash(digest, "inlineDataMimeType", m.inlineData?.mimeType) + mayHash(digest, "inlineDataBase64Data", m.inlineData?.base64Data) if (m.toolCall) { mayHash(digest, "toolName", m.toolCall.name) mayHash(digest, "toolArgs", JSON.stringify(m.toolCall.args)) diff --git a/packages/kurt-open-ai/package.json b/packages/kurt-open-ai/package.json index 77066f7..510f8d5 100644 --- a/packages/kurt-open-ai/package.json +++ b/packages/kurt-open-ai/package.json @@ -28,7 +28,7 @@ }, "dependencies": { "@formula-monks/kurt": "^1.4.0", - "openai": "4.76.0", + "openai": "4.85.1", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.3" }, diff --git a/packages/kurt-open-ai/spec/generateNaturalLanguage.spec.ts b/packages/kurt-open-ai/spec/generateNaturalLanguage.spec.ts index 3b76366..9c27ece 100644 --- a/packages/kurt-open-ai/spec/generateNaturalLanguage.spec.ts +++ b/packages/kurt-open-ai/spec/generateNaturalLanguage.spec.ts @@ -50,7 +50,7 @@ describe("KurtOpenAI generateNaturalLanguage", () => { ) }) - test("describes a base64-encoded image", async () => { + test("describes a base64-encoded image (imageData)", async () => { const result = await snapshotAndMock("gpt-4o-2024-05-13", (kurt) => kurt.generateNaturalLanguage({ prompt: "Describe this emoji, in two words.", @@ -68,4 +68,48 @@ describe("KurtOpenAI generateNaturalLanguage", () => { ) expect(result.text).toEqual("Heart eyes") }) + + test("describes a base64-encoded image (inlineData)", async () => { + const result = await snapshotAndMock("gpt-4o-2024-05-13", (kurt) => + kurt.generateNaturalLanguage({ + prompt: "Describe this emoji, in two words.", + extraMessages: [ + { + role: "user", + inlineData: { + mimeType: "image/png", + base64Data: + "iVBORw0KGgoAAAANSUhEUgAAABgAAAAYCAYAAADgdz34AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAApgAAAKYB3X3/OAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAANCSURBVEiJtZZPbBtFFMZ/M7ubXdtdb1xSFyeilBapySVU8h8OoFaooFSqiihIVIpQBKci6KEg9Q6H9kovIHoCIVQJJCKE1ENFjnAgcaSGC6rEnxBwA04Tx43t2FnvDAfjkNibxgHxnWb2e/u992bee7tCa00YFsffekFY+nUzFtjW0LrvjRXrCDIAaPLlW0nHL0SsZtVoaF98mLrx3pdhOqLtYPHChahZcYYO7KvPFxvRl5XPp1sN3adWiD1ZAqD6XYK1b/dvE5IWryTt2udLFedwc1+9kLp+vbbpoDh+6TklxBeAi9TL0taeWpdmZzQDry0AcO+jQ12RyohqqoYoo8RDwJrU+qXkjWtfi8Xxt58BdQuwQs9qC/afLwCw8tnQbqYAPsgxE1S6F3EAIXux2oQFKm0ihMsOF71dHYx+f3NND68ghCu1YIoePPQN1pGRABkJ6Bus96CutRZMydTl+TvuiRW1m3n0eDl0vRPcEysqdXn+jsQPsrHMquGeXEaY4Yk4wxWcY5V/9scqOMOVUFthatyTy8QyqwZ+kDURKoMWxNKr2EeqVKcTNOajqKoBgOE28U4tdQl5p5bwCw7BWquaZSzAPlwjlithJtp3pTImSqQRrb2Z8PHGigD4RZuNX6JYj6wj7O4TFLbCO/Mn/m8R+h6rYSUb3ekokRY6f/YukArN979jcW+V/S8g0eT/N3VN3kTqWbQ428m9/8k0P/1aIhF36PccEl6EhOcAUCrXKZXXWS3XKd2vc/TRBG9O5ELC17MmWubD2nKhUKZa26Ba2+D3P+4/MNCFwg59oWVeYhkzgN/JDR8deKBoD7Y+ljEjGZ0sosXVTvbc6RHirr2reNy1OXd6pJsQ+gqjk8VWFYmHrwBzW/n+uMPFiRwHB2I7ih8ciHFxIkd/3Omk5tCDV1t+2nNu5sxxpDFNx+huNhVT3/zMDz8usXC3ddaHBj1GHj/As08fwTS7Kt1HBTmyN29vdwAw+/wbwLVOJ3uAD1wi/dUH7Qei66PfyuRj4Ik9is+hglfbkbfR3cnZm7chlUWLdwmprtCohX4HUtlOcQjLYCu+fzGJH2QRKvP3UNz8bWk1qMxjGTOMThZ3kvgLI5AzFfo379UAAAAASUVORK5CYII=", + }, + }, + ], + }) + ) + expect(result.text).toEqual("Heart eyes") + }) + + test("throws an error when a message includes inline audio data", async () => { + await snapshotAndMockWithError( + "gpt-4o-2024-05-13", + (kurt) => + kurt.generateNaturalLanguage({ + prompt: "Transcribe this audio file.", + extraMessages: [ + { + role: "user", + inlineData: { + mimeType: "audio/mpeg", + base64Data: "DUMMYDATA", + }, + }, + ], + }), + (errorAny) => { + expect(errorAny).toBeInstanceOf(Error) + expect(errorAny.message).toEqual( + "Unsupported image MIME type: audio/mpeg" + ) + } + ) + }) }) diff --git a/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml b/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml similarity index 81% rename from packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml rename to packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml index 93c043a..69c97ef 100644 --- a/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml +++ b/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml @@ -23,7 +23,7 @@ step2RawChunks: refusal: null logprobs: null finish_reason: null - system_fingerprint: fp_5796ac6771 + system_fingerprint: fp_279b0a9ade usage: null - choices: - index: 0 @@ -31,7 +31,7 @@ step2RawChunks: content: Heart logprobs: null finish_reason: null - system_fingerprint: fp_5796ac6771 + system_fingerprint: fp_279b0a9ade usage: null - choices: - index: 0 @@ -39,29 +39,35 @@ step2RawChunks: content: " eyes" logprobs: null finish_reason: null - system_fingerprint: fp_5796ac6771 + system_fingerprint: fp_279b0a9ade usage: null - choices: - index: 0 delta: {} logprobs: null finish_reason: stop - system_fingerprint: fp_5796ac6771 + system_fingerprint: fp_279b0a9ade usage: null - choices: [] - system_fingerprint: fp_5796ac6771 + system_fingerprint: fp_279b0a9ade usage: - prompt_tokens: 270 - completion_tokens: 2 - total_tokens: 272 + prompt_tokens: 455 + completion_tokens: 3 + total_tokens: 458 + prompt_tokens_details: + cached_tokens: 0 + audio_tokens: 0 completion_tokens_details: reasoning_tokens: 0 + audio_tokens: 0 + accepted_prediction_tokens: 0 + rejected_prediction_tokens: 0 step3KurtEvents: - chunk: Heart - chunk: " eyes" - finished: true text: Heart eyes metadata: - totalInputTokens: 270 - totalOutputTokens: 2 - systemFingerprint: fp_5796ac6771 + totalInputTokens: 455 + totalOutputTokens: 3 + systemFingerprint: fp_279b0a9ade diff --git a/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml b/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml new file mode 100644 index 0000000..69c97ef --- /dev/null +++ b/packages/kurt-open-ai/spec/snapshots/KurtOpenAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml @@ -0,0 +1,73 @@ +step1Request: + stream: true + stream_options: + include_usage: true + model: gpt-4o-2024-05-13 + max_tokens: 4096 + temperature: 0.5 + top_p: 0.95 + messages: + - role: user + content: + - type: text + text: Describe this emoji, in two words. + - type: image_url + image_url: + url:  +step2RawChunks: + - choices: + - index: 0 + delta: + role: assistant + content: "" + refusal: null + logprobs: null + finish_reason: null + system_fingerprint: fp_279b0a9ade + usage: null + - choices: + - index: 0 + delta: + content: Heart + logprobs: null + finish_reason: null + system_fingerprint: fp_279b0a9ade + usage: null + - choices: + - index: 0 + delta: + content: " eyes" + logprobs: null + finish_reason: null + system_fingerprint: fp_279b0a9ade + usage: null + - choices: + - index: 0 + delta: {} + logprobs: null + finish_reason: stop + system_fingerprint: fp_279b0a9ade + usage: null + - choices: [] + system_fingerprint: fp_279b0a9ade + usage: + prompt_tokens: 455 + completion_tokens: 3 + total_tokens: 458 + prompt_tokens_details: + cached_tokens: 0 + audio_tokens: 0 + completion_tokens_details: + reasoning_tokens: 0 + audio_tokens: 0 + accepted_prediction_tokens: 0 + rejected_prediction_tokens: 0 +step3KurtEvents: + - chunk: Heart + - chunk: " eyes" + - finished: true + text: Heart eyes + metadata: + totalInputTokens: 455 + totalOutputTokens: 3 + systemFingerprint: fp_279b0a9ade diff --git a/packages/kurt-open-ai/src/KurtOpenAI.ts b/packages/kurt-open-ai/src/KurtOpenAI.ts index ba43d8b..ad256e3 100644 --- a/packages/kurt-open-ai/src/KurtOpenAI.ts +++ b/packages/kurt-open-ai/src/KurtOpenAI.ts @@ -247,7 +247,7 @@ function toOpenAIMessages(messages: KurtMessage[]): OpenAIMessage[] { } for (const [messageIndex, message] of messages.entries()) { - const { text, toolCall, imageData } = message + const { text, toolCall, imageData, inlineData } = message if (text) { const role = openAIRoleMapping[message.role] @@ -284,8 +284,8 @@ function toOpenAIMessages(messages: KurtMessage[]): OpenAIMessage[] { tool_call_id: id, content: JSON.stringify(result), }) - } else if (imageData && message.role === "user") { - const { mimeType, base64Data } = imageData + } else if ((imageData || inlineData) && message.role === "user") { + const { mimeType, base64Data } = inlineData ?? imageData // OpenAI only supports the following MIME types, according to these docs: // https://platform.openai.com/docs/guides/vision diff --git a/packages/kurt-vertex-ai/package.json b/packages/kurt-vertex-ai/package.json index 26d9b76..3fb3e10 100644 --- a/packages/kurt-vertex-ai/package.json +++ b/packages/kurt-vertex-ai/package.json @@ -28,7 +28,7 @@ }, "dependencies": { "@formula-monks/kurt": "^1.4.0", - "@google-cloud/vertexai": "1.1.0", + "@google-cloud/vertexai": "1.9.3", "zod": "^3.23.8", "zod-to-json-schema": "^3.23.3" }, diff --git a/packages/kurt-vertex-ai/spec/data/HelloWorld.mp3 b/packages/kurt-vertex-ai/spec/data/HelloWorld.mp3 new file mode 100644 index 0000000..c4f149a Binary files /dev/null and b/packages/kurt-vertex-ai/spec/data/HelloWorld.mp3 differ diff --git a/packages/kurt-vertex-ai/spec/generateNaturalLanguage.spec.ts b/packages/kurt-vertex-ai/spec/generateNaturalLanguage.spec.ts index 1864734..a9cdb81 100644 --- a/packages/kurt-vertex-ai/spec/generateNaturalLanguage.spec.ts +++ b/packages/kurt-vertex-ai/spec/generateNaturalLanguage.spec.ts @@ -60,7 +60,7 @@ describe("KurtVertexAI generateNaturalLanguage", () => { ) }) - test("describes a base64-encoded image", async () => { + test("describes a base64-encoded image (imageData)", async () => { const result = await snapshotAndMock((kurt) => kurt.generateNaturalLanguage({ prompt: "Describe this emoji, in two words.", @@ -78,4 +78,23 @@ describe("KurtVertexAI generateNaturalLanguage", () => { ) expect(result.text).toEqual("Lovestruck smile \n") }) + + test("describes a base64-encoded image (inlineData)", async () => { + const result = await snapshotAndMock((kurt) => + kurt.generateNaturalLanguage({ + prompt: "Describe this emoji, in two words.", + extraMessages: [ + { + role: "user", + inlineData: { + mimeType: "image/png", + base64Data: + "iVBORw0KGgoAAAANSUhEUgAAABgAAAAYCAYAAADgdz34AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAApgAAAKYB3X3/OAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAANCSURBVEiJtZZPbBtFFMZ/M7ubXdtdb1xSFyeilBapySVU8h8OoFaooFSqiihIVIpQBKci6KEg9Q6H9kovIHoCIVQJJCKE1ENFjnAgcaSGC6rEnxBwA04Tx43t2FnvDAfjkNibxgHxnWb2e/u992bee7tCa00YFsffekFY+nUzFtjW0LrvjRXrCDIAaPLlW0nHL0SsZtVoaF98mLrx3pdhOqLtYPHChahZcYYO7KvPFxvRl5XPp1sN3adWiD1ZAqD6XYK1b/dvE5IWryTt2udLFedwc1+9kLp+vbbpoDh+6TklxBeAi9TL0taeWpdmZzQDry0AcO+jQ12RyohqqoYoo8RDwJrU+qXkjWtfi8Xxt58BdQuwQs9qC/afLwCw8tnQbqYAPsgxE1S6F3EAIXux2oQFKm0ihMsOF71dHYx+f3NND68ghCu1YIoePPQN1pGRABkJ6Bus96CutRZMydTl+TvuiRW1m3n0eDl0vRPcEysqdXn+jsQPsrHMquGeXEaY4Yk4wxWcY5V/9scqOMOVUFthatyTy8QyqwZ+kDURKoMWxNKr2EeqVKcTNOajqKoBgOE28U4tdQl5p5bwCw7BWquaZSzAPlwjlithJtp3pTImSqQRrb2Z8PHGigD4RZuNX6JYj6wj7O4TFLbCO/Mn/m8R+h6rYSUb3ekokRY6f/YukArN979jcW+V/S8g0eT/N3VN3kTqWbQ428m9/8k0P/1aIhF36PccEl6EhOcAUCrXKZXXWS3XKd2vc/TRBG9O5ELC17MmWubD2nKhUKZa26Ba2+D3P+4/MNCFwg59oWVeYhkzgN/JDR8deKBoD7Y+ljEjGZ0sosXVTvbc6RHirr2reNy1OXd6pJsQ+gqjk8VWFYmHrwBzW/n+uMPFiRwHB2I7ih8ciHFxIkd/3Omk5tCDV1t+2nNu5sxxpDFNx+huNhVT3/zMDz8usXC3ddaHBj1GHj/As08fwTS7Kt1HBTmyN29vdwAw+/wbwLVOJ3uAD1wi/dUH7Qei66PfyuRj4Ik9is+hglfbkbfR3cnZm7chlUWLdwmprtCohX4HUtlOcQjLYCu+fzGJH2QRKvP3UNz8bWk1qMxjGTOMThZ3kvgLI5AzFfo379UAAAAASUVORK5CYII=", + }, + }, + ], + }) + ) + expect(result.text).toEqual("Lovestruck smile \n") + }) }) diff --git a/packages/kurt-vertex-ai/spec/generateStructuredData.spec.ts b/packages/kurt-vertex-ai/spec/generateStructuredData.spec.ts index 82d9c5c..b24a7a0 100644 --- a/packages/kurt-vertex-ai/spec/generateStructuredData.spec.ts +++ b/packages/kurt-vertex-ai/spec/generateStructuredData.spec.ts @@ -1,10 +1,11 @@ -import { describe, test, expect } from "@jest/globals" +import { describe, expect, test } from "@jest/globals" import { z } from "zod" import { snapshotAndMock, snapshotAndMockWithError } from "./snapshots" import { KurtCapabilityError, KurtResultValidateError, } from "@formula-monks/kurt" +import { promises as fs } from "node:fs" describe("KurtVertexAI generateStructuredData", () => { test("says hello (response format 1)", async () => { @@ -108,4 +109,32 @@ describe("KurtVertexAI generateStructuredData", () => { } ) }) + + test("transcribes a base64-encoded audio", async () => { + const base64Data = await fs.readFile("spec/data/HelloWorld.mp3", { + encoding: "base64", + }) + const result = await snapshotAndMock((kurt) => + kurt.generateStructuredData({ + prompt: "Transcribe this audio file.", + extraMessages: [ + { + role: "user", + inlineData: { + mimeType: "audio/mpeg", + base64Data, + }, + }, + ], + schema: z + .object({ + transcription: z + .string() + .describe("The transcription of the audio"), + }) + .describe("Result of transcribing an audio file"), + }) + ) + expect(result.data).toEqual({ transcription: "Hello world" }) + }) }) diff --git a/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml b/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml similarity index 100% rename from packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image.yaml rename to packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image_(imageData).yaml diff --git a/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml b/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml new file mode 100644 index 0000000..6f31543 --- /dev/null +++ b/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateNaturalLanguage_describes_a_base64-encoded_image_(inlineData).yaml @@ -0,0 +1,63 @@ +step1Request: + generationConfig: + maxOutputTokens: 4096 + temperature: 0.5 + topP: 0.95 + contents: + - role: user + parts: + - text: Describe this emoji, in two words. + - role: user + parts: + - inlineData: + mimeType: image/png + data: iVBORw0KGgoAAAANSUhEUgAAABgAAAAYCAYAAADgdz34AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAApgAAAKYB3X3/OAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAANCSURBVEiJtZZPbBtFFMZ/M7ubXdtdb1xSFyeilBapySVU8h8OoFaooFSqiihIVIpQBKci6KEg9Q6H9kovIHoCIVQJJCKE1ENFjnAgcaSGC6rEnxBwA04Tx43t2FnvDAfjkNibxgHxnWb2e/u992bee7tCa00YFsffekFY+nUzFtjW0LrvjRXrCDIAaPLlW0nHL0SsZtVoaF98mLrx3pdhOqLtYPHChahZcYYO7KvPFxvRl5XPp1sN3adWiD1ZAqD6XYK1b/dvE5IWryTt2udLFedwc1+9kLp+vbbpoDh+6TklxBeAi9TL0taeWpdmZzQDry0AcO+jQ12RyohqqoYoo8RDwJrU+qXkjWtfi8Xxt58BdQuwQs9qC/afLwCw8tnQbqYAPsgxE1S6F3EAIXux2oQFKm0ihMsOF71dHYx+f3NND68ghCu1YIoePPQN1pGRABkJ6Bus96CutRZMydTl+TvuiRW1m3n0eDl0vRPcEysqdXn+jsQPsrHMquGeXEaY4Yk4wxWcY5V/9scqOMOVUFthatyTy8QyqwZ+kDURKoMWxNKr2EeqVKcTNOajqKoBgOE28U4tdQl5p5bwCw7BWquaZSzAPlwjlithJtp3pTImSqQRrb2Z8PHGigD4RZuNX6JYj6wj7O4TFLbCO/Mn/m8R+h6rYSUb3ekokRY6f/YukArN979jcW+V/S8g0eT/N3VN3kTqWbQ428m9/8k0P/1aIhF36PccEl6EhOcAUCrXKZXXWS3XKd2vc/TRBG9O5ELC17MmWubD2nKhUKZa26Ba2+D3P+4/MNCFwg59oWVeYhkzgN/JDR8deKBoD7Y+ljEjGZ0sosXVTvbc6RHirr2reNy1OXd6pJsQ+gqjk8VWFYmHrwBzW/n+uMPFiRwHB2I7ih8ciHFxIkd/3Omk5tCDV1t+2nNu5sxxpDFNx+huNhVT3/zMDz8usXC3ddaHBj1GHj/As08fwTS7Kt1HBTmyN29vdwAw+/wbwLVOJ3uAD1wi/dUH7Qei66PfyuRj4Ik9is+hglfbkbfR3cnZm7chlUWLdwmprtCohX4HUtlOcQjLYCu+fzGJH2QRKvP3UNz8bWk1qMxjGTOMThZ3kvgLI5AzFfo379UAAAAASUVORK5CYII= +step2RawChunks: + - candidates: + - content: + role: model + parts: + - text: Loves + index: 0 + usageMetadata: + promptTokenCount: 266 + candidatesTokenCount: 1 + totalTokenCount: 267 + promptTokensDetails: + - modality: TEXT + tokenCount: 8 + - modality: IMAGE + tokenCount: 258 + candidatesTokensDetails: + - modality: TEXT + tokenCount: 1 + - candidates: + - content: + role: model + parts: + - text: | + truck smile + finishReason: STOP + index: 0 + usageMetadata: + promptTokenCount: 266 + candidatesTokenCount: 5 + totalTokenCount: 271 + promptTokensDetails: + - modality: TEXT + tokenCount: 8 + - modality: IMAGE + tokenCount: 258 + candidatesTokensDetails: + - modality: TEXT + tokenCount: 5 +step3KurtEvents: + - chunk: Loves + - chunk: | + truck smile + - finished: true + text: | + Lovestruck smile + metadata: + totalInputTokens: 266 + totalOutputTokens: 5 diff --git a/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateStructuredData_transcribes_a_base64-encoded_audio.yaml b/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateStructuredData_transcribes_a_base64-encoded_audio.yaml new file mode 100644 index 0000000..a3e478e --- /dev/null +++ b/packages/kurt-vertex-ai/spec/snapshots/KurtVertexAI_generateStructuredData_transcribes_a_base64-encoded_audio.yaml @@ -0,0 +1,63 @@ +step1Request: + generationConfig: + maxOutputTokens: 4096 + temperature: 0.5 + topP: 0.95 + contents: + - role: user + parts: + - text: Transcribe this audio file. + - role: user + parts: + - inlineData: + mimeType: audio/mpeg + data:  + tools: + - functionDeclarations: + - name: structured_data + description: Result of transcribing an audio file + parameters: + type: object + properties: + transcription: + type: string + description: The transcription of the audio + required: + - transcription + tool_config: + function_calling_config: + mode: ANY + allowed_function_names: + - structured_data +step2RawChunks: + - candidates: + - content: + role: model + parts: + - functionCall: + name: structured_data + args: + transcription: Hello world + finishReason: STOP + index: 0 + usageMetadata: + promptTokenCount: 48 + candidatesTokenCount: 7 + totalTokenCount: 55 + promptTokensDetails: + - modality: TEXT + tokenCount: 23 + - modality: AUDIO + tokenCount: 25 + candidatesTokensDetails: + - modality: TEXT + tokenCount: 7 +step3KurtEvents: + - chunk: '{"transcription":"Hello world"}' + - finished: true + text: '{"transcription":"Hello world"}' + data: + transcription: Hello world + metadata: + totalInputTokens: 48 + totalOutputTokens: 7 diff --git a/packages/kurt-vertex-ai/src/KurtVertexAI.ts b/packages/kurt-vertex-ai/src/KurtVertexAI.ts index 0d5b2bc..b905f2b 100644 --- a/packages/kurt-vertex-ai/src/KurtVertexAI.ts +++ b/packages/kurt-vertex-ai/src/KurtVertexAI.ts @@ -1,9 +1,14 @@ import "./VertexAI.patch.generateContentStream" // monkey-patches VertexAI GenerativeModel.prototype.generateContentStream - import zodToJsonSchema from "zod-to-json-schema" import { type KurtAdapterV1, - type KurtStreamEvent, + KurtCapabilityError, + type KurtMessage, + type KurtResult, + KurtResultBlockedError, + KurtResultLimitError, + KurtResultValidateError, + type KurtSamplingOptions, type KurtSchema, type KurtSchemaInner, type KurtSchemaInnerMap, @@ -13,13 +18,7 @@ import { type KurtSchemaMaybe, type KurtSchemaResult, type KurtSchemaResultMaybe, - type KurtMessage, - type KurtSamplingOptions, - type KurtResult, - KurtResultValidateError, - KurtResultLimitError, - KurtResultBlockedError, - KurtCapabilityError, + type KurtStreamEvent, } from "@formula-monks/kurt" import type { VertexAI, @@ -27,7 +26,6 @@ import type { VertexAIMessage, VertexAIRequest, VertexAIResponseChunk, - VertexAIResponseChunkCandidate, VertexAIResponseFunctionCall, VertexAISchema, VertexAITool, @@ -171,7 +169,7 @@ function toVertexAIMessages(messages: KurtMessage[]): VertexAIMessage[] { const vertexAIMessages: VertexAIMessage[] = [] for (const message of messages) { - const { role, text, toolCall, imageData } = message + const { role, text, toolCall, imageData, inlineData } = message if (text) { vertexAIMessages.push({ role, parts: [{ text }] }) } else if (toolCall) { @@ -180,10 +178,10 @@ function toVertexAIMessages(messages: KurtMessage[]): VertexAIMessage[] { const functionResponse = { name, response: result } vertexAIMessages.push({ role, parts: [{ functionCall }] }) vertexAIMessages.push({ role, parts: [{ functionResponse }] }) - } else if (imageData) { - const { mimeType, base64Data } = imageData - const inlineData = { mimeType, data: base64Data } - vertexAIMessages.push({ role, parts: [{ inlineData }] }) + } else if (imageData || inlineData) { + const { mimeType, base64Data } = inlineData ?? imageData + const dataPart = { mimeType, data: base64Data } + vertexAIMessages.push({ role, parts: [{ inlineData: dataPart }] }) } else { throw new Error(`Invalid KurtMessage: ${JSON.stringify(message)}`) } diff --git a/packages/kurt-vertex-ai/src/VertexAI.patch.generateContentStream.ts b/packages/kurt-vertex-ai/src/VertexAI.patch.generateContentStream.ts index 1edb637..d2d115b 100644 --- a/packages/kurt-vertex-ai/src/VertexAI.patch.generateContentStream.ts +++ b/packages/kurt-vertex-ai/src/VertexAI.patch.generateContentStream.ts @@ -66,7 +66,6 @@ import { export async function generateContent( location: string, - project: string, publisherModelEndpoint: string, token: Promise, request: GenerateContentRequest | string, @@ -96,7 +95,6 @@ export async function generateContent( } const response: Response | undefined = await postRequest({ region: location, - project: project, resourcePath: publisherModelEndpoint, resourceMethod: constants.GENERATE_CONTENT_METHOD, token: await token, @@ -122,7 +120,6 @@ export async function generateContent( */ export async function generateContentStream( location: string, - project: string, publisherModelEndpoint: string, token: Promise, request: GenerateContentRequest | string, @@ -151,7 +148,6 @@ export async function generateContentStream( } const response = await postRequest({ region: location, - project: project, resourcePath: publisherModelEndpoint, resourceMethod: constants.STREAMING_GENERATE_CONTENT_METHOD, token: await token, @@ -183,7 +179,6 @@ async function generateContentStreamPATCHED( const _this = this as any return generateContentStream( _this.location, - _this.project, _this.publisherModelEndpoint, _this.fetchToken(), request, diff --git a/packages/kurt/src/Kurt.ts b/packages/kurt/src/Kurt.ts index 54570e9..f92445b 100644 --- a/packages/kurt/src/Kurt.ts +++ b/packages/kurt/src/Kurt.ts @@ -98,7 +98,7 @@ export class Kurt { * - autonomous, open-ended decision-making or action-taking * * The `data` field of the result will be `undefined` if Kurt decides to - * generate natural language. Otherwise it will contain a tool call. + * generate natural language. Otherwise, it will contain a tool call. * * Your application can decide if and how it should fulfill the tool call. * @@ -198,25 +198,23 @@ export type KurtMessage = { text: string /** + * @deprecated Use `inlineData` instead. * When present, this is an image data message, with a base64-encoded image. * This is often used with "multi-modal" LLMs that support image mode input. * * Not all LLM providers or underlying models support this kind of message. - * Check your LLM provider's documentation for confirmaton. + * Check your LLM provider's documentation for confirmation. */ - imageData: { - /** - * The IANA standard MIME type of the inline image data. - * - * Not all MIME types are supported by all LLM providers. - * "image/png" and "image/jpeg" are the most commonly supported. - * Check your LLM provider's documentation for the right list. - */ - mimeType: string + imageData: KurtInlineData - /** Base64-encoded image data, as a string. */ - base64Data: string - } + /** + * When present, this is a base64-encoded data message (i.e. image, audio). + * This is often used with "multi-modal" LLMs that support image/audio mode input. + * + * Not all LLM providers or underlying models support this kind of message. + * Check your LLM provider's documentation for confirmation. + */ + inlineData: KurtInlineData /** * When present, this is a tool call message, with structured data input @@ -251,6 +249,21 @@ export type KurtMessage = { } }> +export interface KurtInlineData { + /** + * The IANA standard MIME type of the inline data. + * + * Not all MIME types are supported by all LLM providers. + * OpenAI's GPT, for example, supports only images. + * Gemini supports both images and audio. + * Check your LLM provider's documentation for the right list. + */ + mimeType: string + + /** Base64-encoded data, as a string. */ + base64Data: string +} + export interface KurtCreateOptions { /** * The default system prompt to use, for any generation method call which @@ -294,7 +307,7 @@ export const KurtSamplingOptionsDefault = { /** * Maximum number of output tokens to sample from the model. * - * This is mean to be a cost control measure, to protect against scenarios + * This is meant to be a cost control measure, to protect against scenarios * where the model might get "stuck" and generate excessive output. * * When the model hits the output limit, whatever it has generated will diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 02bc197..aa95136 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -48,11 +48,11 @@ importers: specifier: workspace:* version: link:../../packages/kurt-vertex-ai '@google-cloud/vertexai': - specifier: 1.1.0 - version: 1.1.0 + specifier: 1.9.3 + version: 1.9.3 openai: - specifier: ^4.76.0 - version: 4.76.0(zod@3.23.8) + specifier: 4.85.1 + version: 4.85.1(zod@3.23.8) zod: specifier: ^3.23.8 version: 3.23.8 @@ -156,8 +156,8 @@ importers: specifier: workspace:^ version: link:../kurt openai: - specifier: 4.76.0 - version: 4.76.0(zod@3.23.8) + specifier: 4.85.1 + version: 4.85.1(zod@3.23.8) zod: specifier: ^3.23.8 version: 3.23.8 @@ -199,8 +199,8 @@ importers: specifier: workspace:^ version: link:../kurt '@google-cloud/vertexai': - specifier: 1.1.0 - version: 1.1.0 + specifier: 1.9.3 + version: 1.9.3 zod: specifier: ^3.23.8 version: 3.23.8 @@ -673,8 +673,8 @@ packages: cpu: [x64] os: [win32] - '@google-cloud/vertexai@1.1.0': - resolution: {integrity: sha512-hfwfdlVpJ+kM6o2b5UFfPnweBcz8tgHAFRswnqUKYqLJsvKU0DDD0Z2/YKoHyAUoPJAv20qg6KlC3msNeUKUiw==} + '@google-cloud/vertexai@1.9.3': + resolution: {integrity: sha512-35o5tIEMLW3JeFJOaaMNR2e5sq+6rpnhrF97PuAxeOm0GlqVTESKhkGj7a5B5mmJSSSU3hUfIhcQCRRsw4Ipzg==} engines: {node: '>=18.0.0'} '@istanbuljs/load-nyc-config@1.1.0': @@ -2519,12 +2519,15 @@ packages: resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==} engines: {node: '>=12'} - openai@4.76.0: - resolution: {integrity: sha512-QBGIetjX1C9xDp5XGa/3mPnfKI9BgAe2xHQX6PmO98wuW9qQaurBaumcYptQWc9LHZZq7cH/Y1Rjnsr6uUDdVw==} + openai@4.85.1: + resolution: {integrity: sha512-jkX2fntHljUvSH3MkWh4jShl10oNkb+SsCj4auKlbu2oF4KWAnmHLNR5EpnUHK1ZNW05Rp0fjbJzYwQzMsH8ZA==} hasBin: true peerDependencies: + ws: ^8.18.0 zod: ^3.23.8 peerDependenciesMeta: + ws: + optional: true zod: optional: true @@ -3848,7 +3851,7 @@ snapshots: '@esbuild/win32-x64@0.21.5': optional: true - '@google-cloud/vertexai@1.1.0': + '@google-cloud/vertexai@1.9.3': dependencies: google-auth-library: 9.11.0 transitivePeerDependencies: @@ -6063,7 +6066,7 @@ snapshots: dependencies: mimic-fn: 4.0.0 - openai@4.76.0(zod@3.23.8): + openai@4.85.1(zod@3.23.8): dependencies: '@types/node': 18.19.32 '@types/node-fetch': 2.6.11