diff --git a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatModel.java b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatModel.java index 2d36014a719..0ada0668e2f 100644 --- a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatModel.java +++ b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatModel.java @@ -64,11 +64,11 @@ import org.springframework.ai.content.Media; import org.springframework.ai.model.ModelOptionsUtils; import org.springframework.ai.model.tool.DefaultToolExecutionEligibilityPredicate; -import org.springframework.ai.model.tool.internal.ToolCallReactiveContextHolder; import org.springframework.ai.model.tool.ToolCallingChatOptions; import org.springframework.ai.model.tool.ToolCallingManager; import org.springframework.ai.model.tool.ToolExecutionEligibilityPredicate; import org.springframework.ai.model.tool.ToolExecutionResult; +import org.springframework.ai.model.tool.internal.ToolCallReactiveContextHolder; import org.springframework.ai.retry.RetryUtils; import org.springframework.ai.support.UsageCalculator; import org.springframework.ai.tool.definition.ToolDefinition; @@ -482,12 +482,25 @@ private Map mergeHttpHeaders(Map runtimeHttpHead ChatCompletionRequest createRequest(Prompt prompt, boolean stream) { + // Get cache control from options + AnthropicChatOptions requestOptions = (AnthropicChatOptions) prompt.getOptions(); + AnthropicApi.ChatCompletionRequest.CacheControl cacheControl = (requestOptions != null) + ? requestOptions.getCacheControl() : null; + List userMessages = prompt.getInstructions() .stream() .filter(message -> message.getMessageType() != MessageType.SYSTEM) .map(message -> { if (message.getMessageType() == MessageType.USER) { - List contents = new ArrayList<>(List.of(new ContentBlock(message.getText()))); + List contents = new ArrayList<>(); + + // Apply cache control if enabled for user messages + if (cacheControl != null) { + contents.add(new ContentBlock(message.getText(), cacheControl)); + } + else { + contents.add(new ContentBlock(message.getText())); + } if (message instanceof UserMessage userMessage) { if (!CollectionUtils.isEmpty(userMessage.getMedia())) { List mediaContent = userMessage.getMedia().stream().map(media -> { @@ -537,7 +550,6 @@ else if (message.getMessageType() == MessageType.TOOL) { ChatCompletionRequest request = new ChatCompletionRequest(this.defaultOptions.getModel(), userMessages, systemPrompt, this.defaultOptions.getMaxTokens(), this.defaultOptions.getTemperature(), stream); - AnthropicChatOptions requestOptions = (AnthropicChatOptions) prompt.getOptions(); request = ModelOptionsUtils.merge(requestOptions, request, ChatCompletionRequest.class); // Add the tool definitions to the request's tools parameter. diff --git a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatOptions.java b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatOptions.java index dbfbee561c8..536063ed546 100644 --- a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatOptions.java +++ b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatOptions.java @@ -44,6 +44,7 @@ * @author Thomas Vitale * @author Alexandros Pappas * @author Ilayaperumal Gopinathan + * @author Soby Chacko * @since 1.0.0 */ @JsonInclude(Include.NON_NULL) @@ -59,6 +60,20 @@ public class AnthropicChatOptions implements ToolCallingChatOptions { private @JsonProperty("top_k") Integer topK; private @JsonProperty("thinking") ChatCompletionRequest.ThinkingConfig thinking; + /** + * Cache control for user messages. When set, enables caching for user messages. + * Uses the existing CacheControl record from AnthropicApi.ChatCompletionRequest. + */ + private @JsonProperty("cache_control") ChatCompletionRequest.CacheControl cacheControl; + + public ChatCompletionRequest.CacheControl getCacheControl() { + return cacheControl; + } + + public void setCacheControl(ChatCompletionRequest.CacheControl cacheControl) { + this.cacheControl = cacheControl; + } + /** * Collection of {@link ToolCallback}s to be used for tool calling in the chat * completion requests. @@ -111,6 +126,7 @@ public static AnthropicChatOptions fromOptions(AnthropicChatOptions fromOptions) .internalToolExecutionEnabled(fromOptions.getInternalToolExecutionEnabled()) .toolContext(fromOptions.getToolContext() != null ? new HashMap<>(fromOptions.getToolContext()) : null) .httpHeaders(fromOptions.getHttpHeaders() != null ? new HashMap<>(fromOptions.getHttpHeaders()) : null) + .cacheControl(fromOptions.getCacheControl()) .build(); } @@ -267,12 +283,10 @@ public AnthropicChatOptions copy() { @Override public boolean equals(Object o) { - if (this == o) { + if (this == o) return true; - } - if (!(o instanceof AnthropicChatOptions that)) { + if (!(o instanceof AnthropicChatOptions that)) return false; - } return Objects.equals(this.model, that.model) && Objects.equals(this.maxTokens, that.maxTokens) && Objects.equals(this.metadata, that.metadata) && Objects.equals(this.stopSequences, that.stopSequences) @@ -282,14 +296,15 @@ public boolean equals(Object o) { && Objects.equals(this.toolNames, that.toolNames) && Objects.equals(this.internalToolExecutionEnabled, that.internalToolExecutionEnabled) && Objects.equals(this.toolContext, that.toolContext) - && Objects.equals(this.httpHeaders, that.httpHeaders); + && Objects.equals(this.httpHeaders, that.httpHeaders) + && Objects.equals(this.cacheControl, that.cacheControl); } @Override public int hashCode() { return Objects.hash(this.model, this.maxTokens, this.metadata, this.stopSequences, this.temperature, this.topP, this.topK, this.thinking, this.toolCallbacks, this.toolNames, this.internalToolExecutionEnabled, - this.toolContext, this.httpHeaders); + this.toolContext, this.httpHeaders, this.cacheControl); } public static class Builder { @@ -389,6 +404,14 @@ public Builder httpHeaders(Map httpHeaders) { return this; } + /** + * Set cache control for user messages + */ + public Builder cacheControl(ChatCompletionRequest.CacheControl cacheControl) { + this.options.cacheControl = cacheControl; + return this; + } + public AnthropicChatOptions build() { return this.options; } diff --git a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicApi.java b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicApi.java index cf410690216..c2bf307ea4f 100644 --- a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicApi.java +++ b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicApi.java @@ -26,6 +26,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.ai.anthropic.api.AnthropicApi.ChatCompletionRequest.CacheControl; import org.springframework.ai.anthropic.api.StreamHelper.ChatCompletionResponseBuilder; import org.springframework.ai.model.ApiKey; import org.springframework.ai.model.ChatModelDescription; @@ -66,6 +67,7 @@ * @author Jonghoon Park * @author Claudio Silva Junior * @author Filip Hrisafov + * @author Soby Chacko * @since 1.0.0 */ public final class AnthropicApi { @@ -559,6 +561,14 @@ public record Metadata(@JsonProperty("user_id") String userId) { } + /** + * @param type is the cache type supported by anthropic. Doc + */ + @JsonInclude(Include.NON_NULL) + public record CacheControl(String type) { + } + /** * Configuration for the model's thinking mode. * @@ -765,8 +775,11 @@ public record ContentBlock( @JsonProperty("thinking") String thinking, // Redacted Thinking only - @JsonProperty("data") String data - ) { + @JsonProperty("data") String data, + + // cache object + @JsonProperty("cache_control") CacheControl cacheControl + ) { // @formatter:on /** @@ -784,7 +797,7 @@ public ContentBlock(String mediaType, String data) { * @param source The source of the content. */ public ContentBlock(Type type, Source source) { - this(type, source, null, null, null, null, null, null, null, null, null, null); + this(type, source, null, null, null, null, null, null, null, null, null, null, null); } /** @@ -792,7 +805,7 @@ public ContentBlock(Type type, Source source) { * @param source The source of the content. */ public ContentBlock(Source source) { - this(Type.IMAGE, source, null, null, null, null, null, null, null, null, null, null); + this(Type.IMAGE, source, null, null, null, null, null, null, null, null, null, null, null); } /** @@ -800,7 +813,11 @@ public ContentBlock(Source source) { * @param text The text of the content. */ public ContentBlock(String text) { - this(Type.TEXT, null, text, null, null, null, null, null, null, null, null, null); + this(Type.TEXT, null, text, null, null, null, null, null, null, null, null, null, null); + } + + public ContentBlock(String text, CacheControl cache) { + this(Type.TEXT, null, text, null, null, null, null, null, null, null, null, null, cache); } // Tool result @@ -811,7 +828,7 @@ public ContentBlock(String text) { * @param content The content of the tool result. */ public ContentBlock(Type type, String toolUseId, String content) { - this(type, null, null, null, null, null, null, toolUseId, content, null, null, null); + this(type, null, null, null, null, null, null, toolUseId, content, null, null, null, null); } /** @@ -822,7 +839,7 @@ public ContentBlock(Type type, String toolUseId, String content) { * @param index The index of the content block. */ public ContentBlock(Type type, Source source, String text, Integer index) { - this(type, source, text, index, null, null, null, null, null, null, null, null); + this(type, source, text, index, null, null, null, null, null, null, null, null, null); } // Tool use input JSON delta streaming @@ -834,7 +851,7 @@ public ContentBlock(Type type, Source source, String text, Integer index) { * @param input The input of the tool use. */ public ContentBlock(Type type, String id, String name, Map input) { - this(type, null, null, null, id, name, input, null, null, null, null, null); + this(type, null, null, null, id, name, input, null, null, null, null, null, null); } /** @@ -1028,7 +1045,9 @@ public record ChatCompletionResponse( public record Usage( // @formatter:off @JsonProperty("input_tokens") Integer inputTokens, - @JsonProperty("output_tokens") Integer outputTokens) { + @JsonProperty("output_tokens") Integer outputTokens, + @JsonProperty("cache_creation_input_tokens") Integer cacheCreationInputTokens, + @JsonProperty("cache_read_input_tokens") Integer cacheReadInputTokens) { // @formatter:off } diff --git a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicCacheType.java b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicCacheType.java new file mode 100644 index 00000000000..a120e3f0f89 --- /dev/null +++ b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/AnthropicCacheType.java @@ -0,0 +1,57 @@ +/* + * Copyright 2025-2025 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.anthropic.api; + +import org.springframework.ai.anthropic.api.AnthropicApi.ChatCompletionRequest.CacheControl; + +import java.util.function.Supplier; + +/** + * Cache types supported by Anthropic's prompt caching feature. + * + *

+ * Prompt caching allows reusing frequently used prompts to reduce costs and improve + * response times for repeated interactions. + * + * @see Anthropic Prompt + * Caching + * @author Claudio Silva Junior + * @author Soby Chacko + */ +public enum AnthropicCacheType { + + /** + * Ephemeral cache with 5-minute lifetime, refreshed on each use. + */ + EPHEMERAL(() -> new CacheControl("ephemeral")); + + private final Supplier value; + + AnthropicCacheType(Supplier value) { + this.value = value; + } + + /** + * Returns a new CacheControl instance for this cache type. + * @return a CacheControl instance configured for this cache type + */ + public CacheControl cacheControl() { + return value.get(); + } + +} diff --git a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/StreamHelper.java b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/StreamHelper.java index f636f29a158..e56e469cac1 100644 --- a/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/StreamHelper.java +++ b/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/api/StreamHelper.java @@ -55,6 +55,8 @@ * @author Christian Tzolov * @author Jihoon Kim * @author Alexandros Pappas + * @author Claudio Silva Junior + * @author Soby Chacko * @since 1.0.0 */ public class StreamHelper { @@ -159,7 +161,7 @@ else if (event.type().equals(EventType.CONTENT_BLOCK_START)) { } else if (contentBlockStartEvent.contentBlock() instanceof ContentBlockThinking thinkingBlock) { ContentBlock cb = new ContentBlock(Type.THINKING, null, null, contentBlockStartEvent.index(), null, - null, null, null, null, null, thinkingBlock.thinking(), null); + null, null, null, null, null, thinkingBlock.thinking(), null, null); contentBlockReference.get().withType(event.type().name()).withContent(List.of(cb)); } else { @@ -176,12 +178,12 @@ else if (event.type().equals(EventType.CONTENT_BLOCK_DELTA)) { } else if (contentBlockDeltaEvent.delta() instanceof ContentBlockDeltaThinking thinking) { ContentBlock cb = new ContentBlock(Type.THINKING_DELTA, null, null, contentBlockDeltaEvent.index(), - null, null, null, null, null, null, thinking.thinking(), null); + null, null, null, null, null, null, thinking.thinking(), null, null); contentBlockReference.get().withType(event.type().name()).withContent(List.of(cb)); } else if (contentBlockDeltaEvent.delta() instanceof ContentBlockDeltaSignature sig) { ContentBlock cb = new ContentBlock(Type.SIGNATURE_DELTA, null, null, contentBlockDeltaEvent.index(), - null, null, null, null, null, sig.signature(), null, null); + null, null, null, null, null, sig.signature(), null, null, null); contentBlockReference.get().withType(event.type().name()).withContent(List.of(cb)); } else { @@ -205,7 +207,9 @@ else if (event.type().equals(EventType.MESSAGE_DELTA)) { if (messageDeltaEvent.usage() != null) { Usage totalUsage = new Usage(contentBlockReference.get().usage.inputTokens(), - messageDeltaEvent.usage().outputTokens()); + messageDeltaEvent.usage().outputTokens(), + contentBlockReference.get().usage.cacheCreationInputTokens(), + contentBlockReference.get().usage.cacheReadInputTokens()); contentBlockReference.get().withUsage(totalUsage); } } diff --git a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatModelIT.java b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatModelIT.java index 6570d5ee6a6..5243fcccab0 100644 --- a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatModelIT.java +++ b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatModelIT.java @@ -32,6 +32,7 @@ import reactor.core.publisher.Flux; import org.springframework.ai.anthropic.api.AnthropicApi; +import org.springframework.ai.anthropic.api.AnthropicCacheType; import org.springframework.ai.anthropic.api.tool.MockWeatherService; import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.chat.messages.AssistantMessage; @@ -491,6 +492,59 @@ void testToolUseContentBlock() { } } + @Test + void chatWithPromptCacheViaOptions() { + String userMessageText = "foobar It could be eitherr a contraction of the full title Quenta Silmarillion (\"Tale of the Silmarils\") or also a plain Genitive which " + + "(as in Ancient Greek) signifies reference. This genitive is translated in English with \"about\" or \"of\" " + + "constructions; the titles of the chapters in The Silmarillion are examples of this genitive in poetic English " + + "(Of the Sindar, Of Men, Of the Darkening of Valinor etc), where \"of\" means \"about\" or \"concerning\". " + + "In the same way, Silmarillion can be taken to mean \"Of/About the Silmarils\""; + + // Repeat content to meet minimum token requirements for caching (1024+ tokens) + String largeContent = userMessageText.repeat(20); + + // First request - should create cache + ChatResponse firstResponse = this.chatModel.call(new Prompt(List.of(new UserMessage(largeContent)), + AnthropicChatOptions.builder() + .model(AnthropicApi.ChatModel.CLAUDE_3_HAIKU.getValue()) + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .maxTokens(100) + .temperature(0.8) + .build())); + + // Access native Anthropic usage data + AnthropicApi.Usage firstUsage = (AnthropicApi.Usage) firstResponse.getMetadata().getUsage().getNativeUsage(); + + // Verify first request created cache + assertThat(firstUsage.cacheCreationInputTokens()).isGreaterThan(0); + assertThat(firstUsage.cacheReadInputTokens()).isEqualTo(0); + + // Second request with identical content - should read from cache + ChatResponse secondResponse = this.chatModel.call(new Prompt(List.of(new UserMessage(largeContent)), + AnthropicChatOptions.builder() + .model(AnthropicApi.ChatModel.CLAUDE_3_HAIKU.getValue()) + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .maxTokens(100) + .temperature(0.8) + .build())); + + // Access native Anthropic usage data + AnthropicApi.Usage secondUsage = (AnthropicApi.Usage) secondResponse.getMetadata().getUsage().getNativeUsage(); + + // Verify second request used cache + assertThat(secondUsage.cacheCreationInputTokens()).isEqualTo(0); + assertThat(secondUsage.cacheReadInputTokens()).isGreaterThan(0); + + // Both responses should be valid + assertThat(firstResponse.getResult().getOutput().getText()).isNotBlank(); + assertThat(secondResponse.getResult().getOutput().getText()).isNotBlank(); + + logger.info("First request - Cache creation: {}, Cache read: {}", firstUsage.cacheCreationInputTokens(), + firstUsage.cacheReadInputTokens()); + logger.info("Second request - Cache creation: {}, Cache read: {}", secondUsage.cacheCreationInputTokens(), + secondUsage.cacheReadInputTokens()); + } + record ActorsFilmsRecord(String actor, List movies) { } diff --git a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatOptionsTests.java b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatOptionsTests.java index d9470070e95..72c2cbc01f2 100644 --- a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatOptionsTests.java +++ b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/AnthropicChatOptionsTests.java @@ -23,6 +23,8 @@ import org.junit.jupiter.api.Test; import org.springframework.ai.anthropic.api.AnthropicApi.ChatCompletionRequest.Metadata; +import org.springframework.ai.anthropic.api.AnthropicApi.ChatCompletionRequest.CacheControl; +import org.springframework.ai.anthropic.api.AnthropicCacheType; import static org.assertj.core.api.Assertions.assertThat; @@ -30,6 +32,7 @@ * Tests for {@link AnthropicChatOptions}. * * @author Alexandros Pappas + * @author Soby Chacko */ class AnthropicChatOptionsTests { @@ -471,4 +474,109 @@ void testSetterOverwriteBehavior() { assertThat(options.getMaxTokens()).isEqualTo(10); } + @Test + void testCacheControlBuilder() { + CacheControl cacheControl = AnthropicCacheType.EPHEMERAL.cacheControl(); + + AnthropicChatOptions options = AnthropicChatOptions.builder() + .model("test-model") + .cacheControl(cacheControl) + .build(); + + assertThat(options.getCacheControl()).isEqualTo(cacheControl); + assertThat(options.getCacheControl().type()).isEqualTo("ephemeral"); + } + + @Test + void testCacheControlDefaultValue() { + AnthropicChatOptions options = new AnthropicChatOptions(); + assertThat(options.getCacheControl()).isNull(); + } + + @Test + void testCacheControlEqualsAndHashCode() { + CacheControl cacheControl = AnthropicCacheType.EPHEMERAL.cacheControl(); + + AnthropicChatOptions options1 = AnthropicChatOptions.builder() + .model("test-model") + .cacheControl(cacheControl) + .build(); + + AnthropicChatOptions options2 = AnthropicChatOptions.builder() + .model("test-model") + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .build(); + + AnthropicChatOptions options3 = AnthropicChatOptions.builder().model("test-model").build(); + + assertThat(options1).isEqualTo(options2); + assertThat(options1.hashCode()).isEqualTo(options2.hashCode()); + + assertThat(options1).isNotEqualTo(options3); + assertThat(options1.hashCode()).isNotEqualTo(options3.hashCode()); + } + + @Test + void testCacheControlCopy() { + CacheControl originalCacheControl = AnthropicCacheType.EPHEMERAL.cacheControl(); + + AnthropicChatOptions original = AnthropicChatOptions.builder() + .model("test-model") + .cacheControl(originalCacheControl) + .build(); + + AnthropicChatOptions copied = original.copy(); + + assertThat(copied).isNotSameAs(original).isEqualTo(original); + assertThat(copied.getCacheControl()).isEqualTo(original.getCacheControl()); + assertThat(copied.getCacheControl()).isEqualTo(originalCacheControl); + } + + @Test + void testCacheControlWithNullValue() { + AnthropicChatOptions options = AnthropicChatOptions.builder().model("test-model").cacheControl(null).build(); + + assertThat(options.getCacheControl()).isNull(); + } + + @Test + void testBuilderWithAllFieldsIncludingCacheControl() { + CacheControl cacheControl = AnthropicCacheType.EPHEMERAL.cacheControl(); + + AnthropicChatOptions options = AnthropicChatOptions.builder() + .model("test-model") + .maxTokens(100) + .stopSequences(List.of("stop1", "stop2")) + .temperature(0.7) + .topP(0.8) + .topK(50) + .metadata(new Metadata("userId_123")) + .cacheControl(cacheControl) + .build(); + + assertThat(options) + .extracting("model", "maxTokens", "stopSequences", "temperature", "topP", "topK", "metadata", + "cacheControl") + .containsExactly("test-model", 100, List.of("stop1", "stop2"), 0.7, 0.8, 50, new Metadata("userId_123"), + cacheControl); + } + + @Test + void testCacheControlMutationDoesNotAffectOriginal() { + CacheControl originalCacheControl = AnthropicCacheType.EPHEMERAL.cacheControl(); + + AnthropicChatOptions original = AnthropicChatOptions.builder() + .model("original-model") + .cacheControl(originalCacheControl) + .build(); + + AnthropicChatOptions copy = original.copy(); + copy.setCacheControl(null); + + // Original should remain unchanged + assertThat(original.getCacheControl()).isEqualTo(originalCacheControl); + // Copy should have null cache control + assertThat(copy.getCacheControl()).isNull(); + } + } diff --git a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/api/AnthropicApiIT.java b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/api/AnthropicApiIT.java index c78386fb7ce..6e387996b23 100644 --- a/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/api/AnthropicApiIT.java +++ b/models/spring-ai-anthropic/src/test/java/org/springframework/ai/anthropic/api/AnthropicApiIT.java @@ -44,6 +44,8 @@ * @author Christian Tzolov * @author Jihoon Kim * @author Alexandros Pappas + * @author Claudio Silva Junior + * @author Soby Chacko */ @EnabledIfEnvironmentVariable(named = "ANTHROPIC_API_KEY", matches = ".+") public class AnthropicApiIT { @@ -70,6 +72,37 @@ public class AnthropicApiIT { } """))); + @Test + void chatWithPromptCache() { + String userMessageText = "It could be either a contraction of the full title Quenta Silmarillion (\"Tale of the Silmarils\") or also a plain Genitive which " + + "(as in Ancient Greek) signifies reference. This genitive is translated in English with \"about\" or \"of\" " + + "constructions; the titles of the chapters in The Silmarillion are examples of this genitive in poetic English " + + "(Of the Sindar, Of Men, Of the Darkening of Valinor etc), where \"of\" means \"about\" or \"concerning\". " + + "In the same way, Silmarillion can be taken to mean \"Of/About the Silmarils\""; + + AnthropicMessage chatCompletionMessage = new AnthropicMessage( + List.of(new ContentBlock(userMessageText.repeat(20), AnthropicCacheType.EPHEMERAL.cacheControl())), + Role.USER); + + ChatCompletionRequest chatCompletionRequest = new ChatCompletionRequest( + AnthropicApi.ChatModel.CLAUDE_3_HAIKU.getValue(), List.of(chatCompletionMessage), null, 100, 0.8, + false); + + // First request - creates cache + AnthropicApi.Usage createdCacheToken = anthropicApi.chatCompletionEntity(chatCompletionRequest) + .getBody() + .usage(); + + assertThat(createdCacheToken.cacheCreationInputTokens()).isGreaterThan(0); + assertThat(createdCacheToken.cacheReadInputTokens()).isEqualTo(0); + + // Second request - reads from cache (same request) + AnthropicApi.Usage readCacheToken = anthropicApi.chatCompletionEntity(chatCompletionRequest).getBody().usage(); + + assertThat(readCacheToken.cacheCreationInputTokens()).isEqualTo(0); + assertThat(readCacheToken.cacheReadInputTokens()).isGreaterThan(0); + } + @Test void chatCompletionEntity() { diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/anthropic-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/anthropic-chat.adoc index 2094ab4ee17..f8d08b31e8a 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/anthropic-chat.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/anthropic-chat.adoc @@ -191,6 +191,181 @@ ChatResponse response = chatModel.call( TIP: In addition to the model specific https://github.com/spring-projects/spring-ai/blob/main/models/spring-ai-anthropic/src/main/java/org/springframework/ai/anthropic/AnthropicChatOptions.java[AnthropicChatOptions] you can use a portable link:https://github.com/spring-projects/spring-ai/blob/main/spring-ai-model/src/main/java/org/springframework/ai/chat/prompt/ChatOptions.java[ChatOptions] instance, created with the link:https://github.com/spring-projects/spring-ai/blob/main/spring-ai-model/src/main/java/org/springframework/ai/chat/prompt/DefaultChatOptionsBuilder.java[ChatOptions#builder()]. +== Prompt Caching + +Anthropic's prompt caching feature allows you to cache frequently used prompts to reduce costs and improve response times for repeated interactions. +When you cache a prompt, subsequent identical requests can reuse the cached content, significantly reducing the number of input tokens processed. + +[NOTE] +==== +*Supported Models* + +Prompt caching is currently supported on Claude Opus 4, Claude Sonnet 4, Claude Sonnet 3.7, Claude Sonnet 3.5, Claude Haiku 3.5, Claude Haiku 3, and Claude Opus 3. +==== + +=== Cache Types + +Spring AI supports Anthropic's cache types through the `AnthropicCacheType` enum: + +* `EPHEMERAL`: Temporary caching suitable for short-term reuse within a session + +=== Enabling Prompt Caching + +To enable prompt caching, use the `cacheControl()` method in `AnthropicChatOptions`: + +==== Basic Usage + +[source,java] +---- +// Enable caching with ephemeral type +ChatResponse response = chatModel.call( + new Prompt( + List.of(new UserMessage("Large content to be cached...")), + AnthropicChatOptions.builder() + .model("claude-3-5-sonnet-latest") + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .build() + ) +); +---- + +==== Using ChatClient Fluent API + +[source,java] +---- +String response = ChatClient.create(chatModel) + .prompt() + .user("Analyze this large document: " + document) + .options(AnthropicChatOptions.builder() + .model("claude-3-5-sonnet-latest") + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .build()) + .call() + .content(); +---- + +=== Usage Example + +Here's a complete example demonstrating prompt caching with cost tracking: + +[source,java] +---- +// Create content that will be reused multiple times +String largeContent = "Large document content that meets minimum token requirements..."; + +// First request - creates cache +ChatResponse firstResponse = chatModel.call( + new Prompt( + List.of(new UserMessage(largeContent)), + AnthropicChatOptions.builder() + .model("claude-3-haiku-20240307") + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .maxTokens(100) + .build() + ) +); + +// Access cache-related token usage +AnthropicApi.Usage firstUsage = (AnthropicApi.Usage) firstResponse.getMetadata() + .getUsage().getNativeUsage(); + +System.out.println("Cache creation tokens: " + firstUsage.cacheCreationInputTokens()); +System.out.println("Cache read tokens: " + firstUsage.cacheReadInputTokens()); + +// Second request with identical content - reads from cache +ChatResponse secondResponse = chatModel.call( + new Prompt( + List.of(new UserMessage(largeContent)), + AnthropicChatOptions.builder() + .model("claude-3-haiku-20240307") + .cacheControl(AnthropicCacheType.EPHEMERAL.cacheControl()) + .maxTokens(100) + .build() + ) +); + +AnthropicApi.Usage secondUsage = (AnthropicApi.Usage) secondResponse.getMetadata() + .getUsage().getNativeUsage(); + +System.out.println("Cache creation tokens: " + secondUsage.cacheCreationInputTokens()); +System.out.println("Cache read tokens: " + secondUsage.cacheReadInputTokens()); +---- + +=== Token Usage Tracking + +The `Usage` record provides detailed information about cache-related token consumption. +To access Anthropic-specific cache metrics, use the `getNativeUsage()` method: + +[source,java] +---- +AnthropicApi.Usage usage = (AnthropicApi.Usage) response.getMetadata() + .getUsage().getNativeUsage(); +---- + +Cache-specific metrics include: + +* `cacheCreationInputTokens()`: Returns the number of tokens used when creating a cache entry +* `cacheReadInputTokens()`: Returns the number of tokens read from an existing cache entry + +When you first send a cached prompt: +- `cacheCreationInputTokens()` will be greater than 0 +- `cacheReadInputTokens()` will be 0 + +When you send the same cached prompt again: +- `cacheCreationInputTokens()` will be 0 +- `cacheReadInputTokens()` will be greater than 0 + +=== Best Practices + +1. **Cache Long Prompts**: Focus on caching prompts that meet the minimum token requirements (1024+ tokens for most models, 2048+ for Haiku models). + +2. **Reuse Identical Content**: Caching works best with exact matches of prompt content. +Even small changes will require a new cache entry. + +3. **Monitor Token Usage**: Use the enhanced usage statistics to track cache effectiveness and optimize your caching strategy. + +4. **Place Static Content First**: Position cached content (system instructions, context, examples) at the beginning of your prompt for optimal performance. + +5. **5-Minute Cache Lifetime**: Ephemeral caches expire after 5 minutes of inactivity. +Each time cached content is accessed, the 5-minute timer resets. + +=== Low-level API Usage + +When using the low-level `AnthropicApi` directly, you can specify cache control through the `ContentBlock` constructor: + +[source,java] +---- +// Create content block with cache control +ContentBlock cachedContent = new ContentBlock( + "", + AnthropicCacheType.EPHEMERAL.cacheControl() +); + +AnthropicMessage message = new AnthropicMessage( + List.of(cachedContent), + Role.USER +); + +ChatCompletionRequest request = new ChatCompletionRequest( + AnthropicApi.ChatModel.CLAUDE_3_HAIKU.getValue(), + List.of(message), + null, 100, 0.8, false +); + +ResponseEntity response = anthropicApi.chatCompletionEntity(request); + +// Access cache-related token usage +Usage usage = response.getBody().usage(); +System.out.println("Cache creation tokens: " + usage.cacheCreationInputTokens()); +System.out.println("Cache read tokens: " + usage.cacheReadInputTokens()); +---- + +=== Implementation Details + +Cache control is configured through `AnthropicChatOptions` rather than individual messages. +This preserves compatibility when switching between different AI providers. +The cache control gets applied during request creation in `AnthropicChatModel`. + == Thinking Anthropic Claude models support a "thinking" feature that allows the model to show its reasoning process before providing a final answer. This feature enables more transparent and detailed problem-solving, particularly for complex questions that require step-by-step reasoning.