diff --git a/sidecar/src/agentic/tool/session/service.rs b/sidecar/src/agentic/tool/session/service.rs index a4e28bff4..2838c1ac8 100644 --- a/sidecar/src/agentic/tool/session/service.rs +++ b/sidecar/src/agentic/tool/session/service.rs @@ -390,11 +390,10 @@ impl SessionService { tool_box: Arc, llm_broker: Arc, user_context: UserContext, - aide_rules: Option, reasoning: bool, running_in_editor: bool, mcts_log_directory: Option, - repo_name: Option, + tool_use_agent_properties: ToolUseAgentProperties, message_properties: SymbolEventMessageProperties, context_crunching_llm: Option, ) -> Result<(), SymbolError> { @@ -431,8 +430,7 @@ impl SessionService { // close to the vscode server // we should ideally get this information from the vscode-server side setting std::env::consts::OS.to_owned(), - shell.to_owned(), - ToolUseAgentProperties::new(running_in_editor, repo_name, aide_rules), + tool_use_agent_properties, ) .set_context_crunching_llm(context_crunching_llm.clone()); diff --git a/sidecar/src/agentic/tool/session/tool_use_agent.rs b/sidecar/src/agentic/tool/session/tool_use_agent.rs index 0518be326..2463f1ae5 100644 --- a/sidecar/src/agentic/tool/session/tool_use_agent.rs +++ b/sidecar/src/agentic/tool/session/tool_use_agent.rs @@ -331,20 +331,45 @@ pub enum ToolUseAgentOutputType { Failure(String), } +/// if the agent should use an explicit tool to think or if we should +/// a mini-cot before using a tool in the agent +#[derive(Clone)] +pub enum AgentThinkingMode { + ToolBased, + MiniCOTBeforeTool, +} + /// The various properties which the tool use agent can use /// We can configure if we are in-editor and additional metadata /// which might be present #[derive(Clone)] pub struct ToolUseAgentProperties { in_editor: bool, + shell: String, + // keeping this disabled for now while we write out the prompts and run a few + // evals on this to measure how the performance is + _thinking: AgentThinkingMode, + // if the current agent is running under a eval harness, this helps tune the system + // prompt for the agent appropriately + is_eval_run: bool, repo_name: Option, aide_rules: Option, } impl ToolUseAgentProperties { - pub fn new(in_editor: bool, repo_name: Option, aide_rules: Option) -> Self { + pub fn new( + in_editor: bool, + shell: String, + thinking: AgentThinkingMode, + is_eval_run: bool, + repo_name: Option, + aide_rules: Option, + ) -> Self { Self { in_editor, + shell, + is_eval_run, + _thinking: thinking, repo_name, aide_rules, } @@ -356,7 +381,6 @@ pub struct ToolUseAgent { llm_client: Arc, working_directory: String, operating_system: String, - shell: String, properties: ToolUseAgentProperties, temperature: f32, context_crunching_llm: Option, @@ -367,14 +391,12 @@ impl ToolUseAgent { llm_client: Arc, working_directory: String, operating_system: String, - shell: String, properties: ToolUseAgentProperties, ) -> Self { Self { llm_client, working_directory, operating_system, - shell, properties, // we always default to 0.2 temp to start with temperature: 0.2, @@ -597,6 +619,139 @@ This ensures you can refine your plan in and keep track of exactly which } } + /// The system message for midwit tool use agent, which takes xml formatted + /// tools as input and has similar objective as to the any swe agent: + /// - create a repo script + /// - find the bug + /// - fix it + /// - rerun repo script to prove things are okay + fn system_message_midwit_tool_mode( + &self, + repo_name: &str, + context: &ToolUseAgentInput, + ) -> String { + let tool_descriptions = context.tool_descriptions.to_vec().join("\n\n"); + let working_directory = self.working_directory.to_owned(); + let operating_system = self.operating_system.to_owned(); + format!( + r#"You are an expert software engineer tasked with solving Github issues which the user will provide. You are an expert at {repo_name} and you will be given a list of tools which you can use one after the other to debug and fix the issue. +I have already taken care of all changes to any test files described in {working_directory}. This means you DON'T have to modify the testing logic or any of the tests in any way! +Your task is to make the minimal changes to non-tests files in the {working_directory} directory to ensure the Github Issue is satisfied. +==== + +TOOL USE + +You have access to a set of tools. You can use one tool per message (and only one), and you will receive the result of the tool use from the user. You should use the tools step-by-step to accomplish the user task. +You use the previous information which you get from using the tools to inform your next tool usage. + +# Tool Use Formatting + +Tool use is formatted using XML-style tags. The tool name is enclosed in opening and closing tags, and each parameter is similarly enclosed within its own set of tags. Each tag is on a new line. Here's the structure: + + + +value1 + + +value2 + +{{rest of the parameters}} + + +As an example: + + + +bin/main.rs + + +1 + + +250 + + + +Another example: + + +. + + +true + + + +Always adhere to this format for the tool use to ensure proper parsing and execution from the tool use. And NOTICE HOW ALL XML TAGS ARE ON A NEW LINE. This is important to not break parsing. + +# Tools provided + +{tool_descriptions} + +# Tool Use Guidelines + +1. In tags, assess what information you already have and what information you need to proceed with the task. Your thinking should be thorough and so it's fine if it's very long. +2. Choose the most appropriate tool based on the task and the tool descriptions provided. Assess if you need additional information to proceed, and which of the available tools would be most effective for gathering this information. For example using the list_files tool is more effective than running a command like \`ls\` in the terminal. It's critical that you think about each available tool and use the one that best fits the current step in the task. +3. If multiple actions are needed, use one tool at a time per message to accomplish the task iteratively, with each tool use being informed by the result of the previous tool use. Do not assume the outcome of any tool use. Each step must be informed by the previous step's result. + +It is crucial to proceed step-by-step, waiting for the tool output after each tool use before moving forward with the task. + +By waiting for and carefully considering the tool output after each tool use, you can react accordingly and make informed decisions about how to proceed with the task. This iterative process helps ensure the overall success and accuracy of your work. + +==== + +CAPABILITIES + +- You have access to tools that let you execute CLI commands on the local checkout, list files, view source code definitions, regex search, read and write files. These tools help you effectively accomplish a wide range of tasks, such as writing code, making edits or improvements to existing files, understanding the current state of a project, and much more. +- The code_edit tool also allows you to implicilty create a new file and write content to it. You can use it to edit the code or create a new file and write content to it. +- You can use search_files to perform regex searches across files in a specified directory, outputting context-rich results that include surrounding lines. This is particularly useful for understanding code patterns, finding specific implementations, or identifying areas that need refactoring. + +==== + +RULES + +- Your current working directory is: {working_directory} +- When using the search_files tool, craft your regex patterns carefully to balance specificity and flexibility. Based on the Github Issue you may use it to find code patterns, function definitions, or any text-based information across the project. The results include context, so analyze the surrounding code to better understand the matches. Leverage the search_files tool in combination with other tools for more comprehensive analysis. For example, use it to find specific code patterns, then use read_file to examine the full context of interesting matches before using code_edit_input to make informed changes. +- When making changes to code, always consider the context in which the code is being used. Ensure that your changes are compatible with the existing codebase and that they follow the project's coding standards and best practices. +- Use the tools provided to accomplish the Github Issue efficiently and effectively. When you've completed solving the issue, you must use the attempt_completion tool to present the result to the user. +- Your goal is to solve the Github Issue be laser focussed on that. +- NEVER end attempt_completion result with a question or request to engage in further conversation! Formulate the end of your result in a way that is final and does not require further input from the user. +- ALWAYS start your tool use with the section. +- ONLY USE A SINGLE tool at a time, never use multiple tools in the same response. + +==== + +SYSTEM INFORMATION + +Operating System: {operating_system} +Default Shell: bash +Current Working Directory: {working_directory} +Current Repo Name: {repo_name} + +==== + +OBJECTIVE + +You are an expert software engineer taked with solving Github issues which the user will provide, breaking it down into clear steps and working through them methodically. +Your first goal should be to reproduce the issue which you can then run using `python reproduce_error.py` using the execute_command to confirm the error, you can put prints to deeply understand the issue. +You are an expert in {repo_name} and know in detail everything about this repository and all the different code structures which are present in it source code for it. + + +You are NOT ALLOWED to create or edit any of the test-files. The test-files are NOT RUNNABLE. +You are NOT ALLOWED to install any new packages. The dev environment has already been setup for you before you run any command or the reproduce_error.py script. + +1. As a first step, it might be a good idea to explore the repo to familiarize yourself with its structure. +2. Create a script to reproduce the error and execute it with `python reproduce_error.py` using the execute_command (which uses bash internally), to confirm the error +3. Edit the sourcecode of the repo to resolve the issue +4. Rerun your reproduce script and confirm that the error is fixed! +5. Think about edgecases and make sure your fix handles them as well. +6. You can ONLY USE 1 TOOL in each step and not multiple tools, using multiple tools is not allowed. +7. ONLY ATTEMPT COMPLETION if you have finished with your round of edits. +8. Run test files at the very end so you can catch any regressions in your solution. Some test output might be wrong or conflict the Github Issue so carefully understand the test file and the outcome before commiting to making more changes based on the test output. +9. NEVER forget to include the section before using a tool. We will not be able to invoke the tool properly if you forget it."# + ) + } + fn system_message_midwit_json_mode(&self, repo_name: &str, problem_statement: &str) -> String { let working_directory = self.working_directory.to_owned(); format!( @@ -732,7 +887,7 @@ You are NOT ALLOWED to install any new packages. The dev environment has already fn system_message_for_context_crunching(&self) -> String { let working_directory = self.working_directory.to_owned(); let operating_system = self.operating_system.to_owned(); - let default_shell = self.shell.to_owned(); + let default_shell = self.properties.shell.to_owned(); let repo_name = self .properties .repo_name @@ -886,7 +1041,7 @@ Additional guildelines and rules the user has provided which must be followed: } None => "".to_owned(), }; - let default_shell = self.shell.to_owned(); + let default_shell = self.properties.shell.to_owned(); format!( r#"You are SOTA-agent, a highly skilled AI software engineer with extensive knowledge in all programming languages, frameworks, design patterns, and best practices. Your primary goal is to accomplish tasks related to software development, file manipulation, and system operations within the specified project directory. @@ -1314,7 +1469,17 @@ You accomplish a given task iteratively, breaking it down into clear steps and w &self, input: ToolUseAgentInput, ) -> Result { - let system_message = LLMClientMessage::system(self.system_message(&input)).cache_point(); + let system_message = if self.properties.is_eval_run { + match self.properties.repo_name.as_ref() { + Some(repo_name) => LLMClientMessage::system( + self.system_message_midwit_tool_mode(&repo_name, &input), + ) + .cache_point(), + None => LLMClientMessage::system(self.system_message(&input)).cache_point(), + } + } else { + LLMClientMessage::system(self.system_message(&input)).cache_point() + }; let llm_properties = input .symbol_event_message_properties .llm_properties() diff --git a/sidecar/src/agentic/tool/type.rs b/sidecar/src/agentic/tool/type.rs index b6db8f21d..8943d1cd8 100644 --- a/sidecar/src/agentic/tool/type.rs +++ b/sidecar/src/agentic/tool/type.rs @@ -158,6 +158,8 @@ pub enum ToolType { RequestScreenshot, // Context crunching ContextCrunching, + // Think tool + ThinkTool, // dynamically configured MCP servers McpTool(String), } @@ -264,6 +266,7 @@ impl std::fmt::Display for ToolType { ToolType::FindFiles => write!(f, "find_file"), ToolType::RequestScreenshot => write!(f, "request_screenshot"), ToolType::ContextCrunching => write!(f, "context_crunching"), + ToolType::ThinkTool => write!(f, "think_tool"), ToolType::McpTool(name) => write!(f, "{}", name), } } diff --git a/sidecar/src/bin/agent_bin.rs b/sidecar/src/bin/agent_bin.rs index 71652ccf5..c3cdb7a53 100644 --- a/sidecar/src/bin/agent_bin.rs +++ b/sidecar/src/bin/agent_bin.rs @@ -14,7 +14,10 @@ use sidecar::{ events::{input::SymbolEventRequestId, message_event::SymbolEventMessageProperties}, identifier::LLMProperties, }, - tool::r#type::ToolType, + tool::{ + r#type::ToolType, + session::tool_use_agent::{AgentThinkingMode, ToolUseAgentProperties}, + }, }, application::{application::Application, config::configuration::Configuration}, repo::types::RepoRef, @@ -201,6 +204,15 @@ Your thinking should be thorough and so it's fine if it's very long."#, ToolType::FindFiles, ]; + let tool_use_agent_properties = ToolUseAgentProperties::new( + false, + "bash".to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + false, + Some(args.repo_name.clone()), + aide_rules, + ); + // wait for the agent to finish over here while busy looping println!("agent::tool_use::start"); let _ = session_service @@ -219,11 +231,10 @@ Your thinking should be thorough and so it's fine if it's very long."#, tool_box, llm_broker, UserContext::default(), - aide_rules, false, false, Some(args.log_directory.clone()), - Some(args.repo_name.clone()), + tool_use_agent_properties, message_properties, None, // No context crunching LLM for agent_bin ) diff --git a/sidecar/src/bin/agent_bin_reasoning.rs b/sidecar/src/bin/agent_bin_reasoning.rs index 78cf3c775..02d87704b 100644 --- a/sidecar/src/bin/agent_bin_reasoning.rs +++ b/sidecar/src/bin/agent_bin_reasoning.rs @@ -14,7 +14,10 @@ use sidecar::{ events::{input::SymbolEventRequestId, message_event::SymbolEventMessageProperties}, identifier::LLMProperties, }, - tool::r#type::ToolType, + tool::{ + r#type::ToolType, + session::tool_use_agent::{AgentThinkingMode, ToolUseAgentProperties}, + }, }, application::{application::Application, config::configuration::Configuration}, repo::types::RepoRef, @@ -194,6 +197,15 @@ async fn main() -> Result<(), Box> { ToolType::FindFiles, ]; + let tool_use_agent_properties = ToolUseAgentProperties::new( + false, + "bash".to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + false, // is running under eval + Some(args.repo_name.to_owned()), + None, + ); + // wait for the agent to finish over here while busy looping println!("agent::tool_use::start"); let _ = session_service @@ -212,11 +224,10 @@ async fn main() -> Result<(), Box> { tool_box, llm_broker, UserContext::default(), - None, true, // turn on reasoning false, Some(args.log_directory.clone()), - Some(args.repo_name.clone()), + tool_use_agent_properties, message_properties, None, // No context crunching LLM for agent_bin_reasoning ) diff --git a/sidecar/src/bin/swe_bench_agent_bin.rs b/sidecar/src/bin/swe_bench_agent_bin.rs index 840d196ca..c79670c9d 100644 --- a/sidecar/src/bin/swe_bench_agent_bin.rs +++ b/sidecar/src/bin/swe_bench_agent_bin.rs @@ -14,7 +14,10 @@ use sidecar::{ events::{input::SymbolEventRequestId, message_event::SymbolEventMessageProperties}, identifier::LLMProperties, }, - tool::r#type::ToolType, + tool::{ + r#type::ToolType, + session::tool_use_agent::{AgentThinkingMode, ToolUseAgentProperties}, + }, }, application::{application::Application, config::configuration::Configuration}, repo::types::RepoRef, @@ -203,6 +206,15 @@ Your thinking should be thorough and so it's fine if it's very long."#, ToolType::FindFiles, ]; + let tool_use_agent_properties = ToolUseAgentProperties::new( + false, + "bash".to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + true, // is running under eval harness + Some(args.repo_name.to_owned()), + aide_rules.clone(), + ); + // wait for the agent to finish over here while busy looping println!("agent::tool_use::start"); let _ = session_service @@ -221,11 +233,10 @@ Your thinking should be thorough and so it's fine if it's very long."#, tool_box, llm_broker, UserContext::default(), - aide_rules, false, false, Some(args.log_directory.clone()), - Some(args.repo_name.clone()), + tool_use_agent_properties, message_properties, None, // No context crunching LLM for agent_bin ) diff --git a/sidecar/src/mcts/execution/inference.rs b/sidecar/src/mcts/execution/inference.rs index 4067cf4f5..b30d24f12 100644 --- a/sidecar/src/mcts/execution/inference.rs +++ b/sidecar/src/mcts/execution/inference.rs @@ -23,7 +23,7 @@ use crate::{ session::{ chat::SessionChatMessage, tool_use_agent::{ - ToolUseAgent, ToolUseAgentInput, ToolUseAgentInputOnlyTools, + AgentThinkingMode, ToolUseAgent, ToolUseAgentInput, ToolUseAgentInputOnlyTools, ToolUseAgentOutputType, ToolUseAgentOutputWithTools, ToolUseAgentProperties, }, }, @@ -257,8 +257,14 @@ impl InferenceEngine { search_tree.llm_client(), search_tree.root_directory(), "linux".to_owned(), - "bash".to_owned(), - ToolUseAgentProperties::new(true, Some(search_tree.repo_name()), None), + ToolUseAgentProperties::new( + true, + "bash".to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + true, + Some(search_tree.repo_name()), + None, + ), ); let session_messages = messages @@ -441,8 +447,14 @@ impl InferenceEngine { search_tree.llm_client(), search_tree.root_directory(), "linux".to_owned(), - "bash".to_owned(), - ToolUseAgentProperties::new(true, Some(search_tree.repo_name()), None), + ToolUseAgentProperties::new( + true, + "bash".to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + true, // is running under eval harness + Some(search_tree.repo_name()), + None, + ), ); let mut session_messages = messages diff --git a/sidecar/src/webserver/agentic.rs b/sidecar/src/webserver/agentic.rs index e9c0542a5..05c26d222 100644 --- a/sidecar/src/webserver/agentic.rs +++ b/sidecar/src/webserver/agentic.rs @@ -35,6 +35,7 @@ use crate::agentic::tool::errors::ToolError; use crate::agentic::tool::lsp::open_file::OpenFileResponse; use crate::agentic::tool::plan::service::PlanService; use crate::agentic::tool::session::session::AideAgentMode; +use crate::agentic::tool::session::tool_use_agent::{AgentThinkingMode, ToolUseAgentProperties}; use crate::chunking::text_document::Range; use crate::repo::types::RepoRef; use crate::webserver::plan::{ @@ -1783,6 +1784,15 @@ pub async fn agent_tool_use( vec![] }) .collect(); + + let tool_use_agent_properties = ToolUseAgentProperties::new( + true, + shell.to_owned(), + AgentThinkingMode::MiniCOTBeforeTool, + false, // running under eval harness + Some(repo_name), + aide_rules, + ); let _ = tokio::spawn({ let sender = sender.clone(); let session_id = session_id.clone(); @@ -1804,11 +1814,10 @@ pub async fn agent_tool_use( tool_box, llm_broker, user_context, - aide_rules, reasoning, true, // we are running inside the editor over here mcts_log_directory, - Some(repo_name), + tool_use_agent_properties, message_properties, None, // No context crunching LLM for web requests )