diff --git a/experimental/aitools/DESIGN.md b/experimental/aitools/DESIGN.md index 7cc141858a..92e46317b3 100644 --- a/experimental/aitools/DESIGN.md +++ b/experimental/aitools/DESIGN.md @@ -192,6 +192,24 @@ the system as a whole a bit (btw each tool should be defined in a separate .go f - further implementation guidance: i want acceptance tests for each of these project types (app, dashboard, job, pipeline); this means they should be exposed as a hidden command like 'databricks experimental aitools tool add_project_resource --json '. having these tests will be instrumental for iterating on them; the initing should not fail! note that the tool subcommand should just assume that the cwd is the current project dir. +- the "workspace_info" tool: + - description: Get information about Databricks workspaces. Call without parameters to list all available workspaces and get current workspace details. Call with a profile parameter to get detailed information about a specific workspace (warehouse, user, etc). + - parameter: profile - optional workspace profile name. If provided, returns detailed information about that specific workspace. If omitted, lists all available workspaces and shows details for the current workspace. + - implementation: + - When called without parameters: + 1. Shows current workspace details (profile, host, cloud, user, warehouse, catalog) + 2. Lists all available workspace profiles with their URLs and cloud providers (if multiple exist) + 3. Provides guidance on how to get details about other workspaces and how to use --profile flag + - When called with a profile parameter: + 1. Validates the profile exists in ~/.databrickscfg + 2. Shows workspace URL and cloud provider + 3. Gets current user via SCIM API + 4. Gets default SQL warehouse using GetDefaultWarehouse() + 5. Gets default Unity Catalog if available + - output: Formatted text with workspace information (profile, host, cloud, user, warehouse, catalog) + - implementation: Single workspace_info.go file with getWorkspaceDetails, listWorkspacesWithCurrent, getCurrentUser, and getDefaultCatalog helpers + - key use case: When user wants to know what workspaces they have access to, or get connection details for a specific workspace + - the "explore" tool: - description: CALL THIS FIRST when user mentions a workspace by name or asks about workspace resources. Shows available workspaces/profiles, default warehouse, and provides guidance on exploring jobs, clusters, catalogs, and other Databricks resources. Use this to discover what's available before running CLI commands. - no parameters needed diff --git a/experimental/aitools/server.go b/experimental/aitools/server.go index 7d9268b0a3..0ecdb610fa 100644 --- a/experimental/aitools/server.go +++ b/experimental/aitools/server.go @@ -66,6 +66,7 @@ type mcpServer struct { // getAllTools returns all tools (definitions + handlers) for the MCP server. func getAllTools() []tools.Tool { return []tools.Tool{ + tools.WorkspaceInfoTool, tools.InvokeDatabricksCLITool, tools.InitProjectTool, tools.AnalyzeProjectTool, diff --git a/experimental/aitools/tools/analyze_project.go b/experimental/aitools/tools/analyze_project.go index 14a6dc4149..cef05fb591 100644 --- a/experimental/aitools/tools/analyze_project.go +++ b/experimental/aitools/tools/analyze_project.go @@ -2,6 +2,7 @@ package tools import ( "context" + "fmt" "os" "os/exec" "path/filepath" @@ -10,6 +11,7 @@ import ( "github.com/databricks/cli/experimental/aitools/auth" "github.com/databricks/cli/experimental/aitools/tools/prompts" "github.com/databricks/cli/experimental/aitools/tools/resources" + "github.com/databricks/cli/libs/log" ) // AnalyzeProjectTool analyzes a Databricks project and returns guidance. @@ -70,14 +72,23 @@ func AnalyzeProject(ctx context.Context, args analyzeProjectArgs) (string, error string(content) } + // Get current workspace context + currentProfile := getCurrentProfile(ctx) + workspaceContext, err := getWorkspaceDetails(ctx, currentProfile) + if err != nil { + log.Debugf(ctx, "Failed to get workspace details (non-fatal): %v", err) + workspaceContext = fmt.Sprintf("Current Profile: %s\n(Unable to load detailed workspace information)", currentProfile) + } + // Get default warehouse for apps and other resources that need it - warehouse, err := GetDefaultWarehouse(ctx) + warehouse, err := GetDefaultWarehouse(ctx, currentProfile) resourceGuidance := getResourceGuidance(args.ProjectPath, warehouse) data := map[string]string{ "Summary": summary, "ReadmeContent": readmeContent, "ResourceGuidance": resourceGuidance, + "WorkspaceContext": workspaceContext, } if err == nil && warehouse != nil { diff --git a/experimental/aitools/tools/explore.go b/experimental/aitools/tools/explore.go index 9223ccb4a5..6644d2bc84 100644 --- a/experimental/aitools/tools/explore.go +++ b/experimental/aitools/tools/explore.go @@ -8,7 +8,6 @@ import ( "strings" "github.com/databricks/cli/experimental/aitools/tools/prompts" - "github.com/databricks/cli/libs/databrickscfg/profile" "github.com/databricks/cli/libs/env" "github.com/databricks/cli/libs/exec" "github.com/databricks/cli/libs/log" @@ -18,23 +17,30 @@ import ( var ExploreTool = Tool{ Definition: ToolDefinition{ Name: "explore", - Description: "**REQUIRED DURING PLAN MODE** - Call this FIRST when planning ANY Databricks work. Use this to discover available workspaces, warehouses, and get workflow recommendations for your specific task. Even if you're just reading an assignment document, call this first. Especially important when task involves: creating Databricks projects/apps/pipelines/jobs, SQL pipelines or data transformation workflows, deploying code to multiple environments (dev/prod), or working with databricks.yml files. You DON'T need a workspace name - call this when starting ANY Databricks planning to understand workspace capabilities and recommended tooling before you create your plan.", + Description: "**REQUIRED DURING PLAN MODE** - Call this FIRST when planning ANY Databricks work. Discovers available workspaces, shows current workspace details (URL, warehouse, user), and provides comprehensive workflow recommendations. Even if you're just reading an assignment document, call this first. Especially important when task involves: creating Databricks projects/apps/pipelines/jobs, SQL pipelines or data transformation workflows, deploying code to multiple environments (dev/prod), or working with databricks.yml files.", InputSchema: map[string]any{ "type": "object", "properties": map[string]any{}, }, }, Handler: func(ctx context.Context, params map[string]any) (string, error) { - warehouse, err := GetDefaultWarehouse(ctx) + // Get workspace context via listWorkspacesWithCurrent + workspaceContext, err := listWorkspacesWithCurrent(ctx) if err != nil { - log.Debugf(ctx, "Failed to get default warehouse (non-fatal): %v", err) - warehouse = nil + log.Debugf(ctx, "Failed to get workspace context (non-fatal): %v", err) + workspaceContext = "Unable to load workspace information. You may need to authenticate first." } + // Get warehouse ID for SQL query examples in guidance currentProfile := getCurrentProfile(ctx) - profiles := getAvailableProfiles(ctx) + warehouse, err := GetDefaultWarehouse(ctx, currentProfile) + warehouseID := "" + if err == nil && warehouse != nil { + warehouseID = warehouse.ID + } - return generateExploreGuidance(warehouse, currentProfile, profiles), nil + // Generate guidance with warehouse context + return generateExploreGuidance(workspaceContext, warehouseID), nil }, } @@ -47,13 +53,21 @@ type warehouse struct { // GetDefaultWarehouse finds a suitable SQL warehouse for queries. // It filters out warehouses the user cannot access and prefers RUNNING warehouses, // then falls back to STOPPED ones (which auto-start). -func GetDefaultWarehouse(ctx context.Context) (*warehouse, error) { +// The profile parameter specifies which workspace profile to use (defaults to DEFAULT if empty). +func GetDefaultWarehouse(ctx context.Context, profile string) (*warehouse, error) { executor, err := exec.NewCommandExecutor("") if err != nil { return nil, fmt.Errorf("failed to create command executor: %w", err) } - output, err := executor.Exec(ctx, fmt.Sprintf(`"%s" api get "/api/2.0/sql/warehouses?skip_cannot_use=true" --output json`, GetCLIPath())) + // Build the CLI command with optional --profile flag + cmd := fmt.Sprintf(`"%s"`, GetCLIPath()) + if profile != "" && profile != "DEFAULT" { + cmd += fmt.Sprintf(` --profile "%s"`, profile) + } + cmd += ` api get "/api/2.0/sql/warehouses?skip_cannot_use=true" --output json` + + output, err := executor.Exec(ctx, cmd) if err != nil { return nil, fmt.Errorf("failed to list warehouses: %w\nOutput: %s", err, output) } @@ -98,69 +112,10 @@ func getCurrentProfile(ctx context.Context) string { return profileName } -// getAvailableProfiles returns all available profiles from ~/.databrickscfg. -func getAvailableProfiles(ctx context.Context) profile.Profiles { - profiles, err := profile.DefaultProfiler.LoadProfiles(ctx, profile.MatchAllProfiles) - if err != nil { - // If we can't load profiles, return empty list (config file might not exist) - return profile.Profiles{} - } - return profiles -} - // generateExploreGuidance creates comprehensive guidance for data exploration. -func generateExploreGuidance(warehouse *warehouse, currentProfile string, profiles profile.Profiles) string { - // Build workspace/profile information - workspaceInfo := "Current Workspace Profile: " + currentProfile - if len(profiles) > 0 { - // Find current profile details - var currentHost string - for _, p := range profiles { - if p.Name == currentProfile { - currentHost = p.Host - if cloud := p.Cloud(); cloud != "" { - currentHost = fmt.Sprintf("%s (%s)", currentHost, cloud) - } - break - } - } - if currentHost != "" { - workspaceInfo = fmt.Sprintf("Current Workspace Profile: %s - %s", currentProfile, currentHost) - } - } - - // Build available profiles list - profilesInfo := "" - if len(profiles) > 1 { - profilesInfo = "\n\nAvailable Workspace Profiles:\n" - for _, p := range profiles { - marker := "" - if p.Name == currentProfile { - marker = " (current)" - } - cloud := p.Cloud() - if cloud != "" { - profilesInfo += fmt.Sprintf(" - %s: %s (%s)%s\n", p.Name, p.Host, cloud, marker) - } else { - profilesInfo += fmt.Sprintf(" - %s: %s%s\n", p.Name, p.Host, marker) - } - } - profilesInfo += "\n To use a different workspace, add --profile to any command:\n" - profilesInfo += " invoke_databricks_cli '--profile prod catalogs list'\n" - } - - // Handle warehouse information (may be nil if lookup failed) - warehouseName := "" - warehouseID := "" - if warehouse != nil { - warehouseName = warehouse.Name - warehouseID = warehouse.ID - } - +func generateExploreGuidance(workspaceContext, warehouseID string) string { return prompts.MustExecuteTemplate("explore.tmpl", map[string]string{ - "WorkspaceInfo": workspaceInfo, - "WarehouseName": warehouseName, - "WarehouseID": warehouseID, - "ProfilesInfo": profilesInfo, + "WorkspaceContext": workspaceContext, + "WarehouseID": warehouseID, }) } diff --git a/experimental/aitools/tools/prompts/analyze_project.tmpl b/experimental/aitools/tools/prompts/analyze_project.tmpl index ea0265ccec..bd48736e80 100644 --- a/experimental/aitools/tools/prompts/analyze_project.tmpl +++ b/experimental/aitools/tools/prompts/analyze_project.tmpl @@ -14,6 +14,12 @@ Project Analysis {{.Summary}} +Current Workspace +----------------- +{{.WorkspaceContext}} + +Use workspace_info(profile='') to get details about other workspaces. + Guidance for Working with this Project -------------------------------------- diff --git a/experimental/aitools/tools/prompts/explore.tmpl b/experimental/aitools/tools/prompts/explore.tmpl index 2075c37e19..7721559aa8 100644 --- a/experimental/aitools/tools/prompts/explore.tmpl +++ b/experimental/aitools/tools/prompts/explore.tmpl @@ -10,11 +10,10 @@ Databricks Data Exploration Guide ===================================== -{{.WorkspaceInfo}}{{if .WarehouseName}} -Default SQL Warehouse: {{.WarehouseName}} ({{.WarehouseID}}){{else}} -Note: No SQL warehouse detected. SQL queries will require warehouse_id to be specified manually.{{end}}{{.ProfilesInfo}} +{{.WorkspaceContext}} IMPORTANT: Use the invoke_databricks_cli tool to run all commands below! +Use workspace_info(profile='') to get details about other workspaces. 1. EXECUTING SQL QUERIES @@ -76,7 +75,99 @@ IMPORTANT: Use the invoke_databricks_cli tool to run all commands below! Getting Started: - Use the commands above to explore what resources exist in the workspace - All commands support --output json for programmatic access -- Remember to add --profile when working with non-default workspaces +- To use a different workspace: workspace_info(profile='') then invoke_databricks_cli('--profile ') + +WORKFLOW PATTERNS FOR NOTEBOOKS +=============================== + +Create notebooks locally (.ipynb), fetch data from Databricks, generate visualizations. + +## Setup (one-time) + +```bash +# Install uv +curl -LsSf https://astral.sh/uv/install.sh | sh +``` + +**pyproject.toml** dev dependencies: +```toml +[dependency-groups] +dev = [ + "databricks-connect>=15.4.0", + "papermill", + "nbformat", + "matplotlib", +] +``` + +```bash +# Install deps +uv sync + +# Auto-init spark (no boilerplate in notebooks) +mkdir -p ~/.ipython/profile_default/startup +cat > ~/.ipython/profile_default/startup/00-databricks-spark.py << 'EOF' +try: + from databricks.connect import DatabricksSession + spark = DatabricksSession.builder.getOrCreate() +except: pass +EOF +``` + +## Workflow + +**Create or update notebook:** + +Add exploratory cells to an .ipynb notebook like + +```python +import pandas as pd, matplotlib.pyplot as plt + +# Aggregate in Spark, limit for viz +df = spark.sql(""" + SELECT category, COUNT(*) as count, AVG(value) as avg_value + FROM catalog.schema.table + GROUP BY category + ORDER BY count DESC +""").limit(10000).toPandas() + +df.plot(x='category', y='avg_value', kind='bar', figsize=(10, 6)) +plt.title('Average Value by Category') +plt.show() +``` + +**Execute:** + +Execute the notebook and produce results inline in the ipynb: + +```bash +DATABRICKS_CONFIG_PROFILE= DATABRICKS_SERVERLESS_COMPUTE_ID=auto \ + uv run papermill notebook.ipynb notebook_executed.ipynb -k python3 +``` + +**Iterate:** + +Actually iterating over a notebook is MANDATORY. You can't assume that it will just be successful. + +Papermill embeds all outputs (stdout, stderr, plots as base64, errors) into the executed .ipynb. + +To iterate: +1. Read `notebook_executed.ipynb` to see results (outputs are in cell JSON) +2. Check for errors, review data/plots +4. Optionally, do any quick exploratory queries directly via the CLI +5. Modify `notebook.ipynb` based on results +6. Re-execute and repeat + +**View in IDE:** +- `cursor notebook_executed.ipynb` or `code notebook_executed.ipynb` +- Or deploy: `databricks bundle deploy` and open in workspace browser + +## Key Pattern + +**Aggregate → Limit → Pandas → Visualize → Read outputs → Iterate** + +Always aggregate in Spark (GROUP BY, AVG, COUNT), then `.limit(10000)` before `.toPandas()`. Execute with papermill, read the executed .ipynb to see results, iterate. + WORKFLOW PATTERNS FOR DATABRICKS PROJECTS diff --git a/experimental/aitools/tools/workspace_info.go b/experimental/aitools/tools/workspace_info.go new file mode 100644 index 0000000000..689bddeeff --- /dev/null +++ b/experimental/aitools/tools/workspace_info.go @@ -0,0 +1,223 @@ +package tools + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/databricks/cli/libs/databrickscfg/profile" + "github.com/databricks/cli/libs/exec" + "github.com/databricks/cli/libs/log" +) + +// WorkspaceInfoTool provides information about available workspaces and their details. +var WorkspaceInfoTool = Tool{ + Definition: ToolDefinition{ + Name: "workspace_info", + Description: "Get information about Databricks workspaces. Call without parameters to list all available workspaces and get current workspace details. Call with a profile parameter to get detailed information about a specific workspace (warehouse, user, etc).", + InputSchema: map[string]any{ + "type": "object", + "properties": map[string]any{ + "profile": map[string]any{ + "type": "string", + "description": "Optional workspace profile name. If provided, returns detailed information about that specific workspace. If omitted, lists all available workspaces and shows details for the current workspace.", + }, + }, + }, + }, + Handler: func(ctx context.Context, params map[string]any) (string, error) { + profileParam, hasProfile := params["profile"].(string) + + if hasProfile && profileParam != "" { + // Get detailed info about specific workspace + return getWorkspaceDetails(ctx, profileParam) + } + + // List all workspaces + current workspace details + return listWorkspacesWithCurrent(ctx) + }, +} + +// getWorkspaceDetails returns detailed information about a specific workspace. +func getWorkspaceDetails(ctx context.Context, profileName string) (string, error) { + // Validate profile exists + profiles, err := profile.DefaultProfiler.LoadProfiles(ctx, profile.MatchAllProfiles) + if err != nil { + return "", fmt.Errorf("failed to load profiles: %w", err) + } + + var targetProfile *profile.Profile + for _, p := range profiles { + if p.Name == profileName { + targetProfile = &p + break + } + } + + if targetProfile == nil { + return "", fmt.Errorf("profile '%s' not found", profileName) + } + + var result strings.Builder + result.WriteString(fmt.Sprintf("Workspace Details for Profile: %s\n", profileName)) + result.WriteString(strings.Repeat("=", 50) + "\n\n") + + result.WriteString(fmt.Sprintf("Workspace URL: %s\n", targetProfile.Host)) + if cloud := targetProfile.Cloud(); cloud != "" { + result.WriteString(fmt.Sprintf("Cloud Provider: %s\n", cloud)) + } + + // Get current user + if user, err := getCurrentUser(ctx, profileName); err == nil && user != "" { + result.WriteString(fmt.Sprintf("Current User: %s\n", user)) + } + + // Get default warehouse + warehouse, err := GetDefaultWarehouse(ctx, profileName) + if err != nil { + log.Debugf(ctx, "Failed to get default warehouse: %v", err) + result.WriteString("\nDefault Warehouse: Not available\n") + result.WriteString("Note: You may need to authenticate or no SQL warehouses are accessible.\n") + } else if warehouse != nil { + result.WriteString("\nDefault SQL Warehouse:\n") + result.WriteString(fmt.Sprintf(" Name: %s\n", warehouse.Name)) + result.WriteString(fmt.Sprintf(" ID: %s\n", warehouse.ID)) + result.WriteString(fmt.Sprintf(" State: %s\n", warehouse.State)) + } + + // Get Unity Catalog info + if catalog, err := getDefaultCatalog(ctx, profileName); err == nil && catalog != "" { + result.WriteString("\nUnity Catalog:\n") + result.WriteString(fmt.Sprintf(" Default Catalog: %s\n", catalog)) + } + + return result.String(), nil +} + +// listWorkspacesWithCurrent lists all available workspaces and shows details for current one. +func listWorkspacesWithCurrent(ctx context.Context) (string, error) { + currentProfile := getCurrentProfile(ctx) + profiles, err := profile.DefaultProfiler.LoadProfiles(ctx, profile.MatchAllProfiles) + if err != nil { + return "", fmt.Errorf("failed to load profiles: %w", err) + } + + if len(profiles) == 0 { + return "No Databricks workspace profiles configured.\n\nTo configure a workspace, run:\n databricks auth login --host ", nil + } + + var result strings.Builder + + // Show current workspace details first + result.WriteString("Current Workspace\n") + result.WriteString(strings.Repeat("=", 50) + "\n\n") + + details, err := getWorkspaceDetails(ctx, currentProfile) + if err != nil { + result.WriteString(fmt.Sprintf("Profile: %s\n", currentProfile)) + result.WriteString(fmt.Sprintf("Error getting details: %v\n", err)) + } else { + // Remove the header from details since we have our own + detailsLines := strings.Split(details, "\n") + if len(detailsLines) > 2 { + result.WriteString(strings.Join(detailsLines[3:], "\n")) + } + } + + // List all available workspaces + if len(profiles) > 1 { + result.WriteString("\n\nAvailable Workspaces\n") + result.WriteString(strings.Repeat("=", 50) + "\n\n") + + for _, p := range profiles { + marker := "" + if p.Name == currentProfile { + marker = " (current)" + } + + if cloud := p.Cloud(); cloud != "" { + result.WriteString(fmt.Sprintf(" %s: %s (%s)%s\n", p.Name, p.Host, cloud, marker)) + } else { + result.WriteString(fmt.Sprintf(" %s: %s%s\n", p.Name, p.Host, marker)) + } + } + + result.WriteString("\nTo get details about a different workspace:\n") + result.WriteString(" workspace_info(profile='')\n") + result.WriteString("\nTo use a different workspace for commands:\n") + result.WriteString(" invoke_databricks_cli('--profile ')\n") + } + + return result.String(), nil +} + +// getCurrentUser returns the current user's username or email. +func getCurrentUser(ctx context.Context, profileName string) (string, error) { + executor, err := exec.NewCommandExecutor("") + if err != nil { + return "", err + } + + cmd := fmt.Sprintf(`"%s"`, GetCLIPath()) + if profileName != "" && profileName != "DEFAULT" { + cmd += fmt.Sprintf(` --profile "%s"`, profileName) + } + cmd += ` api get "/api/2.0/preview/scim/v2/Me" --output json` + + output, err := executor.Exec(ctx, cmd) + if err != nil { + return "", err + } + + var response struct { + UserName string `json:"userName"` + Emails []struct { + Value string `json:"value"` + } `json:"emails"` + } + + if err := json.Unmarshal(output, &response); err != nil { + return "", err + } + + if response.UserName != "" { + return response.UserName, nil + } + + if len(response.Emails) > 0 { + return response.Emails[0].Value, nil + } + + return "", nil +} + +// getDefaultCatalog returns the default Unity Catalog catalog name. +func getDefaultCatalog(ctx context.Context, profileName string) (string, error) { + executor, err := exec.NewCommandExecutor("") + if err != nil { + return "", err + } + + cmd := fmt.Sprintf(`"%s"`, GetCLIPath()) + if profileName != "" && profileName != "DEFAULT" { + cmd += fmt.Sprintf(` --profile "%s"`, profileName) + } + cmd += ` api get "/api/2.1/unity-catalog/current-metastore-assignment" --output json` + + output, err := executor.Exec(ctx, cmd) + if err != nil { + // Unity Catalog might not be enabled + return "", nil + } + + var response struct { + DefaultCatalogName string `json:"default_catalog_name"` + } + + if err := json.Unmarshal(output, &response); err != nil { + return "", nil + } + + return response.DefaultCatalogName, nil +}