Skip to content

Commit 2f06d47

Browse files
WebMemories: Update user action detection (#1331)
- Add an adapter for knowledge extraction that allows us to re-use the discovery agent's action detection code. - Update display of discovered actions in the page knowledge sidepanel
1 parent a72a572 commit 2f06d47

File tree

10 files changed

+847
-31
lines changed

10 files changed

+847
-31
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
// Unified, de-duplicated list of user actions with structured components
5+
export type UnifiedAction = {
6+
// The action verb (e.g., "buy", "book", "track", "search")
7+
verb: string;
8+
// The direct object of the action (e.g., "groceries", "flight", "package")
9+
directObject: string;
10+
// Human-readable short description (e.g., "user can buy groceries for delivery")
11+
shortDescription: string;
12+
// Confidence score for this action detection (0-1)
13+
confidence: number;
14+
// Source of detection: "page_summary", "candidate_actions", or "unified"
15+
source: "page_summary" | "candidate_actions" | "unified";
16+
};
17+
18+
export type UnifiedActionsList = {
19+
// Array of unified, de-duplicated user actions
20+
actions: UnifiedAction[];
21+
// Total number of actions found before deduplication
22+
originalCount: number;
23+
// Number of actions after deduplication
24+
finalCount: number;
25+
};

ts/packages/agents/browser/src/agent/discovery/translator.mts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ import fs from "fs";
1414
import { openai as ai } from "aiclient";
1515
import { fileURLToPath } from "node:url";
1616
import { SchemaDiscoveryActions } from "./schema/discoveryActions.mjs";
17+
import { PageDescription } from "./schema/pageSummary.mjs";
18+
import { UserActionsList } from "./schema/userActionsPool.mjs";
1719

1820
export type HtmlFragments = {
1921
frameId: string;
@@ -341,6 +343,72 @@ export class SchemaDiscoveryAgent<T extends object> {
341343
return response;
342344
}
343345

346+
async unifyUserActions(
347+
candidateActions: UserActionsList,
348+
pageDescription?: PageDescription,
349+
fragments?: HtmlFragments[],
350+
screenshots?: string[],
351+
) {
352+
const unifiedActionsSchema =
353+
await getSchemaFileContents("unifiedActions.mts");
354+
const bootstrapTranslator = this.getBootstrapTranslator(
355+
"UnifiedActionsList",
356+
unifiedActionsSchema,
357+
);
358+
359+
const screenshotSection = getScreenshotPromptSection(
360+
screenshots,
361+
fragments,
362+
);
363+
const htmlSection = getHtmlPromptSection(fragments);
364+
const prefixSection = getPrefixPromptSection();
365+
const suffixSection = getSuffixPromptSection();
366+
367+
const promptSections = [
368+
...prefixSection,
369+
...screenshotSection,
370+
...htmlSection,
371+
{
372+
type: "text",
373+
text: `
374+
You need to create a unified, de-duplicated list of user actions from two sources:
375+
376+
1. Page Summary Actions (high-level user capabilities):
377+
'''
378+
${JSON.stringify(pageDescription?.possibleUserAction, null, 2)}
379+
'''
380+
381+
2. Candidate Actions (detailed schema-based actions):
382+
'''
383+
${JSON.stringify(candidateActions.actions, null, 2)}
384+
'''
385+
386+
Create a de-duplicated list combining these inputs. Rules for deduplication:
387+
- Combine similar actions (e.g., "purchase item" and "buy product" → "buy product")
388+
- Prefer more specific descriptions from candidate actions
389+
- If page summary has high-level action like "order food" and candidate has "add item to cart",
390+
create unified action "add food to cart" that captures both intents
391+
- Include originalCount (total from both sources) and finalCount (after deduplication)
392+
393+
Generate a SINGLE "${bootstrapTranslator.validator.getTypeName()}" response using the typescript schema below.
394+
395+
'''
396+
${bootstrapTranslator.validator.getSchemaText()}
397+
'''
398+
`,
399+
},
400+
...suffixSection,
401+
];
402+
403+
const response = await bootstrapTranslator.translate("", [
404+
{
405+
role: "user",
406+
content: promptSections as MultimodalPromptContent[],
407+
},
408+
]);
409+
return response;
410+
}
411+
344412
async getPageSummary(
345413
userRequest?: string,
346414
fragments?: HtmlFragments[],
Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,251 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
import {
5+
createDiscoveryPageTranslator,
6+
HtmlFragments,
7+
} from "../discovery/translator.mjs";
8+
import { PageDescription } from "../discovery/schema/pageSummary.mjs";
9+
import { UserActionsList } from "../discovery/schema/userActionsPool.mjs";
10+
import { UnifiedActionsList } from "../discovery/schema/unifiedActions.mjs";
11+
import { DetectedAction } from "./schema/knowledgeExtraction.mjs";
12+
import { ExtractionMode } from "website-memory";
13+
14+
/**
15+
* ActionDetectionAdapter bridges knowledge extraction and discovery agent systems.
16+
* Orchestrates three-phase action detection process:
17+
* 1. Page Summary - get high-level actions from page analysis
18+
* 2. Candidate Actions - get detailed actions from known schemas
19+
* 3. Unified Actions - deduplicate and structure actions consistently
20+
*/
21+
export class ActionDetectionAdapter {
22+
private discoveryAgent: any = null;
23+
private isInitialized: boolean = false;
24+
25+
constructor() {}
26+
27+
/**
28+
* Main entry point: Detect actions for given HTML fragments using specified mode
29+
*/
30+
async detectActions(
31+
htmlFragments: any[],
32+
mode: ExtractionMode,
33+
screenshots?: string[],
34+
): Promise<DetectedAction[]> {
35+
try {
36+
// Only perform action detection for modes that support it
37+
if (mode !== "actions" && mode !== "full") {
38+
return [];
39+
}
40+
41+
// Initialize discovery agent if needed
42+
await this.ensureDiscoveryAgent();
43+
44+
if (!this.discoveryAgent) {
45+
console.warn(
46+
"Discovery agent not available, skipping action detection",
47+
);
48+
return [];
49+
}
50+
51+
// Convert HTML fragments to discovery agent format
52+
const discoveryFragments =
53+
this.convertToDiscoveryFormat(htmlFragments);
54+
55+
if (discoveryFragments.length === 0) {
56+
console.warn("No valid HTML fragments for action detection");
57+
return [];
58+
}
59+
60+
// Execute three-phase detection process
61+
62+
const unifiedActions = await this.executeThreePhaseDetection(
63+
this.discoveryAgent,
64+
discoveryFragments,
65+
screenshots,
66+
);
67+
68+
// Convert unified actions to knowledge extraction format
69+
return this.convertToKnowledgeFormat(unifiedActions);
70+
} catch (error) {
71+
console.error("Error in action detection:", error);
72+
// Graceful degradation - don't fail knowledge extraction
73+
return [];
74+
}
75+
}
76+
77+
/**
78+
* Initialize discovery agent with lazy loading and error handling
79+
*/
80+
private async ensureDiscoveryAgent(): Promise<void> {
81+
if (this.isInitialized) {
82+
return;
83+
}
84+
85+
try {
86+
// Use GPT_4_O for consistency with discovery agent
87+
this.discoveryAgent =
88+
await createDiscoveryPageTranslator("GPT_4_O");
89+
this.isInitialized = true;
90+
console.log(
91+
"Discovery agent initialized successfully for action detection",
92+
);
93+
} catch (error) {
94+
console.warn("Failed to initialize discovery agent:", error);
95+
this.discoveryAgent = null;
96+
this.isInitialized = true; // Mark as attempted to avoid retries
97+
}
98+
}
99+
100+
/**
101+
* Convert knowledge extraction HTML fragments to discovery agent format
102+
*/
103+
private convertToDiscoveryFormat(htmlFragments: any[]): HtmlFragments[] {
104+
return htmlFragments
105+
.filter(
106+
(fragment) => fragment && (fragment.content || fragment.text),
107+
)
108+
.map((fragment, index) => ({
109+
frameId: fragment.frameId?.toString() || index.toString(),
110+
content: fragment.content || "",
111+
text: fragment.text || "",
112+
cssSelector: fragment.cssSelector,
113+
}));
114+
}
115+
116+
/**
117+
* Execute the three-phase action detection process
118+
*/
119+
private async executeThreePhaseDetection(
120+
agent: any,
121+
htmlFragments: HtmlFragments[],
122+
screenshots?: string[],
123+
): Promise<UnifiedActionsList | null> {
124+
try {
125+
console.time("Three-phase action detection");
126+
127+
// Phase 1: Get page summary with possible actions
128+
console.time("Phase 1: Page Summary");
129+
const pageSummaryResponse = await agent.getPageSummary(
130+
undefined, // userRequest
131+
htmlFragments,
132+
screenshots,
133+
);
134+
console.timeEnd("Phase 1: Page Summary");
135+
136+
if (!pageSummaryResponse.success) {
137+
console.warn(
138+
"Page summary failed:",
139+
pageSummaryResponse.message,
140+
);
141+
return null;
142+
}
143+
144+
const pageDescription = pageSummaryResponse.data as PageDescription;
145+
console.log(
146+
`Phase 1 complete: Found ${pageDescription.possibleUserAction?.length || 0} possible actions`,
147+
);
148+
149+
// Phase 2: Get candidate actions from schemas
150+
console.time("Phase 2: Candidate Actions");
151+
const candidateActionsResponse =
152+
await agent.getCandidateUserActions(
153+
undefined, // userRequest
154+
htmlFragments,
155+
screenshots,
156+
JSON.stringify(pageDescription), // Pass page summary as context
157+
);
158+
console.timeEnd("Phase 2: Candidate Actions");
159+
160+
if (!candidateActionsResponse.success) {
161+
console.warn(
162+
"Candidate actions failed:",
163+
candidateActionsResponse.message,
164+
);
165+
return null;
166+
}
167+
168+
const candidateActions =
169+
candidateActionsResponse.data as UserActionsList;
170+
console.log(
171+
`Phase 2 complete: Found ${candidateActions.actions?.length || 0} candidate actions`,
172+
);
173+
174+
// Phase 3: Unify and deduplicate actions
175+
176+
// TODO: For now, only pass in the known actions list in the de-dupe step. The general possible
177+
// actions list is currently too general. We'll need to narrow it down before it is useful
178+
console.time("Phase 3: Unified Actions");
179+
const unifiedActionsResponse = await agent.unifyUserActions(
180+
candidateActions,
181+
undefined,
182+
htmlFragments,
183+
screenshots,
184+
);
185+
console.timeEnd("Phase 3: Unified Actions");
186+
187+
if (!unifiedActionsResponse.success) {
188+
console.warn(
189+
"Action unification failed:",
190+
unifiedActionsResponse.message,
191+
);
192+
return null;
193+
}
194+
195+
const unifiedActions =
196+
unifiedActionsResponse.data as UnifiedActionsList;
197+
console.log(
198+
`Phase 3 complete: Unified ${unifiedActions.finalCount} actions from ${unifiedActions.originalCount} total`,
199+
);
200+
201+
console.timeEnd("Three-phase action detection");
202+
return unifiedActions;
203+
} catch (error) {
204+
console.error("Error in three-phase action detection:", error);
205+
return null;
206+
}
207+
}
208+
209+
/**
210+
* Convert unified actions to knowledge extraction DetectedAction format
211+
*/
212+
private convertToKnowledgeFormat(
213+
unifiedActions: UnifiedActionsList | null,
214+
): DetectedAction[] {
215+
if (!unifiedActions || !unifiedActions.actions) {
216+
return [];
217+
}
218+
219+
return unifiedActions.actions.map((action) => ({
220+
type: action.verb || "action",
221+
element: action.directObject || "element",
222+
text:
223+
action.shortDescription ||
224+
`${action.verb} ${action.directObject}`,
225+
confidence: action.confidence || 0.8,
226+
}));
227+
}
228+
229+
/**
230+
* Check if action detection is available (AI model ready)
231+
*/
232+
isActionDetectionAvailable(): boolean {
233+
return this.isInitialized && this.discoveryAgent !== null;
234+
}
235+
236+
/**
237+
* Get summary of action detection capabilities
238+
*/
239+
getCapabilities() {
240+
return {
241+
available: this.isActionDetectionAvailable(),
242+
supportedModes: ["actions", "full"],
243+
phases: [
244+
"Page Summary Analysis",
245+
"Candidate Action Detection",
246+
"Unified Action Deduplication",
247+
],
248+
aiModelRequired: true,
249+
};
250+
}
251+
}

0 commit comments

Comments
 (0)