Skip to content

Commit acd376c

Browse files
WebMemories: filter html page to the core readable view before running entity and topic extraction (#1339)
1 parent 330c2dd commit acd376c

File tree

11 files changed

+586
-215
lines changed

11 files changed

+586
-215
lines changed

ts/packages/agents/browser/src/agent/knowledge/knowledgeHandler.mts

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,25 @@ interface AnalyticsDataResponse {
5252
totalActions: number;
5353
totalRelationships: number;
5454
recentItems?: any[];
55+
recentEntities?: Array<{
56+
name: string;
57+
type: string;
58+
fromPage: string;
59+
extractedAt: string;
60+
}>;
61+
recentTopics?: Array<{
62+
name: string;
63+
fromPage: string;
64+
extractedAt: string;
65+
}>;
66+
recentActions?: Array<{
67+
type: string;
68+
element: string;
69+
text?: string;
70+
confidence: number;
71+
fromPage: string;
72+
extractedAt: string;
73+
}>;
5574
};
5675
domains: {
5776
topDomains: Array<{
@@ -487,11 +506,33 @@ export async function indexWebPageContent(
487506
: [entity.type], // Ensure type is array
488507
})),
489508
topics: aggregatedResults.keyTopics,
490-
actions: [], // Actions would need to be extracted separately if needed
509+
actions: aggregatedResults.contentActions || [], // Use actual content actions
491510
inverseActions: [], // Required property
492511
};
493512
}
494513

514+
// Store detectedActions and actionSummary in metadata for retrieval
515+
if (
516+
aggregatedResults &&
517+
(aggregatedResults.detectedActions ||
518+
aggregatedResults.actionSummary)
519+
) {
520+
websiteObj.metadata = websiteObj.metadata || {};
521+
522+
if (
523+
aggregatedResults.detectedActions &&
524+
aggregatedResults.detectedActions.length > 0
525+
) {
526+
websiteObj.metadata.detectedActions =
527+
aggregatedResults.detectedActions;
528+
}
529+
530+
if (aggregatedResults.actionSummary) {
531+
websiteObj.metadata.actionSummary =
532+
aggregatedResults.actionSummary;
533+
}
534+
}
535+
495536
if (context.agentContext.websiteCollection) {
496537
if (parameters.extractKnowledge) {
497538
try {
@@ -1695,11 +1736,23 @@ export async function getPageIndexedKnowledge(
16951736
}
16961737

16971738
let detectedActions: any[] = [];
1739+
1740+
// Check websiteObj metadata for detectedActions first (with safe property access)
1741+
if (
1742+
foundWebsite.metadata &&
1743+
(foundWebsite.metadata as any).detectedActions &&
1744+
Array.isArray((foundWebsite.metadata as any).detectedActions)
1745+
) {
1746+
detectedActions = (foundWebsite.metadata as any)
1747+
.detectedActions;
1748+
}
1749+
1750+
// Also check knowledge object for detectedActions (fallback)
16981751
if (
16991752
(knowledge as any).detectedActions &&
17001753
Array.isArray((knowledge as any).detectedActions)
17011754
) {
1702-
detectedActions.push(...(knowledge as any).detectedAction);
1755+
detectedActions.push(...(knowledge as any).detectedActions);
17031756
}
17041757

17051758
// Convert the stored knowledge to the expected format
@@ -1762,6 +1815,10 @@ export async function getPageIndexedKnowledge(
17621815
relationships,
17631816
keyTopics,
17641817
detectedActions,
1818+
contentActions: knowledge.actions || [],
1819+
actionSummary: foundWebsite.metadata
1820+
? (foundWebsite.metadata as any).actionSummary
1821+
: undefined,
17651822
suggestedQuestions,
17661823
summary,
17671824
contentMetrics,
@@ -2503,6 +2560,7 @@ export async function getAnalyticsData(
25032560
topDomains,
25042561
activityTrends,
25052562
extractionAnalytics,
2563+
recentKnowledgeItems,
25062564
] = await Promise.all([
25072565
getDetailedKnowledgeStats(
25082566
{
@@ -2531,6 +2589,7 @@ export async function getAnalyticsData(
25312589
},
25322590
context,
25332591
),
2592+
getRecentKnowledgeItems({ limit: 10, type: "all" }, context),
25342593
]);
25352594

25362595
// Get basic website statistics from websiteCollection
@@ -2578,6 +2637,9 @@ export async function getAnalyticsData(
25782637
totalActions: 0, // Actions not tracked in current schema
25792638
totalRelationships: knowledgeStats.totalRelationships || 0,
25802639
recentItems: knowledgeStats.recentActivity || [],
2640+
recentEntities: recentKnowledgeItems.entities || [],
2641+
recentTopics: recentKnowledgeItems.topics || [],
2642+
recentActions: recentKnowledgeItems.actions || [],
25812643
},
25822644
domains: {
25832645
topDomains: topDomains.domains || [],

ts/packages/agents/browser/src/extension/contentScript/eventHandlers.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ export async function handleMessage(
212212
message.inputHtml,
213213
message.frameId,
214214
message.useTimestampIds,
215+
message.filterToReadingView,
216+
message.keepMetaTags,
215217
);
216218
sendResponse(html);
217219
break;

ts/packages/agents/browser/src/extension/contentScript/htmlProcessing.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,25 @@ import { HTMLReducer } from "./htmlReducer";
55
import DOMPurify from "dompurify";
66
import { setIdsOnAllElements, markInvisibleNodesForCleanup } from "./domUtils";
77
import { HtmlFragment } from "./types";
8+
import { Readability, isProbablyReaderable } from "@mozilla/readability";
89

910
/**
1011
* Gets the HTML of the page
1112
* @param fullSize Whether to get the full HTML
1213
* @param documentHtml The HTML to process
1314
* @param frameId The frame ID
1415
* @param useTimestampIds Whether to use timestamp IDs
16+
* @param filterToReadingView Whether to apply readability filter
17+
* @param keepMetaTags Whether to preserve meta tags when using readability
1518
* @returns The processed HTML
1619
*/
1720
export function getPageHTML(
1821
fullSize?: boolean,
1922
documentHtml?: string,
2023
frameId?: number,
2124
useTimestampIds?: boolean,
25+
filterToReadingView?: boolean,
26+
keepMetaTags?: boolean,
2227
): string {
2328
if (!documentHtml) {
2429
if (frameId !== undefined) {
@@ -28,12 +33,23 @@ export function getPageHTML(
2833
documentHtml = document.children[0].outerHTML;
2934
}
3035

36+
// Apply Readability filter if requested
37+
if (filterToReadingView) {
38+
documentHtml = applyReadabilityFilter(documentHtml, keepMetaTags);
39+
}
40+
3141
if (fullSize) {
3242
return documentHtml;
3343
}
3444

3545
const reducer = new HTMLReducer();
3646
reducer.removeDivs = false;
47+
48+
// Preserve meta tags if requested and readability was used
49+
if (filterToReadingView && keepMetaTags) {
50+
reducer.removeMetaTags = false;
51+
}
52+
3753
const reducedHtml = reducer.reduce(documentHtml);
3854
return reducedHtml;
3955
}
@@ -137,3 +153,59 @@ export function getPageHTMLFragments(
137153

138154
return htmlFragments;
139155
}
156+
157+
/**
158+
* Apply Readability filter to extract main content
159+
* @param html The HTML to process
160+
* @param keepMetaTags Whether to preserve meta tags
161+
* @returns The processed HTML with main content extracted
162+
*/
163+
function applyReadabilityFilter(html: string, keepMetaTags?: boolean): string {
164+
try {
165+
// Parse the HTML
166+
const domParser = new DOMParser();
167+
const doc = domParser.parseFromString(html, "text/html");
168+
169+
// Check if readability can process this document
170+
if (!isProbablyReaderable(doc)) {
171+
console.warn(
172+
"Document is not probably readerable, skipping Readability filter",
173+
);
174+
return html;
175+
}
176+
177+
// Clone document to avoid modifying original
178+
const documentClone = doc.cloneNode(true) as Document;
179+
180+
// Extract meta tags before applying Readability if we want to preserve them
181+
let metaTags = "";
182+
if (keepMetaTags) {
183+
const headClone = documentClone.head?.cloneNode(
184+
true,
185+
) as HTMLHeadElement;
186+
if (headClone) {
187+
const metaElements = headClone.querySelectorAll("meta, title");
188+
metaTags = Array.from(metaElements)
189+
.map((el) => el.outerHTML)
190+
.join("\n");
191+
}
192+
}
193+
194+
// Apply Readability
195+
const article = new Readability(documentClone).parse();
196+
197+
if (article?.content) {
198+
// Construct new HTML with main content
199+
let resultHtml = `<html><head>${metaTags}</head><body>${article.content}</body></html>`;
200+
return resultHtml;
201+
} else {
202+
console.warn(
203+
"Readability failed to extract content, falling back to original HTML",
204+
);
205+
return html;
206+
}
207+
} catch (error) {
208+
console.error("Error applying Readability filter:", error);
209+
return html; // Fallback to original HTML
210+
}
211+
}

ts/packages/agents/browser/src/extension/serviceWorker/browserActions.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ export async function runBrowserAction(action: AppAction): Promise<any> {
181181
action.parameters?.downloadAsFile,
182182
action.parameters?.extractText,
183183
action.parameters?.useTimestampIds,
184+
action.parameters?.filterToReadingView,
185+
action.parameters?.keepMetaTags,
184186
);
185187
break;
186188
}

ts/packages/agents/browser/src/extension/serviceWorker/capture.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import {
1515
* @param downloadAsFile Whether to download the HTML as a file
1616
* @param extractText Whether to extract text from the HTML
1717
* @param useTimestampIds Whether to use timestamp IDs
18+
* @param filterToReadingView Whether to apply readability filter
19+
* @param keepMetaTags Whether to preserve meta tags when using readability
1820
* @returns Promise resolving to an array of HTML fragments
1921
*/
2022
export async function getTabHTMLFragments(
@@ -23,6 +25,8 @@ export async function getTabHTMLFragments(
2325
downloadAsFile?: boolean,
2426
extractText?: boolean,
2527
useTimestampIds?: boolean,
28+
filterToReadingView?: boolean,
29+
keepMetaTags?: boolean,
2630
): Promise<HTMLFragment[]> {
2731
const frames = await chrome.webNavigation.getAllFrames({
2832
tabId: targetTab.id!,
@@ -41,6 +45,8 @@ export async function getTabHTMLFragments(
4145
fullSize: fullSize,
4246
frameId: frames[i].frameId,
4347
useTimestampIds: useTimestampIds,
48+
filterToReadingView: filterToReadingView,
49+
keepMetaTags: keepMetaTags,
4450
},
4551
{ frameId: frames[i].frameId },
4652
);

0 commit comments

Comments
 (0)