Skip to content

Commit 2e3c217

Browse files
authored
feat: web search (#57)
1 parent f171e54 commit 2e3c217

11 files changed

+748
-15
lines changed

backend/.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -75,4 +75,5 @@ build/
7575
.DS_Store
7676

7777
*.local
78-
.secret.*
78+
.secret.*
79+
licensed/

backend/functions/integrity-check.cjs

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env node
2+
3+
const fs = require('fs');
4+
const path = require('path');
5+
6+
const file = path.resolve(__dirname, 'licensed/GeoLite2-City.mmdb');
7+
8+
if (!fs.existsSync(file)) {
9+
console.error(`Integrity check failed: ${file} does not exist.`);
10+
process.exit(1);
11+
}

backend/functions/package-lock.json

+31
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/functions/package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "reader",
33
"scripts": {
44
"lint": "eslint --ext .js,.ts .",
5-
"build": "tsc -p .",
5+
"build": "node ./integrity-check.cjs && tsc -p .",
66
"build:watch": "tsc --watch",
77
"build:clean": "rm -rf ./build",
88
"shell": "npm run build && firebase functions:shell",
@@ -44,6 +44,7 @@
4444
"htmlparser2": "^9.0.0",
4545
"jose": "^5.1.0",
4646
"langdetect": "^0.2.1",
47+
"maxmind": "^4.3.18",
4748
"minio": "^7.1.3",
4849
"openai": "^4.20.0",
4950
"puppeteer": "^22.7.1",

backend/functions/src/cloud-functions/crawler.ts

+54-8
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import {
22
assignTransferProtocolMeta, marshalErrorLike,
33
RPCHost, RPCReflection,
44
HashManager,
5-
AssertionFailureError, ParamValidationError,
5+
AssertionFailureError, ParamValidationError, Defer,
66
} from 'civkit';
77
import { singleton } from 'tsyringe';
88
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
@@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
3434
cacheValidMs = 1000 * 300;
3535
urlValidMs = 1000 * 3600 * 4;
3636

37+
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
38+
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
39+
[Homepage] https://jina.ai/reader
40+
[Source code] https://github.com/jina-ai/reader
41+
`;
42+
3743
constructor(
3844
protected globalLogger: Logger,
3945
protected puppeteerControl: PuppeteerControl,
@@ -357,10 +363,7 @@ ${this.content}
357363
[Balance left] ${latestUser.wallet.total_balance}
358364
` : '';
359365

360-
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
361-
[Homepage] https://jina.ai/reader
362-
[Source code] https://github.com/jina-ai/reader
363-
${authMixin}`,
366+
return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
364367
{ contentType: 'text/plain', envelope: null }
365368
);
366369
}
@@ -638,13 +641,13 @@ ${authMixin}`,
638641
return r;
639642
}
640643

641-
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
644+
async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) {
642645
let cache;
643-
if (!noCache && !crawlOpts.cookies?.length) {
646+
if (!noCache && !crawlOpts?.cookies?.length) {
644647
cache = await this.queryCache(urlToCrawl);
645648
}
646649

647-
if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) {
650+
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
648651
yield cache.snapshot;
649652

650653
return;
@@ -683,4 +686,47 @@ ${authMixin}`,
683686
return undefined;
684687
}
685688

689+
690+
async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) {
691+
const iterators = urls.map((url) => this.cachedScrap(url, options, noCache));
692+
693+
const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined);
694+
695+
let nextDeferred = Defer();
696+
let concluded = false;
697+
698+
const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => {
699+
for await (const x of it) {
700+
results[idx] = x;
701+
702+
if (x) {
703+
nextDeferred.resolve();
704+
nextDeferred = Defer();
705+
}
706+
707+
}
708+
};
709+
710+
Promise.all(
711+
iterators.map((it, idx) => handler(it, idx))
712+
).finally(() => {
713+
concluded = true;
714+
nextDeferred.resolve();
715+
});
716+
717+
yield results;
718+
719+
try {
720+
while (!concluded) {
721+
await nextDeferred.promise;
722+
723+
yield results;
724+
}
725+
} finally {
726+
for (const x of iterators) {
727+
x.return();
728+
}
729+
}
730+
}
731+
686732
}

0 commit comments

Comments
 (0)