|
2 | 2 | assignTransferProtocolMeta, marshalErrorLike,
|
3 | 3 | RPCHost, RPCReflection,
|
4 | 4 | HashManager,
|
5 |
| - AssertionFailureError, ParamValidationError, |
| 5 | + AssertionFailureError, ParamValidationError, Defer, |
6 | 6 | } from 'civkit';
|
7 | 7 | import { singleton } from 'tsyringe';
|
8 | 8 | import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
@@ -34,6 +34,12 @@ export class CrawlerHost extends RPCHost {
|
34 | 34 | cacheValidMs = 1000 * 300;
|
35 | 35 | urlValidMs = 1000 * 3600 * 4;
|
36 | 36 |
|
| 37 | + indexText = `[Usage1] https://r.jina.ai/YOUR_URL |
| 38 | +[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY |
| 39 | +[Homepage] https://jina.ai/reader |
| 40 | +[Source code] https://github.com/jina-ai/reader |
| 41 | +`; |
| 42 | + |
37 | 43 | constructor(
|
38 | 44 | protected globalLogger: Logger,
|
39 | 45 | protected puppeteerControl: PuppeteerControl,
|
@@ -357,10 +363,7 @@ ${this.content}
|
357 | 363 | [Balance left] ${latestUser.wallet.total_balance}
|
358 | 364 | ` : '';
|
359 | 365 |
|
360 |
| - return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL |
361 |
| -[Homepage] https://jina.ai/reader |
362 |
| -[Source code] https://github.com/jina-ai/reader |
363 |
| -${authMixin}`, |
| 366 | + return assignTransferProtocolMeta(`${this.indexText}${authMixin}`, |
364 | 367 | { contentType: 'text/plain', envelope: null }
|
365 | 368 | );
|
366 | 369 | }
|
@@ -638,13 +641,13 @@ ${authMixin}`,
|
638 | 641 | return r;
|
639 | 642 | }
|
640 | 643 |
|
641 |
| - async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) { |
| 644 | + async *cachedScrap(urlToCrawl: URL, crawlOpts?: ScrappingOptions, noCache: boolean = false) { |
642 | 645 | let cache;
|
643 |
| - if (!noCache && !crawlOpts.cookies?.length) { |
| 646 | + if (!noCache && !crawlOpts?.cookies?.length) { |
644 | 647 | cache = await this.queryCache(urlToCrawl);
|
645 | 648 | }
|
646 | 649 |
|
647 |
| - if (cache?.isFresh && (!crawlOpts.favorScreenshot || (crawlOpts.favorScreenshot && cache?.screenshotAvailable))) { |
| 650 | + if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { |
648 | 651 | yield cache.snapshot;
|
649 | 652 |
|
650 | 653 | return;
|
@@ -683,4 +686,47 @@ ${authMixin}`,
|
683 | 686 | return undefined;
|
684 | 687 | }
|
685 | 688 |
|
| 689 | + |
| 690 | + async *scrapMany(urls: URL[], options?: ScrappingOptions, noCache = false) { |
| 691 | + const iterators = urls.map((url) => this.cachedScrap(url, options, noCache)); |
| 692 | + |
| 693 | + const results: (PageSnapshot | undefined)[] = iterators.map((_x)=> undefined); |
| 694 | + |
| 695 | + let nextDeferred = Defer(); |
| 696 | + let concluded = false; |
| 697 | + |
| 698 | + const handler = async (it: AsyncGenerator<PageSnapshot | undefined>, idx: number) => { |
| 699 | + for await (const x of it) { |
| 700 | + results[idx] = x; |
| 701 | + |
| 702 | + if (x) { |
| 703 | + nextDeferred.resolve(); |
| 704 | + nextDeferred = Defer(); |
| 705 | + } |
| 706 | + |
| 707 | + } |
| 708 | + }; |
| 709 | + |
| 710 | + Promise.all( |
| 711 | + iterators.map((it, idx) => handler(it, idx)) |
| 712 | + ).finally(() => { |
| 713 | + concluded = true; |
| 714 | + nextDeferred.resolve(); |
| 715 | + }); |
| 716 | + |
| 717 | + yield results; |
| 718 | + |
| 719 | + try { |
| 720 | + while (!concluded) { |
| 721 | + await nextDeferred.promise; |
| 722 | + |
| 723 | + yield results; |
| 724 | + } |
| 725 | + } finally { |
| 726 | + for (const x of iterators) { |
| 727 | + x.return(); |
| 728 | + } |
| 729 | + } |
| 730 | + } |
| 731 | + |
686 | 732 | }
|
0 commit comments