-
Notifications
You must be signed in to change notification settings - Fork 58
/
Copy pathscrape-infinite-scroll.js
58 lines (51 loc) · 1.92 KB
/
scrape-infinite-scroll.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
const fs = require('fs');
const puppeteer = require('puppeteer');
/**
* This function is injected into the page and used to scrape items from it.
*/
function extractItems() {
const extractedElements = document.querySelectorAll('#boxes > div.box');
const items = [];
for (let element of extractedElements) {
items.push(element.innerText);
}
return items;
}
/**
* Scrolls and extracts content from a page.
* @param {object} page - A loaded Puppeteer Page instance.
* @param {function} extractItems - Item extraction function that is injected into the page.
* @param {number} itemTargetConut - The target number of items to extract before stopping.
* @param {number} scrollDelay - The time (in milliseconds) to wait between scrolls.
*/
async function scrapeInfiniteScrollItems(page, extractItems, itemTargetCount, scrollDelay = 1000) {
let items = [];
try {
let previousHeight;
while (items.length < itemTargetCount) {
items = await page.evaluate(extractItems);
previousHeight = await page.evaluate('document.body.scrollHeight');
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
await page.waitFor(scrollDelay);
}
} catch(e) { }
return items;
}
(async () => {
// Set up browser and page.
const browser = await puppeteer.launch({
headless: false,
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
const page = await browser.newPage();
page.setViewport({ width: 1280, height: 926 });
// Navigate to the demo page.
await page.goto('https://intoli.com/blog/scrape-infinite-scroll/demo.html');
// Scroll and extract items from the page.
const items = await scrapeInfiniteScrollItems(page, extractItems, 100);
// Save extracted items to a file.
fs.writeFileSync('./items.txt', items.join('\n') + '\n');
// Close the browser.
await browser.close();
})();