-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspider.js
83 lines (70 loc) · 2.55 KB
/
spider.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
const cheerio = require('cheerio');
const https = require('https');
const iconv = require('iconv-lite');
const Post = require('./database/post')
const init = (name, cb) => {
if (!name) {
return
}
let pages = 2; // 回帖总页数
let current = 1;
let posts = [] //用户创建帖子数组
let user = name;
let url = `https://www.v2ex.com/member/${user}/replies?p=${current}`
let callback = cb; // route 回调
const start = () => {
if (current > pages) {
// 把结果传递出去
callback(posts)
new Post({
user,
posts
}).save((err) => {
err ? console.log('保存数据出错!', err) : console.log('保存数据成功!');
});
return;
}
https.get(url, function (sres) {
let chunks = [];
sres.on('data', function (chunk) {
chunks.push(chunk);
});
sres.on('end', function () {
const html = iconv.decode(Buffer.concat(chunks), 'utf-8');
const $ = cheerio.load(html, {
decodeEntities: false
});
if (current === 1) {
pages = $('#Main > div.box > div.header').text().match(/页 \/ 共 (\d*?) 页/)[1];
}
current++;
url = `https://www.v2ex.com/member/${user}/replies?p=${current}`
$('#Main > div.box .dock_area span.gray').each((index, item) => {
if ($(item).find('a:nth-child(1)').text() !== user) {
return;
}
const nodeDOM = $(item).find('a:nth-child(3)');
const postDOM = $(item).find('a:nth-child(5)');
// 是当前用户发的帖子的话,添加到数组
if (posts.find(item => {
return item.postUrl === postDOM.attr('href')
})) {
// 如果数组中已经存在数据
return;
}
posts.push({
node: nodeDOM.text(),
nodeUrl: nodeDOM.attr('href'),
post: postDOM.text(),
postUrl: postDOM.attr('href'),
});
});
setTimeout(() => {
start()
}, 0)
});
});
}
start(url);
}
module.exports = init;