-
Notifications
You must be signed in to change notification settings - Fork 1
/
pediy.py
29 lines (23 loc) · 1.1 KB
/
pediy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from lxml import etree
import constant
# 看雪论坛 https://bbs.pediy.com/new-tid.htm
class Pediy(object):
def parse(self, page, http):
site_page = 'https://bbs.pediy.com/'
site = 'https://bbs.pediy.com/new-tid-%s.htm' % page
data = http.request('GET', site, headers=constant.headers).data.decode('utf-8')
html = etree.HTML(data)
home = html.xpath('//td/div[1]/a[2]')
arrays = []
for item in home:
title = item.text
href = item.attrib['href']
author = item.xpath('//td/div[2]/div[1]/a')[0].text
create_time = item.xpath('//td/div[2]/div[1]/span')[0].text
# print(title, href, author, create_time)
data = http.request('GET', site_page + href, headers=constant.headers).data.decode('utf-8')
html = etree.HTML(data)
content = html.xpath('//*[@class="message "]')
arrays.append({'title': title, 'href': href, 'author': author, 'create_time': create_time,
'content': content[0].xpath('string()').strip()})
return arrays