-
Notifications
You must be signed in to change notification settings - Fork 1
/
pojie52.py
30 lines (24 loc) · 1.33 KB
/
pojie52.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from lxml import etree
import constant
# 52破解论坛 https://www.52pojie.cn/forum.php?mod=guide&view=newthread
class PoJie52(object):
def parse(self, page, http):
site_page = 'https://www.52pojie.cn/'
site = 'https://www.52pojie.cn/forum.php?mod=guide&view=newthread&page=%s' % page
data = http.request('GET', site, headers=constant.headers).data.decode('gbk')
html = etree.HTML(data)
home = html.xpath('//*[@id="threadlist"]/div[2]/table/tbody/tr/th/a[1]')
arrays = []
for index in range(0, len(home)):
href = home[index].attrib['href']
author = html.xpath('//*[@id="threadlist"]/div[2]/table/tbody/tr/td[3]/cite/a/text()')[index]
create_time = html.xpath('//*[@id="threadlist"]/div[2]/table/tbody/tr/td[3]/em/span/text()')[index]
# print(home[index].text, href, author, create_time)
data = http.request('GET', site_page + href, headers=constant.headers).data.decode('gbk')
content = etree.HTML(data).xpath('//*[@class="pct"]')
text = ''
if content:
text = content[0].xpath('string()').strip()
arrays.append({'title': home[index].text, 'href': href, 'author': author, 'create_time': create_time,
'content': text})
return arrays