-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
121 lines (100 loc) · 4.73 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import argparse
import requests
import re
import warnings
# 无视bs4的UserWarning
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
# 默认搜索组
search_list = ['在这里', '输入', '你想查找的', '字符串']
# 用以发送请求的header
# 注意:在使用时请重新填写使用者的cookie等信息!
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_octo=GH1.1.19******38.159******64; _ga=GA1.2.93******46.15******36; _device_id=37******ed; user_session=Qp******Kq; __Host-user_session_same_site=Qp******Rj-0T******Kq; logged_in=yes; dotcom_user=D******n; _gid=GA1.2.7******5.16******72; has_recent_activity=1; tz=A******%2FS******; _gh_sess=ep******3D',
'DNT': '1',
'Host': 'github.com',
'Pragma': 'no-cache',
'Referer': 'https://github.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
}
# ArgumentParser
args = argparse.ArgumentParser(description='Enter the keywords, divided by comma, that you want to search within GitHub. If no \'-s\' is specified, the default list will be used.')
args.add_argument('-s', metavar='Search_List', type=str, dest='search_string', help='enter the search list, divided by comma')
args = args.parse_args()
# 如果-s被输入,则替换掉原先的list
if args.search_string is not None:
search_list = args.search_string.split(',')
# 创建并写入file
f = open('Result.txt', 'w', encoding='utf-8')
print('\nSearching from the list: ' + str(search_list), flush=True)
f.write('Searching from the list: ' + str(search_list))
for keyword in search_list:
# 跳过空白字符
if keyword == '':
continue
print('Writing the result of:', keyword, flush=True)
f.write('\n\nShowing the result of: ' + keyword + '\n')
# 爬取前200页的搜索结果
for i in range(1,200):
print('Writing page', i, '...', flush=True)
url = 'https://github.com/search?p=' + str(i) + '&q=' + keyword + '&type=code'
try:
_request = requests.get(url = url, headers = headers)
text_raw = _request.text
except Exception as e: print(e)
# 用以支持汉字和特殊字符的输出
soup = BeautifulSoup(text_raw, 'lxml', from_encoding='gb18030')
# URL不正确或碰到了底端,则会显示404
if '404 “This is not the web page you are looking for”' in text_raw:
print('Meet 404, current keyword has reached the end.', flush=True)
break
# Fetch出有用的信息
else:
for box in soup.find_all('div', {'class': 'width-full'}):
valid = False
for href in box.find_all('a', title=True, href=True):
if href['href'] == 'https://github.com' or (not (box.find('a', title=True, href=True))):
break
else:
valid = True
# 分割线
f.write('-----------------------------------\n')
f.write('URL: \n\thttps://github.com' + href['href'] + '\n')
f.write('\nCode:')
f.flush()
if valid:
# print在Windows环境下会出错,故使用write-to-file
for divs in box.find_all('div', {'class': 'file-box blob-wrapper my-1'}):
ncount = 0
for code in divs.strings:
if code.isnumeric():
continue
if code == '\n':
ncount += 1
elif code == '\t':
f.write('\t')
f.flush()
ncount = 0
elif ncount == 0:
f.write(code)
f.flush()
ncount = 0
else:
f.write('\n\t' + code)
f.flush()
ncount = 0
# 分割线
f.write('\n-----------------------------------\n')
f.flush()
f.close()