Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@

- [x] 支持配置排除某些课程的拉取(比如已经有的课程不再下载);

- [ ] 抓取指定名称的课程;
- [x] 抓取指定名称的课程;

- [ ] 将每篇文章的评论与正文一起保存到本地;

Expand Down
136 changes: 86 additions & 50 deletions geek_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import logging
import os
import pathlib

from tenacity import retry, stop_after_attempt

# 定义日志相关内容
logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
Expand Down Expand Up @@ -64,6 +64,14 @@ def _save_finish_article_id_to_file():
f.write(str(i) + '\n')


def _save_finish_article_id_to_file_now(article_id: str):
""" 将已经遍历完成的文章 ID 马上保存成文本,后面不用再遍历 """
_dir = pathlib.PurePosixPath()
file_path = os.path.abspath(_dir / 'finish_crawler_article.txt')
with open(file_path, 'a+', encoding='utf-8') as f:
f.write(article_id + '\n')


def check_filename(file_name):
"""
校验文件名称的方法,在 windows 中文件名不能包含('\','/','*','?','<','>','|') 字符
Expand All @@ -73,17 +81,17 @@ def check_filename(file_name):
修复后的文件名称
"""
return file_name.replace('\\', '') \
.replace('/', '') \
.replace('*', 'x') \
.replace('?', '') \
.replace('<', '《') \
.replace('>', '》') \
.replace('|', '_') \
.replace('\n', '') \
.replace('\b', '') \
.replace('\f', '') \
.replace('\t', '') \
.replace('\r', '')
.replace('/', '') \
.replace('*', 'x') \
.replace('?', '') \
.replace('<', '《') \
.replace('>', '》') \
.replace('|', '_') \
.replace('\n', '') \
.replace('\b', '') \
.replace('\f', '') \
.replace('\t', '') \
.replace('\r', '')


class Cookie:
Expand Down Expand Up @@ -155,6 +163,7 @@ def __repr__(self):

class GeekCrawler:
""" 极客时间相关操作的类 """

def __init__(self, cellphone=None, passwd=None, exclude=None):
self.cellphone = cellphone
self.password = passwd
Expand Down Expand Up @@ -215,7 +224,7 @@ def _login(self):
log.error(f"登录接口请求出错,返回内容为:{res.content.decode()}")
raise RequestError(f"登录接口请求出错,返回内容为:{res.content.decode()}")
self.cookie.load_set_cookie(res.headers['Set-Cookie'])
log.info('-'*40)
log.info('-' * 40)

def _user_auth(self):
""" 用户认证接口方法 """
Expand All @@ -238,7 +247,6 @@ def _user_auth(self):
self.cookie.load_set_cookie(res.headers['Set-Cookie'])
log.info('-' * 40)


def _product(self, _type='c1'):
""" 商品列表(就是课程)的接口)方法 """
log.info("请求获取课程列表接口:")
Expand All @@ -264,7 +272,7 @@ def _product(self, _type='c1'):
res = requests.request(method, url, headers=headers, json=params)

if res.status_code != 200:
log.info(f"此时 products 的数据为:{self.products}")
# log.info(f"此时 products 的数据为:{self.products}")
log.error(f"课程列表接口请求出错,返回内容为:{res.content.decode()}")
raise RequestError(f"课程列表接口请求出错,返回内容为:{res.content.decode()}")
data = res.json().get('data', {})
Expand All @@ -274,9 +282,10 @@ def _product(self, _type='c1'):
self.products += self._parser_products(data, _type)
else:
_save_finish_article_id_to_file()
log.info(f"此时 products 的数据为:{self.products}")
# log.info(f"此时 products 的数据为:{self.products}")
log.error(f"课程列表接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}")
raise NotValueError(f"课程列表接口没有获取到内容,请检查请求。返回结果为:{res.content.decode()}")
log.info(f"此时 products 的数据为:{self.products}")
log.info('-' * 40)

def _parser_products(self, data, _type='c1'):
Expand Down Expand Up @@ -307,10 +316,11 @@ def _parser_products(self, data, _type='c1'):
result.append(new_product)
return result

@retry(stop=stop_after_attempt(3))
def _article(self, aid, pro, file_type=None, get_comments=False):
""" 通过课程 ID 获取文章信息接口方法 """
global FINISH_ARTICLES
log.info("请求获取文章信息接口:")
# log.info("请求获取文章信息接口:")
url = "https://time.geekbang.org/serv/v1/article"
method = "POST"
headers = deepcopy(self.common_headers)
Expand All @@ -323,13 +333,13 @@ def _article(self, aid, pro, file_type=None, get_comments=False):
"is_freelyread": "true"
}

log.info(f"接口请求参数:{params}")
log.debug(f"接口请求参数:{params}")
res = requests.request(method, url, headers=headers, json=params)

if res.status_code != 200:
_save_finish_article_id_to_file()
log.info(f"此时 products 的数据为:{self.products}")
log.error(f"获取文章信息接口请求出错,返回内容为:{res.content.decode()}")
log.error(f"状态{res.status_code}获取文章信息接口请求出错,返回内容为:{res.content.decode()}")
raise RequestError(f"获取文章信息接口请求出错,返回内容为:{res.content.decode()}")
data = res.json().get('data', {})
self.cookie.load_set_cookie(res.headers['Set-Cookie'])
Expand All @@ -342,6 +352,7 @@ def _article(self, aid, pro, file_type=None, get_comments=False):
pro['title'],
article['article_title'],
article['article_content'],
article['id'],
audio=article['audio_download_url'],
file_type=file_type,
comments=comments
Expand Down Expand Up @@ -431,7 +442,7 @@ def _articles(self, cid, pro):
log.info('-' * 40)

@staticmethod
def save_to_file(dir_name, filename, content, audio=None, file_type=None, comments=None):
def save_to_file(dir_name, filename, content, id, audio=None, file_type=None, comments=None):
"""
将结果保存成文件的方法,保存在当前目录下
Args:
Expand All @@ -443,6 +454,7 @@ def save_to_file(dir_name, filename, content, audio=None, file_type=None, commen
comments: 评论相关数据
Returns:
"""
log.info(f" {filename} ")
if not file_type: file_type = '.md'
dir_path = pathlib.PurePosixPath() / dir_name
if not os.path.isdir(dir_path):
Expand Down Expand Up @@ -485,57 +497,80 @@ def save_to_file(dir_name, filename, content, audio=None, file_type=None, commen
audio_text = f'<audio title="{filename}" src="{audio}" controls="controls"></audio> \n'
f.write(audio_text)
f.write(content + temp)
_save_finish_article_id_to_file_now(str(id))


def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments=False):
def run(cellphone=None, passwd=None, exclude=None, special=None, file_type=None, get_comments=False):
""" 整体流程的请求方法 """
global FINISH_ARTICLES
global ALL_ARTICLES

geek = GeekCrawler(cellphone, passwd, exclude=exclude)
geek._login() # 请求登录接口进行登录
geek._product() # 请求获取课程接口

number = 0

exsit = False
for pro in geek.products:
geek._articles(pro['id'], pro) # 获取文章列表

article_ids = pro['article_ids']
for aid in article_ids:
if set(ALL_ARTICLES) == set(FINISH_ARTICLES):
import sys
log.info("正常抓取完成啦,不用再继续跑脚本了。")
sys.exit(1)

if str(aid) in FINISH_ARTICLES:
if special is not None and len(special) > 0:
if pro['id'] not in special:
continue
geek._article(aid, pro, file_type=file_type, get_comments=get_comments) # 获取单个文章的信息
time.sleep(5) # 做一个延时请求,避免过快请求接口被限制访问
number += 1
# 判断是否连续抓取过 37次,如果是则暂停 10s
if number == 37:
log.info("抓取达到37次了,先暂停 10s 再继续。")
time.sleep(10)
number = 0 # 重新计数
geek._user_auth()
get_article(geek, pro)
exsit = True
time.sleep(10)
geek._user_auth()

_save_finish_article_id_to_file()
log.info("正常抓取完成。")
if exsit is not True:
# 不准确
log.warning(f"没有找到文章{special}")
else:
log.info("正常抓取完成。")


def get_article(geek: GeekCrawler, pro=None):
""" 获取课程的所有文章 """
article_id = pro['id']
geek._articles(article_id, pro) # 获取文章列表
article_title = pro['title']
log.info(f"正在下载的文章 《{article_title}》 id={article_id}")
log.info('-' * 40)
article_ids = pro['article_ids']
number = 0
for aid in article_ids:
if set(ALL_ARTICLES) == set(FINISH_ARTICLES):
import sys
log.info("正常抓取完成啦,不用再继续跑脚本了。")
sys.exit(1)

if str(aid) in FINISH_ARTICLES:
continue
geek._article(aid, pro, file_type=file_type, get_comments=get_comments) # 获取单个文章的信息
time.sleep(5) # 做一个延时请求,避免过快请求接口被限制访问
number += 1
# 判断是否连续抓取过 37次,如果是则暂停 10s
if number == 37:
log.info("抓取达到37次了,先暂停 10s 再继续。")
time.sleep(10)
number = 0 # 重新计数
geek._user_auth()


if __name__ == "__main__":
# 采用在脚本中写死账号密码的方式
# cellphone = ""
# pwd = ""

cellphone = ""
pwd = ""
# 采用每次跑脚本手动输入账号密码的方式
cellphone = str(input("请输入你的极客时间账号(手机号): "))
pwd = str(input("请输入你的极客时间密码: "))
if cellphone is None or len(cellphone) <= 5:
cellphone = str(input("请输入你的极客时间账号(手机号): "))
if pwd is None or len(pwd) <= 5:
pwd = str(input("请输入你的极客时间密码: "))

# 需要排除的课程列表,根据自己的情况定义(比如已经有的资源就不用再继续下载了)
# exclude = ['左耳听风', '趣谈网络协议']
exclude = []

# 指定好要下载的课程id
special = [100022301, 100026001]

# 需要保存文件的后缀名,尽量选 .md 或者 .html
file_type = '.md'

Expand All @@ -544,9 +579,10 @@ def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments=

try:
FINISH_ARTICLES = _load_finish_article()
run(cellphone, pwd, exclude=exclude, get_comments=get_comments)
run(cellphone, pwd, exclude=exclude, special=special, get_comments=get_comments)
except Exception:
import traceback

log.error(f"请求过程中出错了,出错信息为:{traceback.format_exc()}")
finally:
_save_finish_article_id_to_file()