From 8e253ac29fff654605019d0a79ae51714277f743 Mon Sep 17 00:00:00 2001 From: duolong Date: Wed, 4 May 2022 15:20:21 +0800 Subject: [PATCH] =?UTF-8?q?:sparkles:=20=E5=95=86=E5=93=81=E5=88=97?= =?UTF-8?q?=E8=A1=A8=EF=BC=88=E5=B0=B1=E6=98=AF=E8=AF=BE=E7=A8=8B=EF=BC=89?= =?UTF-8?q?=E7=9A=84=E6=8E=A5=E5=8F=A3=EF=BC=89=E6=96=B9=E6=B3=95=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E7=BF=BB=E9=A1=B5=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- geek_crawler.py | 70 ++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/geek_crawler.py b/geek_crawler.py index 8941dc1..86c8892 100644 --- a/geek_crawler.py +++ b/geek_crawler.py @@ -28,6 +28,7 @@ # 定义全局变量 FINISH_ARTICLES = [] ALL_ARTICLES = [] +PRODUCT_PREV = 0 class RequestError(Exception): @@ -241,6 +242,7 @@ def _user_auth(self): def _product(self, _type='c1'): """ 商品列表(就是课程)的接口)方法 """ + global PRODUCT_PREV log.info("请求获取课程列表接口:") url = "https://time.geekbang.org/serv/v3/learn/product" method = "POST" @@ -253,7 +255,7 @@ def _product(self, _type='c1'): "expire": 1, "last_learn": 0, "learn_status": 0, - "prev": 0, + "prev": PRODUCT_PREV, "size": 20, "sort": 1, "type": "", @@ -496,31 +498,45 @@ def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments= geek._login() # 请求登录接口进行登录 geek._product() # 请求获取课程接口 - number = 0 - - for pro in geek.products: - geek._articles(pro['id'], pro) # 获取文章列表 - - article_ids = pro['article_ids'] - for aid in article_ids: - if set(ALL_ARTICLES) == set(FINISH_ARTICLES): - import sys - log.info("正常抓取完成啦,不用再继续跑脚本了。") - sys.exit(1) - - if str(aid) in FINISH_ARTICLES: - continue - geek._article(aid, pro, file_type=file_type, get_comments=get_comments) # 获取单个文章的信息 - time.sleep(5) # 做一个延时请求,避免过快请求接口被限制访问 - number += 1 - # 判断是否连续抓取过 37次,如果是则暂停 10s - if number == 37: - log.info("抓取达到37次了,先暂停 10s 再继续。") - time.sleep(10) - number = 0 # 重新计数 - geek._user_auth() - _save_finish_article_id_to_file() - log.info("正常抓取完成。") + def paint_product(): + global PRODUCT_PREV + if len(geek.products): + number = 0 + + for pro in geek.products: + geek._articles(pro['id'], pro) # 获取文章列表 + + article_ids = pro['article_ids'] + for aid in article_ids: + if set(ALL_ARTICLES) == set(FINISH_ARTICLES): + import sys + log.info("正常抓取完成啦,不用再继续跑脚本了。") + sys.exit(1) + + if str(aid) in FINISH_ARTICLES: + continue + geek._article(aid, pro, file_type=file_type, get_comments=get_comments) # 获取单个文章的信息 + time.sleep(5) # 做一个延时请求,避免过快请求接口被限制访问 + number += 1 + # 判断是否连续抓取过 37次,如果是则暂停 10s + if number == 37: + log.info("抓取达到37次了,先暂停 10s 再继续。") + time.sleep(10) + number = 0 # 重新计数 + geek._user_auth() + _save_finish_article_id_to_file() + time.sleep(2) + log.info("翻页开始了...") + if PRODUCT_PREV == 0: + PRODUCT_PREV = 2 + else: + PRODUCT_PREV += 1 + geek.products = [] + geek._product() + paint_product() + else: + log.info("正常抓取完成。") + paint_product() if __name__ == "__main__": @@ -544,7 +560,7 @@ def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments= try: FINISH_ARTICLES = _load_finish_article() - run(cellphone, pwd, exclude=exclude, get_comments=get_comments) + run(cellphone, pwd, exclude=exclude, file_type=file_type, get_comments=get_comments) except Exception: import traceback log.error(f"请求过程中出错了,出错信息为:{traceback.format_exc()}")