From 8e253ac29fff654605019d0a79ae51714277f743 Mon Sep 17 00:00:00 2001
From: duolong <tomato.stao@gmail.com>
Date: Wed, 4 May 2022 15:20:21 +0800
Subject: [PATCH] =?UTF-8?q?:sparkles:=20=E5=95=86=E5=93=81=E5=88=97?=
 =?UTF-8?q?=E8=A1=A8=EF=BC=88=E5=B0=B1=E6=98=AF=E8=AF=BE=E7=A8=8B=EF=BC=89?=
 =?UTF-8?q?=E7=9A=84=E6=8E=A5=E5=8F=A3=EF=BC=89=E6=96=B9=E6=B3=95=E5=A2=9E?=
 =?UTF-8?q?=E5=8A=A0=E7=BF=BB=E9=A1=B5=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 geek_crawler.py | 70 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 27 deletions(-)

diff --git a/geek_crawler.py b/geek_crawler.py
index 8941dc1..86c8892 100644
--- a/geek_crawler.py
+++ b/geek_crawler.py
@@ -28,6 +28,7 @@
 # 定义全局变量
 FINISH_ARTICLES = []
 ALL_ARTICLES = []
+PRODUCT_PREV = 0
 
 
 class RequestError(Exception):
@@ -241,6 +242,7 @@ def _user_auth(self):
 
     def _product(self, _type='c1'):
         """ 商品列表（就是课程）的接口）方法 """
+        global PRODUCT_PREV
         log.info("请求获取课程列表接口：")
         url = "https://time.geekbang.org/serv/v3/learn/product"
         method = "POST"
@@ -253,7 +255,7 @@ def _product(self, _type='c1'):
             "expire": 1,
             "last_learn": 0,
             "learn_status": 0,
-            "prev": 0,
+            "prev": PRODUCT_PREV,
             "size": 20,
             "sort": 1,
             "type": "",
@@ -496,31 +498,45 @@ def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments=
     geek._login()  # 请求登录接口进行登录
     geek._product()  # 请求获取课程接口
 
-    number = 0
-
-    for pro in geek.products:
-        geek._articles(pro['id'], pro)  # 获取文章列表
-
-        article_ids = pro['article_ids']
-        for aid in article_ids:
-            if set(ALL_ARTICLES) == set(FINISH_ARTICLES):
-                import sys
-                log.info("正常抓取完成啦，不用再继续跑脚本了。")
-                sys.exit(1)
-
-            if str(aid) in FINISH_ARTICLES:
-                continue
-            geek._article(aid, pro, file_type=file_type, get_comments=get_comments)  # 获取单个文章的信息
-            time.sleep(5)  # 做一个延时请求，避免过快请求接口被限制访问
-            number += 1
-            # 判断是否连续抓取过 37次，如果是则暂停 10s
-            if number == 37:
-                log.info("抓取达到37次了，先暂停 10s 再继续。")
-                time.sleep(10)
-                number = 0  # 重新计数
-                geek._user_auth()
-    _save_finish_article_id_to_file()
-    log.info("正常抓取完成。")
+    def paint_product():
+        global PRODUCT_PREV
+        if len(geek.products):
+            number = 0
+
+            for pro in geek.products:
+                geek._articles(pro['id'], pro)  # 获取文章列表
+
+                article_ids = pro['article_ids']
+                for aid in article_ids:
+                    if set(ALL_ARTICLES) == set(FINISH_ARTICLES):
+                        import sys
+                        log.info("正常抓取完成啦，不用再继续跑脚本了。")
+                        sys.exit(1)
+
+                    if str(aid) in FINISH_ARTICLES:
+                        continue
+                    geek._article(aid, pro, file_type=file_type, get_comments=get_comments)  # 获取单个文章的信息
+                    time.sleep(5)  # 做一个延时请求，避免过快请求接口被限制访问
+                    number += 1
+                    # 判断是否连续抓取过 37次，如果是则暂停 10s
+                    if number == 37:
+                        log.info("抓取达到37次了，先暂停 10s 再继续。")
+                        time.sleep(10)
+                        number = 0  # 重新计数
+                        geek._user_auth()
+            _save_finish_article_id_to_file()
+            time.sleep(2)
+            log.info("翻页开始了...")
+            if PRODUCT_PREV == 0:
+                PRODUCT_PREV = 2
+            else:
+                PRODUCT_PREV += 1
+            geek.products = []
+            geek._product()
+            paint_product()
+        else:
+            log.info("正常抓取完成。")
+    paint_product()
 
 
 if __name__ == "__main__":
@@ -544,7 +560,7 @@ def run(cellphone=None, passwd=None, exclude=None, file_type=None, get_comments=
 
     try:
         FINISH_ARTICLES = _load_finish_article()
-        run(cellphone, pwd, exclude=exclude, get_comments=get_comments)
+        run(cellphone, pwd, exclude=exclude, file_type=file_type, get_comments=get_comments)
     except Exception:
         import traceback
         log.error(f"请求过程中出错了，出错信息为：{traceback.format_exc()}")