|
1 | 1 | import logging
|
2 | 2 | import random
|
3 | 3 | import re
|
4 |
| -from requests.sessions import Session |
| 4 | +import subprocess |
5 | 5 | from copy import deepcopy
|
6 | 6 | from time import sleep
|
7 | 7 |
|
8 |
| -import execjs |
| 8 | +from requests.sessions import Session |
9 | 9 |
|
10 | 10 | try:
|
11 | 11 | from urlparse import urlparse
|
12 | 12 | except ImportError:
|
13 | 13 | from urllib.parse import urlparse
|
14 | 14 |
|
15 |
| -__version__ = "1.9.3" |
| 15 | +__version__ = "1.9.4" |
16 | 16 |
|
17 | 17 | DEFAULT_USER_AGENTS = [
|
18 | 18 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
|
|
24 | 24 |
|
25 | 25 | DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
|
26 | 26 |
|
27 |
| -BUG_REPORT = ("Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a " |
28 |
| -"bug report at https://github.com/Anorov/cloudflare-scrape/issues.") |
| 27 | +BUG_REPORT = """\ |
| 28 | +Cloudflare may have changed their technique, or there may be a bug in the script. |
| 29 | +
|
| 30 | +Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \ |
| 31 | +bug report at https://github.com/Anorov/cloudflare-scrape/issues."\ |
| 32 | +""" |
29 | 33 |
|
| 34 | +ANSWER_ACCEPT_ERROR = """\ |
| 35 | +The challenge answer was not properly accepted by Cloudflare. This can occur if \ |
| 36 | +the target website is under heavy load, or if Cloudflare is experiencing issues. You can |
| 37 | +potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \ |
| 38 | +For example: cfscrape.create_scraper(delay=10) |
| 39 | +
|
| 40 | +If increasing the delay does not help, please open a GitHub issue at \ |
| 41 | +https://github.com/Anorov/cloudflare-scrape/issues\ |
| 42 | +""" |
30 | 43 |
|
31 | 44 | class CloudflareScraper(Session):
|
32 | 45 | def __init__(self, *args, **kwargs):
|
33 |
| - self.delay = kwargs.pop('delay', 5) |
34 |
| - |
| 46 | + self.delay = kwargs.pop("delay", 5) |
35 | 47 | super(CloudflareScraper, self).__init__(*args, **kwargs)
|
36 | 48 |
|
37 | 49 | if "requests" in self.headers["User-Agent"]:
|
38 |
| - # Spoof Firefox on Linux if no custom User-Agent has been set |
| 50 | + # Set a random User-Agent if no custom User-Agent has been set |
39 | 51 | self.headers["User-Agent"] = DEFAULT_USER_AGENT
|
40 | 52 |
|
| 53 | + def is_cloudflare_challenge(self, resp): |
| 54 | + return ( |
| 55 | + resp.status_code == 503 |
| 56 | + and resp.headers.get("Server", "").startswith("cloudflare") |
| 57 | + and b"jschl_vc" in resp.content |
| 58 | + and b"jschl_answer" in resp.content |
| 59 | + ) |
| 60 | + |
41 | 61 | def request(self, method, url, *args, **kwargs):
|
42 | 62 | resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
|
43 | 63 |
|
44 | 64 | # Check if Cloudflare anti-bot is on
|
45 |
| - if ( resp.status_code == 503 |
46 |
| - and resp.headers.get("Server", "").startswith("cloudflare") |
47 |
| - and b"jschl_vc" in resp.content |
48 |
| - and b"jschl_answer" in resp.content |
49 |
| - ): |
50 |
| - return self.solve_cf_challenge(resp, **kwargs) |
51 |
| - |
52 |
| - # Otherwise, no Cloudflare anti-bot detected |
| 65 | + if self.is_cloudflare_challenge(resp): |
| 66 | + resp = self.solve_cf_challenge(resp, **kwargs) |
| 67 | + if self.is_cloudflare_challenge(resp): |
| 68 | + raise ValueError(ANSWER_ACCEPT_ERROR) |
| 69 | + |
53 | 70 | return resp
|
54 | 71 |
|
55 | 72 | def solve_cf_challenge(self, resp, **original_kwargs):
|
@@ -111,33 +128,32 @@ def solve_challenge(self, body):
|
111 | 128 |
|
112 | 129 | # Use vm.runInNewContext to safely evaluate code
|
113 | 130 | # The sandboxed code cannot use the Node.js standard library
|
114 |
| - js = "return require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000});" % js |
| 131 | + js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js |
115 | 132 |
|
116 | 133 | try:
|
117 |
| - node = execjs.get("Node") |
118 |
| - except Exception: |
119 |
| - raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape" |
120 |
| - " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") |
121 |
| - |
122 |
| - try: |
123 |
| - result = node.exec_(js) |
| 134 | + result = subprocess.check_output(["node", "-e", js]).strip() |
| 135 | + except OSError as e: |
| 136 | + if e.errno == 2: |
| 137 | + raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape" |
| 138 | + " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.") |
| 139 | + raise |
124 | 140 | except Exception:
|
125 | 141 | logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
|
126 | 142 | raise
|
127 | 143 |
|
128 | 144 | try:
|
129 | 145 | result = int(result)
|
130 | 146 | except Exception:
|
131 |
| - raise ValueError("Cloudflare IUAM challenge returned unexpected value. %s" % BUG_REPORT) |
| 147 | + raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT) |
132 | 148 |
|
133 | 149 | return result
|
134 | 150 |
|
135 | 151 | @classmethod
|
136 | 152 | def create_scraper(cls, sess=None, **kwargs):
|
137 | 153 | """
|
138 |
| - Convenience function for creating a ready-to-go requests.Session (subclass) object. |
| 154 | + Convenience function for creating a ready-to-go CloudflareScraper object. |
139 | 155 | """
|
140 |
| - scraper = cls() |
| 156 | + scraper = cls(**kwargs) |
141 | 157 |
|
142 | 158 | if sess:
|
143 | 159 | attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
|
|
0 commit comments