Skip to content

Commit d78325c

Browse files
committed
Handle failed challenge submissions, update README, remove PyExecJS dependency
1 parent 8bf4daf commit d78325c

File tree

3 files changed

+62
-33
lines changed

3 files changed

+62
-33
lines changed

README.md

+10-5
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ For reference, this is the default message Cloudflare uses for these sorts of pa
1919

2020
Any script using cloudflare-scrape will sleep for 5 seconds for the first visit to any site with Cloudflare anti-bots enabled, though no delay will occur after the first request.
2121

22-
Warning
23-
======
24-
25-
**Due to a critical security vulnerability, if you are running any version below 1.9 please upgrade to version 1.9 or higher immediately.** Versions before 1.9.0 used unsafe Javascript execution mechanisms which could result in arbitrary code execution. If you are running a vulnerable version, a malicious website owner could craft a page which executes arbitrary code on the machine that runs this script. This can only occur if the website that the user attempts to scrape has specifically prepared a page to exploit vulnerable versions of cfscrape.
26-
2722
Installation
2823
============
2924

@@ -88,6 +83,16 @@ scraper = cfscrape.create_scraper(sess=session)
8883

8984
Unfortunately, not all of Requests' session attributes are easily transferable, so if you run into problems with this, you should replace your initial `sess = requests.session()` call with `sess = cfscrape.create_scraper()`.
9085

86+
### Delays
87+
88+
Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`.
89+
90+
There is no need to override this delay unless cloudflare-scrape is generates an error recommending you increase the delay.
91+
92+
```python
93+
scraper = cfscrape.create_scraper(delay=10)
94+
```
95+
9196
## Integration
9297

9398
It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make.

cfscrape/__init__.py

+43-27
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
import logging
22
import random
33
import re
4-
from requests.sessions import Session
4+
import subprocess
55
from copy import deepcopy
66
from time import sleep
77

8-
import execjs
8+
from requests.sessions import Session
99

1010
try:
1111
from urlparse import urlparse
1212
except ImportError:
1313
from urllib.parse import urlparse
1414

15-
__version__ = "1.9.3"
15+
__version__ = "1.9.4"
1616

1717
DEFAULT_USER_AGENTS = [
1818
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
@@ -24,32 +24,49 @@
2424

2525
DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
2626

27-
BUG_REPORT = ("Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a "
28-
"bug report at https://github.com/Anorov/cloudflare-scrape/issues.")
27+
BUG_REPORT = """\
28+
Cloudflare may have changed their technique, or there may be a bug in the script.
29+
30+
Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \
31+
bug report at https://github.com/Anorov/cloudflare-scrape/issues."\
32+
"""
2933

34+
ANSWER_ACCEPT_ERROR = """\
35+
The challenge answer was not properly accepted by Cloudflare. This can occur if \
36+
the target website is under heavy load, or if Cloudflare is experiencing issues. You can
37+
potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \
38+
For example: cfscrape.create_scraper(delay=10)
39+
40+
If increasing the delay does not help, please open a GitHub issue at \
41+
https://github.com/Anorov/cloudflare-scrape/issues\
42+
"""
3043

3144
class CloudflareScraper(Session):
3245
def __init__(self, *args, **kwargs):
33-
self.delay = kwargs.pop('delay', 5)
34-
46+
self.delay = kwargs.pop("delay", 5)
3547
super(CloudflareScraper, self).__init__(*args, **kwargs)
3648

3749
if "requests" in self.headers["User-Agent"]:
38-
# Spoof Firefox on Linux if no custom User-Agent has been set
50+
# Set a random User-Agent if no custom User-Agent has been set
3951
self.headers["User-Agent"] = DEFAULT_USER_AGENT
4052

53+
def is_cloudflare_challenge(self, resp):
54+
return (
55+
resp.status_code == 503
56+
and resp.headers.get("Server", "").startswith("cloudflare")
57+
and b"jschl_vc" in resp.content
58+
and b"jschl_answer" in resp.content
59+
)
60+
4161
def request(self, method, url, *args, **kwargs):
4262
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
4363

4464
# Check if Cloudflare anti-bot is on
45-
if ( resp.status_code == 503
46-
and resp.headers.get("Server", "").startswith("cloudflare")
47-
and b"jschl_vc" in resp.content
48-
and b"jschl_answer" in resp.content
49-
):
50-
return self.solve_cf_challenge(resp, **kwargs)
51-
52-
# Otherwise, no Cloudflare anti-bot detected
65+
if self.is_cloudflare_challenge(resp):
66+
resp = self.solve_cf_challenge(resp, **kwargs)
67+
if self.is_cloudflare_challenge(resp):
68+
raise ValueError(ANSWER_ACCEPT_ERROR)
69+
5370
return resp
5471

5572
def solve_cf_challenge(self, resp, **original_kwargs):
@@ -111,33 +128,32 @@ def solve_challenge(self, body):
111128

112129
# Use vm.runInNewContext to safely evaluate code
113130
# The sandboxed code cannot use the Node.js standard library
114-
js = "return require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000});" % js
131+
js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js
115132

116133
try:
117-
node = execjs.get("Node")
118-
except Exception:
119-
raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
120-
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
121-
122-
try:
123-
result = node.exec_(js)
134+
result = subprocess.check_output(["node", "-e", js]).strip()
135+
except OSError as e:
136+
if e.errno == 2:
137+
raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
138+
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
139+
raise
124140
except Exception:
125141
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
126142
raise
127143

128144
try:
129145
result = int(result)
130146
except Exception:
131-
raise ValueError("Cloudflare IUAM challenge returned unexpected value. %s" % BUG_REPORT)
147+
raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)
132148

133149
return result
134150

135151
@classmethod
136152
def create_scraper(cls, sess=None, **kwargs):
137153
"""
138-
Convenience function for creating a ready-to-go requests.Session (subclass) object.
154+
Convenience function for creating a ready-to-go CloudflareScraper object.
139155
"""
140-
scraper = cls()
156+
scraper = cls(**kwargs)
141157

142158
if sess:
143159
attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]

setup.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
1+
import os
2+
import re
13
from setuptools import setup
24

5+
base_path = os.path.dirname(__file__)
6+
7+
with open(os.path.join(base_path, 'cfscrape', '__init__.py')) as fp:
8+
VERSION = re.compile(r'.*__version__ = "(.*?)"',
9+
re.S).match(fp.read()).group(1)
10+
311
setup(
412
name = 'cfscrape',
513
packages = ['cfscrape'],
6-
version = '1.9.3',
14+
version = VERSION,
715
description = 'A simple Python module to bypass Cloudflare\'s anti-bot page. See https://github.com/Anorov/cloudflare-scrape for more information.',
816
author = 'Anorov',
917
author_email = '[email protected]',

0 commit comments

Comments
 (0)