Handle failed challenge submissions, update README, remove PyExecJS dependency

Anorov · Anorov · commit d78325caf015 · 2018-02-05T14:51:21.000-05:00
diff --git a/README.md b/README.md
@@ -19,11 +19,6 @@ For reference, this is the default message Cloudflare uses for these sorts of pa
 
 Any script using cloudflare-scrape will sleep for 5 seconds for the first visit to any site with Cloudflare anti-bots enabled, though no delay will occur after the first request.
 
-Warning
-======
-
-**Due to a critical security vulnerability, if you are running any version below 1.9 please upgrade to version 1.9 or higher immediately.** Versions before 1.9.0 used unsafe Javascript execution mechanisms which could result in arbitrary code execution. If you are running a vulnerable version, a malicious website owner could craft a page which executes arbitrary code on the machine that runs this script. This can only occur if the website that the user attempts to scrape has specifically prepared a page to exploit vulnerable versions of cfscrape.
-
 Installation
 ============
 
@@ -88,6 +83,16 @@ scraper = cfscrape.create_scraper(sess=session)
 
 Unfortunately, not all of Requests' session attributes are easily transferable, so if you run into problems with this, you should replace your initial `sess = requests.session()` call with `sess = cfscrape.create_scraper()`.
 
+### Delays
+
+Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`.
+
+There is no need to override this delay unless cloudflare-scrape is generates an error recommending you increase the delay.
+
+```python
+scraper = cfscrape.create_scraper(delay=10)
+```
+
 ## Integration
 
 It's easy to integrate cloudflare-scrape with other applications and tools. Cloudflare uses two cookies as tokens: one to verify you made it past their challenge page and one to track your session. To bypass the challenge page, simply include both of these cookies (with the appropriate user-agent) in all HTTP requests you make.
diff --git a/cfscrape/__init__.py b/cfscrape/__init__.py
@@ -1,18 +1,18 @@
 import logging
 import random
 import re
-from requests.sessions import Session
+import subprocess
 from copy import deepcopy
 from time import sleep
 
-import execjs
+from requests.sessions import Session
 
 try:
     from urlparse import urlparse
 except ImportError:
     from urllib.parse import urlparse
 
-__version__ = "1.9.3"
+__version__ = "1.9.4"
 
 DEFAULT_USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
@@ -24,32 +24,49 @@
 
 DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
 
-BUG_REPORT = ("Cloudflare may have changed their technique, or there may be a bug in the script.\n\nPlease read " "https://github.com/Anorov/cloudflare-scrape#updates, then file a "
-"bug report at https://github.com/Anorov/cloudflare-scrape/issues.")
+BUG_REPORT = """\
+Cloudflare may have changed their technique, or there may be a bug in the script.
+
+Please read https://github.com/Anorov/cloudflare-scrape#updates, then file a \
+bug report at https://github.com/Anorov/cloudflare-scrape/issues."\
+"""
 
+ANSWER_ACCEPT_ERROR = """\
+The challenge answer was not properly accepted by Cloudflare. This can occur if \
+the target website is under heavy load, or if Cloudflare is experiencing issues. You can
+potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \
+For example: cfscrape.create_scraper(delay=10)
+
+If increasing the delay does not help, please open a GitHub issue at \
+https://github.com/Anorov/cloudflare-scrape/issues\
+"""
 
 class CloudflareScraper(Session):
     def __init__(self, *args, **kwargs):
-        self.delay = kwargs.pop('delay', 5)
-
+        self.delay = kwargs.pop("delay", 5)
         super(CloudflareScraper, self).__init__(*args, **kwargs)
 
         if "requests" in self.headers["User-Agent"]:
-            # Spoof Firefox on Linux if no custom User-Agent has been set
+            # Set a random User-Agent if no custom User-Agent has been set
             self.headers["User-Agent"] = DEFAULT_USER_AGENT
 
+    def is_cloudflare_challenge(self, resp):
+        return (
+            resp.status_code == 503
+            and resp.headers.get("Server", "").startswith("cloudflare")
+            and b"jschl_vc" in resp.content
+            and b"jschl_answer" in resp.content
+        )
+
     def request(self, method, url, *args, **kwargs):
         resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
 
         # Check if Cloudflare anti-bot is on
-        if ( resp.status_code == 503
-             and resp.headers.get("Server", "").startswith("cloudflare")
-             and b"jschl_vc" in resp.content
-             and b"jschl_answer" in resp.content
-        ):
-            return self.solve_cf_challenge(resp, **kwargs)
-
-        # Otherwise, no Cloudflare anti-bot detected
+        if self.is_cloudflare_challenge(resp):
+            resp = self.solve_cf_challenge(resp, **kwargs)
+            if self.is_cloudflare_challenge(resp):
+                raise ValueError(ANSWER_ACCEPT_ERROR)
+
         return resp
 
     def solve_cf_challenge(self, resp, **original_kwargs):
@@ -111,33 +128,32 @@ def solve_challenge(self, body):
 
         # Use vm.runInNewContext to safely evaluate code
         # The sandboxed code cannot use the Node.js standard library
-        js = "return require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000});" % js
+        js = "console.log(require('vm').runInNewContext('%s', Object.create(null), {timeout: 5000}));" % js
 
         try:
-            node = execjs.get("Node")
-        except Exception:
-            raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
-                " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
-
-        try:
-            result = node.exec_(js)
+            result = subprocess.check_output(["node", "-e", js]).strip()
+        except OSError as e:
+            if e.errno == 2:
+                raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
+                    " README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
+            raise
         except Exception:
             logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
             raise
 
         try:
             result = int(result)
         except Exception:
-            raise ValueError("Cloudflare IUAM challenge returned unexpected value. %s" % BUG_REPORT)
+            raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)
 
         return result
 
     @classmethod
     def create_scraper(cls, sess=None, **kwargs):
         """
-        Convenience function for creating a ready-to-go requests.Session (subclass) object.
+        Convenience function for creating a ready-to-go CloudflareScraper object.
         """
-        scraper = cls()
+        scraper = cls(**kwargs)
 
         if sess:
             attrs = ["auth", "cert", "cookies", "headers", "hooks", "params", "proxies", "data"]
diff --git a/setup.py b/setup.py
@@ -1,9 +1,17 @@
+import os
+import re
 from setuptools import setup
 
+base_path = os.path.dirname(__file__)
+
+with open(os.path.join(base_path, 'cfscrape', '__init__.py')) as fp:
+    VERSION = re.compile(r'.*__version__ = "(.*?)"',
+                         re.S).match(fp.read()).group(1)
+
 setup(
   name = 'cfscrape',
   packages = ['cfscrape'],
-  version = '1.9.3',
+  version = VERSION,
   description = 'A simple Python module to bypass Cloudflare\'s anti-bot page. See https://github.com/Anorov/cloudflare-scrape for more information.',
   author = 'Anorov',
   author_email = 'anorov.vorona@gmail.com',