From d5df7183af359ea1ffab91dcbf956f7e07eb1616 Mon Sep 17 00:00:00 2001 From: Stephane Bortzmeyer Date: Sat, 11 Nov 2017 20:18:38 +0800 Subject: [PATCH] Minimum documentation --- INSTALL.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ README.md | 4 ++++ 2 files changed, 48 insertions(+) create mode 100644 INSTALL.md diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..83b73d2 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,44 @@ +# Installing and running block-crawler + +Pre-requisites +-------------- + +You need Node.js (version >= 8) and the npm package manager. + +Installing dependencies +----------------------- + +npm install + +Running +------- + +Simplest run: + +node index.js http://starting.point.example/ + +(The Node.js executable may be named "nodejs" on your system) + +Running with a collector: + +node index.js --collector https://collector.example/ http://starting.point.example/ + +The collector has to be able to receive POST results and do something +with them. A very limited collector in Python+WSGI is: + +def store(start_response, environ): + fileo = open("/var/storage/store.log", 'a') + status = '200 OK' + data = environ['wsgi.input'].read() + fileo.write(data) + fileo.close() + output = "Stored %i bytes\n" % len(data) + response_headers = [('Content-Type', 'text/plain'), + ('Content-Length', str(len(output)))] + start_response(status, response_headers) + return [output] + +The results (only the HTTP errors) will appear in JSON format in +/var/storage/store.log, for instance: + +{"date":"2017-11-11T12:10:07.314Z","creator":"block-crawler","version":"0.1","url":"http://httpstat.us/451","status":451,"statusText":"Unavailable For Legal Reasons"} diff --git a/README.md b/README.md index 6ff972e..cfbfe7b 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ Because HTTP 451 is typically used to 'geoblock' content, it is expected that va Results are produced in a simple streaming JSON annotation format which identifies the affected URL, observed status code and status text and optional blocking entity. A single report entity identifies a one HTTP request at a specific point in time observed from a single IP address. +## Installing and running it + +See INSTALL.md + ## Status and contributor guidelines This tool is under development and not yet recommended for use in production.