diff --git a/.github/workflows/docgen.yaml b/.github/workflows/docgen.yaml deleted file mode 100644 index 8b2396a4..00000000 --- a/.github/workflows/docgen.yaml +++ /dev/null @@ -1,30 +0,0 @@ -name: Generate documentation -on: [push, pull_request] - -permissions: - contents: write - -jobs: - build: - name: Docgen - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - os: - - debian-bullseye - - steps: - - uses: actions/checkout@v1 - - name: Install deps - run: sudo apt-get -y --no-install-recommends install python3-markdown python3-blockdiag - - name: Run docgen - run: ./build_docs.py - - name: Upload to github pages - if: success() - uses: crazy-max/ghaction-github-pages@v1 - with: - target_branch: gh-pages - build_dir: build_docs_output - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/LICENSE.md b/LICENSE.md index 9249ef41..a5f7f552 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,5 @@ -Copyright 2019 Open Observatory of Network Interference (OONI), The Tor Project +Copyright 2020-2024 Open Observatory of Network Interference (OONI) +Copyright 2012-2019 Open Observatory of Network Interference (OONI), The Tor Project Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index 27fef5d9..94543e06 100644 --- a/README.md +++ b/README.md @@ -1,242 +1,6 @@ # OONI backend -Welcome. This document describes the architecture of the main components of the -OONI infrastructure. +Welcome to the OONI backend! -The documentation is meant for core contributors, external contributors and researcher -that want to extract data or reuse software components in their own projects. - -This file is [rendered here](https://ooni.github.io/pipeline/README.html) - -You can also explore the [documentation tree](https://ooni.github.io/pipeline/) - -## Table of contents - -[TOC] - -## Architecture - -The backend infrastructure provides multiple functions: - -* Provide APIs for data consumers -* Instruct probes on what measurements to perform -* Receive measurements from probes, process them and store them in the database and on S3 - -## Data flow - -This diagram represent the main flow of measurement data - - -blockdiag { - Probes [color = "#ffeeee"]; - Explorer [color = "#eeeeff"]; - "S3 jsonl" [shape = ellipse]; - "S3 postcan" [shape = ellipse]; - "DB jsonl tbl" [shape = ellipse]; - "DB fastpath tbl" [shape = ellipse]; - "disk queue" [shape = ellipse]; - - Probes -> "API: Probe services" -> "Fastpath" -> "DB fastpath tbl" -> "API: Measurements" -> "Explorer"; - "API: Probe services" -> "disk queue" -> "API: uploader" -> "S3 jsonl" -> "API: Measurements"; - "API: uploader" -> "S3 postcan"; - "API: uploader" -> "DB jsonl tbl"; - "DB jsonl tbl" -> "API: Measurements" -} - - -Each measurement is processed individually in real time. - - -## Components: API - -The API entry points are documented at [apidocs](https://api.ooni.io/apidocs/) - -### Measurements - -Provide access to measurements to end users directly and through Explorer. - -Mounted under /api/v1/measurement/ - -The API is versioned. Access is rate limited based on source IP address and access tokens -due to the computational cost of running heavy queries on the database. - -[Sources](https://github.com/ooni/api/blob/master/newapi/ooniapi/probe_services.py) - -### Probe services - -Serves lists of collectors and test helpers to the probes and receive measurements from them. - -Mounted under /api/v1/ - -[Sources](https://github.com/ooni/api/blob/master/newapi/ooniapi/probe_services.py) - -### Private entry points - -Not for public consumption. Mounted under `/api/_` and used exclusively by Explorer - -[Sources](https://github.com/ooni/api/blob/master/newapi/ooniapi/private.py) - -## Fastpath - -[Documentation](af/fastpath/fastpath/core.html) - -## Database - -## Operations - -### Build, deploy, rollback - -Host deployments are done with the [sysadmin repo](https://github.com/ooni/sysadmin) - -For component updates a deployment pipeline is used: - -Look at the [Status dashboard](https://github.com/ooni/backend/wiki/Backend) - be aware of badge image caching - -Use the deploy tool: - -```bash -# Update all badges: -dep refresh_badges - -# Show status -dep - -# Deploy/rollback a given version on the "test" stage -deploy ooni-api test 0.6~pr194-147 - -# Deploy latest build on the first stage -deploy ooni-api - -# Deploy latest build on a given stage -deploy ooni-api prod - -``` - -### Adding new tests - -Update [database_upgrade_schema](https://github.com/ooni/pipeline/blob/master/af/fastpath/database_upgrade_schema.py) - -``` -ALTER TYPE ootest ADD VALUE ''; -``` - -Update [fastpath](https://github.com/ooni/pipeline/blob/master/af/fastpath/fastpath/core.py) -by adding a new test to the `score_measurement` function and adding relevant -integration tests. - -Create a [Pull Request](https://github.com/ooni/pipeline/compare) - -Run fastpath manually from S3 on the testing stage see: [rerun fastpath manually](#rerun-fastpath-manually) - -Update the [api](https://github.com/ooni/api/blob/master/newapi/ooniapi/measurements.py#L491) - -### Adding new fingerprints - -TODO - -### API runbook - -Monitor the [API](https://mon.ooni.nu/grafana/d/CkdDBscGz/ams-pg-api?orgId=1) and -[fastpath](https://mon.ooni.nu/grafana/d/75nnWVpMz/fastpath-ams-pg?orgId=1) dashboards. - -Follow Nginx or API logs with: -```bash -sudo journalctl -f -u nginx --no-hostname -# The API logs contain SQL queries, exceptions etc -sudo journalctl -f --identifier gunicorn3 --no-hostname -``` - -### Fastpath runbook - -#### Manual deployment - -```bash -ssh -sudo apt-get update -apt-cache show fastpath | grep Ver | head -n5 -sudo apt-get install fastpath -``` - -#### Restart -`sudo systemctl restart fastpath` - -#### Rerun fastpath manually - -Run as fastpath user: - -```bash -ssh -sudo sudo -u fastpath /bin/bash -cd -``` - -```bash -fastpath --help -# rerun without overwriting files on disk nor writing to database: -fastpath --start-day 2016-05-13 --end-day 2016-05-14 --stdout --no-write-msmt --no-write-to-db -# rerun without overwriting files on disk: -fastpath --start-day 2016-05-13 --end-day 2016-05-14 --stdout --no-write-msmt -# rerun and overwrite: -fastpath --start-day 2016-05-13 --end-day 2016-05-14 --stdout --update -``` - -The fastpath will pull cans from S3. -The daemon (doing real-time processing) can keep running in the meantime. - -[Progress chart](https://mon.ooni.nu/prometheus/new/graph?g0.expr=netdata_statsd_gauge_fastpath_s3feeder_s3_download_percentage_value_average%7Bdimension%3D%22gauge%22%7D&g0.tab=0&g0.stacked=1&g0.range_input=2h&g1.expr=netdata_statsd_gauge_fastpath_load_s3_reports_remaining_files_value_average%7Bdimension%3D%22gauge%22%7D&g1.tab=0&g1.stacked=1&g1.range_input=1h) -#### Log monitoring - -```bash -sudo journalctl -f -u fastpath -``` - -#### Monitoring dashboard - -[https://mon.ooni.nu/grafana/d/75nnWVpMz/fastpath-ams-pg?orgId=1&refresh=5m&from=now-7d&to=now](https://mon.ooni.nu/grafana/d/75nnWVpMz/fastpath-ams-pg?orgId=1&refresh=5m&from=now-7d&to=now) - -### Analysis runbook - -The Analysis tool runs a number of systemd timers to monitor the slow query summary and more. -See https://github.com/ooni/pipeline/blob/master/af/analysis/analysis/analysis.py - -#### Manual deployment - -``` -ssh -sudo apt-get update -apt-cache show analysis | grep Ver | head -n5 -sudo apt-get install analysis= -``` - -#### Run manually -``` -sudo systemctl restart ooni-update-counters.service -``` - -#### Log monitoring - -``` -sudo journalctl -f --identifier analysis -``` - -#### Monitoring dashboard - -[https://mon.ooni.nu/grafana/d/75nnWVpMz/fastpath-ams-pg?orgId=1&refresh=5m&from=now-7d&to=now](https://mon.ooni.nu/grafana/d/75nnWVpMz/fastpath-ams-pg?orgId=1&refresh=5m&from=now-7d&to=now) - -### Deploy new host - -Deploy host from https://cloud.digitalocean.com/projects/ - -Create DNS "A" record `.ooni.org` at https://ap.www.namecheap.com/ - -On the sysadmin repo, ansible directory, add the host to the inventory - -Run the deploy with the root SSH user -``` -./play deploy-.yml -l .ooni.org --diff -u root -``` - -Update prometheus -``` -./play deploy-prometheus.yml -t prometheus-conf --diff -``` +For up to date documentation about the backend, see: +https://docs.ooni.org/backend diff --git a/af/.gitignore b/af/.gitignore deleted file mode 100644 index 8f2b6cc8..00000000 --- a/af/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# TMP -airflow/dumb-init_1.2.0_amd64.deb -shovel/*.flame -shovel/*.svg -shovel/bridge_db.json -useless-test-dags/ diff --git a/af/oometa/000-A-versioning.install.sql b/af/oometa/000-A-versioning.install.sql deleted file mode 120000 index 3e42170f..00000000 --- a/af/oometa/000-A-versioning.install.sql +++ /dev/null @@ -1 +0,0 @@ -../../contrib/Versioning/install.versioning.sql \ No newline at end of file diff --git a/af/oometa/000-A-versioning.rollback.sql b/af/oometa/000-A-versioning.rollback.sql deleted file mode 120000 index ff238f57..00000000 --- a/af/oometa/000-A-versioning.rollback.sql +++ /dev/null @@ -1 +0,0 @@ -../../contrib/Versioning/uninstall.versioning.sql \ No newline at end of file diff --git a/af/oometa/000-init.install.sql b/af/oometa/000-init.install.sql deleted file mode 100644 index 10088f07..00000000 --- a/af/oometa/000-init.install.sql +++ /dev/null @@ -1,238 +0,0 @@ -BEGIN; - -select _v.register_patch( '000-init', NULL, NULL ); - --- Everything goes to `public` schema. - -create domain sha1 as bytea check (octet_length(value) = 20); - -create domain size4 as int4 check (value >= 0); - --- Metadata of `canned` is not recorded as it's useful only for some --- statistics that can be computed without postgres and, moreover, --- `canned` is not `public` data. - -create table autoclaved ( - autoclaved_no serial primary key, - filename text unique not null, - bucket_date date not null, - code_ver int4 not null, - file_size size4 not null, - file_crc32 int4 not null, - file_sha1 sha1 not null, - CHECK(substring(filename from 1 for 11) = (bucket_date || '/')) -); - -comment on column autoclaved.filename is 'Name of compressed blob relative to …/autoclaved like `2017-01-01/facebook_messenger.0.tar.lz4`'; -comment on column autoclaved.code_ver is 'Version of code processed this autoclaved file (used for partial updates)'; - -create type ootest as enum ( - 'web_connectivity', - 'http_requests', - 'dns_consistency', - 'http_invalid_request_line', - 'bridge_reachability', - 'tcp_connect', - 'http_header_field_manipulation', - 'http_host', - 'multi_protocol_traceroute', - 'meek_fronted_requests_test', - 'whatsapp', - 'vanilla_tor', - 'facebook_messenger', - 'ndt' -); - -create table software ( - software_no serial primary key, - test_name text not null, - test_version text not null, - software_name text not null, - software_version text not null, - UNIQUE (test_name, test_version, software_name, software_version) -); - --- Some report fields are ignored while filling `report` table. Some of them --- are not filled in modern reports (or have same value): probe_city, --- backend_version, data_format_version. Other have questionable value for --- indexing and aggregation (may be added later): input_hashes (list of --- hashes), options (CLI), test_helpers. `test_keys` are expanded to features --- in different tables. - -create sequence report_no_seq; - -create table report ( - report_no int4 not null primary key, - autoclaved_no int4 references autoclaved, - test_start_time timestamp without time zone not null, -- XXX: may differ from `textname` datetime due to TZ - probe_cc char(2) not null, -- 'ZZ' is replacement for NULL - probe_asn int4 not null, -- 0 is replacement for NULL, was string 'ASxxxx' before - probe_ip inet null, -- "127.0.0.1" becomes NULL - test_name ootest null, -- NULL for tests without known parsed metadata - badtail size4 null, - textname text unique not null,-- AKA `report_filename` - orig_sha1 sha1 not null, - report_id text unique not null,-- "20170129T000004Z_AS0_JjsPsc…AJZA" OR urlsafe_base64(orig_sha1), we don't know if it's UNIQUE, but we hope that it is :) - software_no int4 references software - -- CHECK(substring(textname from 1 for 11) = (bucket_date || '/')) -); - -comment on column report.textname is 'Name of original reports-raw file like `2017-01-01/20170101T000056Z-ZZ-AS0-facebook_messenger-20170101T000013Z_AS0_a6dK…JVMO-0.1.0-probe.yaml`'; - -create table report_meta ( - report_no int4 not null default nextval('report_no_seq') primary key, - autoclaved_no int4 not null, -- `references` is skipped to speedup import - badtail size4 null, - textname text unique not null,-- AKA `report_filename` - orig_sha1 sha1 not null -); - -comment on table report_meta is 'For `CREATE TABLE LIKE …` while loading'; - -create table report_blob ( - report_no int4 not null primary key, - test_start_time timestamp without time zone not null, - probe_cc char(2) not null, - probe_asn int4 not null, - probe_ip inet null, - test_name ootest null, - report_id text null, - software_no int4 not null -); - -comment on table report_blob is 'For `CREATE TABLE LIKE …` while loading'; - -create sequence msm_no_seq; - --- badblob `src_off` and `src_size` -- offsets within reports-raw file from --- canned blob are not recorded here as canned files are not recorded here. --- The reason to record badblob is preservation of `msm_no` on re-generation --- of measurements. - -create table badblob ( - msm_no int4 not null primary key, - report_no int4 references report, - orig_sha1 sha1 not null -); - --- `frame_off` and `frame_size` do not deserve separate relation as --- de-duplication will save almost nothing: 8 bytes per measurement become --- 4 bytes per measurement + 12 bytes per frame (without indexes). That reduces --- 770 kb of metadata down to 600 kb of frame-related(!) metadata for 2017-01-01 - -create table input ( - input_no serial not null primary key, - input text -); - -create table measurement ( - msm_no int4 not null primary key, - report_no int4 references report, - frame_off size4 not null, - frame_size size4 not null, - intra_off size4 not null, - intra_size size4 not null, - measurement_start_time timestamp without time zone null, - test_runtime real null, - orig_sha1 sha1 not null, - id uuid not null, - input_no int4 null references input -); - -comment on column measurement.frame_off is 'Offset within autoclaved.filename'; - -create table measurement_meta ( - msm_no int4 not null default nextval('msm_no_seq') primary key, - report_no int4 not null, -- `references` is skipped to speedup import - frame_off size4 null, - frame_size size4 null, - intra_off size4 null, - intra_size size4 null, - orig_sha1 sha1 not null -); - -comment on table measurement_meta is 'For `CREATE TABLE LIKE …` while loading, NULL frame goes to `badblob`'; - -create table measurement_blob ( - msm_no int4 not null primary key, - measurement_start_time timestamp without time zone null, - test_runtime real null, - id uuid not null, - input_no int4 null -); - -comment on table measurement_blob is 'For `CREATE TABLE LIKE …` while loading'; - -create table domain ( - domain_no serial primary key, - domain text unique not null -); - -comment on column domain.domain is 'With trailing dot stripped'; - --- + facebook_messenger --- + web_connectivity --- + whatsapp --- - http_requests --- - tcp_connect --- ? bridge_reachability --- ? dns_consistency --- ? http_header_field_manipulation --- ? http_host --- ? http_invalid_request_line --- ? meek_fronted_requests_test --- ? multi_protocol_traceroute --- ? ndt --- ? vanilla_tor -create table tcp ( - msm_no int4 references measurement, - ip inet not null, - port int4 not null, -- int4 because of lack of uint2 - control_failure text null, - test_failure text null -); - -create table dns_a ( - msm_no int4 references measurement, - domain_no int4 references domain, - control_ip inet[] null, -- facebook_messenger and whatsapp have no control - test_ip inet[] null -); - -create table http_control ( - msm_no int4 references measurement, - is_tor boolean not null, -- true for http_requests, false for web_connectivity - failure text null, - status_code int2 null, - body_length size4 null, - title text null, - headers jsonb null -); - -create table http_request ( - msm_no int4 references measurement, - url text not null, - failure text null, - status_code int2 null, - body_length size4 null, - title text null, - headers jsonb null -); - --- TODO: re-align to save some (?) space -create table http_verdict ( - msm_no int4 references measurement, - accessible boolean null, - control_failure text null, -- e.g. socks_ttl_expired - http_experiment_failure text null, - title_match boolean null, - dns_consistency text null, - dns_experiment_failure text null, - body_proportion real null, - blocking text null, - body_length_match boolean null, - headers_match boolean null, - status_code_match boolean null -); - -COMMIT; diff --git a/af/oometa/000-init.rollback.sql b/af/oometa/000-init.rollback.sql deleted file mode 100644 index 29e3a8aa..00000000 --- a/af/oometa/000-init.rollback.sql +++ /dev/null @@ -1,27 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '000-init' ); - -drop table http_verdict; -drop table http_request; -drop table http_control; -drop table dns_a; -drop table tcp; -drop table domain; -drop table measurement_blob; -drop table measurement_meta; -drop table measurement; -drop table input; -drop table badblob; -drop sequence msm_no_seq; -drop table report_blob; -drop table report_meta; -drop table report; -drop sequence report_no_seq; -drop table software; -drop type ootest; -drop table autoclaved; -drop domain size4; -drop domain sha1; - -COMMIT; diff --git a/af/oometa/000-originas.install.sql b/af/oometa/000-originas.install.sql deleted file mode 100644 index ea6c2d9f..00000000 --- a/af/oometa/000-originas.install.sql +++ /dev/null @@ -1,16 +0,0 @@ -BEGIN; - -select _v.register_patch( '000-originas', NULL, NULL ); - --- Everything goes to `public` schema. - -create table originas ( - origin cidr not null, - asn int4 not null -); - -create index on originas using gist (origin inet_ops); - -comment on table originas is 'source is http://archive.routeviews.org/dnszones/originas.bz2'; - -COMMIT; diff --git a/af/oometa/000-originas.rollback.sql b/af/oometa/000-originas.rollback.sql deleted file mode 100644 index a2bf1be9..00000000 --- a/af/oometa/000-originas.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '000-originas' ); - -drop table originas; - -COMMIT; diff --git a/af/oometa/001-fix-input-uniq.install.sql b/af/oometa/001-fix-input-uniq.install.sql deleted file mode 100644 index 48dbc9e5..00000000 --- a/af/oometa/001-fix-input-uniq.install.sql +++ /dev/null @@ -1,23 +0,0 @@ -BEGIN; - -select _v.register_patch( '001-fix-input-uniq', ARRAY[ '000-init' ], NULL ); - --- `delete...` takes ages if online constraint check is performed -alter table measurement drop constraint measurement_input_no_fkey; - -create temporary table inputdup as select * from (select min(input_no) as target, unnest(array_agg(input_no)) as dup from input group by input having count(*) > 1) as T1 where target != dup; - -update measurement set input_no = target from inputdup where input_no = dup; - -delete from input where input_no in (select dup from inputdup); - --- bring dropped constraint back -alter table only measurement add constraint measurement_input_no_fkey foreign key (input_no) references input(input_no); - -alter table only input add constraint input_input_key unique (input); - -COMMIT; - --- vacuum has to be done after commit -vacuum full input; -vacuum full measurement; diff --git a/af/oometa/001-fix-input-uniq.rollback.sql b/af/oometa/001-fix-input-uniq.rollback.sql deleted file mode 100644 index c0bcccb6..00000000 --- a/af/oometa/001-fix-input-uniq.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '001-fix-input-uniq' ); - -alter table input drop constraint input_input_key; - -COMMIT; diff --git a/af/oometa/001-table-comments.install.sql b/af/oometa/001-table-comments.install.sql deleted file mode 100644 index da20c1bf..00000000 --- a/af/oometa/001-table-comments.install.sql +++ /dev/null @@ -1,23 +0,0 @@ -BEGIN; - -select _v.register_patch( '001-table-comments', ARRAY[ '000-init' ], NULL ); - --- Everything goes to `public` schema. - --- Comments on tables visually separate "persistent" table in `\dt+` output --- from "temporary" tables created while doing exploratory data analysis. - -comment on table autoclaved is 'Metadata table for public/autoclaved files'; -comment on table software is 'External dictionary of known ooniprobe tests implementations and versions to shrink `report` table'; -comment on table report is 'Metadata for reports loosely corresponding to private/reports-raw files'; -comment on table badblob is 'Metadata of malformed measurements blobs including truncated reports'; -comment on table input is 'External dictionary of known test inputs to shrink `measurement` table'; -comment on table measurement is 'Metadata for measurements stored within reports'; -comment on table domain is 'External dictionary of known DNS domains to shrink `dns_a` table'; -comment on table tcp is 'Features: state of TCP connect() for specific IP and port'; -comment on table dns_a is 'Features: DNS resolution of `A` query'; -comment on table http_control is 'Features: HTTP response got through "control" vantage point'; -comment on table http_request is 'Features: HTTP response got through "test" vantage point'; -comment on table http_verdict is 'Features: decision of ooniprobe regarding HTTP response'; - -COMMIT; diff --git a/af/oometa/001-table-comments.rollback.sql b/af/oometa/001-table-comments.rollback.sql deleted file mode 100644 index d705eb65..00000000 --- a/af/oometa/001-table-comments.rollback.sql +++ /dev/null @@ -1,18 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '001-table-comments' ); - -comment on table autoclaved is NULL; -comment on table software is NULL; -comment on table report is NULL; -comment on table badblob is NULL; -comment on table input is NULL; -comment on table measurement is NULL; -comment on table domain is NULL; -comment on table tcp is NULL; -comment on table dns_a is NULL; -comment on table http_control is NULL; -comment on table http_request is NULL; -comment on table http_verdict is NULL; - -COMMIT; diff --git a/af/oometa/002-features.install.sql b/af/oometa/002-features.install.sql deleted file mode 100644 index 729b4851..00000000 --- a/af/oometa/002-features.install.sql +++ /dev/null @@ -1,102 +0,0 @@ -BEGIN; - -select _v.register_patch( '002-features', ARRAY[ '001-fix-input-uniq' ], NULL ); - --- Everything goes to `public` schema. - --- NB: postgresql can't place new columns into arbitrary place. -alter table dns_a - alter column msm_no set not null, - alter column domain_no set not null, - add column control_cname int4[], - add column test_cname int4[], - add column ttl int4, - add column resolver_hostname inet, - add column client_resolver inet, - add column control_failure text, - add column test_failure text -; -comment on column dns_a.control_cname is 'Reference to `domain` table. Lack of CNAME is NULL, not zero-length array'; -comment on column dns_a.test_cname is 'Reference to `domain` table. Lack of CNAME is NULL, not zero-length array'; -comment on column dns_a.ttl is 'The first TTL of all DNS Resource Records'; -comment on column dns_a.resolver_hostname is 'Internal resolver IP according to ~ /etc/resolv.conf'; -comment on column dns_a.client_resolver is 'External resolver IP according to ~ whoami.akamai.net'; - -create table dns_a_tpl ( - msm_no int4 not null, - domain text not null, - control_ip inet[], - test_ip inet[], - control_cname text[], - test_cname text[], - ttl int4, - resolver_hostname inet, - client_resolver inet, - control_failure text, - test_failure text -); -comment on table dns_a_tpl is 'For `CREATE TABLE LIKE …` while loading'; - -alter table tcp add column control_api_failure text; -comment on column tcp.control_failure is 'per-address "failure" field for the experiment'; -comment on column tcp.control_api_failure is '"control_failure" field of the measurement'; - -alter table measurement - add column exc int4[], -- NULL == no errors, empty array takes space :) - add column residual_no int4 not null; -- most of residual schemas should be quite common - -alter table measurement_blob - drop column input_no, - add column input text, -- `input->input_no` mapping is too large to fit in RAM, so it's done at PG side - add column exc int4[], -- 32bit MurMurhash3 of traces (to fix the most popluar ones first) - add column residual jsonb not null; - -create domain sha256 as bytea check (octet_length(value) = 32); - -alter table http_control add column body_sha256 sha256; - -alter table http_request add column body_sha256 sha256; - -create table http_body_simhash ( - body_sha256 sha256 primary key, - simhash_shi4mm3hi int8 not null, - simhash_shi4mm3lo int8 not null -); -comment on table http_body_simhash is 'Features: simhash values for HTTP bodies'; -comment on column http_body_simhash.simhash_shi4mm3hi is 'body | 4_word_shingles | MurMur3_128_high | simhash64'; -comment on column http_body_simhash.simhash_shi4mm3lo is 'body | 4_word_shingles | MurMur3_128_low | simhash64'; - -create extension pgcrypto; -create table residual ( - residual_no serial primary key, - residual jsonb not null -); --- I have not managed to write it as CONSTRAINT, so it's UNIQUE INDEX. --- sha256 is used because of long residual values: --- ERROR: index row size 3008 exceeds maximum 2712 for index "residual_residual_key" --- HINT: Values larger than 1/3 of a buffer page cannot be indexed. --- Consider a function index of an MD5 hash of the value, or use full text indexing. -create unique index residual_residual_sha256_key on residual (digest(residual::text, 'sha256')); -comment on table residual is 'Debug: pseudo-schema of json leftovers remaining after centrifugation'; - -create table badmeta ( - autoclaved_no int4 not null, - report_no int4, -- MAYBE references `report` - textname text, -- used to lookup `report_filename` if report_no is NULL - exc_report int4[] not null, - exc_measurement int4[] not null, - CHECK(report_no IS NOT NULL OR textname IS NOT NULL) -); -comment on table badmeta is 'Debug: accounting exceptions while handling metadata during centrifugation'; - -create table badmeta_tpl ( - autoclaved_no int4 not null, - report_no int4 not null, - textname text not null, - exc_report int4 null, - exc_measurement int4 null - CHECK(exc_report IS NOT NULL OR exc_measurement IS NOT NULL) -); -comment on table badmeta_tpl is 'For `CREATE TABLE LIKE …` while loading'; - -COMMIT; diff --git a/af/oometa/002-features.rollback.sql b/af/oometa/002-features.rollback.sql deleted file mode 100644 index 4df32f58..00000000 --- a/af/oometa/002-features.rollback.sql +++ /dev/null @@ -1,20 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '002-features' ); - -drop table badmeta_tpl; -drop table badmeta; -drop extension pgcrypto; -drop table residual; -drop table http_body_simhash; -alter table http_request drop column body_sha256; -alter table http_control drop column body_sha256; -drop domain sha256; -alter table measurement_blob drop column input, drop column exc, drop column residual, add column input_no integer; -alter table measurement drop column exc, drop column residual_no; -comment on column tcp.control_failure is NULL; -alter table tcp drop column control_api_failure; -drop table dns_a_blob; -alter table dns_a drop column control_cname, drop column test_cname; - -COMMIT; diff --git a/af/oometa/002-fix-foreign-key.install.sql b/af/oometa/002-fix-foreign-key.install.sql deleted file mode 100644 index fa74f477..00000000 --- a/af/oometa/002-fix-foreign-key.install.sql +++ /dev/null @@ -1,21 +0,0 @@ -BEGIN; - -select _v.register_patch( '002-fix-foreign-key', ARRAY[ '001-fix-input-uniq' ], NULL ); - --- FK CONSTRAINTs are implemented as triggers that are painfully slow during --- batch import taking huge amount of CPU. It's not trivial to disable/enable --- these constraints in code, so I'm dropping them altogether. - -ALTER TABLE ONLY badblob DROP CONSTRAINT badblob_report_no_fkey; -ALTER TABLE ONLY dns_a DROP CONSTRAINT dns_a_domain_no_fkey; -ALTER TABLE ONLY dns_a DROP CONSTRAINT dns_a_msm_no_fkey; -ALTER TABLE ONLY http_control DROP CONSTRAINT http_control_msm_no_fkey; -ALTER TABLE ONLY http_request DROP CONSTRAINT http_request_msm_no_fkey; -ALTER TABLE ONLY http_verdict DROP CONSTRAINT http_verdict_msm_no_fkey; -ALTER TABLE ONLY measurement DROP CONSTRAINT measurement_input_no_fkey; -ALTER TABLE ONLY measurement DROP CONSTRAINT measurement_report_no_fkey; -ALTER TABLE ONLY report DROP CONSTRAINT report_autoclaved_no_fkey; -ALTER TABLE ONLY report DROP CONSTRAINT report_software_no_fkey; -ALTER TABLE ONLY tcp DROP CONSTRAINT tcp_msm_no_fkey; - -COMMIT; diff --git a/af/oometa/002-fix-foreign-key.rollback.sql b/af/oometa/002-fix-foreign-key.rollback.sql deleted file mode 100644 index d9f5f473..00000000 --- a/af/oometa/002-fix-foreign-key.rollback.sql +++ /dev/null @@ -1,28 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '001-fix-foreign-key' ); - -ALTER TABLE ONLY badblob - ADD CONSTRAINT badblob_report_no_fkey FOREIGN KEY (report_no) REFERENCES report(report_no); -ALTER TABLE ONLY dns_a - ADD CONSTRAINT dns_a_domain_no_fkey FOREIGN KEY (domain_no) REFERENCES domain(domain_no); -ALTER TABLE ONLY dns_a - ADD CONSTRAINT dns_a_msm_no_fkey FOREIGN KEY (msm_no) REFERENCES measurement(msm_no); -ALTER TABLE ONLY http_control - ADD CONSTRAINT http_control_msm_no_fkey FOREIGN KEY (msm_no) REFERENCES measurement(msm_no); -ALTER TABLE ONLY http_request - ADD CONSTRAINT http_request_msm_no_fkey FOREIGN KEY (msm_no) REFERENCES measurement(msm_no); -ALTER TABLE ONLY http_verdict - ADD CONSTRAINT http_verdict_msm_no_fkey FOREIGN KEY (msm_no) REFERENCES measurement(msm_no); -ALTER TABLE ONLY measurement - ADD CONSTRAINT measurement_input_no_fkey FOREIGN KEY (input_no) REFERENCES input(input_no); -ALTER TABLE ONLY measurement - ADD CONSTRAINT measurement_report_no_fkey FOREIGN KEY (report_no) REFERENCES report(report_no); -ALTER TABLE ONLY report - ADD CONSTRAINT report_autoclaved_no_fkey FOREIGN KEY (autoclaved_no) REFERENCES autoclaved(autoclaved_no); -ALTER TABLE ONLY report - ADD CONSTRAINT report_software_no_fkey FOREIGN KEY (software_no) REFERENCES software(software_no); -ALTER TABLE ONLY tcp - ADD CONSTRAINT tcp_msm_no_fkey FOREIGN KEY (msm_no) REFERENCES measurement(msm_no); - -COMMIT; diff --git a/af/oometa/003-fingerprints.install.sql b/af/oometa/003-fingerprints.install.sql deleted file mode 100644 index c2b18dc4..00000000 --- a/af/oometa/003-fingerprints.install.sql +++ /dev/null @@ -1,64 +0,0 @@ -BEGIN; - -select _v.register_patch( '003-fingerprints', ARRAY[ '002-features' ], NULL ); - -create table fingerprint ( - fingerprint_no serial primary key, - origin_cc character(2) NOT NULL, - body_substr text null, - header text null, - header_prefix text null, - header_value text null, - CHECK((body_substr is null) != (header is null)), - CHECK(header is null OR ((header_prefix is null) != (header_value is null))) -); -comment on table fingerprint is 'Static set of known fingerprints of blockpages'; - -CREATE TABLE measurement_exc ( - msm_no integer NOT NULL, - exc integer[] NOT NULL -); -comment on table measurement_exc is 'For `CREATE TABLE LIKE …` while updating'; - -create table http_request_fp ( - msm_no integer NOT NULL, - fingerprint_no integer NOT NULL -); -comment on table http_request_fp is 'Features: decision of pipeline using `fingerprint` table'; - -CREATE INDEX http_request_fp_msm_no_idx ON http_request_fp (msm_no); - -COPY fingerprint (origin_cc, body_substr) FROM STDIN; -IR iframe src="http://10.10 -TR Telekomünikasyon İletişim Başkanlığı -GR www.gamingcommission.gov.gr/index.php/forbidden-access-black-list/ -RU http://eais.rkn.gov.ru/ -IN The page you have requested has been blocked -BE that is considered illegal according to Belgian legislation -IT GdF Stop Page -CY nba.com.cy/Eas/eas.nsf/All/6F7F17A7790A55C8C2257B130055C86F -DK lagt at blokere for adgang til siden. -FR xtpage = "page-blocage-terrorisme" -MY Makluman/Notification -KR http://warning.or.kr -RO Accesul dumneavoastră către acest site a fost restricționat -GF xtpage = "page-blocage-terrorisme" -\. - -COPY fingerprint (origin_cc, header, header_prefix) FROM STDIN; -ID Location http://internet-positif.org -UK Location http://blocked.nb.sky.com -RU Location http://warning.rt.ru -SA Server Protected by WireFilter -AE Server Protected by WireFilter -\. - -COPY fingerprint (origin_cc, header, header_value) FROM STDIN; -SD Location http://196.1.211.6:8080/alert/ -QA Location http://www.vodafone.qa/alu.cfm -KR Location http://www.warning.or.kr -PT Location http://mobilegen.vodafone.pt/denied/dn -NO Location http://block-no.altibox.net/ -\. - -COMMIT; diff --git a/af/oometa/003-fingerprints.rollback.sql b/af/oometa/003-fingerprints.rollback.sql deleted file mode 100644 index 39326cda..00000000 --- a/af/oometa/003-fingerprints.rollback.sql +++ /dev/null @@ -1,9 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '003-fingerprints' ); - -drop table fingerprint; -drop table measurement_exc; -drop table http_request_fp; - -COMMIT; diff --git a/af/oometa/003-msm-index.install.sql b/af/oometa/003-msm-index.install.sql deleted file mode 100644 index f0b53267..00000000 --- a/af/oometa/003-msm-index.install.sql +++ /dev/null @@ -1,24 +0,0 @@ -BEGIN; - -select _v.register_patch( '003-msm-index', ARRAY[ '002-features' ], NULL ); - --- Everything goes to `public` schema. - --- These indexes are done to ease DELETE of measurements features on partial --- table update. It's questionable of partial update is good idea or not --- (maybe, writing to separate table is actually better), but that's current --- decision that seems to be good enough. It's not unique and it's not primary --- key as most of measurements may have several corresponding feature rows. -create index dns_a_msm_no_idx on dns_a (msm_no); -create index http_control_msm_no_idx on http_control (msm_no); -create index http_request_msm_no_idx on http_request (msm_no); -create index http_verdict_msm_no_idx on http_verdict (msm_no); -create index tcp_msm_no_idx on tcp (msm_no); - --- Fix ancient bugs :) -alter table http_control alter column msm_no set not null; -alter table http_request alter column msm_no set not null; -alter table http_verdict alter column msm_no set not null; -alter table tcp alter column msm_no set not null; - -COMMIT; diff --git a/af/oometa/003-msm-index.rollback.sql b/af/oometa/003-msm-index.rollback.sql deleted file mode 100644 index 6d749808..00000000 --- a/af/oometa/003-msm-index.rollback.sql +++ /dev/null @@ -1,18 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '003-msm-index' ); - --- Bring our bugs back! -alter table http_control alter column msm_no drop not null; -alter table http_request alter column msm_no drop not null; -alter table http_verdict alter column msm_no drop not null; -alter table tcp alter column msm_no drop not null; - --- MOAR full-scans! -drop index dns_a_msm_no_idx; -drop index http_control_msm_no_idx; -drop index http_request_msm_no_idx; -drop index http_verdict_msm_no_idx; -drop index tcp_msm_no_idx; - -COMMIT; diff --git a/af/oometa/004-measurements-index.install.sql b/af/oometa/004-measurements-index.install.sql deleted file mode 100644 index 2df7803d..00000000 --- a/af/oometa/004-measurements-index.install.sql +++ /dev/null @@ -1,13 +0,0 @@ -BEGIN; - -select _v.register_patch( '004-measurements-index', ARRAY[ '003-msm-index' ], NULL ); - --- Everything goes to `public` schema. - --- Almost blind copy-paste from https://github.com/TheTorProject/ooni-measurements/pull/22#issue-236675611 -CREATE INDEX measurement_report_no_idx ON measurement (report_no); -CREATE INDEX measurement_input_no_idx ON measurement (input_no); -CREATE INDEX measurement_id_idx ON measurement (id); -CREATE INDEX report_autoclaved_no_idx ON report (autoclaved_no); - -COMMIT; diff --git a/af/oometa/004-measurements-index.rollback.sql b/af/oometa/004-measurements-index.rollback.sql deleted file mode 100644 index 486215dc..00000000 --- a/af/oometa/004-measurements-index.rollback.sql +++ /dev/null @@ -1,10 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '004-measurements-index' ); - -DROP INDEX measurement_report_no_idx; -DROP INDEX measurement_input_no_idx; -DROP INDEX measurement_id_idx; -DROP INDEX report_autoclaved_no_idx; - -COMMIT; diff --git a/af/oometa/005-anomaly-flags.install.sql b/af/oometa/005-anomaly-flags.install.sql deleted file mode 100644 index 563ea828..00000000 --- a/af/oometa/005-anomaly-flags.install.sql +++ /dev/null @@ -1,32 +0,0 @@ -BEGIN; - -SELECT _v.register_patch( '005-anomaly-flags', ARRAY[ '004-measurements-index' ], NULL ); - --- Everything goes to `public` schema. - -ALTER TABLE measurement - ADD COLUMN msm_failure boolean NULL, - ADD COLUMN anomaly boolean NULL, - ADD COLUMN confirmed boolean NULL -; - -CREATE TABLE label ( - msm_no integer NOT NULL, - msm_failure boolean NULL, - anomaly boolean NULL, - confirmed boolean NULL -); - -COMMENT ON TABLE label IS 'Manual markup overriding corresponding flags in measurement table'; - --- API declares following flags: --- --- failure = COALESCE(label.msm_failure, measurement.msm_failure, false) OR measurement.exc IS NOT NULL OR measurement.residual_no IS NOT NULL --- anomaly = COALESCE(label.anomaly, measurement.anomaly, false) --- confirmed = COALESCE(label.confirmed, measurement.confirmed, false) - --- Following updates are not part of pipeline, these are just sample data. -UPDATE measurement SET anomaly = TRUE, confirmed = TRUE WHERE msm_no IN (SELECT msm_no FROM http_request_fp); -UPDATE measurement SET anomaly = TRUE WHERE msm_no IN (SELECT msm_no FROM http_verdict WHERE NOT body_length_match OR NOT headers_match OR NOT status_code_match); - -COMMIT; diff --git a/af/oometa/005-anomaly-flags.rollback.sql b/af/oometa/005-anomaly-flags.rollback.sql deleted file mode 100644 index 9f486673..00000000 --- a/af/oometa/005-anomaly-flags.rollback.sql +++ /dev/null @@ -1,8 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '005-anomaly-flags' ); - -ALTER TABLE measurement DROP COLUMN confirmed, DROP COLUMN anomaly, DROP COLUMN msm_failure; -DROP TABLE label; - -COMMIT; diff --git a/af/oometa/005-badrow.install.sql b/af/oometa/005-badrow.install.sql deleted file mode 100644 index e68e01a6..00000000 --- a/af/oometa/005-badrow.install.sql +++ /dev/null @@ -1,11 +0,0 @@ -BEGIN; - -select _v.register_patch( '005-badrow', ARRAY[ '004-measurements-index' ], NULL ); - --- Everything goes to `public` schema. -CREATE TABLE badrow ( - tbl text NOT NULL, - code_ver integer NOT NULL, - datum bytea NOT NULL); - -COMMIT; diff --git a/af/oometa/005-badrow.rollback.sql b/af/oometa/005-badrow.rollback.sql deleted file mode 100644 index 622c589f..00000000 --- a/af/oometa/005-badrow.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '005-badrow' ); - -DROP TABLE badrow; - -COMMIT; diff --git a/af/oometa/005-repeated-report.install.sql b/af/oometa/005-repeated-report.install.sql deleted file mode 100644 index c4f3631e..00000000 --- a/af/oometa/005-repeated-report.install.sql +++ /dev/null @@ -1,45 +0,0 @@ -BEGIN; - -select _v.register_patch( '005-repeated-report', ARRAY[ '004-measurements-index' ], NULL ); - -create domain sha512 as bytea check (octet_length(value) = 64); - -create sequence dupgrp_no_seq; - --- Everything goes to `public` schema. -CREATE TABLE repeated_report ( - -- group_no < 0 is indication of manual data entry - dupgrp_no integer NOT NULL, - used boolean NOT NULL, - textname text UNIQUE NOT NULL, - -- NB: `filename` is part of "private" information, it's not stored intentionally - orig_sha1 sha1 NOT NULL, - orig_sha512 sha512 NOT NULL); - --- These two files differ, but the first one includes EVERY `datum` from the second one. --- Also, measurements are _reordered_ within the file. It looks like strange in-place editing. --- You can compare output of following commands: --- $ zcat autoclaved/2016-02-11/index.json.gz | awk '/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml/ {a = 1} (a == 1 && /"datum"/) {print} /"\/report"/ {a = 0}' | jq .orig_sha1 | sort --- Corresponding `canned` entries are: --- {"text_crc32": 413542756, "text_sha1": "xVNkUEwQmauT/F6uO1X6ICvCRN0=", "text_size": 25559440, "textname": "2016-02-11/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml"} --- {"text_crc32": -1449913125, "text_sha1": "BI75eyhWrwX/BdVOzLA8GIu/Pxg=", "text_size": 15786443, "textname": "2016-02-23/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml"} -COPY repeated_report (dupgrp_no, used, orig_sha1, orig_sha512, textname) FROM STDIN; --1 true \\xc55364504c1099ab93fc5eae3b55fa202bc244dd \\x18f8c9cca8b76c53063acce30ec328bd54cff5fd399cfef4ab0666aece0b5f8681e6fab6ca73113ace4f0ff10389451f3ccd737d69a23db258eea73cccdc9798 2016-02-11/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml --1 false \\x048ef97b2856af05ff05d54eccb03c188bbf3f18 \\xb042c07fab30a8d855c1b60017b10235f0fabe76e75da25422d66c004fa4b6d8e23ebb48c5cc8ac31b0bf93e9140f3ec95d9702fff8e5744973959b0ec4cd237 2016-02-23/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml -\. - -CREATE FUNCTION ooconstraint_repeated_report() RETURNS VOID STABLE AS $$ -BEGIN - PERFORM 1 FROM repeated_report LEFT JOIN report USING (textname) WHERE NOT used AND report.textname IS NOT NULL; - IF FOUND THEN - RAISE EXCEPTION 'Unused repeated_report in `report` table'; - END IF; - - PERFORM 1 FROM repeated_report LEFT JOIN report USING (textname) WHERE used AND report.textname IS NULL; - IF FOUND THEN - RAISE EXCEPTION 'Used repeated_report not in `report` table'; - END IF; -END; -$$ LANGUAGE plpgsql; - -COMMIT; diff --git a/af/oometa/005-repeated-report.rollback.sql b/af/oometa/005-repeated-report.rollback.sql deleted file mode 100644 index 2c5fe44b..00000000 --- a/af/oometa/005-repeated-report.rollback.sql +++ /dev/null @@ -1,10 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '005-repeated-report' ); - -drop function ooconstraint_repeated_report(); -drop table repeated_report; -drop domain sha512; -drop sequence dupgrp_no_seq; - -COMMIT; diff --git a/af/oometa/006-reingestion.install.sql b/af/oometa/006-reingestion.install.sql deleted file mode 100644 index e3bef12b..00000000 --- a/af/oometa/006-reingestion.install.sql +++ /dev/null @@ -1,32 +0,0 @@ -BEGIN; - -select _v.register_patch( '006-reingestion', ARRAY[ '005-repeated-report' ], NULL ); - -ALTER SEQUENCE autoclaved_autoclaved_no_seq RENAME TO autoclaved_no_seq; - -DROP TABLE report_meta; -DROP TABLE report_blob; -DROP TABLE measurement_meta; -DROP TABLE measurement_blob; -DROP TABLE measurement_exc; -DROP TABLE badmeta_tpl; -DROP TABLE dns_a_tpl; - --- `badblob` is a nice table, but we can't properly ingest it right now and --- it's something that does not actually exist after `autoclaved` stage, these --- bad blobs are lost during autoclaving, only canning preserves them. -DROP TABLE badblob; - -ALTER TABLE badmeta DROP COLUMN textname; - --- re-ingest `vanilla_tor` --- 1. bump global CODE_VER --- 2. prevent reprocessing of other files -UPDATE autoclaved SET code_ver = 4 -WHERE code_ver = 3 AND NOT EXISTS ( - SELECT 1 FROM report - WHERE autoclaved_no = autoclaved.autoclaved_no - AND test_name = 'vanilla_tor' -); - -COMMIT; diff --git a/af/oometa/006-repeated-report.install.sql b/af/oometa/006-repeated-report.install.sql deleted file mode 100644 index 4fb5dcf2..00000000 --- a/af/oometa/006-repeated-report.install.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.register_patch( '006-repeated-report', ARRAY[ '005-repeated-report' ], NULL ); - -comment on table repeated_report is 'Duplicate report files from private/canned'; - -COMMIT; diff --git a/af/oometa/006-repeated-report.rollback.sql b/af/oometa/006-repeated-report.rollback.sql deleted file mode 100644 index 48582efc..00000000 --- a/af/oometa/006-repeated-report.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '006-repeated-report' ); - -comment on table repeated_report is null; - -COMMIT; diff --git a/af/oometa/006-vanilla-tor.install.sql b/af/oometa/006-vanilla-tor.install.sql deleted file mode 100644 index e1b8bc9d..00000000 --- a/af/oometa/006-vanilla-tor.install.sql +++ /dev/null @@ -1,21 +0,0 @@ -BEGIN; - -select _v.register_patch( '006-vanilla-tor', ARRAY[ '005-repeated-report' ], NULL ); - --- Everything goes to `public` schema. - -CREATE TABLE vanilla_tor ( - msm_no int4 not null, -- references measurement, - timeout int4 not null, - error text, - tor_progress int2 not null, - success bool not null, - tor_progress_tag text, - tor_progress_summary text, - tor_version text not null, - tor_log jsonb -); - -COMMENT ON TABLE vanilla_tor IS 'Features: data from `vanilla_tor` measurements'; - -COMMIT; diff --git a/af/oometa/006-vanilla-tor.rollback.sql b/af/oometa/006-vanilla-tor.rollback.sql deleted file mode 100644 index 49b634f0..00000000 --- a/af/oometa/006-vanilla-tor.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '006-vanilla-tor' ); - -DROP TABLE vanilla_tor; - -COMMIT; diff --git a/af/oometa/007-test-name-hotfix.install.sql b/af/oometa/007-test-name-hotfix.install.sql deleted file mode 100644 index 09920fbd..00000000 --- a/af/oometa/007-test-name-hotfix.install.sql +++ /dev/null @@ -1,22 +0,0 @@ --- ALTER TYPE ... ADD cannot run inside a transaction block -ALTER TYPE ootest ADD VALUE IF NOT EXISTS 'dash'; -ALTER TYPE ootest ADD VALUE IF NOT EXISTS 'telegram'; - -BEGIN; - -select _v.register_patch( '007-test-name-hotfix', ARRAY[ '006-reingestion' ], NULL ); - --- That's temporary backward-compatible hotfix till API is updated to use --- relevant tables and relevant indexes are crated as a part of relevant process. - -UPDATE report rpt -SET test_name = 'telegram' -FROM software sw -WHERE rpt.software_no = sw.software_no AND sw.test_name = 'telegram'; - -UPDATE report rpt -SET test_name = 'dash' -FROM software sw -WHERE rpt.software_no = sw.software_no AND sw.test_name = 'dash'; - -COMMIT; diff --git a/af/oometa/008-measurements-index.install.sql b/af/oometa/008-measurements-index.install.sql deleted file mode 100644 index c07ed732..00000000 --- a/af/oometa/008-measurements-index.install.sql +++ /dev/null @@ -1,12 +0,0 @@ -BEGIN; - -select _v.register_patch( '008-measurements-index', ARRAY[ '007-test-name-hotfix' ], NULL ); - -CREATE INDEX measurement_measurement_start_time_idx ON measurement (measurement_start_time); - -CREATE EXTENSION pg_trgm; - -CREATE INDEX input_input_trgm_idx ON input USING GIST (input gist_trgm_ops); -CREATE INDEX report_test_start_time_idx ON report (test_start_time); - -COMMIT; diff --git a/af/oometa/009-simhash.install.sql b/af/oometa/009-simhash.install.sql deleted file mode 100644 index 79bd5363..00000000 --- a/af/oometa/009-simhash.install.sql +++ /dev/null @@ -1,22 +0,0 @@ -BEGIN; - -select _v.register_patch( '009-simhash', ARRAY[ '008-measurements-index' ], NULL ); - --- This table is empty, "inline" computation turned out to be as efficient as --- more complex scheme with caching of body_sha256 values that tried to avoid --- duplicate simhash computation. Moreover, inline storage is more efficient as --- it has no row overhead. - -DROP TABLE http_body_simhash; - -ALTER TABLE http_control - ADD COLUMN body_simhash bigint NULL, - ADD COLUMN body_text_simhash bigint NULL -; - -ALTER TABLE http_request - ADD COLUMN body_simhash bigint NULL, - ADD COLUMN body_text_simhash bigint NULL -; - -COMMIT; diff --git a/af/oometa/010-badblob.install.sql b/af/oometa/010-badblob.install.sql deleted file mode 100644 index 4ade7b23..00000000 --- a/af/oometa/010-badblob.install.sql +++ /dev/null @@ -1,30 +0,0 @@ -BEGIN; - -select _v.register_patch( '010-badblob', ARRAY[ '009-simhash' ], NULL ); - --- Original reason for `badblob` table was to preserve a `msm_no` for possible --- re-generation of measurements later. It's not longer relevant as OOID is --- going to be stamped on historical measurements: --- https://github.com/TheTorProject/ooni-pipeline/blob/master/docs/ooid-hash-prob.ipynb -create table badblob ( - -- Imagine a report that has no measurements! E.g. a half-empty broken - -- badblob as a report. Alike report will unlikely have a report_no. - -- So `filename` and `textname` are stored as canned files have no STRICT - -- mapping to `autoclaved_no` and `report_no`, although names of compressed - -- and uncompressed files usually match 1:1. Also, cardinality of the table - -- is going to be low, so storage optimisations do not matter. - filename text not null, - textname text not null, - canned_off size4 not null, - canned_size size4 not null, - bucket_date date not null, - orig_sha1 sha1 not null, - exc_str text not null, - PRIMARY KEY (filename, textname, canned_off) -); - -create index badblob_bucket_date_idx on badblob (bucket_date); - -comment on table badblob is 'Debug: accounting exceptions during (canned -> autoclaved) process'; - -COMMIT; diff --git a/af/oometa/011-cleanup.install.sql b/af/oometa/011-cleanup.install.sql deleted file mode 100644 index 3de0e937..00000000 --- a/af/oometa/011-cleanup.install.sql +++ /dev/null @@ -1,16 +0,0 @@ -BEGIN; - -SELECT _v.register_patch( '011-cleanup', ARRAY[ '010-badblob' ], NULL ); - -COMMENT ON TABLE badrow IS 'Debug: accounting exceptions while feeding DB with COPY FROM STDIN'; - --- `label` is a nice table, but we don't fill it right now and it creates --- useless complexity without any extra value. Moreover, the hack with manual --- labeling is no longer useful at all as OONI Pipeline can do partial --- re-ingestions as the indicators-of-censorship are updated in the pipeline code. -DROP TABLE label; - -ALTER TABLE report ALTER COLUMN autoclaved_no SET NOT NULL; -ALTER TABLE measurement ALTER COLUMN report_no SET NOT NULL; - -COMMIT; diff --git a/af/oometa/012-sha256-input-uniq.install.sql b/af/oometa/012-sha256-input-uniq.install.sql deleted file mode 100644 index 403ac384..00000000 --- a/af/oometa/012-sha256-input-uniq.install.sql +++ /dev/null @@ -1,12 +0,0 @@ -BEGIN; - -select _v.register_patch( '012-sha256-input-uniq', ARRAY[ '011-cleanup' ], NULL ); - --- ERROR: index row size 6512 exceeds maximum 2712 for index "input_input_key" --- See https://github.com/ooni/pipeline/issues/139 for details. - --- pgcrypto is already loaded -alter table input drop constraint input_input_key; -create unique index input_input_sha256_key on input (digest(input::text, 'sha256')); - -COMMIT; diff --git a/af/oometa/013-ooexpl-tables.install.sql b/af/oometa/013-ooexpl-tables.install.sql deleted file mode 100644 index 021bac06..00000000 --- a/af/oometa/013-ooexpl-tables.install.sql +++ /dev/null @@ -1,27 +0,0 @@ -BEGIN; - -select _v.register_patch( '013-ooexpl-tables', ARRAY[ '012-sha256-input-uniq' ], NULL ); - -CREATE TABLE ooexpl_bucket_msm_count ( - "count" integer, - "probe_asn" integer, - "probe_cc" character(2), - "bucket_date" date, - CONSTRAINT ooexpl_bucket_msm_count_pkey PRIMARY KEY (probe_asn, probe_cc, bucket_date) -); - -comment on table ooexpl_bucket_msm_count is 'OONI Explorer stats table for counting the total number of measurements since the beginning of time by probe_cc and probe_asn'; - -CREATE TABLE ooexpl_daily_msm_count ( - "count" integer, - "probe_cc" character(2), - "probe_asn" integer, - "test_name" ootest, - "test_day" date, - "bucket_date" date, - CONSTRAINT ooexpl_daily_msm_count_pkey PRIMARY KEY (probe_asn, probe_cc, test_name, test_day, bucket_date) -); - -comment on table ooexpl_daily_msm_count is 'OONI Explorer stats table for counting measurements by probe_cc, probe_asn from the past 30 days'; - -COMMIT; diff --git a/af/oometa/013-ooexpl-tables.rollback.sql b/af/oometa/013-ooexpl-tables.rollback.sql deleted file mode 100644 index 93ccfd48..00000000 --- a/af/oometa/013-ooexpl-tables.rollback.sql +++ /dev/null @@ -1,8 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '013-ooexpl-tables'); - -DROP TABLE ooexpl_bucket_msm_count; -DROP TABLE ooexpl_daily_msm_count; - -COMMIT; diff --git a/af/oometa/014-ooexpl-insert.install.sql b/af/oometa/014-ooexpl-insert.install.sql deleted file mode 100644 index 33f31dc8..00000000 --- a/af/oometa/014-ooexpl-insert.install.sql +++ /dev/null @@ -1,30 +0,0 @@ -BEGIN; - -select _v.register_patch( '014-ooexpl-insert', ARRAY[ '013-ooexpl-tables' ], NULL ); - -INSERT INTO ooexpl_bucket_msm_count ("count", "probe_asn", "probe_cc", "bucket_date") -SELECT -COUNT(msm_no) as "count", -probe_asn, -probe_cc, -bucket_date -FROM measurement -JOIN report ON report.report_no = measurement.report_no -JOIN autoclaved ON autoclaved.autoclaved_no = report.autoclaved_no -GROUP BY 2,3,4; - -INSERT INTO ooexpl_daily_msm_count ("count", "probe_cc", "probe_asn", "test_name", "test_day", "bucket_date") -SELECT -COUNT(msm_no) as "count", -probe_cc, -probe_asn, -test_name, -date_trunc('day', measurement_start_time) as test_day, -bucket_date -FROM measurement -JOIN report ON report.report_no = measurement.report_no -JOIN autoclaved ON autoclaved.autoclaved_no = report.autoclaved_no -WHERE measurement_start_time > current_date - interval '31 day' AND test_name IS NOT NULL -GROUP BY 2,3,4,5,6; - -COMMIT; diff --git a/af/oometa/014-ooexpl-insert.rollback.sql b/af/oometa/014-ooexpl-insert.rollback.sql deleted file mode 100644 index 3ef399c7..00000000 --- a/af/oometa/014-ooexpl-insert.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '014-ooexpl-insert'); - -TRUNCATE ooexpl_bucket_msm_count, ooexpl_daily_msm_count; - -COMMIT; diff --git a/af/oometa/015-fingerprint-fix.install.sql b/af/oometa/015-fingerprint-fix.install.sql deleted file mode 100644 index 7cee1689..00000000 --- a/af/oometa/015-fingerprint-fix.install.sql +++ /dev/null @@ -1,8 +0,0 @@ -BEGIN; - -select _v.register_patch( '015-fingerprint-fix', ARRAY[ '014-ooexpl-insert' ], NULL ); - -UPDATE fingerprint -SET "origin_cc"='GB' WHERE "origin_cc"='UK'; - -COMMIT; diff --git a/af/oometa/015-fingerprint-fix.rollback.sql b/af/oometa/015-fingerprint-fix.rollback.sql deleted file mode 100644 index a5e50e3f..00000000 --- a/af/oometa/015-fingerprint-fix.rollback.sql +++ /dev/null @@ -1,8 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '015-fingerprint-fix'); - -UPDATE fingerprint -SET "origin_cc"='UK' WHERE "origin_cc"='GB'; - -COMMIT; diff --git a/af/oometa/016-ooexpl_wc_confirmed.install.sql b/af/oometa/016-ooexpl_wc_confirmed.install.sql deleted file mode 100644 index 3a752966..00000000 --- a/af/oometa/016-ooexpl_wc_confirmed.install.sql +++ /dev/null @@ -1,48 +0,0 @@ -BEGIN; - -select _v.register_patch( '016-ooexpl_wc_confirmed', ARRAY[ '015-fingerprint-fix' ], NULL ); - -/* Store precomputed `confirmed` count and `msm_count` -from the web_connectivity table. Used by OONI Explorer. */ - -/* -INSERT INTO ooexpl_wc_confirmed -SELECT -COALESCE(SUM(CASE WHEN confirmed = TRUE THEN 1 ELSE 0 END), 0) as confirmed_count, -COUNT(*) as msm_count, -test_day, -bucket_date, -probe_cc, -probe_asn -FROM ( - SELECT - DISTINCT input as input, - date_trunc('day', test_start_time) as test_day, - probe_cc, - probe_asn, - bucket_date, - bool_or(confirmed) as confirmed - FROM measurement - JOIN input ON input.input_no = measurement.input_no - JOIN report ON report.report_no = measurement.report_no - JOIN autoclaved ON autoclaved.autoclaved_no = report.autoclaved_no - WHERE test_start_time < current_date - interval '1 day' - AND test_start_time > current_date - interval '31 day' - AND test_name = 'web_connectivity' - GROUP BY input, test_start_time, probe_cc, probe_asn, bucket_date -) as wc -GROUP BY test_day, probe_cc, probe_asn, bucket_date; -*/ - -CREATE TABLE ooexpl_wc_confirmed ( - confirmed_count BIGINT NOT NULL, - msm_count BIGINT NOT NULL, - test_day TIMESTAMP NOT NULL, - bucket_date DATE, - probe_cc CHARACTER(2) NOT NULL, - probe_asn INTEGER NOT NULL, - CONSTRAINT unique_day_bucket_cc_asn UNIQUE (test_day, bucket_date, probe_cc, probe_asn) - ) ; - - -COMMIT; diff --git a/af/oometa/016-ooexpl_wc_confirmed.rollback.sql b/af/oometa/016-ooexpl_wc_confirmed.rollback.sql deleted file mode 100644 index 0006731b..00000000 --- a/af/oometa/016-ooexpl_wc_confirmed.rollback.sql +++ /dev/null @@ -1,7 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '016-ooexpl_wc_confirmed'); - -DROP TABLE ooexpl_wc_confirmed; - -COMMIT; diff --git a/af/oometa/017-ooexpl_wc_input_counts.install.sql b/af/oometa/017-ooexpl_wc_input_counts.install.sql deleted file mode 100644 index 1cfcf437..00000000 --- a/af/oometa/017-ooexpl_wc_input_counts.install.sql +++ /dev/null @@ -1,44 +0,0 @@ -BEGIN; - -select _v.register_patch( '017-ooexpl_wc_input_counts', ARRAY[ '016-ooexpl_wc_confirmed' ], NULL ); - -/* -INSERT INTO ooexpl_wc_input_counts -(test_day, anomaly_count, confirmed_count, failure_count, total_count, input, bucket_date, probe_cc, probe_asn) -SELECT -date_trunc('day', test_start_time) as test_day, -COALESCE(sum(CASE WHEN anomaly = TRUE AND confirmed = FALSE AND msm_failure = FALSE THEN 1 ELSE 0 END), 0) AS anomaly_count, -COALESCE(sum(CASE WHEN confirmed = TRUE THEN 1 ELSE 0 END), 0) AS confirmed_count, -COALESCE(sum(CASE WHEN msm_failure = TRUE THEN 1 ELSE 0 END), 0) AS failure_count, COUNT(*) as total_count, -input, -bucket_date, -probe_cc, -probe_asn -FROM measurement -JOIN input ON input.input_no = measurement.input_no -JOIN report ON report.report_no = measurement.report_no -JOIN autoclaved ON report.autoclaved_no = autoclaved.autoclaved_no -WHERE test_start_time >= current_date - interval '31 day' -AND test_start_time < current_date -AND test_name = 'web_connectivity' -GROUP BY test_day, input, bucket_date, probe_cc, probe_asn; -*/ - -CREATE TABLE ooexpl_wc_input_counts ( - input TEXT, - confirmed_count BIGINT NOT NULL, - anomaly_count BIGINT NOT NULL, - failure_count BIGINT NOT NULL, - total_count BIGINT NOT NULL, - test_day TIMESTAMP NOT NULL, - bucket_date DATE, - probe_cc CHARACTER(2) NOT NULL, - probe_asn INTEGER NOT NULL, - CONSTRAINT ooexpl_wc_input_unique_day_bucket_cc_asn_input UNIQUE (test_day, bucket_date, probe_cc, probe_asn, input) -); - -CREATE INDEX "ooexpl_wc_input_counts_probe_cc_idx" ON "ooexpl_wc_input_counts"("probe_cc"); -CREATE INDEX "ooexpl_wc_input_counts_probe_asn_idx" ON "ooexpl_wc_input_counts"("probe_asn"); -CREATE INDEX "ooexpl_wc_input_counts_input_idx" ON "ooexpl_wc_input_counts"("input"); - -COMMIT; diff --git a/af/oometa/017-ooexpl_wc_input_counts.rollback.sql b/af/oometa/017-ooexpl_wc_input_counts.rollback.sql deleted file mode 100644 index 433db9e3..00000000 --- a/af/oometa/017-ooexpl_wc_input_counts.rollback.sql +++ /dev/null @@ -1,11 +0,0 @@ -BEGIN; - -select _v.unregister_patch( '017-ooexpl_wc_input_counts'); - -DROP TABLE ooexpl_wc_input_counts; - -DROP INDEX "ooexpl_wc_input_counts_probe_cc_idx"; -DROP INDEX "ooexpl_wc_input_counts_probe_asn_idx"; -DROP INDEX "ooexpl_wc_input_counts_input_idx"; - -COMMIT; diff --git a/af/oometa/018-fastpath.install.sql b/af/oometa/018-fastpath.install.sql deleted file mode 100644 index 3fcd8321..00000000 --- a/af/oometa/018-fastpath.install.sql +++ /dev/null @@ -1,64 +0,0 @@ --- Create fastpath tables --- Formatted with pgformatter 3.3 - -BEGIN; - -SELECT - _v.register_patch ('018-fastpath', ARRAY['017-ooexpl_wc_input_counts'], NULL); - -CREATE TABLE fastpath ( - "tid" TEXT PRIMARY KEY, - "report_id" text NOT NULL, - "input" TEXT, - "probe_cc" character (2) NOT NULL, - "probe_asn" integer NOT NULL, - "test_name" ootest, - "test_start_time" timestamp without time zone NOT NULL, - "measurement_start_time" timestamp without time zone, - "platform" text, - "filename" text, -- will be NULL after files are deleted - - "scores" JSON NOT NULL -); - -CREATE INDEX report_id_idx ON fastpath (report_id); - -CREATE INDEX input_idx ON fastpath (input); - -CREATE INDEX measurement_start_time_idx ON fastpath (measurement_start_time); - -COMMENT ON TABLE fastpath IS 'Measurements created by fastpath'; - -COMMENT ON COLUMN fastpath.tid IS 'Trivial ID'; - -COMMENT ON COLUMN fastpath.filename IS 'File served by the fastpath host containing the raw measurement'; - -COMMENT ON COLUMN fastpath.scores IS 'Scoring metadata'; - --- TODO add these to ansible role --- Skip grants during tests on Travis CI -DO $$ -BEGIN - IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'amsapi') THEN - GRANT SELECT ON fastpath TO amsapi; - END IF; -END -$$; - -DO $$ -BEGIN - IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'reader') THEN - GRANT SELECT ON tasks TO reader; - END IF; -END -$$; - -DO $$ -BEGIN - IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'readonly') THEN - GRANT SELECT ON fastpath TO readonly; - END IF; -END -$$; - -COMMIT; diff --git a/af/oometa/018-fastpath.rollback.sql b/af/oometa/018-fastpath.rollback.sql deleted file mode 100644 index ce3c16a4..00000000 --- a/af/oometa/018-fastpath.rollback.sql +++ /dev/null @@ -1,6 +0,0 @@ -BEGIN; -SELECT - _v.unregister_patch ('018-fastpath'); -DROP TABLE fastpath; -COMMIT; - diff --git a/af/oometa/019-domain-and-citizenlab-table.install.sql b/af/oometa/019-domain-and-citizenlab-table.install.sql deleted file mode 100644 index 148e87c5..00000000 --- a/af/oometa/019-domain-and-citizenlab-table.install.sql +++ /dev/null @@ -1,41 +0,0 @@ --- Create domain_input and citizenlab table --- Formatted with pgformatter 3.3 - -BEGIN; - -SELECT - _v.register_patch ('019-domain-and-citizenlab-table', - ARRAY['018-fastpath'], - NULL); - -CREATE TABLE domain_input ( - "domain" TEXT NOT NULL, - "input" TEXT NOT NULL, - "input_no" integer -); - -CREATE INDEX domain_input_domain_idx ON domain_input (domain); - -CREATE UNIQUE INDEX domain_input_input_sha256_key on domain_input (digest(input::text, 'sha256')); - -CREATE INDEX domain_input_input_no_idx ON domain_input (input_no); - -COMMENT ON COLUMN domain_input.domain IS 'FQDN or ipaddr without http and port number'; - -CREATE TABLE citizenlab ( - "domain" TEXT NOT NULL, - "url" TEXT NOT NULL, - "cc" CHARACTER(2), - "category_code" TEXT, - "priority" SMALLINT DEFAULT 256 -); - --- indexing is more efficient on the leftmost columns -CREATE INDEX citizenlab_multi_idx ON citizenlab (url, domain, category_code, cc); - -COMMENT ON COLUMN citizenlab.domain IS 'FQDN or ipaddr without http and port number'; - -COMMENT ON COLUMN citizenlab.category_code IS 'Category from Citizen Lab'; - -COMMIT; - diff --git a/af/oometa/019-domain-and-citizenlab-table.rollback.sql b/af/oometa/019-domain-and-citizenlab-table.rollback.sql deleted file mode 100644 index f1f7e277..00000000 --- a/af/oometa/019-domain-and-citizenlab-table.rollback.sql +++ /dev/null @@ -1,8 +0,0 @@ --- Drop domain_input and citizenlab tables --- Formatted with pgformatter 3.3 -BEGIN; -SELECT - _v.unregister_patch ('019-domain-and-citizenlab-table') - DROP TABLE domain_input; -DROP TABLE citizenlab; -COMMIT; diff --git a/af/oometa/019-fastpath-confirmed.install.sql b/af/oometa/019-fastpath-confirmed.install.sql deleted file mode 100644 index d9871a2d..00000000 --- a/af/oometa/019-fastpath-confirmed.install.sql +++ /dev/null @@ -1,24 +0,0 @@ --- Add confirmed and anomaly columns to fastpath table --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.register_patch ('019-fastpath-confirmed', - ARRAY['018-fastpath'], - NULL); -ALTER TABLE fastpath - ADD COLUMN anomaly boolean, - ADD COLUMN confirmed boolean, - ADD COLUMN msm_failure boolean; - --- Switch to BRIN index for measurement_start_time -DROP INDEX measurement_start_time_idx; -CREATE INDEX fastpath_measurement_start_time_idx ON fastpath -USING BRIN (measurement_start_time) WITH (pages_per_range = 128); - --- Rename indexes -ALTER INDEX input_idx RENAME TO fastpath_input_idx; -ALTER INDEX report_id_idx RENAME TO fastpath_report_id_idx; - -COMMIT; - diff --git a/af/oometa/019-fastpath-confirmed.rollback.sql b/af/oometa/019-fastpath-confirmed.rollback.sql deleted file mode 100644 index 52555c1e..00000000 --- a/af/oometa/019-fastpath-confirmed.rollback.sql +++ /dev/null @@ -1,17 +0,0 @@ --- Remove confirmed and anomaly columns to fastpath table --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.unregister_patch ('019-fastpath-confirmed'); -ALTER TABLE fastpath - DROP COLUMN anomaly, - DROP COLUMN confirmed, - DROP COLUMN msm_failure; - -DROP INDEX fastpath_measurement_start_time_idx; -CREATE INDEX measurement_start_time_idx ON fastpath (measurement_start_time); -ALTER INDEX fastpath_input_idx RENAME TO input_idx; -ALTER INDEX fastpath_report_id_idx RENAME TO report_id_idx; -COMMIT; - diff --git a/af/oometa/020-new-test-names.install.sql b/af/oometa/020-new-test-names.install.sql deleted file mode 100644 index ef77d85b..00000000 --- a/af/oometa/020-new-test-names.install.sql +++ /dev/null @@ -1,14 +0,0 @@ --- Create new test names --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.register_patch ('020-new-test-names', - ARRAY['019-domain-and-citizenlab-table'], - NULL); -COMMIT; - --- This needs to be done outside of a transaction :-/ -ALTER TYPE ootest ADD VALUE 'psiphon'; -ALTER TYPE ootest ADD VALUE 'tor'; - diff --git a/af/oometa/020-new-test-names.rollback.sql b/af/oometa/020-new-test-names.rollback.sql deleted file mode 100644 index dc079a66..00000000 --- a/af/oometa/020-new-test-names.rollback.sql +++ /dev/null @@ -1,35 +0,0 @@ --- Delete new test names --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.unregister_patch ('020-new-test-names'); --- rename type, add old one and switch the colums --- This all work in a transaction, contrarily --- to the addition of new values -ALTER TYPE ootest RENAME TO ootest_old; -CREATE TYPE ootest AS enum ( - 'web_connectivity', - 'http_requests', - 'dns_consistency', - 'http_invalid_request_line', - 'bridge_reachability', - 'tcp_connect', - 'http_header_field_manipulation', - 'http_host', - 'multi_protocol_traceroute', - 'meek_fronted_requests_test', - 'whatsapp', - 'vanilla_tor', - 'facebook_messenger', - 'ndt' -); -ALTER TABLE report - ALTER COLUMN test_name TYPE ootest - USING test_name::text::ootest; -ALTER TABLE fastpath - ALTER COLUMN test_name TYPE ootest - USING test_name::text::ootest; -DROP TYPE ootest_old; -COMMIT; - diff --git a/af/oometa/021-add-report-index.install.sql b/af/oometa/021-add-report-index.install.sql deleted file mode 100644 index 91845970..00000000 --- a/af/oometa/021-add-report-index.install.sql +++ /dev/null @@ -1,9 +0,0 @@ --- Add index to report table ooni/backend#327 -BEGIN; -SELECT - _v.register_patch ('021-add-report-index', - ARRAY['020-new-test-names'], - NULL); -CREATE INDEX report_probe_asn_idx ON report (probe_asn); -CREATE INDEX report_probe_cc_probe_asn_idx ON report (probe_cc, probe_asn); -COMMIT; diff --git a/af/oometa/021-add-report-index.rollback.sql b/af/oometa/021-add-report-index.rollback.sql deleted file mode 100644 index 3b2253b5..00000000 --- a/af/oometa/021-add-report-index.rollback.sql +++ /dev/null @@ -1,9 +0,0 @@ --- Remove report_probe_asn_idx and report_probe_cc_probe_asn_idx --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.unregister_patch ('021-add-report-index'); -DROP INDEX report_probe_asn_idx; -DROP INDEX report_probe_cc_probe_asn_idx; -COMMIT; diff --git a/af/oometa/021-create-counters-table.install.sql b/af/oometa/021-create-counters-table.install.sql deleted file mode 100644 index eb6081c1..00000000 --- a/af/oometa/021-create-counters-table.install.sql +++ /dev/null @@ -1,37 +0,0 @@ --- Create counters table --- See analysis/counters_table.adoc --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.register_patch ('021-counters-table', - ARRAY['020-new-test-names'], - NULL); - -CREATE UNLOGGED TABLE counters ( - "measurement_start_day" DATE, - "test_name" TEXT, - "probe_cc" CHARACTER (2) NOT NULL, - "probe_asn" INTEGER NOT NULL, - "input" TEXT, - "anomaly_count" INTEGER, - "confirmed_count" INTEGER, - "failure_count" INTEGER, - "measurement_count" INTEGER -); - -CREATE INDEX counters_brin_multi_idx ON counters - USING BRIN ( - measurement_start_day, - test_name, - probe_cc, - probe_asn, - input, - anomaly_count, - confirmed_count, - failure_count, - measurement_count - ) - WITH (pages_per_range = 32); - -COMMIT; diff --git a/af/oometa/021-create-counters-table.rollback.sql b/af/oometa/021-create-counters-table.rollback.sql deleted file mode 100644 index 28d384dd..00000000 --- a/af/oometa/021-create-counters-table.rollback.sql +++ /dev/null @@ -1,6 +0,0 @@ -BEGIN; -SELECT - _v.unregister_patch ('021-counters-table'); -DROP TABLE counters; -COMMIT; - diff --git a/af/oometa/022-create-fastpath-btree-idx.install.sql b/af/oometa/022-create-fastpath-btree-idx.install.sql deleted file mode 100644 index dc15190d..00000000 --- a/af/oometa/022-create-fastpath-btree-idx.install.sql +++ /dev/null @@ -1,10 +0,0 @@ --- Create fastpath btree index --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.register_patch ( '022-create-fastpath-btree-idx', ARRAY['021-counters-table'], NULL); - -CREATE INDEX measurement_start_time_btree_idx ON fastpath (measurement_start_time); - -COMMIT; diff --git a/af/oometa/022-create-fastpath-btree-idx.rollback.sql b/af/oometa/022-create-fastpath-btree-idx.rollback.sql deleted file mode 100644 index 0266be31..00000000 --- a/af/oometa/022-create-fastpath-btree-idx.rollback.sql +++ /dev/null @@ -1,5 +0,0 @@ -BEGIN; -SELECT - _v.unregister_patch ('022-create-fastpath-btree-idx'); -DROP INDEX measurement_start_time_btree_idx -COMMIT; diff --git a/af/oometa/023-grant-select-counters-amsapi.install.sql b/af/oometa/023-grant-select-counters-amsapi.install.sql deleted file mode 100644 index c4a06396..00000000 --- a/af/oometa/023-grant-select-counters-amsapi.install.sql +++ /dev/null @@ -1,15 +0,0 @@ --- Grant SELECT on counters to amsapi - -BEGIN; -SELECT - _v.register_patch ( '023-grant-select-counters-amsapi', ARRAY['022-create-fastpath-btree-idx'], NULL); - -DO $$ -BEGIN - IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'amsapi') THEN - GRANT SELECT ON counters TO amsapi; - END IF; -END -$$; - -COMMIT; diff --git a/af/oometa/023-grant-select-counters-amsapi.rollback.sql b/af/oometa/023-grant-select-counters-amsapi.rollback.sql deleted file mode 100644 index 140cefac..00000000 --- a/af/oometa/023-grant-select-counters-amsapi.rollback.sql +++ /dev/null @@ -1,13 +0,0 @@ -BEGIN; -SELECT - _v.unregister_patch ('023-grant-select-counters-amsapi'); - -DO $$ -BEGIN - IF EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'amsapi') THEN - REVOKE SELECT ON counters FROM amsapi; - END IF; -END -$$; - -COMMIT; diff --git a/af/oometa/024-create-counters-btree-idx.install.sql b/af/oometa/024-create-counters-btree-idx.install.sql deleted file mode 100644 index 8990cf86..00000000 --- a/af/oometa/024-create-counters-btree-idx.install.sql +++ /dev/null @@ -1,10 +0,0 @@ --- Create counters btree index --- Formatted with pgformatter 3.3 - -BEGIN; -SELECT - _v.register_patch ( '024-create-counters-btree-idx', ARRAY['023-grant-select-counters-amsapi'], NULL); - -CREATE INDEX measurement_start_day_btree_idx ON counters (measurement_start_day); - -COMMIT; diff --git a/af/oometa/024-create-counters-btree-idx.rollback.sql b/af/oometa/024-create-counters-btree-idx.rollback.sql deleted file mode 100644 index 5e30773a..00000000 --- a/af/oometa/024-create-counters-btree-idx.rollback.sql +++ /dev/null @@ -1,6 +0,0 @@ - -BEGIN; -SELECT - _v.unregister_patch ('024-create-counters-btree-idx'); -DROP INDEX measurement_start_time_btree_idx -COMMIT; diff --git a/af/oometa/dump-schema b/af/oometa/dump-schema deleted file mode 100755 index 8b81a169..00000000 --- a/af/oometa/dump-schema +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -ex - -id=$(docker run --rm --detach --net=none -e POSTGRES_USER=oopguser postgres:9.6) -until docker exec -i $id psql -U oopguser -c 'select 1' >/dev/null; do - sleep 0.5 -done -if [ $# -eq 0 ]; then - cat *.install.sql -else - cat "$@" -fi | docker exec -i $id psql -U oopguser >/dev/null -docker exec -i $id pg_dump -U oopguser --schema-only -docker stop $id >/dev/null diff --git a/af/oometa/psql b/af/oometa/psql deleted file mode 100755 index 416ffb7f..00000000 --- a/af/oometa/psql +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -exec psql -U oopguser -h $(docker inspect spbmeta | jq --raw-output '.[].NetworkSettings.Networks.bridge.IPAddress') "$@" diff --git a/af/oometa/run-pg b/af/oometa/run-pg deleted file mode 100755 index dbee5e4c..00000000 --- a/af/oometa/run-pg +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh -ex - -id=$(docker run --rm --detach --hostname spbmeta --name spbmeta -v "$PWD/tsv:/tsv:ro" -e POSTGRES_USER=oopguser postgres:9.6) -until docker exec -i $id psql -U oopguser -c 'select 1' >/dev/null; do - sleep 0.5 -done - -cat *.install.sql | docker exec -i $id psql -U oopguser - -for fname in tsv/*.tsv; do - if [ -r "$fname" ]; then - tbl=$(basename "$fname" .tsv) - docker exec -i $id psql -U oopguser -c "TRUNCATE ${tbl}; COPY ${tbl} FROM '/tsv/${tbl}.tsv'" - fi -done - -docker inspect $id | jq --raw-output '.[].NetworkSettings.Networks.bridge.IPAddress' diff --git a/af/oometa/stop-pg b/af/oometa/stop-pg deleted file mode 100755 index eaaf1fb0..00000000 --- a/af/oometa/stop-pg +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -exec docker stop spbmeta diff --git a/af/oometa/tsvdiffstat b/af/oometa/tsvdiffstat deleted file mode 100755 index 29c2e820..00000000 --- a/af/oometa/tsvdiffstat +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# -# The script is used to estimate the difference between table slice fetched -# from remote MetaDB instance via `tsvfetch` and corresponding tables dumped -# from local MetaDB instance via `tsvdump`. -# - -for tbl in autoclaved report measurement; do - echo "==> ${tbl}" - diff -u --label ${tbl}.xxx_no <(cut -f 1 tsv/${tbl}.tsv | sort -n) --label ${tbl}.xxx_no <(cut -f 1 tsvdump.dir/${tbl}.tsv | sort -n) | diffstat -done - -for fname in tsv/*.tsv; do - tbl=$(basename "$fname" .tsv) - echo "==> ${tbl}" - diff -u --label ${tbl}.tsv <(sort tsv/${tbl}.tsv) --label ${tbl}.tsv <(sort tsvdump.dir/${tbl}.tsv) | diffstat -done diff --git a/af/oometa/tsvdump b/af/oometa/tsvdump deleted file mode 100755 index 77f37b09..00000000 --- a/af/oometa/tsvdump +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -ex - -mkdir -p tsvdump.dir -for fname in tsv/*.tsv; do - tbl=$(basename "$fname" .tsv) - if [ ! -f "tsvdump.dir/${tbl}.tsv" ]; then - ./psql -c "\\copy ${tbl} to 'tsvdump.dir/${tbl}.tsv'" - fi -done diff --git a/af/oometa/tsvfetch b/af/oometa/tsvfetch deleted file mode 100755 index ddb5a913..00000000 --- a/af/oometa/tsvfetch +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -ex -# -# The script fetches tables slices corresponding to single bucket -# from remote # MetaDB instance. -# - -set -o pipefail - -# Create tables to fetch only one-bucked-sized slice of the data: -suffix="181230" -date="2018-12-30" - -psql.hkgmetadb < Fri, 05 Jun 2020 21:02:07 +0100 - -prio (0.2) unstable; urgency=medium - - * Update conf - - -- Federico Ceratto Fri, 05 Jun 2020 15:51:00 +0100 - -prio (0.1) unstable; urgency=medium - - * URL prioritization MVP - https://github.com/ooni/backend/issues/361 - https://github.com/ooni/pipeline/pull/311 - - -- Federico Ceratto Tue, 14 Apr 2020 22:09:35 +0100 diff --git a/af/prio/debian/control b/af/prio/debian/control deleted file mode 100644 index 9a4619b0..00000000 --- a/af/prio/debian/control +++ /dev/null @@ -1,25 +0,0 @@ -Source: prio -Section: python -Priority: optional -Maintainer: Federico Ceratto -Build-Depends: debhelper-compat (= 12), - python3, - dh-systemd (>= 1.5), - dh-python, - python3-psycopg2, - python3-setuptools, -Standards-Version: 4.5.0 - -Package: prio -Architecture: all -Depends: ${misc:Depends}, - ${python3:Depends}, - python3-bottle, - python3-prometheus-client, - python3-psycopg2, - python3-sdnotify, - python3-systemd -Suggests: - python3-pytest, -Description: OONI Prio - URL prioritization diff --git a/af/prio/debian/postinst b/af/prio/debian/postinst deleted file mode 100644 index 7bcd06dc..00000000 --- a/af/prio/debian/postinst +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh -# postinst script for prio - -set -e - -case "$1" in - configure) - addgroup --system --quiet prio - adduser --system --quiet --ingroup prio --home /var/lib/prio prio - chmod 1777 /run/nodeexp/ - ;; - - abort-upgrade|abort-remove|abort-deconfigure) - ;; - - *) - echo "postinst called with unknown argument \`$1'" >&2 - exit 1 - ;; -esac - -# dh_installdeb will replace this with shell code automatically -# generated by other debhelper scripts. - -#DEBHELPER# - -exit 0 diff --git a/af/prio/debian/prio.service b/af/prio/debian/prio.service deleted file mode 100644 index 415ed0af..00000000 --- a/af/prio/debian/prio.service +++ /dev/null @@ -1,41 +0,0 @@ -[Unit] -Description=OONI Prio -Wants=network-online.target -After=network-online.target - -[Service] -ExecStart=/usr/bin/prio -Restart=on-failure -Type=simple -RestartSec=2s -WorkingDirectory=/var/lib/prio - -WatchdogSec=600s - -User=prio -Group=prio -ReadOnlyDirectories=/ -ReadWriteDirectories=/proc/self -ReadWriteDirectories=/var/lib/prio -ReadWriteDirectories=/run/nodeexp/ - -StandardOutput=syslog+console -StandardError=syslog+console - -PermissionsStartOnly=true -LimitNOFILE=65536 - -# Hardening -CapabilityBoundingSet=CAP_SETUID CAP_SETGID -SystemCallFilter=~@cpu-emulation @debug @keyring @module @mount @obsolete @raw-io @resources @clock @debug @keyring @mount @privileged @reboot @setuid @swap @memlock - -NoNewPrivileges=yes -PrivateDevices=yes -PrivateTmp=yes -ProtectHome=yes -ProtectSystem=full -ProtectKernelModules=yes -ProtectKernelTunables=yes - -[Install] -WantedBy=multi-user.target diff --git a/af/prio/debian/rules b/af/prio/debian/rules deleted file mode 100755 index 1ddb91ec..00000000 --- a/af/prio/debian/rules +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/make -f -export DH_VERBOSE = 1 - -%: - dh $@ --with python3 --buildsystem=pybuild - -override_dh_installsystemd: - dh_installsystemd --no-restart-on-upgrade --name prio diff --git a/af/prio/debian/source/format b/af/prio/debian/source/format deleted file mode 100644 index 163aaf8d..00000000 --- a/af/prio/debian/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) diff --git a/af/prio/etc/ooni/prio.conf b/af/prio/etc/ooni/prio.conf deleted file mode 100644 index a6570ed8..00000000 --- a/af/prio/etc/ooni/prio.conf +++ /dev/null @@ -1,9 +0,0 @@ -[DEFAULT] -apiport = 8788 -dbhost = amsmetadb.ooni.nu -dbname = metadb -# already public -dbpassword = yEqgNr2eXvgG255iEBxVeP -dbport = 5432 -dbuser = shovel -refresh_interval_s = 300 diff --git a/af/prio/prio.py b/af/prio/prio.py deleted file mode 100755 index 3bf3e33c..00000000 --- a/af/prio/prio.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -"""OONI URL prioritization service - -Configured with /etc/ooni/prio.conf - -Runs as a system daemon - -Inputs: - Database tables: - `citizenlab` db table - country codes are in the `cc` column, lowercase, with the exception of ZZ - -Outputs: - Files in /var/lib/analysis - Node exporter / prometheus metrics - Dedicated unlogged database tables and charts - tables: - currently_blocked - -Special country code values: - ZZ: unknown - XX: all -""" - -from collections import namedtuple -from configparser import ConfigParser -from typing import List -import logging -import random -import time - -from systemd.journal import JournalHandler # debdeps: python3-systemd - -from bottle import route -import bottle - -from psycopg2.extras import RealDictCursor -import psycopg2 - -conf = None -test_items = {} -last_update_time = 0 - -log = logging.getLogger("prio") -log.addHandler(JournalHandler(SYSLOG_IDENTIFIER="prio")) - - -def connect_db(c): - conn = psycopg2.connect( - dbname=c.dbname, - user=c.dbuser, - host=c.dbhost, - port=c.dbport, - password=c.dbpassword, - ) - return conn - - -# @metrics.timer("update_url_prioritization") -def update_url_prioritization(): - """ - """ - log.info("Started update_url_prioritization") - conn = connect_db(conf) - cur = conn.cursor(cursor_factory=RealDictCursor) - - log.info("Regenerating URL prioritization file") - sql = """SELECT priority, domain, url, cc, category_code FROM citizenlab""" - cur.execute(sql) - entries = list(cur.fetchall()) - conn.rollback() - conn.close() - - # Create dict: cc -> category_code -> [entry, ... ] - entries_by_country = {} - for e in entries: - country = e["cc"].upper() - if country not in entries_by_country: - entries_by_country[country] = {} - ccode = e["category_code"] - entries_by_country[country].setdefault(ccode, []).append(e) - - # merge ZZ into each country - zz = entries_by_country.pop("ZZ") - for ccode, country_dict in entries_by_country.items(): - for category_code, test_items in zz.items(): - country_dict.setdefault(category_code, []).extend(test_items) - - log.info("Update done") - return entries_by_country - - -def algo_chao(s: List, k: int) -> List: - """Chao weighted random sampling - """ - n = len(s) - assert len(s) >= k - wsum = 0 - r = s[:k] - assert len(r) == k - for i in range(0, n): - wsum = wsum + s[i]["priority"] - if i < k: - continue - p = s[i]["priority"] / wsum # probability for this item - j = random.random() - if j <= p: - pos = random.randint(0, k - 1) - r[pos] = s[i] - - return r - - -def generate_test_list(country_code: str, category_codes: str, limit: int): - global test_items, last_update_time - - if last_update_time < time.time() - 100: # conf.refresh_interval: - last_update_time = time.time() - try: - test_items = update_url_prioritization() - except Exception as e: - log.error(e, exc_info=1) - - candidates_d = test_items[country_code] # category_code -> [test_item, ... ] - - if category_codes: - category_codes = [c.strip().upper() for c in category_codes.split(",")] - else: - category_codes = candidates_d.keys() - - candidates = [] - for ccode in category_codes: - s = candidates_d.get(ccode, []) - candidates.extend(s) - - log.info("%d candidates", len(candidates)) - - if limit == -1: - limit = 100 - limit = min(limit, len(candidates)) - selected = algo_chao(candidates, limit) - - out = [] - for entry in selected: - out.append( - { - "category_code": entry["category_code"], - "url": entry["url"], - "country_code": "XX" if entry["cc"] == "ZZ" else entry["cc"], - } - ) - return out - - -@route("/api/v1/test-list/urls") -def list_urls(): - """ - https://orchestrate.ooni.io/api/v1/test-list/urls?country_code=IT - """ - try: - country_code = bottle.request.query.country_code.upper() or "ZZ" - category_codes = bottle.request.query.category_code - limit = int(bottle.request.query.limit or -1) - test_items = generate_test_list(country_code, category_codes, limit) - out = { - "metadata": { - "count": len(test_items), - "current_page": -1, - "limit": -1, - "next_url": "", - "pages": 1, - }, - "results": test_items, - } - return out - except Exception as e: - log.error(e, exc_info=1) - return {} - - -def main(): - global conf - conffile = "/etc/ooni/prio.conf" - cp = ConfigParser() - with open(conffile) as f: - cp.read_file(f) - d = cp.defaults() # parsed values from DEFAULT section - conf = namedtuple("Conf", d.keys())(*d.values()) - bottle.run(host="localhost", port=conf.apiport) - - -if __name__ == "__main__": - main() diff --git a/af/prio/setup.py b/af/prio/setup.py deleted file mode 100644 index e93083f7..00000000 --- a/af/prio/setup.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from setuptools import setup - -setup( - entry_points={"console_scripts": ["prio=prio:main",]}, - include_package_data=True, - install_requires=[], - name="prio", - py_modules=["prio"], - python_requires=">=3.7.0", - zip_safe=False, -) diff --git a/af/prio/tests/test_functional.py b/af/prio/tests/test_functional.py deleted file mode 100644 index 2cdce499..00000000 --- a/af/prio/tests/test_functional.py +++ /dev/null @@ -1,89 +0,0 @@ -import time -import random -from collections import Counter - -import pytest - -import prio - - -def test_algo_chao(): - random.seed(3) - entries = [ - {"priority": 500, "name": "heavy"}, - {"priority": 200, "name": "b"}, - {"priority": 150, "name": "c"}, - {"priority": 50, "name": "uncommon"}, - ] + [{"priority": 100, "name": "None"},] * 95 - - c = Counter() - for x in range(1, 10000): - selected = prio.algo_chao(entries, 1) - c.update([s["name"] for s in selected]) - - assert c.most_common() == [ - ("None", 9060), - ("heavy", 494), - ("b", 221), - ("c", 173), - ("uncommon", 51), - ] - - -def test_generate_test_list_no_country(): - prio.last_update_time = time.time() - prio.test_items = {} - with pytest.raises(Exception): - prio.generate_test_list("XY", [], 10) - - -def test_generate_test_list_bug(): - prio.last_update_time = time.time() - prio.test_items = {"IE": {"NEWS": []}} - tl = prio.generate_test_list("IE", "NEWS", 10) - assert not tl - - -def test_generate_test_list_3(): - random.seed(3) - prio.last_update_time = time.time() - prio.test_items = { - "IE": { - "NEWS": [ - {"priority": 100, "category_code": "NEWS", "url": "url1", "cc": "IE"}, - {"priority": 200, "category_code": "NEWS", "url": "url2", "cc": "IE"}, - {"priority": 300, "category_code": "NEWS", "url": "url3", "cc": "IE"}, - ] - } - } - tl = prio.generate_test_list("IE", "NEWS", 2) - assert [i["url"] for i in tl] == ["url3", "url2"] - - tl = prio.generate_test_list("IE", "NEWS", 2) - assert [i["url"] for i in tl] == ["url1", "url3"] - - -def test_generate_test_list_categories(): - random.seed(3) - prio.last_update_time = time.time() - prio.test_items = { - "IE": { - "NEWS": [ - {"priority": 100, "category_code": "NEWS", "url": "url1", "cc": "IE"}, - ], - "ANON": [ - {"priority": 200, "category_code": "ANON", "url": "url2", "cc": "IE"}, - ], - "FILE": [ - {"priority": 300, "category_code": "FILE", "url": "url3", "cc": "IE"}, - ], - } - } - tl = prio.generate_test_list("IE", "NEWS", 3) - assert [i["url"] for i in tl] == ["url1"], tl - - tl = prio.generate_test_list("IE", "NEWS,ANON", 3) - assert [i["url"] for i in tl] == ["url1", "url2"], tl - - tl = prio.generate_test_list("IE", "ANON,FILE", 3) - assert [i["url"] for i in tl] == ["url2", "url3"], tl diff --git a/analysis/analysis/counters_table.adoc b/analysis/analysis/counters_table.adoc deleted file mode 100644 index ce3b1cd9..00000000 --- a/analysis/analysis/counters_table.adoc +++ /dev/null @@ -1,16 +0,0 @@ - -== Counters table - -Table definition in../oometa/021-create-counters-table.install.sql - -==== Use cases - -* Showing global statistics and counts in Explorer, replacing the ooexp* tables -* list_measurements in Explorer: estimate the number of results before running heavy queries -* Manually run aggregation queries -* Converted into a dataframe with Pandas on jupiter for analysis - -==== Update process - -It is updated by the analysis process - diff --git a/api/.babelrc b/api/.babelrc deleted file mode 100644 index c13c5f62..00000000 --- a/api/.babelrc +++ /dev/null @@ -1,3 +0,0 @@ -{ - "presets": ["es2015"] -} diff --git a/api/LICENSE.md b/api/LICENSE.md deleted file mode 100644 index 9249ef41..00000000 --- a/api/LICENSE.md +++ /dev/null @@ -1,26 +0,0 @@ -Copyright 2019 Open Observatory of Network Interference (OONI), The Tor Project - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/api/build_runner.sh b/api/build_runner.sh index 837cad43..053f3816 100755 --- a/api/build_runner.sh +++ b/api/build_runner.sh @@ -18,7 +18,7 @@ locale-gen en_US.UTF-8 # Set up OONI archive echo 'deb http://deb-ci.ooni.org unstable main' \ > /etc/apt/sources.list.d/ooni.list -apt-key adv --keyserver hkp://keyserver.ubuntu.com \ +apt-key adv --keyserver hkps://keys.openpgp.org \ --recv-keys "B5A08F01796E7F521861B449372D1FF271F2DD50" apt-get update diff --git a/api/ooniapi/markdown/api_docs.md b/api/ooniapi/markdown/api_docs.md deleted file mode 100644 index 4f16ab2a..00000000 --- a/api/ooniapi/markdown/api_docs.md +++ /dev/null @@ -1,248 +0,0 @@ -# OONI Measurements API - -This is the documentation for version 1 of the OONI measurements API. - -All the API endpoints start with the URL `/api/v1/`. - -# Pagination - -Some API endpoints support pagination. In these cases the response will have - the following structure: - -``` -{ - "metadata": { - "offset": "an integer specifying the current offset into the data", - "limit": "an integer specifying how many results should be presented", - "count": "an integer expressing the total number of items", - "pages": "the number of pages, or the number of requests you will" - "have to do with the current value of limit to obtain the" - "full set of records", - "next_url": "the url to be used to fetch the next set of items" - }, - "results": [ - "a list containing generally dictionaries of the result in question" - ] -} -``` - -## Search files - -Returns a listing of the files matching the given search criteria. - -This API endpoints supports pagination and will by default return 100 -results per response. - -### Request - -**URL** - - /api/v1/files - -**Method** - - `GET` - -**URL Params** - - `probe_cc=[string]` - the two letter country code. - - `probe_asn=[string]` - the - [Autonomous system](https://en.wikipedia.org/wiki/Autonomous_system_(Internet)) - number in the format "ASXXX" - - `test_name=[string]` - the name of the test - - `since=[string]` - the start date of when measurements were run (ex. - "2016-10-20T10:30:00") - - `until=[string]` - the end date of when measurement were run (ex. - "2016-10-20T10:30:00") - - `since_index=[integer]` - return results only strictly greater than the - provided index. - - `order_by=[string]` - by which key the results should be ordered by (default: test_start_time) - - `order=[string] ("desc", "asc")` - if the order should be ascending or descending. - - `offset=[integer]` - offset into the result set (default: 0) - - `limit=[integer]` - number of records to return (default: 100) - -**Data Params** - - None - -### Response - -#### Success - -**Code:** 200
-**Content:** - -``` -{ - "metadata": { - "count": "[integer] total number of rows", - "limit": "[integer] current limit to returned results", - "next_url": "[string] URL pointing to next page of results or none if no more pages are available", - "offset": "[integer] the current offset into the result set", - "pages": "[integer] total number of pages" - "current_page": "[integer] current page" - }, - "results": [ - { - "probe_asn": "[string] the Autonomous system number of the result", - "probe_cc": "[string] the country code of the result", - "test_name": "[string] the name of the test that was run", - "index": "[integer] the index of this result (useful when using since_index)", - "test_start_time": "[string] start time for the measurement is ISO 8601 format", - "download_url": "[string] url to the download. Note: if the download URL ends with '.gz' it should be considered compressed with gzip." - } - ] -} -``` - -#### Error - -**Code:** 400 BAD REQUEST
-**Content:** - -``` -{ - "error_code": 400, - "error_message": "Some error message" -} -``` - -## Search measurements - -Returns the IDs for the measurements that match the specified search -criteria. - -### Request - -**URL** - - /api/v1/measurements - -**Method** - - `GET` - -**URL Params** - - `report_id=[string]` - the report ID of the requested measurement - - `input=[string]` - the input for the requested measurement - - `probe_cc=[string]` - the two letter country code. - - `probe_asn=[string]` - the - [Autonomous system](https://en.wikipedia.org/wiki/Autonomous_system_(Internet)) - number in the format "ASXXX" - - `test_name=[string]` - the name of the test - - `since=[string]` - the start date of when measurements were run (ex. - "2016-10-20T10:30:00") - - `until=[string]` - the end date of when measurement were run (ex. - "2016-10-20T10:30:00") - - `order_by=[string]` - by which key the results should be ordered by (default: test_start_time) - - `order=[string] ("desc", "asc")` - if the order should be ascending or descending. - - `offset=[integer]` - offset into the result set (default: 0) - - `limit=[integer]` - number of records to return (default: 100) - -**Data Params** - - None - -### Response - -#### Success - -**Code:** 200
-**Content:** - -``` -{ - "metadata": { - "count": "[integer] total number of rows", - "limit": "[integer] current limit to returned results", - "next_url": "[string] URL pointing to next page of results or none if no more pages are available", - "offset": "[integer] the current offset into the result set", - "pages": "[integer] total number of pages" - "current_page": "[integer] current page" - }, - "results": [ - { - "measurement_id": "[string] the ID of the measurement returned", - "measurement_url": "[string] link to fetch the measurement (probably in the form of $BASEURL/api/v1/measurement/)" - } - ] -} -``` - -#### Error - -**Code:** 400 BAD REQUEST
-**Content:** - -``` -{ - "error_code": 400, - "error_message": "Some error message" -} -``` - - -## Fetch measurement - -Returns the specified measurement. - -### Request - -**URL** - - `/api/v1/measurement/` - -**Method** - - `GET` - -### Response - -#### Success - -**Code:** 200
-**Content:** - -``` -{ - "id": "XXXX", - "data": { - "probe_cc": "XX", - "probe_asn": "XX", - ... - "test_keys": {}, - } -} -``` - -#### Error - -**Code:** 400 BAD REQUEST
-**Content:** - -``` -{ - "error_code": 400, - "error_message": "Some error message" -} -``` diff --git a/api/rate_limit_quotas.py b/api/rate_limit_quotas.py deleted file mode 100644 index 9f95a2fa..00000000 --- a/api/rate_limit_quotas.py +++ /dev/null @@ -1,220 +0,0 @@ -""" -Rate limiter and quota system. - -Framework-independent rate limiting mechanism that provides: - * IP address and token-based accounting - * customizable quotas based on IP address and token - * late limiting based on resource usage (time spent on API calls) - * bucketing based on day, week, month - * statistics - * metrics - * fast in-memory storage - -Also provides a connector for Flask - -""" - -import time -import ipaddress -from typing import Dict, List, Optional, Tuple, Union - -IpAddress = Union[ipaddress.IPv4Address, ipaddress.IPv6Address] -IpAddrBucket = Dict[IpAddress, float] -IpAddrBuckets = Tuple[IpAddrBucket, IpAddrBucket, IpAddrBucket] -TokenBucket = Dict[str, float] -TokenBuckets = Tuple[TokenBucket, TokenBucket, TokenBucket] - - -class Limiter: - def __init__( - self, - limits: dict, - token_check_callback=None, - ipaddr_methods=["X-Real-Ip", "socket"], - whitelisted_ipaddrs=Optional[List[str]], - ): - # Bucket sequence: month, week, day - self._ipaddr_limits = [ - limits.get(l, None) - for l in ("ipaddr_per_month", "ipaddr_per_week", "ipaddr_per_day") - ] - self._token_limits = [ - limits.get(l, None) - for l in ("token_per_month", "token_per_week", "token_per_day") - ] - self._ipaddr_buckets = ({}, {}, {}) # type: IpAddrBuckets - self._token_buckets = ({}, {}, {}) # type: TokenBuckets - self._token_check_callback = token_check_callback - self._ipaddr_extraction_methods = ipaddr_methods - self._last_quota_update_time = time.monotonic() - self._whitelisted_ipaddrs = set() - for ipa in whitelisted_ipaddrs or []: - self._whitelisted_ipaddrs.add(ipaddress.ip_address(ipa)) - - self.increment_quota_counters(1) - self.refresh_quota_counters_if_needed() - - def increment_quota_counters(self, tdelta: int): - """Delta: time from previous run in seconds""" - if tdelta <= 0: - return - - iterable = ( - (30 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - (7 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - (1 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - (30 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - (7 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - (1 * 24, self._ipaddr_limits[0], self._ipaddr_buckets[0]), - ) - for hours, limit, bucket in iterable: - vdelta = limit / hours / 3600 * tdelta - to_delete = [] - for k, v in bucket.items(): - v += vdelta - if v >= limit: - to_delete.append(k) - else: - bucket[k] = v - - for k in to_delete: - del bucket[k] - - def refresh_quota_counters_if_needed(self): - t = time.monotonic() - delta = t - self._last_quota_update_time - if delta > 3600: - self.increment_quota_counters(delta) - - self._last_quota_update_time = t - - def consume_quota(self, elapsed: float, ipaddr: Optional[IpAddress]=None, token=None) -> None: - """Consume quota in seconds - """ - assert ipaddr or token - if ipaddr: - assert isinstance(ipaddr, ipaddress.IPv4Address) - for n, limit in enumerate(self._ipaddr_limits): - b = self._ipaddr_buckets[n] - b[ipaddr] = b.get(ipaddr, limit) - elapsed - - else: - raise NotImplementedError() - - def get_minimum_across_quotas(self, ipaddr=None, token=None) -> float: - assert ipaddr or token - if ipaddr: - iterable = zip(self._ipaddr_limits, self._ipaddr_buckets) - return min(bucket.get(ipaddr, limit) for limit, bucket in iterable) - - else: - raise NotImplementedError() - - def is_quota_available(self, ipaddr=None, token=None) -> bool: - """Check if all quota buckets for an ipaddr/token are > 0 - """ - # return False if any bucket reached 0 - for bucket in self._ipaddr_buckets: - if ipaddr in bucket: - if bucket[ipaddr] <= 0: - return False - - return True - - def is_ipaddr_whitelisted(self, ipaddr: IpAddress) -> bool: - return ipaddr in self._whitelisted_ipaddrs - - def get_lowest_daily_quotas_summary(self, n=20) -> List[Tuple[int, float]]: - """Returns a summary of daily quotas with the lowest values - """ - li = sorted((val, ipa) for ipa, val in self._ipaddr_buckets[2].items()) - li = li[:n] - return [(int(ipa.packed[0]), val) for val, ipa in li] - - -# # Flask-specific code # # - -from flask import request, current_app -import flask - - -class FlaskLimiter: - def _get_client_ipaddr(self) -> IpAddress: - # https://github.com/alisaifee/flask-limiter/issues/41 - for m in self._limiter._ipaddr_extraction_methods: - if m == "X-Forwarded-For": - raise NotImplementedError("X-Forwarded-For ") - - elif m == "X-Real-Ip": - ipaddr = request.headers.get("X-Real-Ip", None) - if ipaddr: - return ipaddress.ip_address(ipaddr) - - elif m == "socket": - return ipaddress.ip_address(request.remote_addr) - - else: - raise NotImplementedError(f"IP address method {m} is unknown") - - methods = ",".join(self._limiter._ipaddr_extraction_methods) - raise Exception(f"Unable to detect IP address using {methods}") - - def _check_limits_callback(self): - """Check rate limits before processing a request - Refresh quota counters when needed - """ - self._limiter.refresh_quota_counters_if_needed() - ipaddr = self._get_client_ipaddr() - # token = request.headers.get("Token", None) - # if token: - # check token validity - if not self._limiter.is_quota_available(ipaddr=ipaddr): - flask.abort(429) - self._request_start_time = time.monotonic() - log = current_app.logger - log.error("_check_limits_callback called") - - def _after_request_callback(self, response): - """Consume quota and injects HTTP headers when responding to a request - """ - log = current_app.logger - try: - assert response - tdelta = time.monotonic() - self._request_start_time - ipaddr = self._get_client_ipaddr() - if not self._limiter.is_ipaddr_whitelisted(ipaddr): - self._limiter.consume_quota(tdelta, ipaddr=ipaddr) - q = self._limiter.get_minimum_across_quotas(ipaddr=ipaddr) - response.headers.add("X-RateLimit-Remaining", q) - - except Exception as e: - log.error(str(e), exc_info=True) - - finally: - return response - - def __init__( - self, - app, - limits: dict, - token_check_callback=None, - ipaddr_methods=["X-Real-Ip", "socket"], - whitelisted_ipaddrs=None, - ): - """ - """ - self._limiter = Limiter( - limits, - token_check_callback=token_check_callback, - ipaddr_methods=ipaddr_methods, - whitelisted_ipaddrs=whitelisted_ipaddrs, - ) - if app.extensions.get("limiter"): - raise Exception("The Flask app already has an extension named 'limiter'") - - app.before_request(self._check_limits_callback) - app.after_request(self._after_request_callback) - app.extensions["limiter"] = self - - def get_lowest_daily_quotas_summary(self, n=20) -> List[Tuple[int, float]]: - return self._limiter.get_lowest_daily_quotas_summary(n) diff --git a/api/refresh_deps b/api/refresh_deps deleted file mode 100755 index a40fd9fd..00000000 --- a/api/refresh_deps +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -set -exu -rm .tox/integ -rf -tox -e integ -v --notest -./.tox/integ/bin/pip3 freeze | sort > /tmp/freeze -sort requirements/main.txt | grep -v '^#.*$' | grep -v '^$' | sort > /tmp/main -meld /tmp/freeze /tmp/main diff --git a/api/scripts/init_db.sh b/api/scripts/init_db.sh deleted file mode 100755 index ba90d8ee..00000000 --- a/api/scripts/init_db.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -set -exuo pipefail - -export POSTGRES_HOST=db -export PGPASSWORD=$POSTGRES_PASSWORD - -tmpdir=$(mktemp -d) -cd $tmpdir -git clone --depth 1 https://github.com/ooni/pipeline.git - -echo "Create amsapi and readonly roles" -psql -U $POSTGRES_USER -h $POSTGRES_HOST $POSTGRES_USER -c "CREATE ROLE amsapi;" -psql -U $POSTGRES_USER -h $POSTGRES_HOST $POSTGRES_USER -c "CREATE ROLE readonly;" - -echo "Creating database tables using SQL files:" -ls pipeline/af/oometa/*.install.sql -cat pipeline/af/oometa/*.install.sql | psql -U $POSTGRES_USER -h $POSTGRES_HOST $POSTGRES_USER -v ON_ERROR_STOP=1 diff --git a/api/scripts/restore-dump.sh b/api/scripts/restore-dump.sh deleted file mode 100644 index 9ef3a5e6..00000000 --- a/api/scripts/restore-dump.sh +++ /dev/null @@ -1,2 +0,0 @@ -lz4cat meta-closure.sql.lz4 | psql -U postgres -h localhost -p 5433 measurements -f sample-dump.sql - diff --git a/api/tests/integ/test_aggregation.py b/api/tests/integ/test_aggregation.py new file mode 100644 index 00000000..dda68475 --- /dev/null +++ b/api/tests/integ/test_aggregation.py @@ -0,0 +1,640 @@ +import pytest + +from textwrap import dedent +from urllib.parse import urlencode +from ..utils import fjd + + +def api(client, subpath, **kw): + url = f"/api/v1/{subpath}" + if kw: + assert "?" not in url + url += "?" + urlencode(kw) + + response = client.get(url) + assert response.status_code == 200, response.data + assert response.is_json + return response.json + + +def test_aggregation_no_axis_with_caching(client, log): + # 0-dimensional data + url = "aggregation?probe_cc=CH&probe_asn=AS3303&since=2021-07-09&until=2021-07-10" + resp = client.get(f"/api/v1/{url}") + assert resp.status_code == 200 + assert resp.is_json + r = resp.json + r.pop("db_stats", None) + expected = { + "dimension_count": 0, + "result": { + "anomaly_count": 187, + "confirmed_count": 0, + "failure_count": 5, + "measurement_count": 1689, + "ok_count": 1497, + }, + "v": 0, + } + assert r == expected, fjd(r) + h = dict(resp.headers) + # FIXME: caching is currently disabled + # assert h["Cache-Control"] == "max-age=86400" + + +def test_aggregation_no_axis_csv(client, log): + # 0-dimensional data + url = "aggregation?probe_cc=CH&probe_asn=AS3303&since=2021-07-09&until=2021-07-10&format=CSV" + r = client.get(f"/api/v1/{url}") + assert not r.is_json + expected = dedent( + """\ + anomaly_count,confirmed_count,failure_count,measurement_count,ok_count + 187,0,5,1689,1497 + """ + ) + assert r.data.decode().replace("\r", "") == expected + assert r.content_type == "text/csv" + assert "Content-Disposition" not in r.headers # not a download + + +def test_aggregation_no_axis_csv_dload(client, log): + # 0-dimensional data + url = "aggregation?probe_cc=CH&probe_asn=AS3303&since=2021-07-09&until=2021-07-10&format=CSV&download=true" + r = client.get(f"/api/v1/{url}") + assert not r.is_json + assert r.content_type == "text/csv" + exp = "attachment; filename=ooni-aggregate-data.csv" + assert r.headers["Content-Disposition"] == exp + + +def test_aggregation_no_axis_domain(client): + # 0-dimensional data + url = "aggregation?probe_cc=BR&domain=www.cabofrio.rj.gov.br&since=2021-07-09&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 21, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 21, + "ok_count": 0, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_domain_ipaddr(client): + # 0-dimensional data + url = "aggregation?domain=8.8.4.4&since=2021-07-01&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 1, + "confirmed_count": 0, + "failure_count": 1, + "measurement_count": 10, + "ok_count": 8, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_input_ipaddr(client): + # 0-dimensional data + url = "aggregation?input=109.105.109.146:22&since=2021-07-08&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 2, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 2, + "ok_count": 0, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_by_category_code(client): + # 0-dimensional data + url = ( + "aggregation?probe_cc=BR&category_code=CULTR&since=2021-07-09&until=2021-07-10" + ) + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 0, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 14, + "ok_count": 14, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_domain(client): + # 0-dimensional data + url = ( + "aggregation?domain=twitter.com,facebook.com&since=2021-07-09&until=2021-07-10" + ) + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 0, + "confirmed_count": 0, + "failure_count": 1, + "measurement_count": 16, + "ok_count": 15, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_probe_asn(client): + # 0-dimensional dat + url = "aggregation?probe_asn=AS3303,AS8167&since=2021-07-09&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 187, + "confirmed_count": 0, + "failure_count": 5, + "measurement_count": 1689, + "ok_count": 1497, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_probe_cc(client): + # 0-dimensional data + url = "aggregation?probe_cc=BR,GB&since=2021-07-09&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 123, + "confirmed_count": 0, + "failure_count": 113, + "measurement_count": 2435, + "ok_count": 2199, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_test_name(client): + # 0-dimensional data + url = "aggregation?test_name=web_connectivity,whatsapp&since=2021-07-09&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 319, + "confirmed_count": 42, + "failure_count": 340, + "measurement_count": 8547, + "ok_count": 7846, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_test_name_1_axis(client): + # 1-dimensional: test_name + url = "aggregation?test_name=web_connectivity,whatsapp&since=2021-07-09&until=2021-07-10&axis_x=test_name" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 1, + "result": [ + { + "anomaly_count": 317, + "confirmed_count": 42, + "failure_count": 339, + "measurement_count": 8488, + "ok_count": 7790, + "test_name": "web_connectivity", + }, + { + "anomaly_count": 2, + "confirmed_count": 0, + "failure_count": 1, + "measurement_count": 59, + "ok_count": 56, + "test_name": "whatsapp", + }, + ], + "v": 0, + }, fjd(r) + + +def test_aggregation_no_axis_filter_multi_oonirun(client): + # 0-dimensional data + url = "aggregation?ooni_run_link_id=1234,2345&since=2021-07-09&until=2021-07-10" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 0, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 0, + "ok_count": 0, + }, + "v": 0, + }, fjd(r) + + +def test_aggregation_x_axis_only(client, log): + # 1 dimension: X + url = "aggregation?probe_cc=CH&probe_asn=AS3303&since=2021-07-09&until=2021-07-11&time_grain=day&axis_x=measurement_start_day" + r = api(client, url) + r.pop("db_stats", None) + expected = { + "dimension_count": 1, + "result": [ + { + "anomaly_count": 187, + "confirmed_count": 0, + "failure_count": 5, + "measurement_count": 1689, + "measurement_start_day": "2021-07-09", + "ok_count": 1497, + }, + ], + "v": 0, + } + assert r == expected, fjd(r) + + +def test_aggregation_x_axis_only_invalid_range(client, log): + # 1 dimension: X + url = "aggregation?since=2022-07-09&until=2021-07-11&time_grain=day&axis_x=measurement_start_day" + r = client.get(f"/api/v1/{url}") + assert r.status_code == 400 + + +def test_aggregation_x_axis_only_invalid_time_grain_too_small(client, log): + # 1 dimension: X + url = "aggregation?since=2020-07-09&until=2022-07-11&time_grain=hour&axis_x=measurement_start_day" + r = client.get(f"/api/v1/{url}") + assert r.status_code == 400 + exp = "Choose time_grain between day, week, month, year, auto for the given time range" + assert r.json["error"] == exp + + +def test_aggregation_x_axis_only_invalid_time_grain_too_large(client, log): + # 1 dimension: X + url = "aggregation?since=2022-07-09&until=2022-07-11&time_grain=year&axis_x=measurement_start_day" + r = client.get(f"/api/v1/{url}") + assert r.status_code == 400 + exp = "Choose time_grain between hour, day, auto for the given time range" + assert r.json["error"] == exp + + +def test_aggregation_x_axis_only_hour(client, log): + # 1 dimension: X + url = "aggregation?since=2021-07-09&until=2021-07-11&axis_x=measurement_start_day" + r = api(client, url) + r.pop("db_stats", None) + expected = { + "dimension_count": 1, + "result": [ + { + "anomaly_count": 686, + "confirmed_count": 42, + "failure_count": 777, + "measurement_count": 9990, + "measurement_start_day": "2021-07-09T00:00:00Z", + "ok_count": 8485, + }, + { + "anomaly_count": 0, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 1, + "measurement_start_day": "2021-07-09T01:00:00Z", + "ok_count": 1, + }, + ], + "v": 0, + } + assert r == expected, fjd(r) + + +def test_aggregation_x_axis_domain(client, log): + # 1 dimension: X + url = "aggregation?probe_cc=CH&probe_asn=AS3303&since=2021-07-09&until=2021-07-10&axis_x=domain" + r = api(client, url) + r.pop("db_stats", None) + assert r["dimension_count"] == 1 + for x in r["result"]: + if x["domain"] == "www.theregister.co.uk": + assert x == { + "anomaly_count": 0, + "confirmed_count": 0, + "domain": "www.theregister.co.uk", + "failure_count": 0, + "measurement_count": 1, + "ok_count": 1, + } + return + + assert False, "Msmt not found" + + +def test_aggregation_x_axis_without_since(client, log): + # 1 dimension: X + url = "aggregation?probe_cc=CH&probe_asn=AS3303&until=2021-07-10&axis_x=measurement_start_day" + r = client.get(f"/api/v1/{url}") + assert r.status_code == 400 + + +@pytest.mark.skip("To be fixed in future") +def test_aggregation_y_axis_only_blocking_type(client, log): + # 1 dimension: Y: blocking_type + url = "aggregation?since=2021-07-09&until=2021-07-10&axis_y=blocking_type" + r = api(client, url) + r.pop("db_stats", None) + expected = { + "dimension_count": 1, + "result": [ + # FIXME + ], + "v": 0, + } + assert r == expected, fjd(r) + + +def test_aggregation_x_axis_only_probe_cc(client, log): + # 1 dimension: X + url = "aggregation?since=2021-07-09&until=2021-07-10&axis_x=probe_cc" + r = api(client, url) + assert r["dimension_count"] == 1 + assert len(r["result"]) == 33 + + +@pytest.mark.skipif(not pytest.proddb, reason="use --proddb to run") +def test_aggregation_x_axis_only_category_code(client): + # 1-dimensional data + url = "aggregation?probe_cc=IE&category_code=HACK&since=2021-07-09&until=2021-07-10&axis_x=measurement_start_day" + r = api(client, url) + expected = { + "dimension_count": 1, + "result": [ + { + "anomaly_count": 32, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 1302, + "measurement_start_day": "2021-07-10", + }, + { + "anomaly_count": 13, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 1236, + "measurement_start_day": "2021-07-10", + }, + ], + "v": 0, + } + assert r == expected, fjd(r) + + +@pytest.mark.skipif(not pytest.proddb, reason="use --proddb to run") +def test_aggregation_x_axis_only_csv(client, log): + # 1-dimensional data + url = "aggregation?probe_cc=BR&probe_asn=AS8167&since=2021-07-09&until=2021-07-10&format=CSV&axis_x=measurement_start_day" + r = api(client, url) + expected = dedent( + """\ + anomaly_count,confirmed_count,failure_count,measurement_count,measurement_start_day + 0,0,0,5,2021-07-10 + 1,0,0,37,2020-01-04 + 2,0,0,46,2020-01-08 + 2,0,0,26,2020-01-13 + 0,0,0,20,2020-01-16 + 2,0,0,87,2020-01-20 + 0,0,0,6,2020-01-21 + 6,0,0,87,2020-01-23 + 0,0,0,11,2020-01-26 + 0,0,0,25,2020-01-27 + """ + ) + assert r.replace("\r", "") == expected + + +@pytest.mark.skipif(not pytest.proddb, reason="use --proddb to run") +def test_aggregation_x_axis_y_axis(client, log): + # 2-dimensional data + url = "aggregation?since=2021-07-09&until=2021-07-10&axis_x=measurement_start_day&axis_y=probe_cc&test_name=web_connectivity" + r = api(client, url) + + assert "error" not in r + assert r["dimension_count"] == 2 + assert len(r["result"]) == 2140 + + +def test_aggregation_x_axis_y_axis_are_the_same(client, log): + # 2-dimensional data + url = "aggregation?since=2021-07-09&until=2021-07-10&axis_x=probe_cc&axis_y=probe_cc&test_name=web_connectivity" + r = api(client, url) + assert r == {"error": "Axis X and Y cannot be the same", "v": 0} + + +@pytest.mark.skipif(not pytest.proddb, reason="use --proddb to run") +def test_aggregation_two_axis_too_big(client, log): + url = "aggregation?since=2021-10-14&until=2021-10-15&test_name=web_connectivity&axis_x=measurement_start_day&axis_y=input" + r = api(client, url) + assert r == {} + + +def test_aggregation_foo(client): + url = "aggregation?test_name=web_connectivity&since=2021-07-09&axis_x=probe_cc&until=2021-07-10" + r = api(client, url) + assert sorted(r["result"][0]) == [ + "anomaly_count", + "confirmed_count", + "failure_count", + "measurement_count", + "ok_count", + "probe_cc", + ] + + +def test_aggregation_x_axis_only_csv_2d(client, log): + # 2-dimensional data: day vs ASN + dom = "www.cabofrio.rj.gov.br" + url = f"aggregation?probe_cc=BR&domain={dom}&since=2021-07-09&until=2021-07-10&time_grain=day&axis_x=measurement_start_day&axis_y=probe_asn&format=CSV" + r = client.get(f"/api/v1/{url}") + assert r.status_code == 200 + assert not r.is_json + expected = dedent( + """\ + anomaly_count,confirmed_count,failure_count,measurement_count,measurement_start_day,ok_count,probe_asn + 1,0,0,1,2021-07-09,0,18881 + 1,0,0,1,2021-07-09,0,28154 + 1,0,0,1,2021-07-09,0,28183 + 1,0,0,1,2021-07-09,0,28210 + 1,0,0,1,2021-07-09,0,28343 + 3,0,0,3,2021-07-09,0,28573 + 1,0,0,1,2021-07-09,0,53029 + 1,0,0,1,2021-07-09,0,53089 + 1,0,0,1,2021-07-09,0,53209 + 1,0,0,1,2021-07-09,0,262616 + 1,0,0,1,2021-07-09,0,262644 + 1,0,0,1,2021-07-09,0,262970 + 2,0,0,2,2021-07-09,0,262983 + 1,0,0,1,2021-07-09,0,264146 + 1,0,0,1,2021-07-09,0,264510 + 1,0,0,1,2021-07-09,0,264592 + 1,0,0,1,2021-07-09,0,268821 + 1,0,0,1,2021-07-09,0,269246 + """ + ) + assert r.data.decode().replace("\r", "") == expected + + +aggreg_over_category_code_expected = [ + { + "anomaly_count": 77, + "category_code": "ALDR", + "confirmed_count": 0, + "failure_count": 116, + "measurement_count": 250, + }, + { + "anomaly_count": 118, + "category_code": "ANON", + "confirmed_count": 0, + "failure_count": 184, + "measurement_count": 405, + }, + { + "anomaly_count": 35, + "category_code": "COMM", + "confirmed_count": 0, + "failure_count": 54, + "measurement_count": 107, + }, +] + + +@pytest.mark.skip("FIXME citizenlab") +def test_aggregation_x_axis_category_code(client, log): + # 1d data over a special column: category_code + url = ( + "aggregation?probe_cc=DE&since=2021-07-09&until=2021-07-10&axis_x=category_code" + ) + r = api(client, url) + assert r["dimension_count"] == 1, fjd(r) + # shortened to save space + assert r["result"][:3] == aggreg_over_category_code_expected, fjd(r) + + +# @pytest.mark.skipif(not pytest.proddb, reason="use --proddb to run") +@pytest.mark.skip("FIXME citizenlab") +def test_aggregation_y_axis_category_code(client, log): + # 1d data over a special column: category_code + url = ( + "aggregation?probe_cc=DE&since=2021-07-09&until=2021-07-10&axis_y=category_code" + ) + r = api(client, url) + assert "dimension_count" in r, fjd(r) + assert r["dimension_count"] == 1, fjd(r) + # shortened to save space. The query should be identical to + # test_aggregation_x_axis_category_code + assert r["result"][:3] == aggreg_over_category_code_expected, fjd(r) + + +@pytest.mark.skip("FIXME citizenlab") +def test_aggregation_xy_axis_category_code(client, log): + # 2d data over a special column: category_code + url = "aggregation?since=2021-07-09&until=2021-07-10&axis_x=category_code&axis_y=category_code" + r = api(client, url) + assert "dimension_count" in r, fjd(r) + assert r["dimension_count"] == 2, fjd(r) + # shortened to save space. The query should be identical to + # test_aggregation_x_axis_category_code + assert r["result"][:3] == [], fjd(r) + + +def test_aggregation_psiphon(client): + url = "aggregation?probe_cc=BR&since=2021-07-09&until=2021-07-10&test_name=psiphon" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 0, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 20, + "ok_count": 20, + }, + "v": 0, + } + + +def test_aggregation_test_name(client): + r = api(client, "aggregation?test_name=BOGUS") + assert r == {"error": "Invalid characters", "v": 0} + + +def test_aggregation_input(client): + url = "aggregation?since=2021-07-09&until=2021-07-10&input=http://www.cabofrio.rj.gov.br/" + r = api(client, url) + r.pop("db_stats", None) + assert r == { + "dimension_count": 0, + "result": { + "anomaly_count": 21, + "confirmed_count": 0, + "failure_count": 0, + "measurement_count": 21, + "ok_count": 0, + }, + "v": 0, + } + + +def test_aggregation_invalid_input(client): + url = "aggregation?since=2021-07-09&until=2021-07-10&input=~!^{}" + r = api(client, url) + assert r == {"error": "Invalid characters in input field", "v": 0} + + +def test_aggregation_invalid_input_2(client): + url = "aggregation?since=2021-07-09&until=2021-07-10&input=foo.org;" + r = api(client, url) + assert r == {"error": "Invalid characters in input field", "v": 0} + + +def test_aggregation_invalid_input_3(client): + url = "aggregation?since=2021-07-09&until=2021-07-10&input=foo.org%3D%27" + r = api(client, url) + assert r == {"error": "Invalid characters in input field", "v": 0} + + +def test_aggregation_bug_585(client): + url = "aggregation?test_name=web_connectivity&since=2022-01-24&until=2022-02-24&axis_x=measurement_start_day&category_code=LGBT" + r = api(client, url) diff --git a/build_docs.ini b/build_docs.ini deleted file mode 100644 index 12b4916b..00000000 --- a/build_docs.ini +++ /dev/null @@ -1,158 +0,0 @@ -[DEFAULT] -# Template to generate GitHub URLs -# variables: {action} {path} {lineno} -github_url_template = https://github.com/ooni/pipeline/{action}/master/{path}#L{lineno} - -# markdown or asciidoc -markup_format = markdown - -# files / dirs to be ignored -ignore_paths_substr = - build_docs.py - build_docs_output - af/shovel - af/prio - LICENSE.md - -outdir = build_docs_output - -# Author: name -author = OONI project - -# CSS: multiline field (use indentation > 0) -css = - html, body { - margin: 0; - font-family: "Fira Sans", sans-serif; - font-weight: 400; - } - .header { - padding: 2rem; - background-color: #0588cb; - color: #fff; - margin-bottom: 2rem; - } - a { - color: #0588cb; - } - pre { - border-left: none !important; - padding: 0.5rem 1rem; - } - a.headerlink { - color: #868e96 - } - a.headerlink:hover { - color: #0588cb; - } - div#pagepath { padding-bottom: 1em } - h1 { font-size: 48px; font-weight: 300; } - h2 { font-size: 36px; font-weight: 600; } - h3 { font-size: 28px; font-weight: 600; } - h4 { font-size: 22px; font-weight: 600; } - h5 { font-size: 20px; font-weight: 600; } - h6 { font-size: 20px; font-weight: 600; } - h7 { font-size: 20px; font-weight: 600; } - div.toc { - border: 1px solid #ccc; - margin: 1em; - padding: 1em; - border-radius: 10px; - font-size: 115%%; - } - footer { - background-color: rgb(0, 54, 91); - padding: 2rem; - color: #fff; - font-size: 14px; - font-family: "Fira Sans", sans-serif; - margin-top: 2rem; - } - .pt-1 { - padding-top: 0.25rem; - } - .pb-2 { - padding-bottom: 0.5rem; - } - .pb-4 { - padding-bottom: 1rem; - } - .footer-section-title { - font-weight: bold; - } - footer a { - color: #fff; - opacity: 0.5; - padding: 2px 0; - margin: 2px 0; - } - footer a:hover { - color: #fff; - opacity: 1; - } - .button { - color: #0588cb !important; - border-color: #0588cb !important; - } - .button:hover,.button:focus { - color: #343a40 !important; - border-color: #343a40 !important; - - } - .button-small { - padding: 0 2rem; - height: 3.5rem; - line-height: 3.5rem; - } - - -# CSS: multiline field (use indentation > 0) -html_imports = - - - -# footer: multiline field (use indentation > 0) -footer = - diff --git a/build_docs.py b/build_docs.py deleted file mode 100755 index cac97653..00000000 --- a/build_docs.py +++ /dev/null @@ -1,405 +0,0 @@ -#!/usr/bin/env python3 - -""" -Generate docs from module docstrigs -Link # to github issue -Link # to github issue -List #TODO and #FIXME - -debdeps: asciidoc-base (>= 8.6.9) -debdeps: python3-markdown -""" - -from configparser import ConfigParser -from io import StringIO -from pathlib import Path -from subprocess import check_call -from tempfile import NamedTemporaryFile -from textwrap import dedent -from typing import List -import ast -import base64 -import sys -import zlib - - -try: - # debdeps: asciidoc-base (>= 9.0.0) - sys.path.append("/usr/share/asciidoc") - import asciidocapi - - asciidoc_available = True -except ImportError: - asciidoc_available = False - -try: - # debdeps: python3-markdown - import markdown - from markdown.extensions.toc import TocExtension - from markdown.extensions.codehilite import CodeHiliteExtension - from markdown.extensions.fenced_code import FencedCodeExtension - - markdown_available = True -except ImportError: - markdown_available = False - -# debdeps: python3-blockdiag - -HTMLTPL = dedent( - """ - - - - - - {title} - {head_links} - - - -
-
-
-

Project documentation

-
-
-
-
- """ -) - -conf = None - - -def load_conf(): - confp = ConfigParser() - with open("build_docs.ini") as f: - confp.read_file(f) - return confp["DEFAULT"] - - -def glob_ext(ignored, ext): - for f in sorted(Path(".").glob(f"**/*.{ext}")): - if any(i in f.as_posix() for i in ignored): - continue - yield f - - -def _scan_ast(i, skipfirst=True): - for y in ast.iter_child_nodes(i): - if isinstance(y, ast.Expr) and isinstance(y.value, ast.Str): - if skipfirst: - skipfirst = False - else: - yield y.value.s, y.lineno - - -def extract_python_doc(inputf) -> List: - """Extract documentation strings from a Python file""" - a = ast.parse(inputf.read_text()) - out = [] - - def unroll(g): - for item in g: - s = str(item).strip() - if s: - out.append(s) - - out.extend(_scan_ast(a, skipfirst=False)) - - for i in ast.iter_child_nodes(a): - if isinstance(i, ast.FunctionDef): - out.extend(_scan_ast(i)) - - elif isinstance(i, ast.ClassDef): - out.extend(_scan_ast(i)) - for x in ast.iter_child_nodes(i): - if isinstance(x, ast.FunctionDef): - out.extend(_scan_ast(x)) - - return out - - -def render_adoc(orig_source_f: Path, infile: StringIO): - outfile = conf.outdir / orig_source_f.with_suffix(".html") - outfile.parent.mkdir(parents=True, exist_ok=True) - ad = asciidocapi.AsciiDocAPI() - ad.attributes["author"] = conf.get("author", "") - infile.seek(0) - with outfile.open("w") as outf: - ad.execute(infile, outf, backend="html5") - - -def render_markdown(orig_source_f: Path, inp: str): - outfile = conf.outdir / orig_source_f.with_suffix(".html") - outfile.parent.mkdir(parents=True, exist_ok=True) - print(outfile) - content = markdown.markdown( - inp, - extensions=[ - TocExtension(baselevel=3, permalink=' #'), - CodeHiliteExtension(), - FencedCodeExtension(), - ], - ) - html = wrap_page(orig_source_f, content) - outfile.write_text(html) - - -def generate_github_link(action: str, f: Path): - # action: blob edit - url_tpl = conf.get("github_url_template") - return url_tpl.format(action=action, path=f.as_posix(), lineno=0) - - -def generate_github_link_unused(action, f, lineno): - # action: blob edit - url_tpl = conf.get("github_url_template") - url = url_tpl.format(action=action, path=f.as_posix(), lineno=lineno) - adoc_tpl = f"""image:{action}.svg[link="{url}"]""" - # return f"""\nimage::https://asciidoctor.org/images/octocat.jpg[link="{url}"]\n""" - return adoc_tpl - - -def generate_badge(url, text): - tpl = """{text}""" - return tpl.format(url=url, text=text) - - -def generate_header_path(inputf: Path) -> str: - s = [] - backticker = "/".join([".."] * len(inputf.parents)) - if backticker: - backticker += "/" - pc = len(inputf.parents) - - for depth, x in enumerate(reversed(inputf.parents)): - item = "link:{}{}[{}]".format(backticker, str(x), x.name) - - backticker = "/".join([".."] * (pc - depth - 1)) - item = "link:{}[{}]".format(backticker, x.name) - - s.append(item) - last = "link:[{}]\n".format(inputf.name) - s.append(last) - out = " -> ".join(s) - return out - - -def generate_header_path_html(inputf: Path) -> str: - s = [] - backticker = "/".join([".."] * len(inputf.parents)) - if backticker: - backticker += "/" - pc = len(inputf.parents) - - for depth, x in enumerate(reversed(inputf.parents)): - backticker = "/".join([".."] * (pc - depth - 1)) - item = "{}".format(backticker, x.name) - s.append(item) - - # last = "[{}](.)".format(inputf.name) - last = "{}".format(inputf.name) - s.append(last) - out = " » ".join(s) - return """
""" + out + "
" - - -def generate_view_badge(f: Path): - url = generate_github_link("blob", f) - return generate_badge(url, "view") - - -def generate_edit_badge(f: Path): - url = generate_github_link("edit", f) - return generate_badge(url, "edit") - - -def generate_python_adoc(inputf: Path, pdoc: List): - adoc = [] - adoc.append(generate_header_path(inputf)) - for content, lineno in pdoc: - # gh_b = generate_github_link("blob", inputf, lineno) - # f.write(gh_b) - adoc.append("++++") - adoc.append(generate_view_badge(inputf)) - adoc.append(generate_edit_badge(inputf)) - adoc.append("++++") - adoc.append("\n" + content + "\n") - - return StringIO("\n".join(adoc)) - - -def generate_html_begin(orig_source_f): - hl = conf.get("html_imports", "") - css = conf.get("css", "") - return HTMLTPL.format(title=orig_source_f.name, head_links=hl, css=css) - - -def wrap_page(orig_source_f, content): - begin = generate_html_begin(orig_source_f) - header = generate_header_path_html(orig_source_f) - footer = conf.get("footer", "") - end = "
" + footer + "" - return begin + header + content + end - - -def generate_python_markdown(inputf: Path, pdoc: List): - lines = [] - for content, lineno in pdoc: - lines.append(generate_view_badge(inputf)) - lines.append(generate_edit_badge(inputf)) - lines.append("\n" + content + "\n") - - return "\n".join(lines) - - -def render_blockdiag(diag: str) -> str: - """Render blockdiag to SVG""" - print("Rendering blockdiag") - inp = NamedTemporaryFile("w") - inp.write(diag) - inp.flush() - out = NamedTemporaryFile("r") - cmd = ["/usr/bin/blockdiag3", "-T", "svg", "-o", out.name, inp.name] - try: - check_call(cmd) - except Exception as e: - print(f"Unable to render diagram: {e}") - print(f"------ diagram code ------\n{diag}\n------- end ------") - sys.exit(1) - svg = out.read() - _, _, svg = svg.split("\n", 2) - return svg - - -def process_diagrams(md: str) -> str: - """Extract diagrams and replace them with SVG/PNG images""" - out = "" - blocks = md.split("\nblockdiag {")[1:] - for block in blocks: - try: - diag, post = block.split("\n}", 1) - diag = "blockdiag {\n" + diag + "\n}\n" - # url = generate_kroki_url(diag, "blockdiag") - # exp = "https://kroki.io/blockdiag/svg/eNpdzDEKQjEQhOHeU4zpPYFoYesRxGJ9bwghMSsbUYJ4d10UCZbDfPynolOek0Q8FsDeNCestoisNLmy-Qg7R3Blcm5hPcr0ITdaB6X15fv-_YdJixo2CNHI2lmK3sPRA__RwV5SzV80ZAegJjXSyfMFptc71w==" - # assert url == exp, url - # url = f"""""" - # out += url - svg = render_blockdiag(diag) - out += f"\n
{svg}
" - out += post - - except ValueError as e: - out += block - - return out - - -# assert process_diagrams("") == "" -# svg = process_diagrams("a\nblockdiag {\n}\nb") -# assert svg == """a -# -# -# -# -# -# -# blockdiag -# blockdiag { -# -# } -# -# -# -# b""", svg - - -def generate_kroki_url(content, method: str) -> str: - """Generate URL for https://kroki.io/""" - # FIXME: Broken - if method != "blockdiag": - raise NotImplementedError - - baseurl = "https://kroki.io/graphviz/svg/" - content = content.encode() - path = base64.urlsafe_b64encode(zlib.compress(content, 9)) - path = path.decode() - return baseurl + path - - -def create_index_html(basedir: Path): - """Recursively create index.html files""" - for d in basedir.iterdir(): - if d.is_dir(): - create_index_html(d) - - out = generate_html_begin(basedir) - out += generate_header_path_html(basedir) - out += "
    " - for f in sorted(basedir.iterdir()): - n = f.with_suffix("").name - if f.is_dir(): - out += "
  • » {}
  • ".format(f.name, n) - elif f.suffix == ".html": - out += "
  • {}
  • ".format(f.name, n) - - footer = conf.get("footer", "") - footer = "
" + footer + "" - out += footer - indexf = basedir / "index.html" - indexf.write_text(out) - - -def main(): - global conf - conf = load_conf() - ignored = conf.get("ignore_paths_substr", "").split() - markup_format = conf.get("markup_format", "markdown") - conf.outdir = Path(conf.get("outdir", "build_docs_output")) - conf.outdir.mkdir(parents=True, exist_ok=True) - - if markup_format == "asciidoc": - print("Rendering AsciiDoc files") - for adocf in glob_ext(ignored, "adoc"): - # render_adoc(renderer, adocf) - pass - - elif markup_format == "markdown": - print("Rendering MarkDown files") - for f in glob_ext(ignored, "md"): - print(f"Reading {f}") - md = f.read_text() - md = process_diagrams(md) - render_markdown(f, md) - pass - - print("Rendering Python files") - for pyfile in glob_ext(ignored, "py"): - try: - pdoc = extract_python_doc(pyfile) - if not len(pdoc): - continue - - if markup_format == "asciidoc": - raise NotImplementedError - adocf = generate_python_adoc(pyfile, pdoc) - render_adoc(pyfile, adocf) - - elif markup_format == "markdown": - md = generate_python_markdown(pyfile, pdoc) - md = process_diagrams(md) - render_markdown(pyfile, md) - - except Exception as e: - print(e) - - create_index_html(conf.outdir) - - print("Done") - - -if __name__ == "__main__": - main() diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index c302654b..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -pipeline-16.10.png: pipeline-16.10.dot - dot -Tpng $^ >$@ diff --git a/docs/aggregate-task.pdf b/docs/aggregate-task.pdf deleted file mode 100644 index 15186c9c..00000000 Binary files a/docs/aggregate-task.pdf and /dev/null differ diff --git a/docs/airflow-pipeline.png b/docs/airflow-pipeline.png deleted file mode 100644 index bc7e3349..00000000 Binary files a/docs/airflow-pipeline.png and /dev/null differ diff --git a/docs/airflow-pools.png b/docs/airflow-pools.png deleted file mode 100644 index 1ee51964..00000000 Binary files a/docs/airflow-pools.png and /dev/null differ diff --git a/docs/cpu-daily.png b/docs/cpu-daily.png deleted file mode 100644 index e2dd885d..00000000 Binary files a/docs/cpu-daily.png and /dev/null differ diff --git a/docs/cpu-pipeline.png b/docs/cpu-pipeline.png deleted file mode 100644 index 6dfcfb1b..00000000 Binary files a/docs/cpu-pipeline.png and /dev/null differ diff --git a/docs/delete-report.md b/docs/delete-report.md deleted file mode 100644 index 67f04c50..00000000 --- a/docs/delete-report.md +++ /dev/null @@ -1,128 +0,0 @@ -This document describes, how to remove reports from historical data with minimal resources waste and minimal damage to the -on-going data processing. - -This document is valid as of February 2019, when the [Airflow data processing graph](./pipeline-16.10.md#data-flow) looked like following one: - -![hist_canning DAG](./airflow-pipeline.png) - -## "Modern" buckets - -Here is the checklist of places to clean up the report (for buckets >= 2018): - -- `s3://ooni-data-private/archives-raw/yaml/YYYY-MM-DD.tar.gz` -- `s3://ooni-data-private/archives-raw/yaml/YYYY-MM-DD.index.json.gz` -- `datacollector.infra.ooni.io:/data/ooni/private/reports-tgz/YYYY-MM-DD.index.json.gz` -- `s3://ooni-data-private/canned/YYYY-MM-DD/test_name.42.tar.lz4` -- `s3://ooni-data-private/canned/YYYY-MM-DD/index.json.gz` -- `datacollector.infra.ooni.io:/data/ooni/private/canned/YYYY-MM-DD.index.json.gz` -- `s3://ooni-data/autoclaved/jsonl.tar.lz4/YYYY-MM-DD/test_name.42.tar.lz4` -- `s3://ooni-data/autoclaved/jsonl.tar.lz4/YYYY-MM-DD/index.json.gz` -- `datacollector.infra.ooni.io:/data/ooni/public/autoclaved/YYYY-MM-DD/test_name.42.tar.lz4` -- `datacollector.infra.ooni.io:/data/ooni/public/autoclaved/YYYY-MM-DD/index.json.gz` -- `s3://ooni-data/autoclaved/jsonl/YYYY-MM-DD/yyyymmddThhmmssZ-ZZ-AS0-test_name-...-probe.json` - -## "Legacy" buckets - -Older reports may also(!) be stored in legacy folders and archives under following roots -in addition to aforementioned places: -- `s3://ooni-data-private/` -- `s3://ooni-data/sanitised/` - -Dealing with legacy data is out of the scope of this document. - -## Archive re-compression - -Both `archives-raw/yaml` and `canned` archives have to be re-compressed deleting affected report files. - -To keep `index.json.gz` file in-sync with actual archives, `delete_canned_report.py` -script has to be used. The script is conceptually a wrapper around -`gzip -d out/test_name.42.tar.gz` -that also maintains `index.json.gz`. - -Let's assume that: -- `affected.textname` lists all report files to delete from archives -- `affected.bucket_date` lists all affected buckets -- all affected `autoclaved` files were alread removed both from S3 and from datacollector.infra.ooni.io filesystem - -First, `reports-tgz` (huge, beware of disk usage!) and `canned` (smaller) archives have to be fetched from private S3 bucket: - -``` -user@datacollector:~$ sudo docker pull openobservatory/pipeline-shovel:latest -user@datacollector:~$ sudo mkdir /data/ooni/private/fix-{tgz,can} && sudo chown 1000:1000 /data/ooni/private/fix-{tgz,can} -user@datacollector:~$ sudo docker run --rm -ti --env-file=/srv/etc/af-worker/s3_ooni_datacollector.env -u 1000:1000 -v /data/ooni/private:/p:rw -v $PWD:/opt:ro openobservatory/pipeline-shovel:latest -$ aws s3 sync s3://ooni-data-private/archives-raw/yaml/ /p/reports-tgz/ --exclude '*' $(sed 's,.*,--include &.tar.gz,' /opt/affected.bucket_date) -$ for b in $(cat /opt/affected.bucket_date); do for f in $(delete_canned_report.py --files-from /opt/affected.textname --canned /p/canned --bucket "$b" --list); do test ! -f /p/canned/"$f" && aws s3 cp s3://ooni-data-private/canned/"$f" /p/canned/"$f"; done; done -``` - -Second, the archives have to be cleaned up: - -``` -user@datacollector:~$ sudo docker run --rm -ti -u 1000:1000 -v /data/ooni/private:/p:rw -v $PWD:/opt:ro openobservatory/pipeline-shovel:latest -$ for b in $(cat /opt/affected.bucket_date); do delete_canned_report.py --files-from /opt/affected.textname --reports-tgz /p/reports-tgz --bucket "$b" --dst /p/fix-tgz; done -$ for b in $(cat /opt/affected.bucket_date); do delete_canned_report.py --files-from /opt/affected.textname --canned /p/canned --bucket "$b" --dst /p/fix-can; done; date -``` - -Third, `autoclaved` files deleted earlier have to be re-created from `canned` files: - -``` -user@datacollector:~$ sudo docker run --rm -ti -u 1000:1000 -v /data/ooni/private:/p:ro -v /data/ooni/public:/pub:rw -v $PWD:/opt:ro openobservatory/pipeline-shovel:latest -$ for b in $(cat /opt/affected.bucket_date); do chmod u+w "/pub/autoclaved/${b}" && autoclaving.py --canned-root /p/fix-can --bridge-db /p/bridge_db/bridge_db.json --autoclaved-root /pub/autoclaved --missing --start ${b}T00:00:00 --end $(date -d "$b + 1 day" --rfc-3339=date)T00:00:00; done -``` - -Fourth, changed `autoclaved` files in the affected buckets have to be re-ingested into MetaDB. That's done under GNU `make` control as Airflow scheduler is not happy about hundreds of DAGs being active. Also, exit-code of the TaskInstances should be inspected to verify that all the buckets are processed correctly: - -``` -user@datacollector:~$ tmux -user@datacollector:~$ ./pipeline-reprocess reprocess -user@datacollector:~$ ./pipeline-reprocess rc -``` - -Fifth, the `autoclaved` files should be published to S3 now, API already tries to fetch them as metadata in now updated in the MetaDB: - -``` -user@datacollector:~$ sudo docker run --rm -ti --env-file=/srv/etc/af-worker/s3root.env -u 1000:1000 -v /data/ooni/public:/pub:ro -v $PWD:/opt:ro openobservatory/pipeline-shovel:latest -$ cd /pub/autoclaved -$ for b in $(cat /opt/affected.bucket_date); do aws s3 sync ${b}/ s3://ooni-data/autoclaved/jsonl.tar.lz4/${b}/; done -``` - -Sixth, archives stored in private S3 bucket are updated: - -``` -user@datacollector:~$ sudo docker run --rm -ti --env-file=/srv/etc/af-worker/s3_ooni_datacollector.env -u 1000:1000 -v /data/ooni/private:/p:ro -v $PWD:/opt:ro openobservatory/pipeline-shovel:latest -$ cd /p/fix-tgz -$ for f in *; do aws s3 cp ${f} s3://ooni-data-private/archives-raw/yaml/${f}; done -$ cd /p/fix-can -$ for b in *; do aws s3 sync ${b}/ s3://ooni-data-private/canned/${b}/; done -``` - -At this point `VACUUM FULL` should be run on MetaDB tables and -all the previous MetaDB snapshots and WALs should be probably deleted from -[public MetaDB archive](https://github.com/ooni/sysadmin/issues/272) -depending on reason for report deletion. - -Last, file trees on `datacollector` should be updated and stale cashes should be purged: - -``` -user@datacollector:~$ cd /data/ooni/private/reports-tgz/ -user@datacollector:~$ sudo rm /data/ooni/private/fix-tgz/*.tar.gz -user@datacollector:~$ sudo mv /data/ooni/private/fix-tgz/*.index.json.gz /data/ooni/private/reports-tgz/ -user@datacollector:~$ sudo rm $(sed 's,$,.tar.gz,' ~/affected.bucket_date) - -user@datacollector:~$ cd /data/ooni/private/fix-can/ -user@datacollector:~$ for b in $(cat ~/affected.bucket_date); do sudo mv $b/index.json.gz /data/ooni/private/canned/$b/index.json.gz; done -user@datacollector:~$ find . -type f | sed 's,^,/data/ooni/private/canned/,' | xargs sudo rm -user@datacollector:~$ sudo find . -type f -delete -user@datacollector:~$ sudo rmdir * - -user@datacollector:~$ cd /data/ooni/private/reports-tgz-s3-ls/ -user@datacollector:~$ sudo rm $(sed 's,$,.json.gz,' ~/affected.bucket_date) - -user@datacollector:~$ cd /data/ooni/private/canned-s3-ls/ -user@datacollector:~$ sudo rm $(sed 's,$,.json.gz,' ~/affected.bucket_date) -``` - -## Logs - -It's possible to estimate if the report was ever accessed or not using logs of: -- `api.ooni.io` webserver storing nginx logs -- `s3://ooni-data-logs/` storing logs of `s3://ooni-data/` access diff --git a/docs/links.txt b/docs/links.txt deleted file mode 100644 index 5167423f..00000000 --- a/docs/links.txt +++ /dev/null @@ -1,2 +0,0 @@ -http://blog.treasuredata.com/blog/2015/02/25/managing-the-data-pipeline-with-git-luigi/ - diff --git a/docs/ooid-hash-prob.ipynb b/docs/ooid-hash-prob.ipynb deleted file mode 100644 index 4d757153..00000000 --- a/docs/ooid-hash-prob.ipynb +++ /dev/null @@ -1,261 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from scipy.special import binom\n", - "from scipy.misc import factorial\n", - "from math import log" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "data = pd.read_csv('autoclaved-1529600920.coincidence-stat', sep='\\t', names=['rep', 'msm'])" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
repmsm
021021
121023
22999
341002
441002
\n", - "
" - ], - "text/plain": [ - " rep msm\n", - "0 2 1021\n", - "1 2 1023\n", - "2 2 999\n", - "3 4 1002\n", - "4 4 1002" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of colliding timestamps: 316084\n", - "Number of reports: 753235\n", - "Number of measurements: 40440216\n" - ] - } - ], - "source": [ - "print 'Number of colliding timestamps:', len(data.rep)\n", - "print 'Number of reports:', data.rep.sum()\n", - "print 'Number of measurements:', data.msm.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def prob(bits):\n", - " space = (2.**bits)**data.rep\n", - " k = data.rep\n", - " n = (2.**bits) - data.msm\n", - " good = binom(n + k , k) * factorial(data.rep)\n", - " perexp = good / space\n", - " return perexp" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "32 0.0 0.968998231107\n", - "31 0.1 0.938957560662\n", - "30 0.2 0.881641258555\n", - "29 0.4 0.777291160072\n", - "28 0.7 0.604181085148\n", - "27 1.5 0.365033666199\n", - "26 2.9 0.133247945814\n", - "25 5.8 0.0177541454083\n", - "24 11.6 0.000315147922694\n", - "23 23.3 9.92403912673e-08\n", - "22 46.5 9.81781524666e-15\n", - "21 93.1 9.51871872619e-29\n", - "20 186.2 8.6164609409e-57\n" - ] - } - ], - "source": [ - "for i in [32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20]:\n", - " perexp = prob(i)\n", - " p = perexp.prod() # all at once\n", - " print i, '%6.1f' % (-log(p)/log(2)), p" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "count 316084.000000\n", - "mean 0.999593\n", - "std 0.001830\n", - "min 0.906941\n", - "25% 0.999957\n", - "50% 0.999999\n", - "75% 0.999999\n", - "max 1.000020\n", - "dtype: float64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# per-experiment probability of success is still quite high, so some set of keys can be bruteforced for 20bit tail\n", - "pd.Series(perexp).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of buckets 256\n" - ] - }, - { - "data": { - "text/plain": [ - "count 257.000000\n", - "mean 0.668918\n", - "std 0.220726\n", - "min 0.002041\n", - "25% 0.533609\n", - "50% 0.687229\n", - "75% 0.842643\n", - "max 0.989087\n", - "dtype: float64" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "step = 1234\n", - "print 'Number of buckets', len(perexp) / step\n", - "pd.Series([perexp[i:i+step].product() for i in range(0, len(perexp), step)]).describe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/docs/ooni-pipeline-architecture.png b/docs/ooni-pipeline-architecture.png deleted file mode 100644 index f938c336..00000000 Binary files a/docs/ooni-pipeline-architecture.png and /dev/null differ diff --git a/docs/ooni-uuid.md b/docs/ooni-uuid.md deleted file mode 100644 index 1a00b86f..00000000 --- a/docs/ooni-uuid.md +++ /dev/null @@ -1,186 +0,0 @@ -# On `OOID` OONI measurement UUID - -Every measurement already has some set of identifiers described in [OONI Spec](https://github.com/ooni/spec/blob/master/data-formats/df-000-base.md): - -- `id` — UUID stamped by client, fallback to `UUID(bytes=(sha1(measurement_blob)[:16]))` at pipeline -- `report_id` — generated by collector on opening `/report` as `{server_now|iso8601}_AS{asn}_{urandom_alnum(50)}`, stamped by client. Old yaml reports have it [generated by pipeline](https://github.com/TheTorProject/ooni-pipeline/blob/4ddc40b6ab5eafc5759eeaf4f23dcf05bdbf6e65/af/shovel/autoclaving.py#L225-L235) as `{start_time}_{hash_cc_asn_test_ver_city_alnum(50)}` that has _at most_ 34 bits entropy for 50-byte string. Some json reports have `null` values, some yaml reports have `urandom_alnum(64)`. Collector and pipeline do not enforce report body and filename to contain same `report_id`! -- `bucket_date` — stamped by pipeline while processing daily bucket -- `${report_filename}` — generated by collector on `/report/{report_id}/close` as `{start_time or test_start_time|iso8601}-{test_name}-{report_id}-AS{asn}-{probe_cc}-….{ext}`. That enforces `report_id` being stored as part of `${report_filename}`. Both `start_time` and `test_start_time` come from client as part of opening `/report` -- `report_filename` — generated by pipeline as `{bucket_date}/${report_filename}` preserving extension of original report file -- `test_start_time` — [pipeline-messed](https://github.com/TheTorProject/ooni-pipeline/blob/4ddc40b6ab5eafc5759eeaf4f23dcf05bdbf6e65/af/shovel/daily_workflow.py#L475-L493) mix of `test_start_time` and `start_time` from raw file -- `measurement_start_time` — pipeline-messed mix of `measurement_start_time`, `test_start_time` and `start_time` from raw file -- `test_started` — float(time_t) present in 19519 measurements as seen by `select count(*) from measurement join residual using (residual_no) where residual->'test_keys' ? 'test_started'` -- `input` — stamped by client, was part of OONI Explorer URL scheme together with `report_id` - -All these identifiers are not nice due to following reasons: - -- most of them are generated by client, `report_id` may be server-generated, but it's not enforced by the pipeline -- ones having timestamp use client-side timestamp that is not protected from _"time travellers"_ -- most of them have low entropy-per-byte ratio -- `test_start_time` and `measurement_start_time` are overriden in pipeline and are different in _canned_ and _autoclaved_ data, `test_started` is not even respected (e.g. it's not promoted to `measurement_start_time` in [this file](https://api.ooni.io/files/download/2012-12-30/20121230T142923Z-RU-AS57668-http_requests-no_report_id-0.1.0-probe.yaml) that has `measurement_start_time` and `test_start_time` equal for all measurements) - -It's nice to have int64 identifier having 32 bits allocated for `time_t` as an unique identifier of every measurement collected. It has to respect following constraints and corner-cases: - -- max number of measurements per report file is 1000003 (20 bits) for 2014-11-22/20141122T040940Z-US-AS1968-tcp_connect-no_report_id-0.1.0-probe.yaml (top5 is 1000003, 65007, 41889, 40875, 30949) -- max number of measurements per report file in 2018-01-01 … 2018-06-15 time window is ~5000 -- client may have wrong wall-clock date, e.g. [2018-06-02 13:05:08](https://api.ooni.io/files/download/2018-06-03/20180802T130309Z-LY-AS37284-ndt-20180602T130508Z_AS37284_Vl7cO6V33OkYBoJgQL403dM2L4arYk7WEAeiPizIW6au6aVfV5-0.2.0-probe.json) reports `test_start_time` fro2018-08-02 13:03:09 from future and [2017-11-13 15:13:05](https://api.ooni.io/files/download/2017-11-14/20031106T094115Z-IQ-AS50710-ndt-20171113T151305Z_AS50710_beuliHbl2zzV3F05or7NIt4ynhZFUCCOjKf1okz1zTov3lvLJU-0.2.0-probe.json) reports from past 2003-11-06 09:41:15. -- client may write wrong timezone in `test_start_time` and `measurement_start_time`, see [2018-06-06 01:56:13](https://api.ooni.io/files/download/2018-06-07/20180606T015613Z-SG-AS7472-http_requests-g14IPb8Zf91k1IUMhj9GfMDoXcOXnjywlCuaxEv0WdHHgRbA6ORUEiezcooJUddc-0.1.0-probe.yaml) having `test_start_time` 2018-06-06 08:56:05 and `measurement_start_time` 2018-06-06 08:56:13 for the first measurement. Correctness of filename-based guess relies on on `Date` HTTP headers in server responses saying 06 Jun 2018 01:56:14 GMT -- report_id may have no timestamp. E.g. [bMr4ueruR8QGX1fjw3dBIOHekiQ2Yxv3prXOO8zw4Qu0NEb0XYU9Em9LyyVaBK3b](https://api.ooni.io/files/download/2018-06-19/20180618T043905Z-RU-AS58191-http_header_field_manipulation-bMr4ueruR8QGX1fjw3dBIOHekiQ2Yxv3prXOO8zw4Qu0NEb0XYU9Em9LyyVaBK3b-0.1.0-probe.yaml) has following _raw_ fields: `start_time`: 1529285941.0 (01:39:01 UTC) , `test_start_time`: 2018-06-18 04:39:05, and `measurement/test_start_time`: 1529285945.0 (01:39:05 UTC). Processed _autoclaved_ file has `test_start_time`: 2018-06-18 01:39:01 and `measurement_start_time`: 2018-06-18 01:39:05. So It suggests that 01:39 is right UTC timestamp. But it's wrong assumption, there is a report coming from _alike_ probe (same version, same AS, close time), [it shows that](https://api.ooni.io/files/download/2018-06-19/20180618T043906Z-RU-AS58191-http_requests-WqMmGyBUMXRxSDGde6Wb8lZMR5vlls0rInMeAo3ro4K6nlSjVwrrBa90fIpNmLzQ-0.1.0-probe.yaml) that also has `test_start_time`: 01:39:01, `measurement_start_time`: 01:46:40, but HTTP `Date` headers explicitly say `Mon, 18 Jun 2018 04:46:35 GMT`. So it means that client-generated-server-serialised timestamp still preserves date and time better than fields within message body. **NB**: UNIX timestamp in raw file is incorrect in this example, it's adjusted for timezone offset! -- generated report_id may still have correct timestamp as well for some yaml files, e.g. [2012-12-30 14:29:23](https://api.ooni.io/files/download/2012-12-30/20121230T142923Z-RU-AS57668-http_requests-no_report_id-0.1.0-probe.yaml) -- report_id may be missing, e.g. [2016-07-31 10:39:22](https://api.ooni.io/files/download/2016-08-01/20160731T103922Z-US-AS14618-dns_injection-no_report_id-0.2.0-probe.json) has generated `report_id` EGTL4PNEIF5K3yNuLA55_gb935U as base64(sha1(raw_file)) in database, but it's not stamped in filename or autoclaved file -- report_id may be duplicate and indicate several retries to submit report, e.g. `2018-05-06/20180505T000008Z-NL-AS9143-web_connectivity-20180505T000008Z_AS9143_YXblHbyqIlBUxqzkwQ344hJM4O19Nx9q2E90RUv4W6yFTi4QyS-0.2.0-probe.json` and `2018-05-10/20180505T000008Z-NL-AS9143-web_connectivity-20180505T000008Z_AS9143_YXblHbyqIlBUxqzkwQ344hJM4O19Nx9q2E90RUv4W6yFTi4QyS-0.2.0-probe.json` -- report_id may be duplicate across _different(!)_ report files, e.g. `2016-02-11/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml` and `2016-02-23/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml` -- `bucket_date` can be more than a year away from HTTP `Date` timestamps while `report_id` timestamp is reasonable, e.g. [2015-02-01](https://api.ooni.io/files/download/2016-02-27/20150201T095956Z-BE-AS12392-http_requests-no_report_id-0.1.0-probe.yaml), that's likely upload of ancient measurements -- the dataset has 3.6M reports. 80% have unique timestamps, but 753k reports have 316k coincident timestamps, having at most ~20 reports per timestamp -- the dataset has 147M measurements, at least 28 bits are needed to numerate them - -Moreover we would like the OONI UUID to have the following properties: - -- It's a 64 bit integer (8 bytes) so that it can fits into postgres `bigint` fixed-width native type unlike larger fields -- A sort of "namespace" separation to distinguish pipeline-backfilled OOIDs from collector-stamped OOIDs during rollout of stamping collector -- There is some loose ordering over the ID so that measurements that are close in time have an ID that is of similar cardinality (ex. all measurements from 2018 should have an ID that is `<<` measurements from 2019). - -And that's probably incomplete list of corner cases. - -## Backfilled OOID - -So, given that: - -* Timestamps in the measurement body (i.e. `test_start_time`, `measurement_start_time`, etc.) are unreliable -* We have bucket date and filename as a part of "golden" dataset -* We have quite precise timestamp as part of dataset - -The proposed schema for _backfilled_ `OOID` (OONI UUID) is the following: - -- 32 bits representing time_t stored as `report_id` part of textname, fallback to time_t stored as prefix of the textname basename that usually represents server-side interpretation of client-side `test_start_time` (i.e. we try to get from the data we have available the time that is closest to when the measurement was submitted to the collector, approximating, in the worst case, to the time in which the measurement was added to a bucket). -- 4 bits set to `1` forming nibble `f` (which is a reserved magic value to indicate the `ooid` was backfilled) -- 28 bits of counter indicating measurement index within report file initialised with 28 least significant bits of `sha1(b'2014-11-18/2014…-probe.yaml')` - -Note: we tested with historical data up to 2018-06-20, that using the 28 least significant bits of the output of `sha1` does not lead to a collision, but we may change this offset if we notice a collision while we roll-out this feature. See the below sections for more details on this. - -Amount of static bits may be reduced to single `1` bit, but 28 bits of entropy are enough to avoid collisions and single `f` nibble looks nice in logs. - -### `textname` to `time_t` test vectors - -Here is the Python code implementing suggested textname to time_t transformation: - -```python -import re, calendar -rex = re.compile( - r'^(?P20[0-9]{2})-(?P[01][0-9])-(?P[0123][0-9])/' - r'(?P20[0-9]{2})(?P[01][0-9])(?P[0123][0-9])T(?P
[012][0-9])(?P[0-5][0-9])(?P[0-5][0-9])Z-' - r'[A-Z][A-Z]-' # can be replaced with list of ISO country codes - r'AS(?P[0-9]{1,10})-' # ASN is 32bit at most - r'[^-]+-' # test name - r'(?Pno_report_id' - r'|(?P20[0-9]{2})(?P[01][0-9])(?P[0123][0-9])T(?P[012][0-9])(?P[0-5][0-9])(?P[0-5][0-9])Z_AS(?P=asn)_[0-9A-Za-z]{50}' - r'|[A-Za-z0-9]{64}' - r')-' - r'0\.[12]\.0-probe\.(?:yaml|json)$') # trailer -def ts(textname): - m = rex.match(textname) - if m.group('ridyear') is not None: - keys = ('ridyear', 'ridmon', 'ridday', 'ridhr', 'ridmin', 'ridsec') - else: - keys = ('year', 'mon', 'day', 'hr', 'min', 'sec') - return calendar.timegm(tuple(int(m.group(_)) for _ in keys)) -``` - -Test vectors: - -``` ->>> ts('2016-02-11/20160210T163242Z-IR-AS201227-http_requests-yZthLDkKNe6IdePf7B1gMgNvRxSMDwNGWD6BB1MWcuY2T3q7oLmDQkjhZARARuic-0.1.0-probe.yaml') -1455121962 # 2016-02-10 16:32:42 UTC, bucket date is ignored ->>> ts('2017-11-14/20031106T094115Z-IQ-AS50710-ndt-20171113T151305Z_AS50710_beuliHbl2zzV3F05or7NIt4ynhZFUCCOjKf1okz1zTov3lvLJU-0.2.0-probe.json') -1510585985 # 2017-11-13 15:13:05 UTC, time from `report_id` is used -``` - -### Hash for 32:4:28 scheme - -Table describes sha1.hexdigest() offsets producing collision-free OOIDs for all the collected reports up to 2018-06-20 bucket: - -counter bits | 7-digit nibble-aligned offset within sha1 --------------|--------------------------------------------- -28 | 4 5 6 7 9 11 12 14 16 17 18 19 20 23 26 28 30 31 32 33 -27 | 4 5 6 9 11 12 14 16 19 20 23 28 30 31 32 33 -26 | 4 5 12 14 19 20 23 30 31 32 33 -25 | 19 33 -24 | not enough entropy within any offset of sha1(textname).hexdigest() - -The smallest known timestamp in the current dataset is 0x50bef44d (2012-12-05 07:14:21 UTC), so OOID with first nibble [0-4] may have different binary meaning. -The largest one is 0x5b29a005 (2018-06-20 00:29:57), but that's subject to change :-) - -Here is the Python code implementing suggested OOID: - -```python -import hashlib -def ooid3(ts, textname, ndx): - assert textname.startswith(b'20') and textname.endswith((b'.yaml', b'.json')) and ndx >= 0 - ts = ts * 2**32 - colid = 0xf0000000 - cnt = (int(hashlib.sha1(textname).hexdigest()[-7:], 16) + ndx) & 0x0fffffff - assert (ts & colid) == (ts & cnt) == (colid & cnt) == 0 - return hex(ts | colid | cnt)[2:] -def ooid(textname, ndx): - return ooid3(ts(textname), textname.encode('ascii'), ndx) -``` - -Test vectors: - -``` ->>> ooid('2012-12-05/20121205T071421Z-MM-AS18399-http_invalid_request_line-no_report_id-0.1.0-probe.yaml', 0) -'50bef44df29c69e2' ->>> ooid('2012-12-05/20121205T071421Z-MM-AS18399-http_invalid_request_line-no_report_id-0.1.0-probe.yaml', 1) -'50bef44df29c69e3' ->>> ooid('2018-06-20/20180620T002915Z-DE-AS28753-http_header_field_manipulation-20180620T002917Z_AS28753_ZryhjoYMtU6jEx9TOjDCRuBo5z5te2fLWWj7gkvmkMkbLlnFTi-0.2.0-probe.json', 0) -'5b299fddf5c34544' -``` - -### Using less bits for counter - -If the OOID prefix is the same for all the measurements in the report file, then the minimal possible bit length for the counter is 20 bits. There is a report having 1000003 measurements that needs at least 20 bits. - -It's not trivial to have numbering schema that depends only on report file and does not provide collisions across different reports. The outline of collision probability for $time:$static:$counter schema over the whole dataset is estimated below. - -It's practical to brute-force a sha1-hmac or siphash key to make counter 24 bit, so it'll be aligned at byte boundary. -The probability of collision of single hash function truncated to 24 bits among those 316k coincident timestamps is ~3e-4, so it's like brute-forcing ~11 bits. - -It's not practical to make counter 20 bit with _single_ hash function as it's -equivalent to brute-forcing 186-bit key. But it's practical to have ~128...256 -_independent_ keys for hash function to have collision-free 20 bit counter for backfilling. - -Estimates of those probabilities are available in [jupyter notebook](./ooid-hash-prob.ipynb). - -## Collector-stamped OOID - -Collector should use following schema to stamp OOID on every measurement within report file while closing report: - -- 32 bits to represent time when the measurement is _received_ according to collector's wall clock -- 8 bits in {0...0xEF} range representing Collector-ID -- 24 bits of counter - -int24 counter is enough to stamp 16M measurements per second. Smallest possible -measurement with probe_asn, probe_cc, test_runtime and alike fields is at least -286 bytes. That makes at least ~38Gbit/s stream. That's way higher than -throughput of LZ4 decompressor we've seen (11Gbit/s) and/or available wire -speeds (10Gbit/s). - -`Collector-ID` is a value stored in configuration file representing unique -collector OS process running somewhere. That limits overall number of -concurrently running collectors with ~240 instances. That's enough for now and -foreseeable future. - -Report may be submitted through different collector instances. Collector-ID and -timestamp of the collector receiving the message should be used. - -Collector should ensure both during run-time and start-time, that... -- wall clock for specific Collector-ID does not tick backwards -- counter does not overflow within specific second - -## OOID representation - -Canonical representation of OOID should be hex string. -It's nice to be able to strip first 8 characters and feed them into `date -d -@$(( 0x5b2ce5f4 ))`. -Int64 may be transparently converted to float64 by javascript / some json -libraries leading to annoying errors. diff --git a/docs/pipeline-16.10.dot b/docs/pipeline-16.10.dot deleted file mode 100644 index 9d900050..00000000 --- a/docs/pipeline-16.10.dot +++ /dev/null @@ -1,53 +0,0 @@ -digraph { - // that's DATA dependency, not DAG - subgraph { - node [shape = folder]; - report_incomplete; - subgraph { - node [fillcolor="#ffffe0" style=filled]; - report_raw [label = "RAW report\n1 Tb"]; - backup_raw [label = "RAW backup\n1 Tb"]; - measurement_sanitized [label = "Sanitised report\n1 Tb"]; // newline-separated JSON BLOB - } - measurement_index; // measurement_id, report, target, type, Data-URL, pointer{encoding, offset, size} - measurement_features [label = "Features for\naggregation"]; // extracted from data for aggregation - measurement_colours [label = "Known anomalies:\ngreen, yellow, orange, red"]; // measurement_id, anomaly_type, reason{json} - blockpages_candidates; - blockpages_known; - geoip; - asn; - } - - "Closed by probe" [shape = cds]; - report_incomplete -> "Closed by probe" -> report_raw; - report_incomplete -> "Timeout" -> report_raw; - - report_raw -> "Backup to S3" -> backup_raw; - - Normalise [label = "Normalise,\ndrop IP addr"]; - { asn geoip } -> Normalise; - report_raw -> Normalise; - Normalise -> measurement_sanitized; - - Index [label = "Build index\n(TODO: during normalisation)"]; - measurement_sanitized -> Index -> measurement_index; - - { measurement_features measurement_sanitized } -> "Possible blockpages:\nlook-alike non-mirrors" -> blockpages_candidates; - - "Divine intervention" [shape = rarrow]; - blockpages_candidates -> "Divine intervention" -> blockpages_known; - - "WEB extractor" [label = "WEB extractor\nper-measurement"]; - blockpages_known -> "WEB extractor"; - measurement_index -> "WEB extractor"; - measurement_sanitized -> "WEB extractor" -> measurement_colours; - "WEB extractor" -> measurement_features; - - "Tor reader" [label = "Tor logs reader\nper-measurement"]; - measurement_index -> "Tor reader"; - measurement_sanitized -> "Tor reader" -> measurement_colours; - "Tor reader" -> measurement_features; - - measurement_index -> "Latency anomaly\nagainst history"; - measurement_features -> "Latency anomaly\nagainst history" -> measurement_colours; -} diff --git a/docs/pipeline-16.10.png b/docs/pipeline-16.10.png deleted file mode 100644 index 6feec246..00000000 Binary files a/docs/pipeline-16.10.png and /dev/null differ diff --git a/docs/readme-2.0.0.md b/docs/readme-2.0.0.md deleted file mode 100644 index 11447560..00000000 --- a/docs/readme-2.0.0.md +++ /dev/null @@ -1,217 +0,0 @@ -# Open Observatory Pipeline - -This is the Open Observatory data processing pipeline. Actually, two of them. - -The legacy one is based on the [luigi workflow engine](https://github.com/spotify/luigi) -and described below. - -The modern one is based on the [Apache Airflow](https://airflow.incubator.apache.org/) and it's described in [docs](docs/pipeline-16.10.md). - -## Setup - -Edit the `client.cfg` based on `client.cfg.example`. See the -[configuration](#configuration) section for more information on how to -configure the data processing pipeline. - -Install also all the python requirements in `requirements.txt`. - -Install or build -[PyYAML](http://pyyaml.org/wiki/PyYAML#DownloadandInstallation) with C -bindings. - -*Note*: You should not use more than 1 worker (luigi `--workers` option) per core -otherwise you could be wasting a bunch of CPU time in context switching. - -## How to run the pipeline tasks - -Ensure that the `pipeline` module is within your `sys.path`. This can be done -by exporting the `PYTHONPATH` environment variable to the directory where -ooni-pipeline is copied to. - - -### Daily workflow - -This `daily_workflow` is the main workflow that consists of the following steps: - -* Performs normalisation of the reports to adhere to the 0.2.0 data format and - converts them to JSON (`NormaliseReport`). - -* Performs sanitisation of the reports that contain private bridge addresses - (`SanitiseReport`). - -* Inserts the measurements inside of the postgresql `metrics-table` - (`InsertMeasurementsIntoPostgres`). - -The dependency graph is built from a master task called `ListReportsAndRun` -that takes as arguments: - -* `date_interval` the range of dates that should be operated on - -* `task` the name of the task that should be run (if you choose to run - `SanitiseReport` only `NormaliseReport` will be run, while - `InsertMeasurementsIntoPostgres` will run `NormaliseReport` that in turn runs - also `SanitiseReport`). By default task is set to - `InsertMeasurementsIntoPostgres`. - -* `test_names` a space separated list of test names that the task should - operate on. - -It is possible to specify an optional boolean parameter with the -`--update-views true` command line argument to indicate that the materialised -views should also be updated. To learn how to generate the materialised views -see below. - -Here is an example of how to run the daily workflow: - -``` -luigi --module pipeline.batch.daily_workflow ListReportsAndRun --task NoramliseReport --test-names 'http_requests dns_consistency' --date-interval 2016-01-01-2016-02-01 --workers 10 -``` - -To generate the materialised views the `sql_tasks` module shall be used. In here there are two main tasks: - -* `CreateMaterialisedViews` is used to create the materialised views used to - count the number of blockpages detected and the number of identified vendors. - -* `CreateIndexes` is used to create database indexes on certain keys. - -To create indexes run: - -``` -luigi --module pipeline.batch.sql_tasks CreateIndexes -``` - -To create the materialised views run: - -``` -luigi --module pipeline.batch.sql_tasks CreateMaterialisedViews -``` - -## Domain intelligence - -These tasks are used to update the tables related to the categories of domains being tested. Currently we support extracting the categories for the urls in the citizenlab repository. - -Moreover there are also tasks in here related to updating information pertaining to ASNs. - -To update the citizen-lab categories of URLs you shall run: - -``` -luigi --module pipeline.batch.domain_intelligence InsertCitizenLabURLS -``` - -To update the ASN information run: - -``` -luigi --module pipeline.batch.domain_intelligence UpdateASNPostgres -``` - -## Configuration - -Before running the pipeline you should configure it by editing the -`client.cfg` file. An example configuration file is provided inside of -`client.cfg.example`. - -The files you should probably be editing are the following: - -### core - -* **tmp_dir** What directory should be used to store temporary files. - -* **ssh_private_key_file** What ssh private key shall be used by luigi for sshing into ssh:// machines. - -* **ooni_pipeline_path** The location on the ec2 instance where to look for the ooni-pipeline repository. - -### aws - -* **access_key_id** This is your AWS access key ID for spinning up EC2 instances. - -* **secret_access_key** This is your AWS secret token. - -### postgres - -* **host** The hostname of your postgres instance. - -* **database** The database name. - -* **username** The username to use when logging in. - -* **password** The password to use when logging in. - -* **table** The database table to use for writing measurements to. - -### ooni - -* **bridge-db-path** A path to where you have a bridge_db.json file that - contains mappings between bridge IPs, their hashes and the ring they were - taken for (this is required for the sanitisation of bridge_reachability - reports). - -* `raw-reports-dir` is the directory where the raw reports are stored. They - should be placed inside of directories that contain the date of when the were - gathered. - An example of the layout of this directory is: - - ``` - yaml - ├── 2016-01-31 - │   ├── 20121209T051845Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - │   ├── 20121209T052108Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - │   ├── 20121209T052248Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - │   ├── 20121209T055811Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - │   ├── 20121209T055945Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - │   └── 20121209T060215Z-MM-AS9988-dns_consistency-no_report_id-0.1.0-probe.yaml - ... - ├── 2012-12-23 - │   ├── 20121222T221931Z-RU-AS57668-tcp_connect-no_report_id-0.1.0-probe.yaml - │   ├── 20121223T155557Z-RU-AS57668-tcp_connect-no_report_id-0.1.0-probe.yaml - │   └── 20121223T160913Z-RU-AS57668-tcp_connect-no_report_id-0.1.0-probe.yaml - ``` - - The structure of the filenames should be: - `{timestamp}-{probe_cc}-{probe_asn}-{test_name}-{report_id}-{data_format_version}-probe.yaml`. - -* `public-dir` is the directory where the sanitised reports will end up in - nested inside of the sanitised directory. - -* `private-dir` is the directory where the normalised and JSON converted report - files will end up in. - -## (Re)build ooni-pipeline - -*Recommended*: Create a virtual Python instance: - -`virtualenv venv` - -### Start luigi server ([script](scripts/start-luigid.sh)) - -``` -luigid --address 127.0.0.1 \ - --port 8082 --pidfile luigid.pid \ - --logdir luigid.log \ - --state-path luigi-state.pickle \ - --background -``` - -### Run the pipeline tasks - -- Run the main (`daily_workflow`) pipeline batch tasks since 2012 in weekly -batches: - -``` -for year in `seq -w 12 $(date +%g)`; do - for week in `seq -w 1 52`; do - PYTHONPATH=${HOME}/ooni-pipeline/ luigi \ - --module pipeline.batch.daily_workflow ListReportsAndRun \ - --workers 16 \ - --ignore-asn 'AS2856 AS20712 AS5607' \ - --parallel-scheduling \ - --date-interval 20${year}-W${week} - echo "[*] Finished processing 20${year}-W${week}" - done -done -``` - -- Create the database indexes. - -- Create the materialised views. - -- Run the [Domain intelligence](#domain-intelligence) batch tasks. diff --git a/docs/reprocess-report.md b/docs/reprocess-report.md deleted file mode 100644 index 02cf1676..00000000 --- a/docs/reprocess-report.md +++ /dev/null @@ -1,134 +0,0 @@ -This document describes how to reprocess historical data ingesting new features while minimizing resources waste and negative effect on the on-going data processing. - -This document is valid as of May 2019. - -Reading [overall pipeline design document](./pipeline-16.10.md) is useful to understand the following text. - -## Preface - -There are a few problems that make reingestion and reprocessing a non-instant and non-trivial process: - -- reprocessing **all** the data is slow: fresh data is ingested at ≈1.0 MByte/s. Throughput is measured per CPU core processing autoclaved files. So at least ≈46 CPU-days are needed to ingest 3.8 TB dataset + PostgreSQL [may double](https://github.com/ooni/pipeline/issues/140) that estimate. -- rewriting **all** feature tables on reprocessing produces unnecessary _PostgreSQL table bloat_. Features are deleted from feature tables on reprocessing and re-inserted back instead of the minimal possible update to avoid mistakes caused by incremental computation. Full-bucket _update_ is equivalent to _delete + insert_ as that's the way for PostgreSQL to implement MVCC. -- airflow 1.8 scheduler fails to schedule tasks properly when 2'300 DAGs are started at once to reprocess all the buckets. It starts hogging CPU, that negatively affects both reprocessing speed and ingestion of new data. - -There are a few hacks that make reingestion and reprocessing more "instant" in various cases: - -- minimal reprocessing "unit" is an autoclaved file that is 20 MB on average instead of 5.5 GB bucket. -- `code_ver` allows to reprocess files updating just a subset of feature-tables according to `min_compat_code_ver` instead of updating all of them. -- `body_sha256`, `body_simhash` and `body_text_simhash` allow to select a subset of autoclaved files for reprocessing when new blockpage fingerprint is discovered. -- GNU Make can be used to [run airflow tasks](https://github.com/ooni/sysadmin/blob/8224b4627dd2e16529b98f9907f0fbd280814035/scripts/pipeline-reprocess) with pre-defined concurrency level to limit pressure on Airflow's scheduler. -- `SimhashCache` fetches subset of `sha256(body)` to `simhash(text(body)), simhash(body)` mapping from the MetaDB before reingestion, that speeds reingestion up from 1.0 MB/s to 4.3 MB/s -- one-pass ingestion of streamed json input into _separate_ tables is not trivial. It's achieved maintaining [write buffer](https://github.com/ooni/pipeline/blob/1b2688d75a7abc09e446a7d965dd8011f5b5564d/af/shovel/oonipl/pg.py) for each table and flushing the buffer with `COPY` when few megabytes of data are accumulated. - -Currently following fingerprints to _"confirm"_ cases of network interference are implemented: HTTP Body substring, HTTP Header prefix, HTTP Header value. NB: HTTP Bodies are _not_ stored in the MetaDB, so those are not feature-based fingerprints. - -## Case: new HTML blockpage fingerprint - -Identify new blockpage, e.g. one from [homeline.kg ISP](https://explorer.ooni.torproject.org/measurement/20180126T000430Z_AS8449_pk15Mr2LgOhNOk9NfI2EarhUAM64DZ3R85nh4Z3q2m56hflUGh?input=http:%2F%2Farchive.org) coming from [#122](https://github.com/ooni/pipeline/issues/122). - -Identify corresponding measurement and `msm_no`, e.g. with `select * from report join measurement using (report_no) join input using (input_no) where report_id = '20180126T000430Z_AS8449_pk15Mr2LgOhNOk9NfI2EarhUAM64DZ3R85nh4Z3q2m56hflUGh' and input = 'http://archive.org'`. - -Identify, if possible, if the blockpage is a _static_ or a _dynamic_ one. Static page usually does not include URL of blocked page in HTML body while dynamic does. For a static blockpage `body_sha256` can be reliably used to identify all the measurements referencing it. For a dynamic blockpage low hamming distance between the blockpage and `body_simhash` (or `body_text_simhash`) of the measurement can be used to reliably identify most of the candidates containing the blockpage. E.g. [Cloudflare blockpage cluster](https://gist.github.com/darkk/e2b2762c4fe053a3cf8a299520f0490e) (see `In[18]`) has diameter of 15 for 64-bit `body_simhash`. ISP blockpages are often static as it's significantly cheaper to serve them from computational perspective. CDN server-side blockpages are often dynamic as they include some small bits of tracking those are useful for customer support. - -Sidenote: having a blockpage at hand is an opportunity to mine blocked URLs showing same blockpage and mine more blockpages, as different ISPs may show different blockpage for the same blocked URL. - -Then _human intelligence task_ should be solved to extract a fingerprint for the blockpage. The fingerprint should be added to the set of fingerprints and `openobservatory/pipeline-shovel` should be rolled out before reprocessing of historical data. - -If the blockpage is a static one, there is a fast-path alternative to reprocessing: it's possible to update MetaDB directly without actual reprocessing as SHA256 collision is very unlikely and `body_sha256` may be used as a feature _identifying_ the blockpage server (at the current stage of OONI Methodology development). See feature-based fingerprint case for more on the fast-path. Keep in mind that the HTTP Body substring fingerprint is still _derived_ from the body, so avoiding full-dataset reprocessing may lead to false negatives. - -Overall steps needed to mark existing & future measurements are: - -- pause ongoing ingestion and ensure that there are no `meta_pg` TaskInstances running -- update `fingerprint` table in [the database schema](https://github.com/ooni/pipeline/blob/065cccdfeb531e93a22d2aacc05ec05e990f99ee/af/oometa/) following [an example](https://github.com/ooni/pipeline/blob/065cccdfeb531e93a22d2aacc05ec05e990f99ee/af/oometa/003-fingerprints.install.sql#L31-L62) and [roll it out](https://github.com/ooni/sysadmin/blob/4defab8e92a2e53e2679a17214162ed058089e7f/ansible/deploy-pipeline-ddl.yml). The fingerprints are stored in the schema to generate `fingerprint_no serial`. -- create a temporary table having `msm_no` of the measurements matching the fingerprint _with confidence_ according to the _derived_ features existing in database (e.g. `select msm_no from http_request where body_sha256 = '\x833b2fb8887eed1c0d496670148efa8b6a6e65b89f8df42dbd716464e3cf47a6'` for static blockpages) -- insert those `msm_no` together with matching `fingerprint_no` into `http_request_fp` table as if those were actually ingested by `centrifugation.py` -- update `anomaly` and `confirmed` flags in `measurement` table for the affected measurement according to the logic codified in `calc_measurement_flags()` -- update `centrifugation.py`: 1) set up-to-date `fingerprint` table checksum in `HttpRequestFPFeeder.__init__()`, 2) bump global `CODE_VER` and `HttpRequestFPFeeder.min_compat_code_ver` (and only it, to avoid rewriting other tables) -- roll out `openobservatory/pipeline-shovel` and unpause data ingestion -- reprocess all the previous buckets under GNU Make control - -It's possible to try to use `body_simhash` to reprocess _likely-affected_ buckets first to reduce time-to-publication latency, but that's out of the scope of the document. - -## Case: new feature-based fingerprint - -The goal of special handling of feature-based case is that the case does not depend on voluminous HTTP bodies. So the flags for the dataset can be updated within couple of hours given quite modest computing resources (4 vCPU, 16 GiB RAM, HDD) compared to ≈46 CPU-days needed ingest whole dataset from scratch. - -Examples are `Location` redirects and DNS-based redirects to blockpage servers. - -E.g. aforementioned [homeline.kg ISP](https://explorer.ooni.torproject.org/measurement/20180126T000430Z_AS8449_pk15Mr2LgOhNOk9NfI2EarhUAM64DZ3R85nh4Z3q2m56hflUGh?input=http:%2F%2Farchive.org) actually serves redirect for a blocked http URI with no `Date` and no `Server` headers that clearly looks like injected HTTP redirect. - -This case is almost the same one as the case of a static blockpage: the MetaDB has all the data to follow fast-path updating measurement metadata (`http_request_fp` table, `confirmed` and `anomaly` flags, etc.) with direct DB queries. The downside of fast-path is that it'll lead to duplication of logic between the queries and `centrifugation.py` that may (by mistake) lead to inconsistencies if the logic is not perfectly equivalent. - -Overall steps needed to mark existing & future measurements are the same as for HTML blockpage fingerprint with small alterations: - -- _(same)_ pause ongoing ingestion and ensure that there are no `meta_pg` TaskInstances running -- _(same)_ update `fingerprint` table in [the database schema](https://github.com/ooni/pipeline/blob/065cccdfeb531e93a22d2aacc05ec05e990f99ee/af/oometa/) following [an example](https://github.com/ooni/pipeline/blob/065cccdfeb531e93a22d2aacc05ec05e990f99ee/af/oometa/003-fingerprints.install.sql#L31-L62) and [roll it out](https://github.com/ooni/sysadmin/blob/4defab8e92a2e53e2679a17214162ed058089e7f/ansible/deploy-pipeline-ddl.yml). The fingerprints are stored in the schema to generate `fingerprint_no serial`. -- create a temporary table having `msm_no` of the measurements matching the fingerprint _perfectly_ according to features existing in database (e.g. `select msm_no from http_request where headers->>'Location' = 'http://homeline.kg/access/blockpage.html'`, keep in mind that keys of headers are case-sensitive) -- _(same)_ insert those `msm_no` together with matching `fingerprint_no` into `http_request_fp` table as if those were actually ingested by `centrifugation.py` -- _(same)_ update `anomaly` and `confirmed` flags in `measurement` table for the affected measurement according to the logic codified in `calc_measurement_flags()` -- update `centrifugation.py`: set up-to-date `fingerprint` table checksum in `HttpRequestFPFeeder.__init__()`. There is no need to bump `CODE_VER` for feature-based fingerprints as we are 100% confident that reprocessing is not needed and ongoing data processing is paused. -- _(same)_ roll out `openobservatory/pipeline-shovel` and unpause data ingestion -- there is no need for reprocessing as there is no possibility for false negative here - -_A temporary table_ is not necessary a result of `CREATE TEMPORARY TABLE`, it may also be a query executed on a read-only replica with faster disk drives with the output of the query directed to a local file that becomes `UNLOGGED` table on a master via out-of-band data transfer or via _Foreign Data Wrapper_. - -Unfortunately, it's not trivial to give a concrete example of the queries as these examples have to be kept in-sync with the rest of the code and, what's more important, different cardinality of the tables may need different strategies for UPDATE. E.g. [CREATE TABLE + rename](https://github.com/ooni/pipeline/pull/144#issuecomment-483365330) strategy may be order of magnitude more performant than `UPDATE` when the UPDATE touches _many_ rows (it was touching ≈5% of rows in the case). - -## Case: new feature table - -The rule of thumb is: if you are not going to use the extracted features for search or aggregation, you should rather consider leaving JSON as-is without bloating the MetaDB. Maintained table should be an asset, not just a liability of maintenance for the sake of maintenance. - -One may want to use [commit adding `vanilla_tor` stats](https://github.com/ooni/pipeline/commit/902e6751340dd515096214f74c739751c9ddca55) for inspiration, but the code evolved a bit since than. - -- Add new feature table. Avoid foreign keys, those are very slow to verify during batch ingestion (as of PostgreSQL 9.6). -- Bump `CODE_VER`, set `min_compat_code_ver` for the new feeder -- `TheFeeder.row()` creates a string that is suitable for sending to the table via `COPY` -- `TheFeeder.pop()` removes fields from the JSON object those are completely ingested by the feeder and should NOT be considered a part of the _residual_ -- test, deploy, reprocess all (or the affected) buckets under GNU Make control - -One may save significant amount of CPU time marking old _autoclaved_ files as already processed by the new version of code bumping their corresponding `code_ver` in the database. It may be useful in a case when a feature has to be extracted **only(!)** from a known subset of reports, so the reports that have no data on the specific feature may be skipped safely. Example is extracting a feature of a "low-volume" test. E.g. `web_connectivity` test takes 99.4% of data volume of 2019Q1, so _any_ other test is a low-volume one. Another example is a extracting a feature that was shipped as a part of some specific `software` version, so `autoclaved` having no records coming from the new software may be manually labeled with a newer `code_ver` and skipped safely. - -For example if you were to need to reprocess only measurements for `"test_name": "telegram"`, you could run the following query on the db: - -``` -UPDATE autoclaved SET code_ver = 6 WHERE code_ver = 5 AND autoclaved_no IN ( - SELECT autoclaved_no FROM autoclaved WHERE autoclaved_no NOT IN ( - SELECT DISTINCT autoclaved_no FROM report WHERE test_name = 'telegram')); -``` - -Assuming the current code_ver is 5 and the next code_ver is going to be 6, as per https://github.com/ooni/pipeline/pull/177. - -## Case: adding new feature to existing table - -Let's use [commit adding `body_simhash` extraction](https://github.com/ooni/pipeline/commit/8e14b20ec368572c0bb831fb958bcc70eb9108a6) as an example. Things to do are the following: - -- Add new feature as a nullable column. Adding a `NOT NULL` column will trigger an early table rewrite that is waste of CPU and Disk IO bandwidth. -- Bump `CODE_VER` -- Bump `min_compat_code_ver` to the new value of `CODE_VER` for affected "feeders" (`HttpRequestFeeder` and `HttpControlFeeder` in this case) -- Append new feature columns to `columns` -- Write code to extract the needed feature for `TheFeeder.row()`, drop those fields in `TheFeeder.pop()` (if needed), test it and deploy. -- Reprocess all (or the affected) buckets under GNU Make control. -- Alter the feature column to be `NOT NULL` if needed. - -## Marking autoclaved files for reprocessing - -The _autoclaved_ files are selected for reingestion and reprocessing based on their `code_ver`. If `autoclaved.code_ver` matches `centrifugation.py:CODE_VER` then the file is skipped altogether (file is not read and decompressed, json is not parsed). If `autoclaved.code_ver` is _compatible_ with `Feeder.min_compat_code_ver` (greater-equal-than) then the corresponding PostgreSQL table is not re-written during a centrifugation pass. So it can be used to reduce amount of burned CPU and database disk IO. - -_autoclaved_ file may be marked with `code_ver` equal to 0 (`CODE_VER_REPROCESS`) to force reprocessing of all the feature-tables for this file. There should be no reasons for that besides, maybe, clean-up after a manual database modifications. - -Reingestion is different from reprocessing as it may handle changes to autoclaved files themselves and update `autoclaved`, `report`, `measurement` tables accordingly. The easiest way to force reingestion manually is to set `autoclaved.file_sha1` to all-zeros of something like `digest('', 'sha1')`. One of the possible reasons for that is [report deletion](./delete-report.md). - -## GNU Make crutch for Airflow - -Airflow has an issue in a scheduler, it starts consuming unreasonable amount of resources if there are thousands of _running_ DAGs. So, reprocessing of ≈2300 daily buckets of OONI data has to be micro-managed. One of the usual Linux tools to execute parallel processes is GNU Make, so, it was taken for the [`pipeline-reprocess`](https://github.com/ooni/sysadmin/blob/4defab8e92a2e53e2679a17214162ed058089e7f/scripts/pipeline-reprocess) script. - -The way to use the script is the following: - -- download it to your $HOME at `datacollector.infra.ooni.io` running Airflow -- edit `PRJ` with a slug representing a reprocessing session (ex: https://github.com/ooni/sysadmin/commit/bf9c967da7b2e2cc0c5efca0351cdf679d861b2f#diff-0bd5c245bef1335609715572487a3117) -- choose a way to list buckets-to-reprocess with `TYPEOF_DEPS` -- edit `$(PRJ)/...-deps` target in the makefile to reflect the desired logic to select buckets to reprocess -- run `tmux` and `./pipeline-reprocess reprocess` within tmux session - -The script will execute TaskInstances via `airflow run` one-by-one within predefined concurrency limits. diff --git a/docs/simhash-cache-hit.ipynb b/docs/simhash-cache-hit.ipynb deleted file mode 100644 index d983e3ea..00000000 --- a/docs/simhash-cache-hit.ipynb +++ /dev/null @@ -1,1179 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import binascii\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Populating the interactive namespace from numpy and matplotlib\n" - ] - } - ], - "source": [ - "%pylab inline" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "FILES = range(213, 334+1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The procedure to generate the files was following:\n", - "\n", - "```bash\n", - "for d in 2018-08-{01..31} 2018-09-{01..30} 2018-10-{01..31} 2018-11-{01..30}; do\n", - " if [ -s \"$d\" ]; then\n", - " echo \"$d\"\n", - " time psql.amsmetadb --no-align --tuples-only -c \"with msm as (select distinct msm_no from autoclaved join report using (autoclaved_no) join measurement using (report_no) where bucket_date = '$d'), lim as (select min(msm_no), max(msm_no) from msm) select body_sha256, body_simhash, body_text_simhash, body_length from http_request where msm_no in (select * from msm) and msm_no >= (select min from lim) and msm_no <= (select max from lim) and body_sha256 is not null union all select body_sha256, body_simhash, body_text_simhash, body_length from http_control where msm_no in (select * from msm) and msm_no >= (select min from lim) and msm_no <= (select max from lim) and body_sha256 is not null\" >\"$d\"\n", - " fi\n", - "done\n", - "\n", - "for f in 2018-*; do mv $f $(date -d \"$f\" +%j); done\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "CACHEENTRY_SIZE = int((256 + 64 + 64) / 8 / 0.268) # 26.8% is an estimate of useful space in the dict structure\n", - "MB = 1024. * 1024" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The estimate was done with following `test.py` script:\n", - "\n", - "```python\n", - "import binascii\n", - "import sys\n", - "import resource\n", - "import struct\n", - "import array\n", - "def load_cache():\n", - " cache = {}\n", - " s = struct.Struct('qq')\n", - " with open(sys.argv[1]) as fd:\n", - " for line in fd:\n", - " line = line.strip().split('|')\n", - " key = binascii.a2b_hex(line[0][2:])\n", - " # Py2.7: 0.197, Py3.5: 0.180, Py3.6: 0.187, Py3.7: 0.187\n", - " #value = (int(line[1]), int(line[2]))\n", - " # Py2.7: 0.236, Py3.5: 0.223, Py3.6: 0.231, Py3.7: 0.231\n", - " #value = line[1] + '|' + line[2]\n", - " # Py2.7: 0.268, Py3.5: 0.268, Py3.6: 0.282, Py3.7: 0.281\n", - " value = s.pack(int(line[1]), int(line[2]))\n", - " # Py2.7: 0.288, Py3.5: 0.287, Py3.6: 0.303, Py3.7: 0.301\n", - " #value = int(line[1]) << 64 | int(line[2]) # maybe that's not bit-perfect\n", - " # Py2.7: 0.394, Py3.5: 0.396, Py3.6: 0.424, Py3.7: 0.423\n", - " #value = None # theoretical maxium of density with dict and specific key representation\n", - " cache[key] = value\n", - " return cache\n", - "def main():\n", - " base = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss\n", - " cache = load_cache()\n", - " data = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss\n", - " rss = data - base\n", - " print('{:d} items, {:d} KiB, {:.3f} %util'.format(len(cache), rss,\n", - " len(cache) * ((256 + 64 + 64) / 8) / (1024. * rss)))\n", - "if __name__ == '__main__':\n", - " main()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# What's the best depth?" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def df_row_with_cache(x, cache, dyn):\n", - " total, hit, btotal, bhit = 0, 0, 0, 0\n", - " start_len = len(cache)\n", - " with open('{:d}'.format(x)) as fd:\n", - " for _ in fd:\n", - " sz = int(_.rsplit('|', 1)[1])\n", - " total += 1\n", - " btotal += sz\n", - " key = binascii.unhexlify(_[2:66])\n", - " if key in cache:\n", - " hit += 1\n", - " bhit += sz\n", - " elif dyn:\n", - " cache.add(key)\n", - " return (x, hit, total, bhit, btotal, start_len, len(cache))\n", - " \n", - "def df_from_rows(row):\n", - " d = pd.DataFrame(row, columns=['x', 'hit', 'total', 'bhit', 'btotal', 'start_len', 'end_len'])\n", - " d['ratio'] = d.hit / d.total\n", - " d['bratio'] = d.bhit / d.btotal\n", - " d.set_index('x', drop=False, inplace=True)\n", - " return d " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def calc(depth):\n", - " row = []\n", - " for x in FILES[depth:]:\n", - " cache = set()\n", - " for prev in xrange(depth):\n", - " with open('{:d}'.format(x - prev - 1)) as fd:\n", - " cache.update({binascii.unhexlify(_[2:66]) for _ in fd})\n", - " row.append(df_row_with_cache(x, cache, dyn=False))\n", - " return df_from_rows(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "d1 = calc(depth=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.453759\n", - "std 0.061638\n", - "min 0.000000\n", - "25% 0.452917\n", - "50% 0.461423\n", - "75% 0.468988\n", - "max 0.496665\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAG/NJREFUeJzt3X+QXWd93/H3V6uVWQP10ngzwSslUoLjVA5gwUbxjNukOCSyoUgaMLHc0JppMp7SqokNUSMPHZu4nUGgJM404w5xB2ZoQmIZcNVNUKo0tekPZmy0QrKNsFW25oe0osOmeElTbe1d+ds/7jnS2aPz47n3nrv33n0+rxmN7j0/7n2e+zzne57zPM85a+6OiIjEYV2/EyAiIqtHQV9EJCIK+iIiEVHQFxGJiIK+iEhEFPRFRCKioC8iEhEFfRGRiCjoi4hEZH2/vvjqq6/2zZs39+vrRUSG0vHjx//S3Sc63b9vQX/z5s3MzMz06+tFRIaSmX2rm/3VvSMiEhEFfRGRiCjoi4hEREFfRCQiCvoiIhFR0BcRiYiCvohIRBT0RUQioqAvIhKRoKBvZreY2WkzmzWz/QXr329m82Z2Mvn3K80nVUREulX7GAYzGwEeAn4eOAscM7Npd/9abtND7r63B2mUBh0+McfBo6c5t7DINeNj7NtxHbu3TfY7WSKySkKevbMdmHX3FwDM7BFgF5AP+jLgDp+Y497HnmVx6QIAcwuL3PvYswDBgV8nDZHhFtK9Mwmcybw/myzLe4+ZPWNmnzOzTUUfZGZ3mdmMmc3Mz893kFzpxsGjpy8G/NTi0gUOHj0dtH960phbWMS5dNI4fGKuB6kVkV4ICfpWsMxz7/8E2OzubwL+Avh00Qe5+8PuPuXuUxMTHT8ZVDp0bmGxreV53Z40RLpx+MQcNx14nC37v8BNBx5XY6NDId07Z4Fsy30jcC67gbv/78zbfwt8rPukCazsTrlqbBQzWDi/1FHXyjXjY8wVBPhrxseC9u/2pCHDZ1C685rompSWkKB/DLjWzLYAc8Ae4O9nNzCz17v7d5K3O4HnGk1lm5qqqP3+nHxFX1hcuriuk0q/b8d1Kz4PYGx0hH07rgvav+yksc6MwyfmdPA1YFCCbJqWQQm0VVeZqnftqQ367r5sZnuBo8AI8Cl3P2VmDwAz7j4N/KqZ7QSWge8B7+9hmis1VVG7/Zz04J1bWMS41B/WzucUVfSsdit9ul2nQaXopAFwwX2oW12DEmgHbaB9kAKtrjKbY+757vnVMTU15b34y1k3HXi8sDU6OT7Gl/bfvCqfkz94i4R8zpb9X7hs8CTPgG8ceGfNVs05fGKODz36NBcK6k27v/EgKCqrsdERPvruN7Yd2LoNuN3W3bJ697orR7n/Xde3nZ+y+ldU53p94mzquF4LzOy4u091uv+auyO3qRZBN59T10IP/ZyQvvbQ/vim7N42ySslDYVhbHU1NTjdxMymXgy0A7x4fqmjWVZldSu/fDVmde3bcR1joyMrlrXTNSmX9O1v5PbC4RNzrDMrbIW2Gxy7GfRsKqCXdaek+lXpux0Qbko3rcts91uRdk9g3XaFNFF3q9LcSbdM6BjQanQDdds1mdVEvclOrHjx/BIjSdlNDsG9K2sm6KetjaKDppPg2M2gZ1lQbPdz8hW929k7Tel2QLgJ3fR/h3S/tXsC66aV3lTdrat37Z7IQgNtr/vb80H6wdtv6LjeN1lvshMr0rIbhllFa6ZPv6zPb8SM3/7FN/d11g1wcTA3bRGM9zCAF6Ubqg/edvPa5FTSTnTTx1u2b6qTPv1epKeu7ubL7G0/McHnj8+VnszaGRtopy6UpX98bJRXX7G+rXpYlJYmx1y6GY+qqzftfl6nuu3TH6qgX1UZ2xl06kTVd4cE2boDstOKXJTO/EEyus7AYOnCpV9obHSE97x1kieen79shlF+fdUB2uRB2Y6qQe7JmgDTzb55ZbO0UiGDqJ3U3bLf/T1vneRPn/7OilZoui6kTELKM+RkU1TnyuphVbqaGsCtu7oLiRMhEyva+bxOdRv0h6Z7p+6yLKSfuamWe/a7gcJ1H333G1dUypsOPN7o9MsyRf2rS69cXlUXly7wmSe/fbES57fIry+7bO3HtL6q/u80rfs++/SKAJNNf1ldSQNJaBdAfrui1KSDqPl9s9oZI6kai1hcusATz89z8v5fKG2I3HTg8cr6X1eeRb/N54/PXdZAOP/yMi+eX3niKauHVXUlpOso5Lium1zhtH6bqsbc+JWjl+WpzGqPb7VjaIJ+WWW8+9BJDh49XdjaGBsd4W0/MXGxtRA6Xz5f2OdfXq6c4RES9EL6N0MrclXXSujlJxQHqar17eSrVzN5qvq/s6oCTNmYRLaulO1bd8IrUhfY6tITerUIl3733dsmK09QZfW/rjzLjsMnnp9f0cjZsv8LpWks++w0nSGBNg2q3eYrq64xFyodh8meoAdpoHdogn5VoZW1NvIHSVEQ+9CjTwOsCKihhV2Vpvy6kICctjby6a6qjPm7dIu6GJoUmq9etXRCA22ZuYXFi5+RPRBDAurcwuKKINxOICiqK/mT96tG1108eRfVgT988tu131N2d3RZsP7I9Km2gmzoSb6d36cqgBcJOUGH5qto37LGXN6Vo+u4YnSkcPYOrDxOB2mgd2iCfl0lWly6wB8/dWbFwFddlwpcupt05lvfu9i/3U6aoLhy5oNe3fTLVNnBHVoZ2+lzrNq2bH0+qKz2TJ5uryCMS+V1wf1iWkNOJtl92z3B5rsPimaCjI2OXJyZElJ3i5TdHV32uy0sLl1sOMwtLDK6zhgdscv63dPyDD3JF9WLsj797BhKVZ+7Q/AJOiRfZULr2OtefUXpuEJV+fX78RFDc3NW0c0ZeWmFT28KCS28tP+6k0u4onSlwSH7JMDd2yb56LvfyOT4GEZrZsPrrhwN/j5o5aeJbpPJ8TF+6cYfLkw3tGaNOMWPV83/xvl8TY6P9XQQt+wKoiitRduUdVnV/a5F+xb9RmlwKZK9aanuprBuyrno5rLQK6+lV5ylC86ItfKQliewops0K3+Sz+Yv+zkH3/tmDt725svqSvazy6QB/0v7b+aJ5+fbPiEuveK8esP6Fd9ddvxdMz7W9f04deXXzxsZh6aln50zXNfiT8+i7Vxi1p3/q6afZbsKqsYN8v2s0N6MgKori1DZWQ9TP/I3K2dhlKUr31JJ85Ue7Pck4ywhfZftDq6XXVm8562Tld0fkxV1oa67Znxs9LLZMKnsNNz8zJ+qsYG6bpJ2u4/KPgdav/H/fWm5rf2zV0HAZQPW2ZZ3frwp362Rfk62vqTbfmT6VOlvW5anTgPmwuISH9l5fWlagdI8F6k6MdSVXz8HeoempQ+tyvKl/Tfzu7ffUNnqTytFyNVBiLHRET6y83q+tP9mHrz9BgDuPnSSew6dXNFVUNWSLBNa+FVXFqHyLbL09/zGgXe23YLKH3id3IrfyT5lVxb/andrWZH0RFe2Pj3ZFN3m/74bf5iXll8pTQ9cHtjS37Xs6iM9yZSlBcLq7ohZZWsVLv3GoYE1K627RVcl2ZZ36GyurE7Sleap6phJryzKlF2lpvtm05pdV3d1k1dVfv1+fMRQBf1UWlhlBZxWiqIA8b6Kbo0i2e6KbJCC4sv9IlUtk5CDO5uGom6idSUZGB8bbavbpZ0WVP7Aq5pdVfYHLzrZBy4/WWXHF6qez1K1vuxkEnoibKdLpeokk6Y1H5CKgs5v/+Kbuf9d11d+TrcD31VdikXLQ7dtN1115Zh2q9XN6sqX0+5tkxc/LzvYes+hk9x96CQAv3v7DTx4+w1tHUtFJxQC9+21oeneyUt/tLpBxKIulWy3xlVjo7y8fIHzSytbc0U3jXR6AFW1TNLPL7rMLbtxpW46Xrpv9lI2NJ1Fl6RFN27lWyp1s6vanUY3t7DI3YdO8pt/cor733U9UH+zVNljA+DS/PT8LJl8t0P+M+9JDvwQ+fxUDXKHPOIgm566brCydd32HbczWSFdFrJtXbqKulPz3UP5adWhc+hDTkD5Ltr8fTchiurTIBiqO3KLdPvwpHYeRRv6qON8gAw9szf1IKhOH4VQdZdn3Z25Ibeo5++iDL2tvZM7OevyFFom3d563+9n9Yc+IqHsPpd0oDX0Nwz9vat+107u6G5nbCxfTiH7NvVYhSbqQzR35Jbp5mxa1nK/csP6ws8MeZBaSIAs001emmhVhLQ+y4RMSQ1pCRfp5E7OVLd3DHcy9TCr3629squNoqvA/MB+6NVEVmgdKiv7Tp/9381VasigeegVU91NlYPwl8iGPuh3o927SYsqatkshmHVaZAKmV2Vv8QPnZFVpt27nNvdF6q7jPrZgg/Vzom8quzbqRch23bTwChSNaurrhEW0vgImXBRF9T78ciSIlEH/XbvJm26oq412amboTdsVe1TJ+RAbOKO4bIgNizl3u+rjTJNpqubYzPf+Ai5OihSF9RX+5ElZaIO+p3cTTqoB9Ag6eQArBrQbqc7JW8Qnv2/lvV7zCKrqe7RTvPU6b0Xqz1nf+gHcrs1SJVWWpr+60Qq497o12O1B1XdY6Cb+r2iep6+xEVBZbDpj5Wv1MnfItDsHZGMQRn4kmKD0kc9KNq996JfFPRlYCmoDLZB6aMeJIMQ1OsM5WMYJA51z6eR/qp7lIQMJgV96anDJ+a46cDjbNn/hcrn6RRRUBls2efLrMZjtaUZ6t6Rnun2DkTdFzH4hqE7Q1ZS0JeeaWIgVkFFpFnq3pGe0UCsyOAJCvpmdouZnTazWTPbX7HdbWbmZtbxHFJZOzQQKzJ4aoO+mY0ADwG3AluBO8xsa8F2rwV+FXiq6UTKcNJArMjgCWnpbwdm3f0Fd38ZeATYVbDdvwQ+Dvy/BtMnQ0yzO0QGT8hA7iRwJvP+LPDT2Q3MbBuwyd3/1Mx+vcH0yZDTQKzIYAlp6Rf9BdaLD+wxs3XAg8CHaj/I7C4zmzGzmfn5+fBUiohII0KC/llgU+b9RuBc5v1rgZ8Evmhm3wRuBKaLBnPd/WF3n3L3qYmJic5TLSIiHQkJ+seAa81si5ltAPYA0+lKd/++u1/t7pvdfTPwJLDT3fUITRGRAVMb9N19GdgLHAWeAx5191Nm9oCZ7ex1AkVEpDlBd+S6+xHgSG7ZfSXb/t3ukyUia4n+kM3g0GMYRKSnun0GkzRLj2EQkZ6qegaTrD4FfRHpKT2DabAo6ItIT+kZTINFQV9EekrPYBosGsgVkZ7SH8MZLAr6ItJzegbT4FD3johIRBT0RUQioqAvIhIRBX0RkYgo6IuIRERBX0QkIgr6IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISEQV9EZGIKOiLiEREQV9EJCIK+iIiEVHQFxGJiIK+iEhEFPRFRCKioC8iEhEFfRGRiCjoi4hEREFfRCQiQUHfzG4xs9NmNmtm+wvW/2Mze9bMTprZfzezrc0nVUREulUb9M1sBHgIuBXYCtxRENT/yN3f6O43AB8HfqfxlIqISNdCWvrbgVl3f8HdXwYeAXZlN3D3v8q8fTXgzSVRRESasj5gm0ngTOb9WeCn8xuZ2T8FPghsAG5uJHUiItKokJa+FSy7rCXv7g+5+48BvwH8i8IPMrvLzGbMbGZ+fr69lIqISNdCgv5ZYFPm/UbgXMX2jwC7i1a4+8PuPuXuUxMTE+GpFBGRRoQE/WPAtWa2xcw2AHuA6ewGZnZt5u07ga83l0QREWlKbZ++uy+b2V7gKDACfMrdT5nZA8CMu08De83s7cAS8CJwZy8TLSIinQkZyMXdjwBHcsvuy7z+tYbTJSIiPaA7ckVEIqKgLyISEQV9EZGIKOiLiEREQV9EJCIK+iIiEVHQFxGJiIK+iEhEFPRFRCKioC8iEhEFfRGRiCjoi4hEREFfRCQiCvoiIhFR0BcRiYiCvohIRBT0RUQioqAvIhIRBX0RkYgo6IuIRERBX0QkIgr6IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISEQV9EZGIKOiLiEREQV9EJCJBQd/MbjGz02Y2a2b7C9Z/0My+ZmbPmNl/NrMfaT6pIiLSrdqgb2YjwEPArcBW4A4z25rb7AQw5e5vAj4HfLzphIqISPdCWvrbgVl3f8HdXwYeAXZlN3D3J9z9fPL2SWBjs8kUEZEmhAT9SeBM5v3ZZFmZXwb+rGiFmd1lZjNmNjM/Px+eShERaURI0LeCZV64odn7gCngYNF6d3/Y3afcfWpiYiI8lSIi0oj1AducBTZl3m8EzuU3MrO3Ax8GftbdX2omeSIi0qSQlv4x4Foz22JmG4A9wHR2AzPbBvw+sNPdv9t8MkVEpAm1Qd/dl4G9wFHgOeBRdz9lZg+Y2c5ks4PAa4DPmtlJM5su+TgREemjkO4d3P0IcCS37L7M67c3nC4REekB3ZErIhIRBX0RkYgo6IuIRERBX0QkIgr6IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISEQV9EZGIKOiLiEREQV9EJCIK+iIiEVHQFxGJiIK+iEhEFPRFRCKioC8iEhEFfRGRiCjoi4hEREFfRCQiCvoiIhFR0BcRiYiCvohIRBT0RUQioqAvIhIRBX0RkYgo6IuIRERBX0QkIkFB38xuMbPTZjZrZvsL1v+MmX3FzJbN7LbmkykiIk2oDfpmNgI8BNwKbAXuMLOtuc2+Dbwf+KOmEygiIs1ZH7DNdmDW3V8AMLNHgF3A19IN3P2bybpXepBGERFpSEj3ziRwJvP+bLJMRESGTEjQt4Jl3smXmdldZjZjZjPz8/OdfISIiHQhJOifBTZl3m8EznXyZe7+sLtPufvUxMREJx8hIiJdCAn6x4BrzWyLmW0A9gDTvU2WiIj0Qm3Qd/dlYC9wFHgOeNTdT5nZA2a2E8DMfsrMzgLvBX7fzE71MtEiItKZkNk7uPsR4Ehu2X2Z18dodfuIiMgA0x25IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISEQV9EZGIKOiLiEREQV9EJCIK+iIiEVHQFxGJiIK+iEhEFPRFRCKioC8iEhEFfRGRiCjoi4hEREFfRCQiCvoiIhFR0BcRiYiCvohIRBT0RUQioqAvIhIRBX0RkYgo6IuIRERBX0QkIgr6IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISkaCgb2a3mNlpM5s1s/0F668ws0PJ+qfMbHPTCRURke6tr9vAzEaAh4CfB84Cx8xs2t2/ltnsl4EX3f0NZrYH+Bhwey8SLMPv8Ik5Dh49zbmFRa4ZH2PfjuvYvW2y38laNbHnX/orpKW/HZh19xfc/WXgEWBXbptdwKeT158Dfs7MrLlkylpx+MQc9z72LHMLizgwt7DIvY89y+ETc/1O2qqIPf/SfyFBfxI4k3l/NllWuI27LwPfB36giQTK2nLw6GkWly6sWLa4dIGDR0/3KUWrK/b8S/+FBP2iFrt3sA1mdpeZzZjZzPz8fEj6ZI05t7DY1vK1Jvb8S/+FBP2zwKbM+43AubJtzGw9cBXwvfwHufvD7j7l7lMTExOdpViG2jXjY20tX2tiz7/0X0jQPwZca2ZbzGwDsAeYzm0zDdyZvL4NeNzdL2vpi+zbcR1joyMrlo2NjrBvx3V9StHqij3/0n+1s3fcfdnM9gJHgRHgU+5+ysweAGbcfRr4JPAHZjZLq4W/p5eJluGVzlKJdfZK7PmX/rN+NcinpqZ8ZmamL98tIjKszOy4u091ur/uyBURiYiCvohIRBT0RUQioqAvIhIRBX0RkYgo6IuIRERBX0QkIgr6IiIRUdAXEYmIgr6ISET69hgGM5sHvtWXL4ergb/s03c3aa3kA5SXQbRW8gFrKy/XuftrO9259oFrveLufXu2spnNdPPsikGxVvIByssgWiv5gLWXl272V/eOiEhEFPRFRCISa9B/uN8JaMhayQcoL4NoreQDlJeL+jaQKyIiqy/Wlr6ISJTWXNA3s01m9oSZPWdmp8zs15LlB83seTN7xsz+vZmNZ/a518xmzey0me3oX+pXKstLZv2vm5mb2dXJezOzf53k5Rkze0t/Ur5SVT7M7J8lv/spM/t4ZvlQlYmZ3WBmT5rZSTObMbPtyfKBLBMAM3uVmX3ZzJ5O8vKbyfItZvaUmX3dzA4lfxsbM7sieT+brN/cz/SnKvLxmaT+fNXMPmVmo8nyoSuTzPrfM7O/zrxvv0zcfU39A14PvCV5/VrgfwBbgV8A1ifLPwZ8LHm9FXgauALYAvxPYKTf+ajKS/J+E62/W/wt4Opk2TuAPwMMuBF4qt95qCmTtwF/AVyRrPvBYS0T4M+BWzPl8MVBLpMkbQa8Jnk9CjyVpPFRYE+y/BPAB5LX/wT4RPJ6D3Co33moycc7knUG/HEmH0NXJsn7KeAPgL/ObN92may5lr67f8fdv5K8/j/Ac8Cku/+5uy8nmz0JbExe7wIecfeX3P0bwCywfbXTXaQsL8nqB4F/DmQHZXYB/85bngTGzez1q5nmIhX5+ABwwN1fStZ9N9llGMvEgb+RbHYVcC55PZBlApCkKW01jib/HLgZ+Fyy/NPA7uT1ruQ9yfqfMzNbpeSWKsuHux9J1jnwZVYe80NVJmY2AhykdcxntV0may7oZyWXOttonS2z/hGtMz20DtgzmXVnuRRYB0Y2L2a2E5hz96dzmw18XnJl8uPA30kuS/+Lmf1UstnA5wMuy8vdwEEzOwP8FnBvstlA58XMRszsJPBd4D/RuqpayDSQsum9mJdk/feBH1jdFBfL58Pdn8qsGwX+AfAfk0VDVSZJXvYC0+7+ndzmbZfJmg36ZvYa4PPA3e7+V5nlHwaWgc+kiwp2H6gpTdm80Er7h4H7ijYtWDYweSkok/XA62hdYu8DHk1aKQOdDyjMyweAe9x9E3AP8Ml004LdByYv7n7B3W+g1QreDvytos2S/wc2L/l8mNlPZlb/G+C/uvt/S94PbD6gMC8/A7wX+L2CzdvOy5oM+smZ/fPAZ9z9sczyO4G/B/xScskHrbP8pszuG7l0ad53BXn5MVr93E+b2TdppfcrZvZDDHBeSsrkLPBYckn7ZeAVWs9IGdh8QGle7gTS15/lUnfUQOcl5e4LwBdpnYDHzSx9REs2vRfzkqy/Cvje6qa0WiYftwCY2f3ABPDBzGbDViZvA94AzCbH/JVmNpts1naZrLmgn7QUPwk85+6/k1l+C/AbwE53P5/ZZRrYk4yCbwGupdX/13dFeXH3Z939B919s7tvplXob3H3/0UrL/8wmZ1wI/D9gsvBVVdWJsBhWv3HmNmPAxtoPRRrqMokcQ742eT1zcDXk9cDWSYAZjZhySw2MxsD3k5rjOIJ4LZkszuB/5C8nk7ek6x/PNN46puSfDxvZr8C7ADucPdXMrsMW5kcd/cfyhzz5939Dcku7ZdJ3UjvsP0D/jaty5tngJPJv3fQGgw8k1n2icw+H6bVl3maZAbGIPwry0tum29yafaOAQ8leXkWmOp3HmrKZAPwh8BXga8ANw9rmSTLj9OadfQU8NZBLpMkbW8CTiR5+SpwX7L8R2mdZGdpXbWks6telbyfTdb/aL/zUJOP5eR3T8spXT50ZZLbJjt7p+0y0R25IiIRWXPdOyIiUk5BX0QkIgr6IiIRUdAXEYmIgr6ISEQU9EVEIqKgLyISEQV9EZGI/H9GELH3W/NxkgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(d1.x, d1.ratio)\n", - "d1.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "d2 = calc(depth=2)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 118.000000\n", - "mean 0.466060\n", - "std 0.016722\n", - "min 0.387390\n", - "25% 0.457964\n", - "50% 0.466787\n", - "75% 0.475006\n", - "max 0.499960\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X+QXWWd5/H3J00HG8wYR9pSmrCJCuwgWESukSpnHMWBMLqTZAV3otYM7JbFrjtZVgZYQ+n6A2pLhCnZ2Vm2XMbR0tE1oGK2RxijljK7Sy2YjglgAtGIKN1xy1YMLkuEBL77xz03nNzcH+fec/qec29/XlWp9Dn3nO7nuc+95/uc59dRRGBmZovbkrITYGZm5XMwMDMzBwMzM3MwMDMzHAzMzAwHAzMzw8HAzMzIGAwkXSRpr6R9kja3eP0ySfOSdiX/3pPsP0fS/5a0W9IDkv646AyYmVl+6jbpTNIY8APgAmAW2A68MyL2pI65DKhFxKamc08HIiJ+KOlkYAfwOxFxoNBcmJlZLsdlOGYNsC8iHgGQtAVYD+zpeBYQET9I/bxf0s+BSaBtMDjppJNi5cqVGZJlZmYNO3bs+EVETPZ7fpZgMAU8ltqeBV7f4riLJb2R+l3ElRGRPgdJa4ClwI+aT5R0OXA5wKmnnsrMzEy21JuZGQCSfpLn/Cx9Bmqxr7lt6e+AlRHxGuBbwGeP+gXSy4G/Bf55RDx3zC+LuDUiahFRm5zsO7CZmVmfsgSDWWBFavsUYH/6gIj4ZUQ8nWz+NXBu4zVJvwXcCXwwIu7Nl1wzM1sIWYLBduA0SaskLQU2AtPpA5Kaf8M64KFk/1Lgq8DnIuJLxSTZzMyK1rXPICIOS9oEbAPGgE9HxG5J1wEzETENXCFpHXAYeBy4LDn9nwFvBF6SjDgCuCwidhWbDTMzy6Pr0NJBq9Vq4Q5kM7PeSNoREbV+z/cMZDMzczAwMzMHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMcDAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzMyNjMJB0kaS9kvZJ2tzi9cskzUvalfx7T+q1SyX9MPl3aZGJNzOzYhzX7QBJY8AtwAXALLBd0nRE7Gk69LaI2NR07m8DHwZqQAA7knN/VUjqrRBbd85x07a97D9wkJOXT3DN2jPYsHqq7GSZ2QB1DQbAGmBfRDwCIGkLsB5oDgatrAW+GRGPJ+d+E7gI+GJ/ybW0Ii7iW3fOce0dD3Lw0LMAzB04yLV3PAjggGC2iGRpJpoCHkttzyb7ml0s6QFJX5a0osdzrUeNi/jcgYMEz1/Et+6c6+n33LRt75FA0HDw0LPctG1vgak1s6rLEgzUYl80bf8dsDIiXgN8C/hsD+ci6XJJM5Jm5ufnMyTJirqI7z9wsKf9zbbunOMNN3ybVZvv5A03fLvnYGRm1ZAlGMwCK1LbpwD70wdExC8j4ulk86+Bc7Oem5x/a0TUIqI2OTmZNe2LWt6LeMPJyyd62p9W1N2JmZUvSzDYDpwmaZWkpcBGYDp9gKSXpzbXAQ8lP28DLpT0YkkvBi5M9llOeS7iadesPYOJ8bGj9k2Mj3HN2jO6nusmJrPR0TUYRMRhYBP1i/hDwO0RsVvSdZLWJYddIWm3pPuBK4DLknMfB66nHlC2A9c1OpMtnzwX8bQNq6f42NvPZmr5BAKmlk/wsbefnanzuKi7EzMrnyKOacIvVa1Wi5mZmbKTMRTKHhL6hhu+zVyLC//U8gnu2Xz+wNJhi1vZ34OqkLQjImr9np9laKlV1IbVU6V+6K9Ze8ZRw1Khv7sTs355aHRxvByF9S1PE5NZEdxvVRzfGVguZd+d2OLmfqviOBgMIbeRmtWdvHyiZb9Vr6PqzMFg6FS5jdRBavGoSlm736o47jMYMlVtI/UEtMWjSmXtfqviLIo7g6JqMVWoDVW1jbRTkPIXc7RUrazdb1WMkQ8GRTWrVKV5pqptpFUNUna0Iio0LuvRNPLNREU1qxTZPJNncbeiZh4XrajlMWzhFNW847IeTSMfDIqqxRT1e/J+IavaRlrVIDUqilgdtqgKjct6NI18M1FRzSpF/Z4i2lur2EbaSE/ZfSqjqKgmyqIqNC7r0TTywaCooWdF/Z5Rbm+tYpAaBUV12BbZ3+SyHj0j30xUVLNKEb9n6845lqjV837c3lo1VXpoT1EVCDfvWCcjf2cAxdVi8vyexq3+sy1WifUXslqqMnKsIW+NPj2C6EUT47xgfAkHnjrk5h07ysjfGVRFq1t9gDGpEh3A9ryqTezLU6NvHrBw4OAhfnPoOW7+43O4Z/P5/tzZEYvizqAK2t3SPxfhL2TFFNmvU8S4/jwdtlWbIGbV5WAwIFWdLGbHKqqsimxu6reJcpQHLFix3Ew0IO68Gx5FlVUVmps8Qcyy8p3BgHhs9vAoqqyqUCsve1XPKqznZdlkCgaSLgL+EhgDPhURN7Q57hLgS8DrImJG0jjwKeC1yd/6XER8rJCUDyGPzR4eRZRVFZoGy6yEVG1UlnXWNRhIGgNuAS4AZoHtkqYjYk/TccuAK4D7UrvfARwfEWdLOgHYI+mLEfFoURkwq6qya+UNZVVC3Hk9XLLcGawB9kXEIwCStgDrgT1Nx10P3AhcndoXwImSjgMmgGeAX+dNtNlCKLpJY7E3DVahmawMzfM6JIZiXkeWYDAFPJbangVenz5A0mpgRUR8TVI6GHyZeuD4GXACcGVEPJ4vydm5vXLx6bfMF6pJYzE3DVahmawXea8XW3fO8ZHp3Rw4eOjIvvTPVW8myzKaqNX6CUem0UpaAtwMXNXiuDXAs8DJwCrgKkmvOOYPSJdLmpE0Mz8/nynh3VTpaUw2GHnKvAojf0bNsIyg27pzjnM++g3ed9uuvq8Xjc9e+uLfSpU/U1mCwSywIrV9CrA/tb0MOAu4W9KjwHnAtKQa8C7g6xFxKCJ+DtwD1Jr/QETcGhG1iKhNTk72l5Mm/nIvPnnKvFOTRlnrFFVpfaR+VHW59bROF/FerhftVhhoparNZFmaibYDp0laBcwBG6lf5AGIiCeAkxrbku4Grk5GE70FOF/S56k3E50H/Mfikt/eYm2vXMzylHm7Jo0XTYyXMiJmVEbiFNVMtlBNvt0u4lmvF71cV6raTNb1ziAiDgObgG3AQ8DtEbFb0nWS1nU5/RbghcD3qQeVz0TEAznTnIkn2yw+ecq8XZOGRCl3mL6zfV675r8Pbn0w951Tt4t41utF1uOq2EzWkGkGckTcFRGnR8QrI+I/JPs+FBHTLY59U0TMJD8/GRHviIhXR8SZEXFTsclvb1jaK0dRWc0becq8XZPGgadatwEv9B2m72yf1y4wfuHeny7YIzyht+tFq88ewAnjS3jxCeOVbSZLG9kZyIt9WF83C3XbXWbzRt4yb9WkcdO2vaWMiOl3JM4ojqBrFwCbF4PvZw5Dq7kgAC8+YZwP/9Gre/rswHBfbxQt1tcvU61Wi5mZmbKTMTBlfHmbL9hQrwUVUWt5ww3fbnkRm1o+wT2bz8/1u8uwkO9V0X+3rLT2qtfPfLvPVCsCfnzD2xY0PVUlaUdEHDNAJ6uRvTNYaEV8gMqqRXdrj86Tr1Fr3iirxtfu70L94tgqLQs947esz3yr2rs49s4A/AjPPHxn0IeiamBF1qJ7+aKu2nxnyy8S1PORJ1+jdmeQlvU9HlQTHNTL5+Jzp/jOw/Nta8/91Jaz/u30ZyNLvvv9fDT/7jf/40m+smOu8ndBg+Q7gxJ0q4FlvRi0qy3PJWPbs36oe61ttWuPHpNy1yyrsh4PtL44QX+1/Kzv8ULe7XXqSO1UpSuinyHLZz5Lvjt95tvd8TR+R3Oaav/ot0eieacqHAz60G2CUtaLQbuLMtDTBaTX4NSuVtVuvHUvwakqHWmtyuGaL90PgkPPxpF9Wd/nrE0w7Y77yPTuBVsSu1Mg6BaI817EG/uzvj/tPvNK/nanNDRrFSBGpf2/DH64TR86jWfvZXx4u+Fonc5pJUtwSg/B+8qOOS4+d+qYYZRTHWqQvQzb27B6ins2n8+Pb3hbac/ZbVUOh56LI4GgoYgZylmOO3Dw0IIOg2wly1DGrJ/XbnM4sr4/rT7zrdr/+5lT4SVo8nEw6EOn8ey9dKA2xra3k7XTtZ/g9Pl7fwpw1IPRuwWnj0zvHprlEXrpsM46QznL/qwX7H4udu0upK002uC7BeI8F/H0XUfW96fVfI52dza9DjrwRL18HAz60GnNlV5nwW5YPdW2Rp4+p9NErn6CExxbc+oWnIqo3Q5KL7XoPDOUm5tgOgXUZr1e7Fp97t593qm5JlfmuYin7zp6mfDXfOeY5fOfxaiNZBs09xn0qd1wtH46ULud061dt1M7fbtJUw3N7bpZzml3bpW0ek/Hl+ioPgPobYYydO8LaXXcU88c5lctZjIXNQwyT0dqL5/XTkMw8/QVFTXooMglsxdj38PIDC2tUuH1k5ZO5+QZrtlqSGCz5qGHWc5pd25Zso4carVvmCf5FaEK352FmMMA/b3PVS+vdvIOLR2JYNBL4VXhg9+rdvMCsl6IG3luV9tvFVSa36d2tdsqzB/oVv5VKPMqpGExKOJ9Hta5Mp5nQPZhbf2OAS/7i5z39rdxe9/uopmlSaCXcwetW8dhFZaC9izXwSjifc7S91D2NWEhjEQwyNpx1M90/SqsK19Um2qedt2qzB9opVP5L+aHso/iBasX/ea/W+Ur7zUhfac+JvFsBFMVKJ+RCAZZa86dZj+uvu4bLR9aXYWLSZEX4jw1p6rWbjuV/2IdYVKFSkyZ2uV/5ieP852H5zt+j7pVvvJcE5rT9Wz0PgFyoYxEMMhac+4047fRHt5cKGVcTNrVaHqpdSym2mCn8i9rCeqyVaESU6YsS3e0uwB3q3zlWUam05PVyi6fkQgGWWvO7dYub5YulCKHq2WRp0a3WGuD3cq/qn0dC2mx3hE15H0GQqfKV55lZLq9/2WWz0gEA8jWhNF4/X237er6+xqFMuiF1/LU6BZzbbBd+Ve5r2MhDboSUzWdLtjNer0Ad6pUdvu+dUtXmeUzMsEgq6yTqhqFMuiLSZ4a3WKvDbbTraKQp2mtqs1yVVo9tgwL+QyEbpXKTt+3ToGk7PLJFAwkXQT8JTAGfCoibmhz3CXAl4DXNZ6DLOk1wH8Ffgt4LnntNwWkvW/dmouaC2WQHad5anSLvTbYj1FtlstbialqkMuqVf7brdbbzwW4U6Wy0/ctna6qjSbqOulM0hjwA+ACYBbYDrwzIvY0HbcMuBNYCmyKiBlJxwHfA/4kIu6X9BLgQES0bbQf1MNt0h/2F02MI9FyNNGg5Zn92OrcRm2oCh+2KsozwajsyUmDfojOoCZxdhp6Cfnu0otMb9VmKg9i0tkaYF9EPJL8wS3AemBP03HXAzcCV6f2XQg8EBH3A0TEL/tNaNGqOkyyqLkAcwcOHnVbXHStddhrjg3D2iy3EHclnWaqFzmJs1sa2g29zPM8ioYiv/ej1h+VJRhMAY+ltmeB16cPkLQaWBERX5OUDganAyFpGzAJbImIG3OmeSDKvNgVMRegVa21qM7kKjeP9KrfprWtO+dYktRaez23CEUPFsiyHlURkzi76TT08tBzx77XZQ+QqGqlsh9ZlrButWT6kVKRtAS4GbiqxXHHAb8LvDv5/59Kessxf0C6XNKMpJn5+flMCV9IVXlIRqdlq7tZyFrrKK0b38vSyw2Nz0erQNB4Ylen8upWrlnKvejy7XQRbgg4Kj0L8Rnr59zFPkCiKFmCwSywIrV9CrA/tb0MOAu4W9KjwHnAtKRacu4/RMQvIuIp4C7gtc1/ICJujYhaRNQmJyf7y0mBqnCxyxuQen2uQi9GadTShtWd1+lvpdOFs7lZrtWFvlO5Zi33oss3a9ml07MQn7F+zvUAiWJkCQbbgdMkrZK0FNgITDdejIgnIuKkiFgZESuBe4F1yWiibcBrJJ2QdCb/Psf2NRQqT226oQoXu7wBqZ8abzvN7+nyE8ZbHjesX8oNq3t7TGfWz0Gr8upWrlnLvcjyhd7KrpGeotMAnR8ONL5EjI8d3VBR9nDMUdK1zyAiDkvaRP3CPgZ8OiJ2S7oOmImI6Q7n/krSJ6gHlADuiog7C0r7MYpqy67CEM28Aamozq1W72njS5nlITGj0tGclmdCU7dyzVruRQ8dbTfsst0d0P4DBxekA7Xb0Mui/549L9M8g4i4i3oTT3rfh9oc+6am7c8Dn+8zfT0pqkOr3wk7WR+wUsTKiVkU0bnV7sHyyyfGOfH44zrma5Q6mtPyTGjqVq69lHu/5duqXL6yY46Lz506ZhG3bmPpF2LNrG6/c5g/O1U2UjOQi2re6afG0+oLlmcoXFVmkLZ77544eIhdH76w47mjujxGnglN3cp1EOXerly+8/B8y/kRedMzqpWCUTNSwaDI5p1ea13tatDNsl4MqzKGOc97WoW+l4XS6vOR5VnE3cp1EOXeS7kUkZ5RrRSMmpEKBmXWpnu5wPXS7l/2lyXPe1qFvpdBylpeWZpBFrLcey2XvOmpYqVgFPuy8hqpYFBGbbrxoerlSdLDdDHM855WpanLjjbocim7UtCts9zNVnVd1yYatEGtTVSEbrM2x5foqD4DKHftkjK4BlZNgyyXMtfw6bReV7OqP/C+m7xrEzkYNOnlS9JuoTLAQ+FayPreOoCMnrLKtNN3tJmAH9/wtoVN0AIaxEJ1i0avox7atXkKjqph+EKW/b3N8+xaq66y+r966ZcYpubbhZBlBvKi0eus34Vc8mHUZH1vOz27tuy1omz4tPsuNi+41mnSZN4VDYaFg0FKr6MeFmI6/qjK+t72+uxas07afUfffd6pXdejqsqClYPiZqKUfobcgfsEssj63vay1ENjdVC/99ZOnu/oIOdHVKGfzB3IKVV7ctEoyfreZllXv6F5VIjLyoq0avOdbYeMF/nkwKKuO3k7kN1MlNLPcsaWTbf3ttE2e+Vtuzj+uCUsafUUjZRWwwPddGRF6tT3V2STURWWzAc3Ex2jCrN+R1W797a5ZnTg4KGOv2eqQ1PSKCx1YdXQanJeWlFNRlWZoe07AytdlqdsNTQmBk15JJctsPTdbDtFXLCrMirRwcBKl/UL1by6Z56RXItpyKD1r/Hgo4WsfFRlVKKDgZWu3Rdq+cR42z6GPP07i23IoOW3kBfsqvRVejSRlW7Qo7jaLVEw7GvT2MKqwvDPTrwchQ29Qc/XqEqHnQ2XUR9c4mBglTDIL1rZSyqbVZH7DGzRqUqHnVmVZAoGki6StFfSPkmbOxx3iaSQVGvaf6qkJyVdnTfBZnlVpcPOrEq6NhNJGgNuAS4AZoHtkqYjYk/TccuAK4D7Wvyam4G/z59cs2KMevuvWa+y9BmsAfZFxCMAkrYA64E9TcddD9wIHFX7l7QBeAT4f7lTa2Y2hKo+EgmyNRNNAY+ltmeTfUdIWg2siIivNe0/EXg/8NFOf0DS5ZJmJM3Mz89nSriZ2TAYlnktWYJBqyXDjkxOkLSEejPQVS2O+yhwc0Q82ekPRMStEVGLiNrk5GSGJJmZDYeqLETXTZZmollgRWr7FGB/ansZcBZwtySAlwHTktYBrwcukXQjsBx4TtJvIuI/F5F4W3yG4XbbLG1Y5rVkCQbbgdMkrQLmgI3AuxovRsQTwEmNbUl3A1dHxAzwe6n9HwGedCCwfvX6jGqzKhiWeS1dm4ki4jCwCdgGPATcHhG7JV2X1P7NBmJYbrfN0oZlXkumGcgRcRdwV9O+D7U59k1t9n+kx7SZHWVYbrfN0obl8bhejsKGxrDcbps1G4Z5LV6OwobGsNxuL3Z+VsRw8p2BDY1hud1ezNzJP7wcDGyoDMPt9mLWqZPf5VZtbiYys8K4k394ORiYWWGq8nB3652DgZkVxp38w8t9BmZWGHfyDy8HAzMrlDv5h5ObiczMzMHAzMwcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMzIGA0kXSdoraZ+kzR2Ou0RSSKol2xdI2iHpweT/84tKuJmZFafr2kSSxoBbgAuAWWC7pOmI2NN03DLgCuC+1O5fAH8UEfslnQVsA7xoiZlZxWS5M1gD7IuIRyLiGWALsL7FcdcDNwK/aeyIiJ0RsT/Z3A28QNLxOdNsZmYFyxIMpoDHUtuzNNXuJa0GVkTE1zr8nouBnRHxdPMLki6XNCNpZn5+PkOSbJj5gelm1ZNlCWu12BdHXpSWADcDl7X9BdKrgY8DF7Z6PSJuBW4FqNVq0eoYGw1+YLpZNWW5M5gFVqS2TwH2p7aXAWcBd0t6FDgPmE51Ip8CfBX404j4URGJtuHV6YHpZlaeLMFgO3CapFWSlgIbgenGixHxREScFBErI2IlcC+wLiJmJC0H7gSujYh7FiD9NmT8wHSzauoaDCLiMLCJ+kigh4DbI2K3pOskrety+ibgVcC/l7Qr+ffS3Km2oeUHpptVkyKq1URfq9ViZmam7GTYAmnuM4D6A9M/9vaz3WdgloOkHRFR6/d8PwPZBsoPTDerJgcDGzg/MN2serw2kZmZORiYmZmbicwqYevOOfejWKkcDMxK5lnZVgVuJjIrmWdlWxU4GJiVzLOyrQocDMxK5lnZVgUOBmYlu2btGUyMjx21b2J8jGvWnlFSimwxcgeyWck8K9uqwMHArAI8K9vK5mYiMzNzMDAzMwcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzI2MwkHSRpL2S9kna3OG4SySFpFpq37XJeXslrS0i0WZmVqyuk84kjQG3ABcAs8B2SdMRsafpuGXAFcB9qX1nAhuBVwMnA9+SdHpEHL1Eo5mZlSrLncEaYF9EPBIRzwBbgPUtjrseuBH4TWrfemBLRDwdET8G9iW/z8zMKiRLMJgCHkttzyb7jpC0GlgREV/r9VwzMytflmCgFvviyIvSEuBm4Kpez039jsslzUiamZ+fz5AkMzMrUpZgMAusSG2fAuxPbS8DzgLulvQocB4wnXQidzsXgIi4NSJqEVGbnJzsLQdmZpZblmCwHThN0ipJS6l3CE83XoyIJyLipIhYGRErgXuBdRExkxy3UdLxklYBpwHfLTwXZmaWS9fRRBFxWNImYBswBnw6InZLug6YiYjpDufulnQ7sAc4DPyZRxKZmVWPIo5pwi9VrVaLmZmZspNhZjZUJO2IiFr3I1vzDGQzM3MwMDMzBwMzM8PBwMzMcDAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzMixHYWa2ULbunOOmbXvZf+AgJy+f4Jq1Z7BhtVe5L4ODgZmVYuvOOa6940EOHqovVzZ34CDX3vEggANCCdxMZGaluGnb3iOBoOHgoWe5adveklK0uDkYmFkp9h842NN+W1gOBmZWipOXT/S03xaWg4GZleKatWcwMT521L6J8TGuWXtGSSla3NyBbGalaHQSezRRNTgYmFlpNqye8sW/ItxMZGZmDgZmZpYxGEi6SNJeSfskbW7x+r+S9KCkXZL+l6Qzk/3jkj6bvPaQpGuLzoCZmeXXNRhIGgNuAf4QOBN4Z+Nin/LfIuLsiDgHuBH4RLL/HcDxEXE2cC7wLyWtLCjtZmZWkCx3BmuAfRHxSEQ8A2wB1qcPiIhfpzZPBKLxEnCipOOACeAZIH2smZlVQJbRRFPAY6ntWeD1zQdJ+jPgz4GlwPnJ7i9TDxw/A04AroyIx/Mk2MzMipflzkAt9sUxOyJuiYhXAu8HPpjsXgM8C5wMrAKukvSKY/6AdLmkGUkz8/PzmRNvZmbFyBIMZoEVqe1TgP0djt8CbEh+fhfw9Yg4FBE/B+4Bas0nRMStEVGLiNrk5GS2lJuZWWGyBIPtwGmSVklaCmwEptMHSDottfk24IfJzz8FzlfdicB5wMP5k21mZkXq2mcQEYclbQK2AWPApyNit6TrgJmImAY2SfoD4BDwK+DS5PRbgM8A36fe3PSZiHhgAfJhZmY5KOKY5v9S1Wq1mJmZKTsZZmZDRdKOiDimGT4rz0A2MzMHAzMzczAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMqOAS1pLmgZ8M8E+eBPxigH9voY1SfpyXanJequmMiFjW78ldH24zaBEx0OdeSprJswZ41YxSfpyXanJeqklSrgfBuJnIzMwcDMzMzMEA4NayE1CwUcqP81JNzks15cpL5TqQzcxs8HxnYGZmox8MJK2Q9B1JD0naLenfJvtvkvSwpAckfVXS8tQ510raJ2mvpLXlpf5o7fKSev1qSSHppGRbkv5TkpcHJL22nJQfq1NeJP2b5L3fLenG1P6hKhdJ50i6V9IuSTOS1iT7q1wuL5D0XUn3J3n5aLJ/laT7JP1Q0m2Slib7j0+29yWvrywz/c065OcLyefo+5I+LWk82T90ZZN6/a8kPZna7q1sImKk/wEvB16b/LwM+AFwJnAhcFyy/+PAx5OfzwTuB44HVgE/AsbKzkenvCTbK4Bt1OdonJTseyvw94CA84D7ys5DhnJ5M/At4PjktZcOa7kA3wD+MFUWdw9BuQh4YfLzOHBfksbbgY3J/k8C701+/tfAJ5OfNwK3lZ2HjPl5a/KagC+m8jN0ZZNs14C/BZ5MHd9T2Yz8nUFE/Cwivpf8/H+Bh4CpiPhGRBxODrsXOCX5eT2wJSKejogfA/uANYNOdyvt8pK8fDPw74B0J9B64HNRdy+wXNLLB5nmdjrk5b3ADRHxdPLaz5NThrFcAvit5LAXAfuTn6tcLhERjdrlePIvgPOBLyf7PwtsSH5en2yTvP4WSRpQcrtql5+IuCt5LYDvcvT3f6jKRtIYcBP1739aT2Uz8sEgLblNWk09oqb9C+q1Aah/iR9LvTbL8xfcykjnRdI6YC4i7m86bOjyApwO/F5yW/sPkl6XHDaMeXkfcJOkx4C/AK5NDqt0XiSNSdoF/Bz4JvW7sAOpylM6vUfykrz+BPCSwaa4s+b8RMTYL26tAAACmElEQVR9qdfGgT8Bvp7sGqqySfKyCZiOiJ81Hd5T2SyaYCDphcBXgPdFxK9T+z8AHAa+0NjV4vRKDblK54V62j8AfKjVoS32VTYvSbkcB7yY+i36NcDtSW1mGPPyXuDKiFgBXAn8TePQFqdXJi8R8WxEnEO9trwG+J1WhyX/VzovcGx+JJ2Vevm/AP8jIv5nsl3p/LTIyxuBdwB/1eLwnvKyKIJBEv2/AnwhIu5I7b8U+CfAu5PbRajXBFakTj+F52/vS9ciL6+k3oZ+v6RHqaf3e5JexvDlBeppviO5Jf4u8Bz19WOGMS+XAo2fv8TzzVqVzktDRBwA7qYemJdLaixfk07vkbwkr78IeHywKc0mlZ+LACR9GJgE/jx12LCVzZuBVwH7ku//CZL2JYf1VDYjHwySWuXfAA9FxCdS+y8C3g+si4inUqdMAxuTnvhVwGnU2xRL1yovEfFgRLw0IlZGxErqH4DXRsT/oZ6XP01GSJwHPNHiVrIU7coF2Eq9fRpJpwNLqS8kNlTlktgP/H7y8/nAD5Ofq1wuk0pG1kmaAP6Aeh/Id4BLksMuBf578vN0sk3y+rdTFavStcnPw5LeA6wF3hkRz6VOGbay2RERL0t9/5+KiFclp/RWNp16l0fhH/C71G+NHgB2Jf/eSr0D8rHUvk+mzvkA9XbSvSSjQarwr11emo55lOdHEwm4JcnLg0Ct7DxkKJelwOeB7wPfA84f1nJJ9u+gPgrqPuDcISiX1wA7k7x8H/hQsv8V1IPvPup3OY3RXi9Itvclr7+i7DxkzM/h5P1vlFdj/9CVTdMx6dFEPZWNZyCbmdnoNxOZmVl3DgZmZuZgYGZmDgZmZoaDgZmZ4WBgZmY4GJiZGQ4GZmYG/H/YoVgE2XZ46QAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(d2.x, d2.ratio)\n", - "d2.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 118.000000\n", - "mean 0.012398\n", - "std 0.055022\n", - "min 0.000000\n", - "25% 0.003244\n", - "50% 0.004825\n", - "75% 0.006493\n", - "max 0.446525\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAEABJREFUeJzt3G+MXOV1x/HvCVtIYPnvskK2VZPGaougVeIpokVCs3FUkZACL0AF0dahllZpSYgKlYCmElIrVCillKKozQpoHcliIQTJVgJJkeMpygu7xYRgwI0wxAIH105kcLpAg5yevthLteMsnvG9Ozvex9+PtNp77zx3njNH5jeXZ+9MZCaSpHJ9YNgFSJIGy6CXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFW6k14CIeBD4NLAvM8+rjt0F/C7wLvAycF1mvlk9diuwFvgZcENmfqvXHEuWLMkVK1bUegFvvfUWJ510Uq1zS2VPutmPbvaj22Lux7Zt236cmb/Yc2BmHvYHuBj4GPD8rGO/A4xU23cCd1bb5wLfA04AzmHmTeC4XnOsWrUq69q8eXPtc0tlT7rZj272o9ti7gfwdPbI18zsvXSTmU8B+w859q+ZebDa3QIsq7YvB6Yy86eZ+QNgJ3BBz3cbSdLAzMca/R8BT1TbS4HXZj22uzomSRqSnmv0hxMRXwQOAuvfOzTHsDm/HjMiJoAJgLGxMTqdTq0apqena59bKnvSzX50sx/djoV+1A76iFjDzB9pV1drRTBzBb981rBlwOtznZ+Zk8AkQKvVyna7XauOTqdD3XNLZU+62Y9u9qPbsdCPWks3EXEJcDNwWWa+PeuhjcDVEXFCRJwDrAT+vXmZkqS6+rm98iGgDSyJiN3AbcCtzNxZ82REAGzJzM9m5gsR8QjwIjNLOtdn5s8GVbwkqbeeQZ+Z18xx+IHDjL8duL1JUZKk+eMnYyWpcAa9JBWu0e2VR4PtPzzAZ275xlDm3nXHpUOZV5KOhFf0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwvUM+oh4MCL2RcTzs46dERFPRsRL1e/Tq+MREf8QETsj4rmI+Nggi5ck9dbPFf2/AJcccuwWYFNmrgQ2VfsAnwRWVj8TwD/OT5mSpLp6Bn1mPgXsP+Tw5cC6ansdcMWs41/JGVuA0yLi7PkqVpJ05EZqnjeWmXsAMnNPRJxVHV8KvDZr3O7q2J5DnyAiJpi56mdsbIxOp1OvkA/BTecfrHVuU3VrHrTp6emjtrZhsB/d7Ee3Y6EfdYP+/cQcx3KugZk5CUwCtFqtbLfbtSa8b/0G7t4+3y+jP7uubQ9l3l46nQ51+1ki+9HNfnQ7FvpR966bve8tyVS/91XHdwPLZ41bBrxevzxJUlN1g34jsKbaXgNsmHX8D6u7by4EDry3xCNJGo6eax4R8RDQBpZExG7gNuAO4JGIWAu8ClxVDX8c+BSwE3gbuG4ANUuSjkDPoM/Ma97nodVzjE3g+qZFSZLmj5+MlaTCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVLhGQR8RfxoRL0TE8xHxUER8MCLOiYitEfFSRDwcEcfPV7GSpCNXO+gjYilwA9DKzPOA44CrgTuBezJzJfAGsHY+CpUk1dN06WYE+FBEjAAnAnuAjwOPVo+vA65oOIckqYHaQZ+ZPwT+FniVmYA/AGwD3szMg9Ww3cDSpkVKkuqLzKx3YsTpwNeA3wPeBL5a7d+WmR+pxiwHHs/M8+c4fwKYABgbG1s1NTVVq459+w+w951apzZ2/tJThzNxD9PT04yOjg67jKOG/ehmP7ot5n6Mj49vy8xWr3EjDeb4BPCDzPwRQEQ8Bvw2cFpEjFRX9cuA1+c6OTMngUmAVquV7Xa7VhH3rd/A3dubvIz6dl3bHsq8vXQ6Her2s0T2o5v96HYs9KPJGv2rwIURcWJEBLAaeBHYDFxZjVkDbGhWoiSpiSZr9FuZ+aPrM8D26rkmgZuBGyNiJ3Am8MA81ClJqqnRmkdm3gbcdsjhV4ALmjyvJGn++MlYSSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwjYI+Ik6LiEcj4j8jYkdE/FZEnBERT0bES9Xv0+erWEnSkWt6RX8v8M3M/FXgN4AdwC3ApsxcCWyq9iVJQ1I76CPiFOBi4AGAzHw3M98ELgfWVcPWAVc0LVKSVF+TK/oPAz8C/jkivhsR90fEScBYZu4BqH6fNQ91SpJqisysd2JEC9gCXJSZWyPiXuAnwOcz87RZ497IzJ9bp4+ICWACYGxsbNXU1FStOvbtP8Ded2qd2tj5S08dzsQ9TE9PMzo6Ouwyjhr2o5v96LaY+zE+Pr4tM1u9xo00mGM3sDszt1b7jzKzHr83Is7OzD0RcTawb66TM3MSmARotVrZbrdrFXHf+g3cvb3Jy6hv17XtoczbS6fToW4/S2Q/utmPbsdCP2ov3WTmfwGvRcSvVIdWAy8CG4E11bE1wIZGFUqSGml6Kfx5YH1EHA+8AlzHzJvHIxGxFngVuKrhHJKkBhoFfWY+C8y1PrS6yfNKkuaPn4yVpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgrXOOgj4riI+G5EfL3aPycitkbESxHxcEQc37xMSVJd83FF/wVgx6z9O4F7MnMl8Aawdh7mkCTV1CjoI2IZcClwf7UfwMeBR6sh64ArmswhSWomMrP+yRGPAn8NnAz8GfAZYEtmfqR6fDnwRGaeN8e5E8AEwNjY2KqpqalaNezbf4C979Q6tbHzl546nIl7mJ6eZnR0dNhlHDXsRzf70W0x92N8fHxbZrZ6jRupO0FEfBrYl5nbIqL93uE5hs75TpKZk8AkQKvVyna7Pdewnu5bv4G7t9d+GY3surY9lHl76XQ61O1niexHN/vR7VjoR5OEvAi4LCI+BXwQOAX4e+C0iBjJzIPAMuD15mVKkuqqvUafmbdm5rLMXAFcDXw7M68FNgNXVsPWABsaVylJqm0Q99HfDNwYETuBM4EHBjCHJKlP87K4nZkdoFNtvwJcMB/PK0lqzk/GSlLhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKlztoI+I5RGxOSJ2RMQLEfGF6vgZEfFkRLxU/T59/sqVJB2pJlf0B4GbMvPXgAuB6yPiXOAWYFNmrgQ2VfuSpCGpHfSZuSczn6m2/xvYASwFLgfWVcPWAVc0LVKSVN+8rNFHxArgo8BWYCwz98DMmwFw1nzMIUmqJzKz2RNEjAL/BtyemY9FxJuZedqsx9/IzJ9bp4+ICWACYGxsbNXU1FSt+fftP8Ded+rV3tT5S08dzsQ9TE9PMzo6Ouwyjhr2o5v96LaY+zE+Pr4tM1u9xo00mSQifgH4GrA+Mx+rDu+NiLMzc09EnA3sm+vczJwEJgFarVa22+1aNdy3fgN3b2/0MmrbdW17KPP20ul0qNvPEtmPbvaj27HQjyZ33QTwALAjM/9u1kMbgTXV9hpgQ/3yJElNNbkUvgj4A2B7RDxbHftz4A7gkYhYC7wKXNWsRElSE7WDPjO/A8T7PLy67vNKkuaXn4yVpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgo3MqgnjohLgHuB44D7M/OOQc0lSU2suOUbQ5t71x2XDnyOgVzRR8RxwJeATwLnAtdExLmDmEuSdHiDWrq5ANiZma9k5rvAFHD5gOaSJB3GoIJ+KfDarP3d1TFJ0gIb1Bp9zHEsuwZETAAT1e50RHy/5lxLgB/XPLeRuHMYs/ZlaD05StmPbvaj21D70TBHfqmfQYMK+t3A8ln7y4DXZw/IzElgsulEEfF0ZraaPk9J7Ek3+9HNfnQ7FvoxqKWb/wBWRsQ5EXE8cDWwcUBzSZIOYyBX9Jl5MCI+B3yLmdsrH8zMFwYxlyTp8AZ2H31mPg48Pqjnn6Xx8k+B7Ek3+9HNfnQrvh+Rmb1HSZIWLb8CQZIKt2iCPiIuiYjvR8TOiLhljsdPiIiHq8e3RsSKha9y4fTRj4sj4pmIOBgRVw6jxoXURz9ujIgXI+K5iNgUEX3dlraY9dGTz0bE9oh4NiK+U/qn13v1Y9a4KyMiI6KcO3Ey86j/YeYPui8DHwaOB74HnHvImD8B/qnavhp4eNh1D7kfK4BfB74CXDnsmo+CfowDJ1bbf1zyv48j6Mkps7YvA7457LqH2Y9q3MnAU8AWoDXsuufrZ7Fc0ffzlQqXA+uq7UeB1REx1we3StCzH5m5KzOfA/53GAUusH76sTkz3652tzDz2Y6S9dOTn8zaPYlDPtRYmH6/luWvgL8B/mchixu0xRL0/Xylwv+PycyDwAHgzAWpbuH5FRPdjrQfa4EnBlrR8PXVk4i4PiJeZibcblig2oahZz8i4qPA8sz8+kIWthAWS9D3/EqFPseU4lh6rf3oux8R8ftAC7hroBUNX189ycwvZeYvAzcDfzHwqobnsP2IiA8A9wA3LVhFC2ixBH3Pr1SYPSYiRoBTgf0LUt3C66cfx5K++hERnwC+CFyWmT9doNqG5Uj/jUwBVwy0ouHq1Y+TgfOATkTsAi4ENpbyB9nFEvT9fKXCRmBNtX0l8O2s/rpSIL9iolvPflT/W/5lZkJ+3xBqXGj99GTlrN1LgZcWsL6Fdth+ZOaBzFySmSsycwUzf8e5LDOfHk6582tRBH215v7eVyrsAB7JzBci4i8j4rJq2APAmRGxE7gReN/bpxa7fvoREb8ZEbuBq4AvR0SxX0HR57+Pu4BR4KvV7YRFvzH22ZPPRcQLEfEsM//NrHmfp1v0+uxHsfxkrCQVblFc0UuS6jPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkq3P8B+nv1Eh7kxG0AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "diff = d2.ratio - d1.ratio\n", - "diff.hist(bins=10)\n", - "diff.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFENJREFUeJzt3X+wXGddx/H3NzfbciPYW+1loLepqVKqwUoC19IZf1KQtqBNpgJtBcVRpyNaRZFqOzhF6zgUolB16mBHmEFFS4VaMxqNv8AfzLT0lhZKKJFYqvmhQxACMgk0Sb/+sWfTk83eu2fv3Xt3b573a+ZO9pzz7Nnne55zPnt29+wmMhNJUhnWjLoDkqSVY+hLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCrJ2VA98zjnn5IYNG0b18JK0Kj344IOfz8zpxd5/ZKG/YcMG5ubmRvXwkrQqRcR/LuX+vr0jSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVZGS/vaMy3fvQfrbt3M2BQ0c4d2qSGy+/iK2bZ0bdLakYhr5WzL0P7efmex7hyNHjAOw/dISb73kEwOCXVohv72jFbNu5+0Tgdxw5epxtO3ePqEdSeQx9rZgDh44MNF/S8Bn6WjHnTk0ONF/S8Bn6WjE3Xn4Rk62Jk+ZNtia48fKLRtQjqTx+kKsV0/mw1qt3pNEx9LWitm6eMeSlEfLtHUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCuJ1+tIK8qelNWqGvrRC/GlpjQPf3pFWiD8trXFg6EsrxJ+W1jhoFPoRcUVE7I6IPRFx0wLtXhkRGRGzw+uidHrwp6U1DvqGfkRMAHcAVwIbgesiYmOPds8Afh64f9idlE4H/rS0xkGTM/1LgD2Z+VhmPgHcBWzp0e43gLcDXx1i/6TTxtbNM7z16ouZmZokgJmpSd569cV+iKsV1eTqnRlgb216H/CieoOI2Aysz8y/iog3zbeiiLgeuB7g/PPPH7y30irnT0tr1Jqc6UePeXliYcQa4J3AL/VbUWbemZmzmTk7PT3dvJeSpKFoEvr7gPW16fOAA7XpZwDfDnw4Ih4HLgW2+2GuJI2fJqH/AHBhRFwQEWcA1wLbOwsz80uZeU5mbsjMDcB9wFWZObcsPZYkLVrf0M/MY8ANwE7gUeDuzNwVEbdGxFXL3UFJ0vA0+hmGzNwB7Oiad8s8bb9/6d2SJC0Hv5ErSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKkij0I+IKyJid0TsiYibeiz/6Yh4JCIejoh/i4iNw++qJGmp+oZ+REwAdwBXAhuB63qE+p9m5sWZuQl4O/COofdUkrRkTc70LwH2ZOZjmfkEcBewpd4gM79cm/w6IIfXRUnSsKxt0GYG2Fub3ge8qLtRRPws8EbgDOCyofROkjRUTc70o8e8U87kM/OOzPwW4FeAX+25oojrI2IuIuYOHjw4WE8lSUvWJPT3Aetr0+cBBxZofxewtdeCzLwzM2czc3Z6erp5LyVJQ9Ek9B8ALoyICyLiDOBaYHu9QURcWJt8BfCZ4XVRkjQsfd/Tz8xjEXEDsBOYAN6Tmbsi4lZgLjO3AzdExEuBo8AXgdctZ6clSYvT5INcMnMHsKNr3i21228Ycr8kScvAb+RKUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBGoV+RFwREbsjYk9E3NRj+Rsj4lMR8YmI+MeI+Kbhd1WStFR9Qz8iJoA7gCuBjcB1EbGxq9lDwGxmfgfwAeDtw+6oJGnpmpzpXwLsyczHMvMJ4C5gS71BZn4oMw9Xk/cB5w23m5KkYWgS+jPA3tr0vmrefH4S+JuldEqStDzWNmgTPeZlz4YRrwVmge+bZ/n1wPUA559/fsMuSpKGpcmZ/j5gfW36POBAd6OIeCnwZuCqzPxarxVl5p2ZOZuZs9PT04vpryRpCZqE/gPAhRFxQUScAVwLbK83iIjNwB/QDvzPDb+bkqRh6Bv6mXkMuAHYCTwK3J2ZuyLi1oi4qmq2DXg68OcR8XBEbJ9ndZKkEWrynj6ZuQPY0TXvltrtlw65X5KkZeA3ciWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCNQj8iroiI3RGxJyJu6rH8eyPiYxFxLCJeOfxuSpKGoW/oR8QEcAdwJbARuC4iNnY1+y/gx4E/HXYHJUnDs7ZBm0uAPZn5GEBE3AVsAT7VaZCZj1fLnlyGPkqShqTJ2zszwN7a9L5q3sAi4vqImIuIuYMHDy5mFZKkJWgS+tFjXi7mwTLzzsyczczZ6enpxaxCkrQETUJ/H7C+Nn0ecGB5uiNJWk5NQv8B4MKIuCAizgCuBbYvb7ckScuhb+hn5jHgBmAn8Chwd2buiohbI+IqgIj4zojYB7wK+IOI2LWcnZYkLU6Tq3fIzB3Ajq55t9RuP0D7bR9J0hjzG7mSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkHWjroD0ji496H9bNu5mwOHjnDu1CQ3Xn4RWzfPjLpbRXEMVoahvwp4MCyvex/az833PMKRo8cB2H/oCDff8wiA23mFDDoGHhOLF5k5kgeenZ3Nubm5kTz2atphug+GjrPXtXjLDz1vbPu9VCs5Rt912z+x/9CRU+bPTE3ykZsuW5bH7Gcp9a+G/bu7j4efOMYXDx89pV2vMeh1TEy2Jnjr1RePXZ3dhjE2EfFgZs4utg+nVejXN+hZky0i4NDhoydt3HHfYZoeDDB/v1fqoF+Ox7n3of382vZdHDpycs0BJO0QmO9xFtufC276a+Y7CmamJk9aH9B3H1uq+fbRH37hDB/69MF5+3Pu1CQv/tZpPvjg/kb3HaSvwxzr+U5kegngs7e94qR5/Z6ke/UVGPkT4bCyp6jQX2jH67cjdXb8P7t/L8d71Dw12eLrzlx70sH8xcNHmYjgeCZTtQN80IO96U4IND4YevV7voO+OzDne+xB5u0/dOTEersfp7PNFgroXtun1zp7qR8o/e7b6xVR98nBl796lCeHcBgsNlybPtE32TZN2tTbLbRP9Du21rXWcGZrou9xMMiJTLf5jsuF2nefMLTWBAQcPf7UlunsQwvV3UuTE8v57tPriQoGf0VZTOj32vG6Q2aUOgdA9xPFE8eOc/jok33v3/RgXW36naEPctbXvd7J1pqBtu0g4zEsy1X/clvssVV/IjhrBNt7UL2Ou6Ucy4upv9ermQXbLzH0G32QGxFXAL8DTAB/mJm3dS0/E/gj4IXA/wLXZObji+1UL9t27j7lwOgM1qgDH+Dw0SdPDG6nP91nHAsZfQXLo1NX9wdz/c5+mqy3aZh0+jDIeAzLctW/3BZ7bNWPg1Fs70H1qm4px/Ji6j93arJRu2HpG/oRMQHcAfwAsA94ICK2Z+anas1+EvhiZj4nIq4F3gZcM8yOHhjTg0PNHTl6nG07dwODv411Oii9fp1qsjVx4u21ldLky1mXAHsy87HMfAK4C9jS1WYL8N7q9geAl0REDK+bK/9sOI6mJlvMrPLtcODQkZ6v2kpRev16yszU5EguIGkS+jPA3tr0vmpezzaZeQz4EvCN3SuKiOsjYi4i5g4ePDhQR2+8/CImWxON209NtljT52lnqM9Ky2yyNcGvXfU8PnLTZWMV/NH1bz/nVlfDNFnnzNQkt1+ziduv2cTEcM8h5tVaE7Qmmj3WutYazl7XGmj9TeqvW2rdq2Ef75zMBO0xb7pNV0NtvUy2Jrj9mk185KbLRnLFYJPQ77Vtu98Ka9KGzLwzM2czc3Z6erpJ/07YunmGt1598YnAm2/AOxv04be8jHe8etMpTxT1QHnNpef3fCKpH8ydg25qssXZ61pEdXtda/l+wWKyNcFrLz3/pAOhfkbQ9Amwvh4Y/kFy9roW77xmE4/f9greec2mE4/T2Wbdj9d5KbvQq7aZqckT6+wcFFs3z/Dbr35+35rPXtc68STRtObOWHe287ZXPZ9tr3z+Sdu+vs76vE/9xpU8dMvLuP2aU/ezXprU393+uhetP2XdvfaP1/bYlydbE7ymq92g22ehxx3GcVA/mflsNeZv+aHnLXhcdmrpdfzWj+9e22SQJ/V+1rXWDFz/2etaI788vMkHufuA9bXp84AD87TZFxFrgbOALwylhzWdAICTL4Oa7xLBzr8LXZI1+03fMJQvwfS7zHPQSyMX6sN8dfVbzyCXjg667vrYLPR4nTaDXq/cXXO/y+W695NBr6vvNb9p35pss6bX4W/dPNN4Hx1kX+61fXpdltvk8tv5tnGv7b3QNum3PRdTc6/l/caq6bHcbx9bru9xLFXfSzarEP934CXAfuAB4Ecyc1etzc8CF2fmT1cf5F6dma9eaL2j/EauRm81fGt0OZVevxZvRa7Tj4iXA7fTvmTzPZn5mxFxKzCXmdsj4mnAHwObaZ/hX5uZjy20TkNfkga3ItfpZ+YOYEfXvFtqt78KvGqxnZAkrQx/T1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQUb2f+RGxEHgP1fwIc8BPr+Cj7ecrGV8nU71WMt4uigzn7HYOzf67Z3lkJmD/aD+EkXE3FJ+pGicWMv4Op3qsZbxFBFL+qVK396RpIIY+pJUkJJC/85Rd2CIrGV8nU71WMt4WlItI/sgV5K08ko605ek4p0WoR8R6yPiQxHxaETsiog3VPO3RcSnI+ITEfEXETFVu8/NEbEnInZHxOWj6/2p5quntvxNEZERcU41HRHxu1U9n4iIF4ym56daqJaI+Llq+++KiLfX5o/l2Cywn22KiPsi4uGImIuIS6r54zwuT4uIj0bEx6tafr2af0FE3B8Rn4mI90fEGdX8M6vpPdXyDaPsf90Ctbyv2oc+GRHviYhWNX/VjUtt+e9FxFdq04OPS2au+j/g2cALqtvPoP0fuW8EXgasrea/DXhbdXsj8HHgTOAC4D+AiVHX0a+eano9sJP2dxzOqea9HPgbIIBLgftHXUODsXkx8A/AmdWyZ4772CxQy98BV9bG4sOrYFwCeHp1uwXcX/Xxbtr/xzXAu4DXV7d/BnhXdfta4P2jrqFBLS+vlgXwZ7VaVt24VNOztP8v8q/U2g88LqfFmX5m/ndmfqy6/X/Ao8BMZv5dZh6rmt0HnFfd3gLclZlfy8zPAnuAS1a63/OZr55q8TuBXwbqH8ZsAf4o2+4DpiLi2SvZ5/ksUMvrgdsy82vVss9VdxnbsVmglgS+vmp2FnCguj3O45KZ2TljbFV/CVwGfKCa/15ga3V7SzVNtfwlEREr1N0FzVdLZu6oliXwUU4+/lfVuETEBLCN9rFfN/C4nBahX1e9vNlM+xmy7idoP7tD+0DdW1u2j6dCdazU64mIq4D9mfnxrmarop6usXku8D3VS9J/jojvrJqtxlp+AdgWEXuB3wJurpqNdS0RMRERDwOfA/6e9quqQ7UTpXp/T9RSLf8S8I0r2+P5ddeSmffXlrWAHwX+tpq1qsalquUGYHtm/ndX84HH5bQK/Yh4OvBB4Bcy88u1+W8GjgHv68zqcfexu4ypXg/t/r8ZuKVX0x7zxqqeHmOzFjib9svrG4G7qzOU1VjL64FfzMz1wC8C7+407XH3saklM49n5ibaZ8CXAN/Wq1n176qqJSK+vbb494F/ycx/raZXWy3fC7wK+L0ezQeu5bQJ/erZ/IPA+zLzntr81wE/CLymepkH7Wf29bW7n8dTL8nHQo96voX2e9wfj4jHaff5YxHxLMa8nnnGZh9wT/Vy9qPAk7R/H2U11vI6oHP7z3nq7aixrqUjMw8BH6b9BDwVEZ2fZ6n390Qt1fKzgC+sbE/7q9VyBUBEvAWYBt5Ya7baxuXFwHOAPdWxvy4i9lTNBh6X0yL0qzPEdwOPZuY7avOvAH4FuCozD9fush24tvrk+wLgQtrv+Y2FXvVk5iOZ+czM3JCZG2gP9gsy839o1/Nj1VUJlwJf6vEycCTmGxvgXtrvHxMRzwXOoP2DWGM7NgvUcgD4vur2ZcBnqtvjPC7TUV3NFhGTwEtpf0bxIeCVVbPXAX9Z3d5eTVMt/6faSdRIzVPLpyPip4DLgesy88naXVbbuDyYmc+qHfuHM/M51V0GH5d+n/Suhj/gu2m/pPkE8HD193LaHwLurc17V+0+b6b9HuZuqisvxuVvvnq62jzOU1fvBHBHVc8jwOyoa2gwNmcAfwJ8EvgYcNm4j80CtXw38CDtq47uB164CsblO4CHqlo+CdxSzf9m2k+ye2i/aulcXfW0anpPtfybR11Dg1qOVdu+M1ad+atuXLra1K/eGXhc/EauJBXktHh7R5LUjKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JB/h8teBQdlpRcPAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(diff.index, diff)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So, there is no reason to take more than one bucket as Simhash cache as IQR ~ 0.003 — `depth=1` is good enough." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Is week-old bucket as good as the yesterday's one?" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_week():\n", - " row = []\n", - " for x in FILES[7:]:\n", - " with open('{:d}'.format(x - 7)) as fd:\n", - " cache = {binascii.unhexlify(_[2:66]) for _ in fd}\n", - " row.append(df_row_with_cache(x, cache, dyn=False))\n", - " return df_from_rows(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "d17 = calc_week()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 113.000000\n", - "mean 0.323854\n", - "std 0.044418\n", - "min 0.000000\n", - "25% 0.323235\n", - "50% 0.329020\n", - "75% 0.335014\n", - "max 0.352001\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHDxJREFUeJzt3X+Q3PV93/Hni+MQZ7tB2FymQT8s4QhqKCmy14pnqO2JDUh2GolxSC03aZUpHQ0u1DVOqGHIBEf5wxilTtMZJaCJmXFjuxK2Kb1O6iokYDfTqUArJMASVjhkG53kjpWIg3p0FXfHu3/s98RXq+/ufvdu73b3Pq/HzM3t9/v9fL/7+eznu+/v5/P5fnZXEYGZmaXhgm5nwMzMFo6DvplZQhz0zcwS4qBvZpYQB30zs4Q46JuZJcRB38wsIQ76ZmYJcdA3M0vIhd3OQL3LLrssVq1a1e1smJn1lf379/9tRAy3StdzQX/VqlVUq9VuZ8PMrK9I+lGZdB7eMTNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klxEHfzCwhDvpmZglx0DczS4iDvplZQhz0zcwS4qBvZpaQnvvCtZQ8duA42/cc4cT4BJcvHeKu9Vdx89pl3c6WmS1ipYK+pA3AHwEDwJ9GxP11228DbgemgZ8CWyPisKRVwAvAkSzp3oi4rTNZ72+PHTjOPY8+z8TkNADHxye459Hnqf7oFE9+/6QvBGY2L1oGfUkDwA7gRmAM2CdpJCIO55J9PSIezNJvBL4EbMi2vRQR13U22/1v+54jZwP+jInJab6292UiW565EAAO/HYe9xRtNsqM6a8DRiPiaES8DuwCNuUTRMRrucW3wtm4ZQ2cGJ8oXF//wk1MTrN9z5HCtJaumZ7i8fEJgjcbCI8dON7trFmPKxP0lwHHcstj2bpzSLpd0kvAA8Cnc5tWSzog6buSPlD0BJK2SqpKqp48ebKN7Pevy5cOlU7b6AJh6WrUU3QDwVopE/RVsO68lnxE7IiIdwGfA34nW/1jYGVErAU+C3xd0s8U7LszIioRURkebvlrX4vCXeuvYmhwoFTay5cO8diB41x//xOsvvvPuf7+J9yiS1yjhoAbCNZKmRu5Y8CK3PJy4EST9LuAPwGIiDPAmezx/qwncCXQc7+HuNDjozPH3r7nCMfHJxDFY2Ki1nW/c/fBJMb6F6Ieen0svEz+Ll86xPGCAJ/vQfZ6Oa07FNF8+F3ShcDfAB8BjgP7gH8WEYdyadZExIvZ418B7ouIiqRh4FRETEu6Avhr4NqIONXo+SqVSiz0b+TWz6QBGBoc4Asfv3ZOb5Kyb7rr73+i8A3cyoDEv/+n/2jRvJHnqx4W+jnK5GHmYj8gMR3Bsuz8AErlr1k5AD4/cojxiclznrdsOX2x6E+S9kdEpVW6li39iJiSdAewh9qUzYcj4pCkbUA1IkaAOyTdAEwCrwBbst0/CGyTNEVtOudtzQJ+tzQbH60/2cu+IRpNyYTzW+ez7ZJPRyyqFn879dCufKCtNzE5zW898iww/69j/XkxnTW6Zs6PiwcvKPUa5HuK+XMRzr9o1B+naL+Z47UzlbjZcRZCv16cup3vli39hbaQLf1mgWDGf/jEdQ3fENC49dSs9V7fumuVh1aWLR3if9394Vnv3ytW3/3nDYe4fnD/L8/6uEX1VmQhWvyz7dWVfQ3KHH9ocKDhOdxo//rhx8ELBILJ6Sg8znzrhR7bbMxnvjvW0l8sirrUjcbR82Za0gC/9cizZ1tmM+pbT/njN1LfuvvV9y7jW/uPn3MilMnbjBPjE231QHq1dVRmnDqvbFmKehBFOtWraGa2vbqys71aHX9AatqTKDuVePKN88/OZu+FpUODSDB+erIj59189grnUy/kO4nv3snPaYY3g26ZoDoxOc3nRw5xz6PPNwzkM8G7/vhlTExO8+T3T/KFj1/LsqVDiFrL/dffv7L07J4A7tx9sOWc7V6f2100o2locODsUEJeO2VpJ9DOXEDna6ZUq+C9dGiw9GvQ7vGHBgcanpszr1E7U4mLNHovjE9M8srpybN1defug6yaw+vbr7OXeiHfSQzvzLZLvVAadd3rW7K/9A+Gz+sRNLN0aJC3LrmQE+MTXDI0yGv/b5KCBlrD4aFu9ArmevM7X+bLZzF8tnRokDNTb5zX/f7V9y5r++sxisoCjcfc8zdi220p53uyRb3EtwxewJLBAV45PUmRmXOg7FBYJ81meKNR/ZcZ6uxmb7dRvjsxKaPs8E4SQb/RWHGvaGdMvsx9iNmoDypFF5heGjMtW6czAbu+LEVj0u0MqbV6LVrNrmkV1Ftd4PPPX/RcM2VZVrKxcOlbBrnvV645e7yi2T/zScDStwy2vKg1awCVOT8X8l5APs+XZPX7yunJhufZXPPhoJ9TtqVfFAg6oaj1OGO2FT3fF7JGJ2Z976GT47TtaKf3lr9hXjT7pNnnJFodt9HFup2WaLOgXeb5Wz1X2dcqf6E4/fpUw17BQsj3rorqZ7a9rzL10omewGx7THOZlJHsjdyiCrtr/VUNKyB/otcHgk4YGhzg8xuvOee49bN3ZhMoG9307JRGAWd8YvJsCzDfElyoD4y1GsYocnx8gu17jhS+1s1mrLTSbBy2nbHbopt7Zco1c6xWz1V2vDj/4b9um5ic5qt7Xz67XPSdVF/d+zLLlg7xh7kZdtD8cxCtXquiKauf2X2Q3/tvh872hFp57MDxwkkfZSzE2P6iauk3azEtzXWvygTdZi3pVi2hTgT1Vrox9lrWfJV7ti3iGY16VXPpNdU3FmZz/2S2z9/pln4ZRb3WZnVQP3zVbHhjtlr1CvLpLh68oPB9OzOm3qzBV99AbOeeSllu6bepWYtpfGKSocGBc+bdN9OoJd3shtdCjnkXfTinnS55p994eZ1q9df32k6/PlVYv62myM5oNDVuLr2m4+MT3PWNZ88ZFmw0Ft5oFk6j529WR/ljFfVkW22fTf3X91pnO74+l5ZwkfqvJG901InJaSYmpwvLPh1xzledFMn3hO7cfZDP7D5YeM+kbKmKhqzKztKai0XV0i/TYip7JS0T1HttznuZnk7ZG4WdkJ+R0O5r1W5Ppv4DR40UzZQqeq653uit12x2RqNzLT9m3er+SavXdzYzwYpmQjW7cT2f9bvYzPaeRDNJ3sgt041t59OdvRbUy2gnz/WzC+Zjtkaj2TOtekVzvVHbqNfTzvRUOP9rBlq1Bhtpdd51c3psoxul891rbTS7pZH57J3Ot3ZnJ82m/pML+mWnmS2WryyYD83mvhfNPmo193tGo+GXZi3JdqZkFgWn+Rp+m+34eK+fd73SwGnUA5iZUgrnf86h1y8Gc5ne204dJDWm3853qyzEmFm/ajQ2XDSO2+xLuuo1GrvNzwSqvw/QaJy77JBDoy8km2sgazYTrJF+OO9uXrusJ3qxZeut1VDVzIWg2f2eVheLmXNtNjdnm930rbfQX82wKFr67Xw6sxdO7F422xZfs5tzZW+0Qu/cKG+m1Y3ITn/PjLXW7Lxtdc+kzPBWO7Nz2p291qkvGkyqpd9obuurE5McvO+mBc5Nf5tti29mn0ZvrrI3jWfqcr5a6p3QrKy9cFFKUbPztsy51Kqxkz9+o6Hk2dZ/u180OFeLIugv9ItmxZq9uSrvfHupaXr5OuuVIYcivXxRsvO1OpfaOddm0nbqPkirKbedtiiGd3p5KMDe1OrmbDvjoGaLyULO3lkULX23uvpDsw9B5cdJF/NvAJsVWchebamWvqQNwB9R+7nEP42I++u23wbcTu0nEX8KbI2Iw9m2e4Bbs22fjog9zZ6rG7+RawujUY+s0Ufje32ao1kvKdvSb/kjKpIGgB3AR4GrgU9Kurou2dcj4tqIuA54APhStu/VwGbgGmAD8MfZ8SxBN69ddt6PxXzh49cy3mCef6//IIZZPyozvLMOGI2IowCSdgGbgMMzCSLitVz6t/JmT30TsCsizgA/kDSaHe9/dyDv1oeKurGNvuTKN+LNOq/MzyUuA47llseydeeQdLukl6i19D/dzr6WtnZ+JtHM5qZM0FfBuvNuBETEjoh4F/A54Hfa2VfSVklVSdWTJ0+WyJItJo2GfXwT16zzygzvjAErcsvLgRNN0u8C/qSdfSNiJ7ATajdyS+TJFplenpNvtpiUaenvA9ZIWi3pImo3ZkfyCSStyS3+MvBi9ngE2CxpiaTVwBrg6bln28zMZqNlSz8ipiTdAeyhNmXz4Yg4JGkbUI2IEeAOSTcAk8ArwJZs30OSHqF203cKuD0i0vwCbTOzHrAoPpFrZpa6js3TNzOzxcNB38wsIQ76ZmYJcdA3M0uIg76ZWUIc9M3MEuKgb2aWEAd9M7OEOOibmSXEQd/MLCEO+mZmCXHQNzNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klpFTQl7RB0hFJo5LuLtj+WUmHJT0n6a8kvTO3bVrSwexvpH5fMzNbOC1/I1fSALADuBEYA/ZJGomIw7lkB4BKRJyW9CngAeAT2baJiLiuw/k2M7NZKNPSXweMRsTRiHgd2AVsyieIiCcj4nS2uBdY3tlsmplZJ5QJ+suAY7nlsWxdI7cC384tXyypKmmvpJtnkUczM+uQlsM7gArWRWFC6TeACvCh3OqVEXFC0hXAE5Kej4iX6vbbCmwFWLlyZamMm5lZ+8q09MeAFbnl5cCJ+kSSbgDuBTZGxJmZ9RFxIvt/FPgOsLZ+34jYGRGViKgMDw+3VQAzMyuvTNDfB6yRtFrSRcBm4JxZOJLWAg9RC/g/ya2/VNKS7PFlwPVA/gawmZktoJbDOxExJekOYA8wADwcEYckbQOqETECbAfeBnxDEsDLEbEReDfwkKQ3qF1g7q+b9WNmZgtIEYXD811TqVSiWq12OxtmZn1F0v6IqLRK50/kmpklxEHfzCwhDvpmZglx0DczS4iDvplZQhz0zcwS4qBvZpYQB30zs4Q46JuZJcRB38wsIQ76ZmYJcdA3M0uIg76ZWUIc9M3MEuKgb2aWEAd9M7OEOOibmSWkVNCXtEHSEUmjku4u2P5ZSYclPSfpryS9M7dti6QXs78tncy8mZm1p2XQlzQA7AA+ClwNfFLS1XXJDgCViPgF4JvAA9m+bwfuA34RWAfcJ+nSzmXfzMzaUaalvw4YjYijEfE6sAvYlE8QEU9GxOlscS+wPHu8Hng8Ik5FxCvA48CGzmTdzMzaVSboLwOO5ZbHsnWN3Ap8e5b7mpnZPLqwRBoVrIvChNJvABXgQ+3sK2krsBVg5cqVJbJkZmazUaalPwasyC0vB07UJ5J0A3AvsDEizrSzb0TsjIhKRFSGh4fL5t3MzNpUJujvA9ZIWi3pImAzMJJPIGkt8BC1gP+T3KY9wE2SLs1u4N6UrTMzsy5oObwTEVOS7qAWrAeAhyPikKRtQDUiRoDtwNuAb0gCeDkiNkbEKUm/T+3CAbAtIk7NS0nMzKwlRRQOz3dNpVKJarXa7WyYmfUVSfsjotIqnT+Ra2aWEAd9M7OEOOibmSXEQd/MLCEO+mZmCXHQNzNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klxEHfzCwhDvpmZglx0DczS4iDvplZQhz0zcwS4qBvZpaQUkFf0gZJRySNSrq7YPsHJT0jaUrSLXXbpiUdzP5G6vc1M7OF0/I3ciUNADuAG4ExYJ+kkYg4nEv2MvCbwG8XHGIiIq7rQF7NzGyOWgZ9YB0wGhFHASTtAjYBZ4N+RPww2/bGPOTRzMw6pMzwzjLgWG55LFtX1sWSqpL2Srq5rdyZmVlHlWnpq2BdtPEcKyPihKQrgCckPR8RL53zBNJWYCvAypUr2zi0mZm1o0xLfwxYkVteDpwo+wQRcSL7fxT4DrC2IM3OiKhERGV4eLjsoc3MrE1lgv4+YI2k1ZIuAjYDpWbhSLpU0pLs8WXA9eTuBZiZ2cJqGfQjYgq4A9gDvAA8EhGHJG2TtBFA0vskjQG/Bjwk6VC2+7uBqqRngSeB++tm/ZiZ2QJSRDvD8/OvUqlEtVrtdjbMzPqKpP0RUWmVzp/INTNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klxEHfzCwhDvpmZglx0DczS4iDvplZQhz0zcwS4qBvZpYQB30zs4Q46JuZJcRB38wsIQ76ZmYJcdA3M0tIqaAvaYOkI5JGJd1dsP2Dkp6RNCXplrptWyS9mP1t6VTGzcysfS2DvqQBYAfwUeBq4JOSrq5L9jLwm8DX6/Z9O3Af8IvAOuA+SZfOPdtmZjYbZVr664DRiDgaEa8Du4BN+QQR8cOIeA54o27f9cDjEXEqIl4BHgc2dCDfZmY2C2WC/jLgWG55LFtXxlz2NTOzDisT9FWwLkoev9S+krZKqkqqnjx5suShzcysXWWC/hiwIre8HDhR8vil9o2InRFRiYjK8PBwyUObmVm7ygT9fcAaSaslXQRsBkZKHn8PcJOkS7MbuDdl68zMrAtaBv2ImALuoBasXwAeiYhDkrZJ2ggg6X2SxoBfAx6SdCjb9xTw+9QuHPuAbdk6MzPrAkWUHZ5fGJVKJarVarezYWbWVyTtj4hKq3T+RK6ZWUIc9M3MEuKgb2aWEAd9M7OEOOibmSXEQd/MLCEO+mZmCXHQNzNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klxEHfzCwhDvpmZglx0DczS4iDvplZQkoFfUkbJB2RNCrp7oLtSyTtzrY/JWlVtn6VpAlJB7O/BzubfTMza8eFrRJIGgB2ADcCY8A+SSMRcTiX7FbglYj4eUmbgS8Cn8i2vRQR13U432ZmNgtlWvrrgNGIOBoRrwO7gE11aTYBX8kefxP4iCR1LptmZtYJZYL+MuBYbnksW1eYJiKmgFeBd2TbVks6IOm7kj4wx/yamdkctBzeAYpa7FEyzY+BlRHxd5LeCzwm6ZqIeO2cnaWtwFaAlStXlsiSmZnNRpmW/hiwIre8HDjRKI2kC4FLgFMRcSYi/g4gIvYDLwFX1j9BROyMiEpEVIaHh9svhZmZlVIm6O8D1khaLekiYDMwUpdmBNiSPb4FeCIiQtJwdiMYSVcAa4Cjncm6mZm1q+XwTkRMSboD2AMMAA9HxCFJ24BqRIwAXwb+TNIocIrahQHgg8A2SVPANHBbRJyaj4KYmVlriqgfnu+uSqUS1Wq129kwM+srkvZHRKVVOn8i18wsIQ76ZmYJcdA3M0uIg76ZWUIc9M3MEuKgb2aWEAd9M7OEOOibmSXEQd/MLCEO+mZmCXHQNzNLiIO+mVlCHPTNzBLioG9mlhAHfTOzhDjom5klxEHfzCwhpYK+pA2SjkgalXR3wfYlknZn25+StCq37Z5s/RFJ6zuXdTMza1fL38jNfth8B3AjMAbskzQSEYdzyW4FXomIn5e0Gfgi8AlJV1P7vdxrgMuBv5R0ZURMd7ogtrg9duA42/cc4cT4BJcvHeKu9Vdx89pl3c5WV/i1sLko09JfB4xGxNGIeB3YBWyqS7MJ+Er2+JvARyQpW78rIs5ExA+A0ex4ZqU9duA49zz6PMfHJwjg+PgE9zz6PI8dON7trC04vxY2V2WC/jLgWG55LFtXmCYipoBXgXeU3Nesqe17jjAxeW7ncGJymu17jnQpR93j18LmqkzQV8G6KJmmzL5I2iqpKql68uTJElmylJwYn2hr/WLm18LmqkzQHwNW5JaXAycapZF0IXAJcKrkvkTEzoioRERleHi4fO4tCZcvHWpr/WLm18LmqkzQ3weskbRa0kXUbsyO1KUZAbZkj28BnoiIyNZvzmb3rAbWAE93JuuWirvWX8XQ4MA564YGB7hr/VVdylH3+LWwuWo5eycipiTdAewBBoCHI+KQpG1ANSJGgC8DfyZplFoLf3O27yFJjwCHgSngds/csXbNzEzxjBW/FjZ3qjXIe0elUolqtdrtbJiZ9RVJ+yOi0iqdP5FrZpYQB30zs4Q46JuZJcRB38wsIQ76ZmYJcdA3M0uIg76ZWUIc9M3MEuKgb2aWEAd9M7OEOOibmSXEQd/MLCE994Vrkk4CP5rDIS4D/rZD2ekVLlP/WIzlWoxlgsVXrndGRMsfJOm5oD9Xkqplvmmun7hM/WMxlmsxlgkWb7la8fCOmVlCHPTNzBKyGIP+zm5nYB64TP1jMZZrMZYJFm+5mlp0Y/pmZtbYYmzpm5lZA30V9CWtkPSkpBckHZL0b7P12yV9X9Jzkv6LpKW5fe6RNCrpiKT13ct9sUZlym3/bUkh6bJsWZL+Y1am5yS9pzs5b65ZuST9m6w+Dkl6ILe+L+tK0nWS9ko6KKkqaV22vl/q6mJJT0t6NivX72XrV0t6StKLknZLuihbvyRbHs22r+pm/os0KdPXsvPre5IeljSYre+LuuqIiOibP+DngPdkj/8e8DfA1cBNwIXZ+i8CX8weXw08CywBVgMvAQPdLkeZMmXLK4A91D63cFm27mPAtwEB7wee6nYZ2qyrXwL+EliSbfvZfq8r4C+Aj+bq5zt9VlcC3pY9HgSeyvL7CLA5W/8g8Kns8b8GHswebwZ2d7sMbZTpY9k2Af85V6a+qKtO/PVVSz8ifhwRz2SP/y/wArAsIv4iIqayZHuB5dnjTcCuiDgTET8ARoF1C53vZhqVKdv8h8C/A/I3XjYB/ylq9gJLJf3cQua5jCbl+hRwf0Scybb9JNuln+sqgJ/Jkl0CnMge90tdRUT8NFsczP4C+DDwzWz9V4Cbs8ebsmWy7R+RpAXKbimNyhQR/z3bFsDTnBsrer6uOqGvgn5e1qVcS+0KnvcvqV2xofaGPJbbNsabAbXn5MskaSNwPCKerUvWV2WC8+rqSuAD2bDAdyW9L0vWV+WqK9NngO2SjgF/ANyTJeubMkkakHQQ+AnwOLWe1niuMZXP+9lyZdtfBd6xsDlurb5MEfFUbtsg8M+B/5Gt6pu6mqu+DPqS3gZ8C/hMRLyWW38vMAV8bWZVwe49OV0pXyZqZbgX+N2ipAXrerJMUFhXFwKXUutC3wU8krUS+6ZcBWX6FHBnRKwA7gS+PJO0YPeeLFNETEfEddRavuuAdxcly/73RbnqyyTpH+Y2/zHwPyPir7PlvihTJ/Rd0M+u0N8CvhYRj+bWbwH+CfDrWdcNalfrFbndl/Nm17tnFJTpXdTGtZ+V9ENq+X5G0t+nT8oEDetqDHg060Y/DbxB7TtQ+qJcDcq0BZh5/A3eHJbqizLlRcQ48B1qF+Wlki7MNuXzfrZc2fZLgFMLm9PycmXaACDpPmAY+GwuWd/V1Wz1VdDPWoRfBl6IiC/l1m8APgdsjIjTuV1GgM3ZbIPVwBpq43g9o6hMEfF8RPxsRKyKiFXUTsj3RMT/oVamf5HNNng/8GpE/Lhb+W+kUV0Bj1EbK0bSlcBF1L70qi/rKnMC+FD2+MPAi9njfqmrYWUz3iQNATdQu1/xJHBLlmwL8F+zxyPZMtn2J3INrZ7QoEzfl/SvgPXAJyPijdwufVFXHdHtO8nt/AH/mFqX6zngYPb3MWo3/Y7l1j2Y2+deauOTR8hmWPTSX6My1aX5IW/O3hGwIyvT80Cl22Vos64uAr4KfA94Bvhwv9dVtn4/tdlHTwHv7bO6+gXgQFau7wG/m62/gtqFd5RaD2ZmxtXF2fJotv2KbpehjTJNZfUxU38z6/uirjrx50/kmpklpK+Gd8zMbG4c9M3MEuKgb2aWEAd9M7OEOOibmSXEQd/MLCEO+mZmCXHQNzNLyP8HOsc+MKhPP28AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(d17.x, d17.ratio)\n", - "d17.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 113.000000\n", - "mean -0.129231\n", - "std 0.077366\n", - "min -0.463717\n", - "25% -0.141660\n", - "50% -0.132295\n", - "75% -0.122428\n", - "max 0.339009\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(d17.ratio - d1.ratio).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Yes, it's worse than fresh bucket, but that's still quite good." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Does dynamic cache update help?" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_dyn():\n", - " row = []\n", - " for x in FILES[1:]:\n", - " with open('{:d}'.format(x - 1)) as fd:\n", - " cache = {binascii.unhexlify(_[2:66]) for _ in fd}\n", - " row.append(df_row_with_cache(x, cache, dyn=True))\n", - " return df_from_rows(row) " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "dd = calc_dyn()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.608656\n", - "std 0.008876\n", - "min 0.556575\n", - "25% 0.604430\n", - "50% 0.609102\n", - "75% 0.613458\n", - "max 0.626283\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X+wXGd93/H3R1dX5joE5MQisWW5EkVWQoDa+MYhdfhhJUYuaW0XWjDQxp4OeAbGpThFrTR0wDHTYqJSOkncYUzjCYQfNhhHEYUgQ20o9cRGV0i2kYyCsEOkK1orti+p4xt8JX/7x56Vj47O7p7dPbvn7O7nNaPR3bNnd59nzznP93me8zzPKiIwM7PJtqzqBJiZWfUcDMzMzMHAzMwcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzA5ZXnYCsM888M9auXVt1MszMRsru3bv/OiJW9fr62gWDtWvXMjc3V3UyzMxGiqQf9vN6dxOZmZmDgZmZORiYmRkOBmZmhoOBmZlRMBhIukzSAUkHJW1psc+bJe2XtE/SZzPPvUDSvKQ/KCPRZmZWro5DSyVNATcDlwKHgV2SdkTE/tQ+64GtwMUR8aSkF2Xe5kPAN8tLtpmZlalIy+Ai4GBEPBIRzwC3AVdk9nkncHNEPAkQEY81n5B0IfBzwF3lJNnMzMpWJBisBg6lHh9OtqWdB5wn6V5J90m6DEDSMuCjwOZ2HyDpWklzkuaOHj1aPPVmZlaKIsFAOdsi83g5sB54HfBW4L9LWgm8G/hKRByijYi4JSJmI2J21aqeZ1ObmVmPiixHcRhYk3p8DnAkZ5/7ImIJeFTSARrB4VeBV0t6N/B8YIWkpyIi9ya0mZlVo0jLYBewXtI6SSuAq4AdmX22A5cASDqTRrfRIxHx9og4NyLWAu8DPuVAYGZWPx2DQUQcA64DdgIPA5+PiH2SbpR0ebLbTuBxSfuBe4DNEfH4oBJtZmblUkS2+79as7Oz4VVLzcy6I2l3RMz2+nrPQDYzMwcDMzNzMDAzMxwMzMwMBwMzM8PBwMzMcDAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMcDAwMzNgedUJMLPRs33PPNt2HuDIwiJnr5xh86YNXHnB6qqTZX1wMDCzrmzfM8/WOx9icek4APMLi2y98yEAB4QR5m4iM+vKtp0HTgSCpsWl42zbeaCiFFkZHAzMrCtHFha72m6jwcHAzLpy9sqZrrbbaHAwMLOubN60gZnpqZO2zUxPsXnThopSZGXwDWSzCdPvSKDmvh5NNF4cDMwmSFkjga68YLUL/zHjYGAneOz4+Gs3EsjHerJNXDBwgZfPY8cng0cCWSsTdQO5WeDNLywSPFfgbd8zX3XSKjduY8e375nn4pvuZt2WL3PxTXf7GCc8EshamahgMG4FXpnGqcbooH+qZnCcX1hEmec8EsigYDCQdJmkA5IOStrSYp83S9ovaZ+kzybbzpf058m2ByW9pczEd6tuBV6daq/jVGN00D9ZOjgCBJwICKtXzvDhN77cXYHW+Z6BpCngZuBS4DCwS9KOiNif2mc9sBW4OCKelPSi5Kmngd+KiO9LOhvYLWlnRCyUnpMCzl45c+KCyG4ftjL66Mu8/7F504aT0gO91xirvi9Tt6BftbzgGDQCwb1bNlaTqDaqPn8mVZGWwUXAwYh4JCKeAW4Drsjs807g5oh4EiAiHkv+/4uI+H7y9xHgMWBVWYnvVp0my/Rbey27K+TKC1bz4Te+nNUrZxC91xjr0EUzTq2cMoxScKzD+TOpiowmWg0cSj0+DPxKZp/zACTdC0wBN0TEV9M7SLoIWAH8oOfU9qlOk2X6vUAHMUSwjLHjdRi6WGYrpwxl1XR7fZ8yW8SDrrXX4fyZVEWCQfZ+EzRamdn3WQ+8DjgH+JaklzW7gySdBfwxcHVEPHvKB0jXAtcCnHvuuYUT34u6TJbp9wKta22vynSlC6oXzkzzvOllLDy9VGnQL2vIbj/vU1ZwHMbw47qe15OgSDfRYWBN6vE5wJGcff40IpYi4lHgAI3ggKQXAF8G/kNE3Jf3ARFxS0TMRsTsqlWV9SJ1pd+bv/12WdWtK6T5fWRrCU2DTle2e2FhcYm/W3qWj73lfO7dsrGyCkBZN7P7eZ+yugCHcWO+buf1JCnSMtgFrJe0DpgHrgLeltlnO/BW4I8knUmj2+gRSSuAPwE+FRFfKC/Z1SqjhtRvl1WdukKy30fWMNJV1+6Fsmq6/b5PGS3iYdTa63ReT5qOwSAijkm6DthJ437ArRGxT9KNwFxE7Eiee72k/cBxYHNEPC7pXwCvAX5W0jXJW14TEXsHkZlWyu7n7LfgyabnY285v+v0pIPJ/MIiU9JJtbRhFoB530fT6iF10dS1e6Gs/vo6jIQbRhrqdF+vW6M+CqrQchQR8RXgK5ltH0j9HcBvJ//S+3wa+HT/yezdIPo5+yl4ykxPc/+ql5FolW/B0IYu1qGwzFNWTbcONeZhpaFTK6aOhe44LOcydmsTZU+Up5851nf3QfY9V54+zZNPL52yX5GCp+zujDp0j9ShIK5DYZmnrJpuFTXmvEL3w298eaUFcV0L3Tpch/0aq2CQd6K0UrT7IO89p5eJ6SmxdPy526VFC56yuzPq0D1Sh4K4zt0LZY1gG+ZIuFaF7off+PJKJ6rVpdDNBspWZU3V3ZTdGKtg0K7vOqtorTXvPZeeDVbOTPNTpy2vdMz3IN4Pum+G16Ugrsuw4XFQl0I3qw6Vn7xAKU4dbw/Vd1N2Y2yCwfY9821bAmnd1FpbnWQLi0vccPkvdX1hlF2LLvv9em2GuyAeL3UodPPUoUuy1fIe2YBQh27KbozFqqXNAqyVlTPTPY+xbneS9TJNvqwx34N6Py/yZlDf8f51WFKmVUBsrvdUxnVYhbFoGbTrHpqZnuqpBt+UV/Nu6rXZXHYtOu/9eh1xUdcaYRXqOGplWOpwHyhPHbokW7VO6rrwX1FjEQzaFVT9Rufma997e/7UiDoWkv2MuKhDM7wO6jpqZVDqOHKolaq7JOsaKPs1FsGgXaQuaxRHc3JX3mfXTauunht27Dvl4m7u39x2yS+s4ou758fuRO9WXW+gDkJdRw7VVafWyai2KMciGAwjUvf6Gc0TozlL+HjEwGfltrvpvbDYmB8xv7DI5i88AOLEENn5hUW+uHueN124mnu+d3TkTuaytBuMUMeWYL8mKfCVpVXrZJRblGMRDPrtRyxSYPfyGdkT43g8V+gO8gRpN+45benZUwfDLS4d557vHT1RI2x+N9ffvnciAkOnwQjdtARHpYY46feJ+jlOg5jkWpWxCAbQ+03Ubgrsbvsq293YHuQJ0u6mdxHNQmCUazm96jQYoWhrc5S+u0m+T9TPcepmkuv8wiIX33R3rbuTxiYYZBU9yIMssDvVrHqpeRU5gfJaMU8/cyx3CY08zUJgErsPyhqMMOzvrp+CZZDdrK3SVVVBWGZNvptJrvBcGTT3wyf4Hw/86ESXbfo5qK6yMLbBoOjFOIgCu6lTd023Na9uajHZVkynZaab0oXAJHYflDUYYZjfXd55cf3te3nv7XsL3Z8a1A3RVufr3A+fOGmQwrAKwn5r8lm9HMvFpeN85r6/yp2tXHVFaywmneUpejF2KpD7aSrnTZBpalXzavejOWX9wEkrU9JJtd+6TjwapLImNQ3zu2s1IxaK/4bwlRes5t4tG3n0pt886ceA+vlN4lbn6+fuP1TJxMZea/Kt8trqWDYnubbS6gegoNqK1tgGg6IXYy8FdlHZAnhKjV8QzZuduH3PPOf/zl289/a9LS+8Mn7g5N4tG/mvbzk/t8D76Jv/wUlpqsNsz2HrZUZ3XgAf5nfX6fj3U9D2UwFpla7mfbmi+5el15p8q7y2OsY3XP5L3LtlY9uA0EqVFa2x7SYq2g+abiIPYvhnkZvO7bpw0k3Hsm70FR0ZVYfZnlXoZqBAuzH6ZUzaKtJFU2T0WBWr4rZKV/May9t/kFqlp7noZLfDiTtdH90O5Ki6oqVoEaWrMjs7G3Nzc6W8V93u1rdy8U13t72YBTx602/mBo2Z6amhr4EyKt/rMLQ6dmUsTVD0eBe5H9RrevrJX6v0v+nC1bkTG7s9j7s9Dzt9n4M4ltv3zPNvP/9Ay9ZQ0+nTyzhteoqFp5d6vqYk7Y6I2Z4Syhi3DGA409bLKBg71bKaNaY61NS7uYk9SkGjjms5FR0EkW3dlrl6ZjcjjbpZ0mL27/1MX+dGLzfNe6nJl9FVfH2LpWygEWias/6bo/2qGlk01i2DQSurpt6uZVBFzT9PemJenmztqS6tmCL6SesgWwbrtnw592Zjs6XYStlBuJf5OjDY492pNd3rZw+iAtPpHCnrHHLLoEJljSVv1bd4xunTfPCf9L7ialmKdENka8J1nqOQvuBfODPN3/zdEtnJ2K3Smi0sBrmWU6/3iMpuERd5vzKOdzcFcZGb5nlrcWW71/KeL/v87NTiqMsQ7okNBoPs3un2INah+6edIkPysgVUv9/NIGpo2/fMc8OOfSdN9kn/3SmteV0TzTHjRQcedJOvUVods4zj3c1M4CI3zbNrcaXfr9PnlXn+dbq+6zIDfCKDQVlLBZR5ENvVSKrue+90QecVUEW+m3azU8teyqHopLtWaYX24/mPR5z4Htodx27vtywuHT8l0ECj66FOFYd+r4VuWxa9LLmSfr9OQ2bLPv/aXd91CfpjO8+gnbJ+zWsYY8n7mfRTlnYXdKtx+J2+m3b5GsSvrXU74SjvOPY7nr9ovtLfDZwcaIBTvrfrb9/L2pxJisPU77XQbcsiO4dHBdPZfL92nzfsX/vrZW7LIExky2CUuneG3Rebp1XNpd0J2+m7aZevQfShdvPa7Ezspn7H87d6bn5hke175gt9N82/07KzjWH469v0ey300rJI17bz1hzKW4ur+X7tPq+KPvxhjHzsZCKDwbC6d/pVxrr6ZXS59Hqht/tu2l1wg+hDLbqsd7sgV6Rrol0a26UhfUz6KYyqvEnfz7XQb1dJ9rNbjW5qvl+7z2s1am6cl2GBCe0mGoVlFspaV7+sJm9zKYvs2jW9ardcyCCOT6tlR06fXsYZp093bJ5n+/Dh1K6JTmlst/RJ+pi0+26KHPdRXEiw7K6STu/X7vlW598lv7Cq5bph42AiWwZ1H70D5a2rX5dha1ntamZlHp/sMNLnTS/repZn3m9eNGfSZn8RDlrf3G3+3+n3tDvVkvtpndRZ2YMoOrVUWj2fd/5lhxDXYcnpsk1kMIDOJ0ovJ1+Zo356XVc/m4aVp0+37TutSqcCv4zut2whvrC4xMz0FB97y/ldvXer1lX6F+HyPq/VDyR16oYoEgyLzDauehRaWaqY9Z49/y6+6e7azpspy8QGg3Z66WcvezhkL+vq56VhepmYntKJ3zmG+nSJDfqmWVkT34q2rop+XpH+8XbfTbsbp4McntvKoINO0e91kHmuawu7TBN5z6CTXvrZyx6O1ku/eV4alp4NfmrF8sqHrRXV7vcculXWBVx0OfSin1dm/3ireznDGh45jKHPZQTjfk3Cb3u4ZZCjl0Kk7JpDL/3mrT7rx4tL7P3g63tKR1l6Wd9mUK2rbi/goiNduvm8QbeKhlWTHcayI0W/10Hlefueef72J8dO2V6XFnZZHAxy9FKIDGI4ZLcFRhXT2sss5MsuWMqa2Vk0MNdlJikM71wYdNDppiAeRJ5bzVyvy7phZSrUTSTpMkkHJB2UtKXFPm+WtF/SPkmfTW2/WtL3k39Xl5XwQeqli6YOw1WHnYaiXQRFm++DaF0NujtmUJ/Xr2GdC4PsPmmeX9n1o844fbqnWe+9aDWq7/QVy8cqEECBloGkKeBm4FLgMLBL0o6I2J/aZz2wFbg4Ip6U9KJk+88AHwRmaQx62J289snys1KeXrpo6jBcddhpKFqTL1rIt6rZvXBmuuVwzU4tk2HP7KzDTNJmOmDw58IgW0PdFsSDyPMk3DhuKtJNdBFwMCIeAZB0G3AFsD+1zzuBm5uFfEQ8lmzfBHwtIp5IXvs14DLgc+Ukf3B6uajrUBAMMw39FvLLpJOWYcgrWKaXib995lju6pNQ/oJi42QY58Igg04vBXHZea7LiqLDUCQYrAYOpR4fBn4ls895AJLuBaaAGyLiqy1e66t0TBS9UFot43A84qTCO69gyVtjpt06PeM29nuQBjUmv6zPq0NBXKf7QINWJBjkLQiY/QGm5cB64HXAOcC3JL2s4GuRdC1wLcC5555bIEn1Ni6TfTopeqE08573W7DZwjtbsKzb8uXcz+5lZJc9Z5jzEHr9vLIK4n6uxzp0/w5LkWBwGFiTenwOcCRnn/siYgl4VNIBGsHhMI0AkX7tN7IfEBG3ALdA42cvC6a9loZ9kVWpmwvlygta/xZsu8K7U+2w6prjqBr2L9H18nllFMRlLdQ4btduniLBYBewXtI6YB64CnhbZp/twFuBP5J0Jo1uo0eAHwD/SdIZyX6vp3GjeWzV+eceB6GbC6WXZn+36/SMaxO+bMO+Mdrr5/VbEE/a9diPjsEgIo5Jug7YSeN+wK0RsU/SjcBcROxInnu9pP3AcWBzRDwOIOlDNAIKwI3Nm8njapJGH3Srl2Z/0XV6xr0JX7ZeRm4N4vMG3Yrz9VicIurVKzM7Oxtzc3NVJ6NnF990d8s1hdKLmk2qSbmfUnd5k6mmlwnEKetY9TJXInucs6t+9vPe3Zik61HS7oiY7fX1noFcskkafdCLSel/rbtuR27120//xd3zuUt+D/pc8PVYnINBySZp9IGNtjJGbuUpuuT3MPh6LM7BYABc+7VRVFa/ft366X09FuMlrM0MKG9tn0lY7nkcORiYGVDeQnt1WLTRuuduIrOaqMNIqzK6VNxPP5ocDGxk1KGwHJRxm7nufvrR424iGwnD+HnFKg3rZyrNWnEwsJEw7oVl3Ubg2ORxMLCRMO6FpUfgWNUcDGwkjHth6RE4VjUHAxsJ415Y1un3k20yeTSRjYRJGK7oEThWJQcDGxkuLM0Gx91EZmbmYGBmZg4GZmaGg4GZmeFgYGZmOBiYmRkOBmZmhoOBmZnhYGBmZjgYmJkZDgZmZoaDgZmZ4WBgZmY4GJiZGQ4GZmaGg4GZmeFgYGZmOBiYmRkFg4GkyyQdkHRQ0pac56+RdFTS3uTfO1LP/a6kfZIelvR7klRmBszMrH8dfwNZ0hRwM3ApcBjYJWlHROzP7Hp7RFyXee0/BC4GXpFs+t/Aa4Fv9JluMzMrUZGWwUXAwYh4JCKeAW4Drij4/gE8D1gBnAZMA/+3l4SamdngFAkGq4FDqceHk21Zb5L0oKQ7JK0BiIg/B+4BfpT82xkRD2dfKOlaSXOS5o4ePdp1JszMrD9FgkFeH39kHn8JWBsRrwC+DnwSQNJLgF8EzqERQDZKes0pbxZxS0TMRsTsqlWrukm/mZmVoEgwOAysST0+BziS3iEiHo+InyQPPwFcmPz9T4H7IuKpiHgK+DPgVf0l2czMylYkGOwC1ktaJ2kFcBWwI72DpLNSDy8Hml1BfwW8VtJySdM0bh6f0k1kZmbV6jiaKCKOSboO2AlMAbdGxD5JNwJzEbEDeI+ky4FjwBPANcnL7wA2Ag/R6Fr6akR8qfxsmJlZPxSR7f6v1uzsbMzNzVWdDDOzkSJpd0TM9vr6ji0DszravmeebTsPcGRhkbNXzrB50wauvCBvkJuZFeFgYCNn+555tt75EItLxwGYX1hk650PATggmPXIaxPZyNm288CJQNC0uHScbTsPVJQis9HnYGAj58jCYlfbzawzBwMbOWevnOlqu5l15mBgI2fzpg3MTE+dtG1meorNmzZUlCKz0ecbyDZymjeJPZrIrDwOBjaSrrxgtQt/sxK5m8jMzBwMzMzMwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMcDAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMKBgMJF0m6YCkg5K25Dx/jaSjkvYm/96Reu5cSXdJeljSfklry0u+mZmVYXmnHSRNATcDlwKHgV2SdkTE/syut0fEdTlv8SngP0bE1yQ9H3i230SbmVm5irQMLgIORsQjEfEMcBtwRZE3l/RSYHlEfA0gIp6KiKd7Tq2ZmQ1EkWCwGjiUenw42Zb1JkkPSrpD0ppk23nAgqQ7Je2RtC1paZiZWY0UCQbK2RaZx18C1kbEK4CvA59Mti8HXg28D/hl4MXANad8gHStpDlJc0ePHi2YdDMzK0uRYHAYWJN6fA5wJL1DRDweET9JHn4CuDD12j1JF9MxYDvwyuwHRMQtETEbEbOrVq3qNg9mZtanIsFgF7Be0jpJK4CrgB3pHSSdlXp4OfBw6rVnSGqW8BuB7I1nMzOrWMfRRBFxTNJ1wE5gCrg1IvZJuhGYi4gdwHskXQ4cA54g6QqKiOOS3gf8T0kCdtNoOZiZWY0oItv9X63Z2dmYm5urOhlmNgTb98yzbecBjiwscvbKGTZv2sCVF+SNT7FOJO2OiNleX9+xZWBmNgjb98yz9c6HWFw6DsD8wiJb73wIwAGhAl6OwswqsW3ngROBoGlx6Tjbdh6oKEWTzcHAzCpxZGGxq+02WA4GZlaJs1fOdLXdBsvBwMwqsXnTBmamT16QYGZ6is2bNlSUosnmG8hmVonmTWKPJqoHBwMzq8yVF6x24V8T7iYyMzMHAzMzczAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM8PBwMzMcDAwMzMcDMzMDAcDMzPDwcDMzHAwMDMzHAzMzAwHAzMzw8HAzMxwMDAzMxwMzMwMBwMzM6NgMJB0maQDkg5K2pLz/DWSjkram/x7R+b5F0ial/QHZSXczMzKs7zTDpKmgJuBS4HDwC5JOyJif2bX2yPiuhZv8yHgm32l1MzMBqZIy+Ai4GBEPBIRzwC3AVcU/QBJFwI/B9zVWxLNzGzQigSD1cCh1OPDybasN0l6UNIdktYASFoGfBTY3HdKzcxsYIoEA+Vsi8zjLwFrI+IVwNeBTybb3w18JSIO0YakayXNSZo7evRogSSZmVmZOt4zoNESWJN6fA5wJL1DRDyeevgJ4CPJ378KvFrSu4HnAyskPRURWzKvvwW4BWB2djYbaMzMbMCKBINdwHpJ64B54CrgbekdJJ0VET9KHl4OPAwQEW9P7XMNMJsNBGZmVr2OwSAijkm6DtgJTAG3RsQ+STcCcxGxA3iPpMuBY8ATwDUDTLOZmZVMEfXqlZF0FPhhRR9/JvDXFX122cYlL+OSD3Be6mpc8rIhIn661xcX6SYaqohYVdVnS5qLiNmqPr9M45KXcckHOC91NS55kTTXz+u9HIWZmTkYmJmZg0HWLVUnoETjkpdxyQc4L3U1LnnpKx+1u4FsZmbD55aBmZlNTjCQtEbSPZIelrRP0r9Jtm+T9L1kXaU/kbQy9ZqtybLdByRtqi71J2uVl9Tz75MUks5MHkvS7yV5eVDSK6tJ+ana5UXSv06++32Sfje1vXbHpc35db6k+5Kl3eckXZRsr/MxeZ6kb0t6IMnL7yTb10m6X9L3Jd0uaUWy/bTk8cHk+bVVpj+tTV4+k5w/35V0q6TpZPvIHZfU878v6anU4+6OS0RMxD/gLOCVyd8/DfwF8FLg9cDyZPtHgI8kf78UeAA4DVgH/ACYqjof7fKSPF5DY4LgD4Ezk21vAP6MxjpTrwLurzoPBY7LJTTWuTotee5FdT4ubfJxF/CPUsfhGyNwTAQ8P/l7Grg/SePngauS7R8H3pX8/W7g48nfV9FYzr7yfHTIyxuS5wR8LpWXkTsuyeNZ4I+Bp1L7d3VcJqZlEBE/iojvJH//PxpLZqyOiLsi4liy23001l6CxjLdt0XETyLiUeAgjeW8K9cqL8nTHwP+HScvJngF8KlouA9YKemsYaa5lTZ5eRdwU0T8JHnuseQltTwubfIRwAuS3V7Ic+t61fmYREQ0a5jTyb8ANgJ3JNs/CVyZ/H0Fzy1OeQfw65LyFrgculZ5iYivJM8F8G1Ovu5H6rio8Zsz22hc92ldHZeJCQZpSXPpAhqRNe1f0agVQPGluyuVzkuyJMh8RDyQ2W3k8gKcR2ORw/slfVPSLye71T4vmXy8F9gm6RDwn4GtyW61zoekKUl7gceAr9FogS2kKk7p9J7IS/L8j4GfHW6KW8vmJSLuTz03DfxL4KvJppE6LklergN2xHPrwzV1dVwmLhhIej7wReC9EfE3qe3vp7G20meam3JeXquhV+m80Ej7+4EP5O2as622eUmOy3LgDBpN9c3A55NaTa3zkpOPdwHXR8Qa4HrgD5u75ry8NvmIiOMRcT6NGvNFwC/m7Zb8P1J5kfSy1NP/DfhfEfGt5PGo5eU1wD8Hfj9n967yMlHBIKkFfBH4TETcmdp+NfCPgbcnzUYosHR3lXLy8vdp9KE/IOkvaaT3O5J+ntHLCzTSfGfSNP428CyNNWRqm5cW+bgaaP79BZ7r0qptPtIiYgH4Bo2gvFJScwmbdHpP5CV5/oU0FqyslVReLgOQ9EFgFfDbqd1G7bhcArwEOJhc96dLOpjs1tVxmZhgkNQq/xB4OCL+S2r7ZcC/By6PiKdTL9kBXJXckV8HrKfRt1i5vLxExEMR8aKIWBsRa2mcCK+MiP9DIy+/lYyUeBXw45wmZSVaHRdgO40+aiSdB6ygsZhYLY9Lm3wcAV6b/L0R+H7yd52PySolo+okzQC/QeMeyD3AP0t2uxr40+TvHcljkufvTlWqKtUiL9+T9A5gE/DWiHg29ZJROy67I+LnU9f90xHxkuQl3R2XdneXx+kf8Gs0mkgPAnuTf2+gcQPyUGrbx1OveT+NvtIDJCNC6vCvVV4y+/wlz40mEnBzkpeHaPyuROX56HBcVgCfBr4LfAfYWOfj0iYfvwaXzEsVAAAAdUlEQVTspjEC6n7gwhE4Jq8A9iR5+S7wgWT7i2kE3oM0WjnNkV7PSx4fTJ5/cdV5KJCXY8l33zxWze0jd1wy+6RHE3V1XDwD2czMJqebyMzMWnMwMDMzBwMzM3MwMDMzHAzMzAwHAzMzw8HAzMxwMDAzM+D/A7hkxgVTlkJAAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(dd.x, dd.ratio)\n", - "dd.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGn1JREFUeJzt3X+QXWd93/H3R6uVWQfKuvUyQWspUohQa2JqwcbxjNskqBAZaGQP0CAPaWHa1FNaNbEhauQhY6j7hxWUhkwz6hC3YSZJSS0DrrItSkRTQ394xkYrJNnIRkUxJtYqHZTghWS04JX87R/3XOns1bn3nnN/7L13n89rRqO955577/Occ+7nPM9zflxFBGZmloY1gy6AmZmtHIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWkLWD+uDrr78+Nm3aNKiPNzMbSceOHfvziJjq9PUDC/1NmzYxNzc3qI83MxtJkr7Zzes9vGNmlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJceibmSXEoW9mlhCHvplZQhz6ZmYJKRX6km6XdFrSGUl7m8zzs5KekXRK0u/3tpi2mhw6Ps9t+x5j897Pc9u+xzh0fH7QRTJLRtsbrkkaAw4AbwPOAkclzUbEM7l5tgD3AbdFxIuSXtOvAttoO3R8nvsefZrFpUsAzC8sct+jTwNw57bpQRbNLAllWvq3AGci4rmIeAl4GLijYZ5/ChyIiBcBIuJbvS2mrRb7j5y+HPh1i0uX2H/k9IBKZJaWMrdWngZeyD0+C/x4wzyvB5D0ODAGfCwi/qjxjSTdDdwNsHHjxk7KayPu3MJipemr0aHj8+w/cppzC4usn5xgz46t7uXYiinT0lfBtGh4vBbYAvwUcBfwHyVNXvWiiIciYiYiZqamOv4NABth6ycnKk1fberDW/MLiwRXhrd8XMNWSpnQPwtsyD2+AThXMM8fRMRSRHwDOE1tJ2C2zJ4dW5kYH1s2bWJ8jD07tg6oRCvLw1s2aGVC/yiwRdJmSeuAXcBswzyHgLcASLqe2nDPc70sqK0Od26b5sF33cT05AQCpicnePBdNyUzvOHhLRu0tmP6EXFR0m7gCLXx+k9FxClJDwBzETGbPffTkp4BLgF7IuIv+llwG113bptOJuQbrZ+cYL4g4FMZ3rLBK/UbuRFxGDjcMO3+3N8BfCj7Z2ZN7Nmxddkpq5DW8JYN3sB+GN0sRfUejs/esUFx6JutsJSHt2zwfO8dM7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS0ip0Jd0u6TTks5I2lvw/AcknZd0Ivv3870vqpmZdWttuxkkjQEHgLcBZ4GjkmYj4pmGWQ9GxO4+lNHMzHqkTEv/FuBMRDwXES8BDwN39LdYZmbWD2VCfxp4Iff4bDat0bslPSXps5I29KR0ZmbWU2VCXwXTouHxfwU2RcQbgT8GfqfwjaS7Jc1Jmjt//ny1kpqZWdfKhP5ZIN9yvwE4l58hIv4iIr6fPfwPwJuL3igiHoqImYiYmZqa6qS8ZmbWhTKhfxTYImmzpHXALmA2P4Ok1+Ye7gSe7V0RzcysV9qevRMRFyXtBo4AY8CnIuKUpAeAuYiYBX5B0k7gIvBt4AN9LLOZmXVIEY3D8ytjZmYm5ubmBvLZZmajStKxiJjp9PW+ItfMLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhDj0zcwS4tA3M0uIQ9/MLCEOfTOzhJQKfUm3Szot6YykvS3me4+kkDTTuyKamVmvtA19SWPAAeDtwI3AXZJuLJjvVcAvAE/2upBmZtYbZVr6twBnIuK5iHgJeBi4o2C+fwN8HPheD8tnZmY9VCb0p4EXco/PZtMuk7QN2BAR/62HZTMzsx4rE/oqmBaXn5TWAJ8APtz2jaS7Jc1Jmjt//nz5UpqZWU+UCf2zwIbc4xuAc7nHrwJ+FPiSpOeBW4HZooO5EfFQRMxExMzU1FTnpTYzs46UCf2jwBZJmyWtA3YBs/UnI+I7EXF9RGyKiE3AE8DOiJjrS4nNzKxjbUM/Ii4Cu4EjwLPAIxFxStIDknb2u4BmZtY7a8vMFBGHgcMN0+5vMu9PdV8sMzPrB1+Ra2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCSoW+pNslnZZ0RtLeguf/maSnJZ2Q9H8k3dj7opqZWbfahr6kMeAA8HbgRuCuglD//Yi4KSJuBj4O/HrPS2pmZl0r09K/BTgTEc9FxEvAw8Ad+Rki4ru5hz8ARO+KaGZmvbK2xDzTwAu5x2eBH2+cSdK/AD4ErAO2F72RpLuBuwE2btxYtaxmZtalMi19FUy7qiUfEQci4nXALwO/UvRGEfFQRMxExMzU1FS1kpqZWdfKhP5ZYEPu8Q3AuRbzPwzc2U2hzMysP8qE/lFgi6TNktYBu4DZ/AyStuQevhP4eu+KaGZmvdJ2TD8iLkraDRwBxoBPRcQpSQ8AcxExC+yW9FZgCXgReH8/C21mZp0pcyCXiDgMHG6Ydn/u71/scbnMzKwPfEWumVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCHPpmZglx6JuZJcShb2aWEIe+mVlCSoW+pNslnZZ0RtLeguc/JOkZSU9J+h+Sfqj3RTUzs261DX1JY8AB4O3AjcBdkm5smO04MBMRbwQ+C3y81wU1M7PulWnp3wKciYjnIuIl4GHgjvwMEfHFiLiQPXwCuKG3xTQzs14oE/rTwAu5x2ezac38E+APuymUmZn1x9oS86hgWhTOKP0cMAP8ZJPn7wbuBti4cWPJItooO3R8nv1HTnNuYZH1kxPs2bGVO7e1ajOYWT+VaemfBTbkHt8AnGucSdJbgY8AOyPi+0VvFBEPRcRMRMxMTU11Ul4bIYeOz3Pfo08zv7BIAPMLi9z36NMcOj4/6KKZJatM6B8FtkjaLGkdsAuYzc8gaRvwW9QC/1u9L6aNov1HTrO4dGnZtMWlS+w/cnpAJTKztqEfEReB3cAR4FngkYg4JekBSTuz2fYDrwQ+I+mEpNkmb2cJObewWGm6mfVfmTF9IuIwcLhh2v25v9/a43LZKrB+coL5goBfPzkxgNKYGfiKXOujPTu2MjE+tmzaxPgYe3ZsHVCJzKxUS9+sE/WzdHz2jvksruHh0Le+unPbtL/ciaufxVU/qF8/iwtYddvGKOzcHPpm1lfNzuL62OypoQ/IKkZl5+bQN7OOlWnZNjtba2FxiYXFJWB4A7KKVqco1+s0DD0Bh76tesPwRVuNyrZsm53F1agxIEdNu1OUh6Un4LN3bFXzVcH9U/biu6KzuJoZ5Ws4mp2KXJ8+LBcrJt/Sr7cC5xcWGZO4FMF0F61BtyqHS5kut3Wm7MV3RWdxXXjpIi9eWLrqtfWAHOT3qNPP3rNj67KWPCw/RXlYLlYc+dDvZuNo7G5ditp95Drtdg1L92016nQ9D8sXbVh18/2pcvFd41lcjd8VuBKQvfweVa1fN5/d7hTlYblYURGFN8zsu5mZmZibm+v49YeOz/Ox2VOXDwTVTYyP8eC7biq1cdy277G2Y42Nrf5WG1Gz95uenODxvdvLVm2gVrqFlf+8V0+MI8HChaVln10UEKJ2q9d2vbJerJPV2nvrZrk2e32V71+z5dqr71GV8uV7/EV68R3udnnVSToWETOdlmMkW/pFC6+uTNe93QrOy+/pgZatgEG0KnsZSCvdwmr8vPwOPP/ZRUM0UTBfURnbdbnL1KNomcx989t88WvnR3pH0M1yzU/vdPtrdg1HL75Hh47P8+FHTl7uvdcV5UOrPKmbX1hk2wNfuKpBUsWwXKw4Ui39smEt4Bv73ln4+qLeQRnTWResVUug2Thl2VZCmVZv4/xFG+t1147z0Z95A1BtA+u2hZVfP/UWY11Ri6ZMT6vxfZppVcaqyzWvTBmhfQtyGHYOjWUpU68xiX/7s38bWLmw6nQ7bLX95TXmQ9l1nNeLHk2nkmnpl9kb1+XHyMpuCO20a2XMLywyvkaMj4mlS1c+pWyrsmyrN7+xFLXUAF68sMSez5wEcbksRe9RNgTKtLAay9+4nItaWGXet+z6yp8WV/QFKxomKtOaLdu6XFy6xIcfObnsvdp9XpUw6OaEg6LGTtnvw6WIUttSNxqXw1v+5hSfOzZfqXfWbvvLaxxD76QnXnZEoWi5D/o438iEfrOAa5TfOKpsCMDlL1OR9W1a+gBLLwei1tKu0g1s1hXNqxqaSy9f/V759ygKpGYh8OqJcW7b91jLcCqzfhrLW7a1WcYaiV859PSysGj8gnVyJk+VMl6KKP150HqoEJo3WBpPOGg11NSqsRSU60m125YaVd2ZNS6Hzx2b591vnm45fNbYe/vu95YoKOZVinYenW6H8wuLHDo+33QZdDME3U8jM7yzee/n226c146v4ZrxscuB22y4pUi969jqYAtQqrdRpuvXSQ+kF91SaD0UVUZR/cqsn8bueZXeWxnNlmP9c5uVsb5ci8IKyq3zqp/XLGhabYdlTYyPXQ7NskM4lyIq94SLhlGrHqxstQ0368l0umwETDY0yIDCId+yy6JZ3dp9N5sNQZfR7fDOyFyc1eq0punJCX7u1o0E4sULS5cvwikbavm9/53bpnnwXTcxPTmBsveuB3695Tamop8NvqLZBReHjs9z277H2LT389x78MTljaLsFy1/DnN9o2pdkmJVlk2RxvodOj7PmjbLpKiF1bisJyfGWVOiQpMT44XroNlynF9Y5LZ9jzF57Xjh8+snJwov4rr34AnuOXiCa9auKVWu/Oe97r7DTcuzfnKi7cHKsj3bIotLl/j0E39aqYcyMT7G+27d2Hbbzgtq4Za/0K1Z7+bDj5wsvCCuVW+12YV0nSyb8TVi7djyfNjzmZPs+ezJqwL/umvHed+tG0tdULa4dIl7Dp64ajm0GzIa5G9KjExLv10LotNWb/2gZ9VT08bXaNk4Z6PGPXm3rdqqvY1+y7eOm5Wnyul/de2WU3053HvwROXjM83W2XXZzqDVjrDd+i6rXv5mJyS06yX0U309Vd2+8t/DVuXO90DaXaSVNzkxzg9cs7bSAWhYvv112uvPDyGVOQGknietTjjp5DTNvG5b+iMT+tB6rLDsl6STIGq2Q5mcGOcvv3excCw+v6FWGXPMl3Py2nFevLC07MBds423/nm9GiNvp93n1c/66Paq5mZn23S6k5+cqAV8J2dwNYZP0QHHVvLbXKtz5FsdW+qnxmGuKsu3HpRlhjXyNetkZ1pm6KVx+6uyE2029FLlTK53v3m6cNso08hsW76UQr+VTi60aqfdxi/gE++9uXIvoJ1OWvWtWt7NviSd7ijK1K+bMcsyuhnX7fTAXbMx7E5PI656XKcXO4UqpxZX7W1MZ8u16rGBTrbDVp/R6enBdc1ODa2yzdWzph+nuSYzpt9Ou5s6CXh87/bKt2hotaGsn5woPAbwylesrRz49ZHU+jGEZmd/tCoLFB+TKBqfnBgf42M738Dje7fzG++9+arn66efNrru2vFS9ev3mGW+nlW0Gk8v89qicjy+d3vbcrR7bZmD4J947808v++d/MmD7yhcZ61MjI/xG++9mcf3buejP/OGUj9jWWUdiitntlXdHX1ncanpdthMvbdePxZ03bXjy47BNX7Py970rdWpoVW2uXMLi5fX7zf2vbNS9vTbyJyy2U59gTY79bFqCLUL3MaDv41nspTRbqipbDg1bqhFVzrO/NBfX3aed/5gbLMrBYum1cdtq5SnX+r1bNaKK7pArN766uRinFZ1ajUW3u617dZzvcGSV+UmZmPSsiAse2VoUZ2KennNWt2N05vNl2+wlK1T1dsi1N/7noMnms5TZiSg2TUfzeo0jFZN6MOVFdvNZfd1rb6I7TaOMsMHZca8y75PmYNCRcum8dzwovcomtaqXN3cobRTzW610Opc7zLd9OmsV1Cma54PrKoXULVbz80CpHGdVTldstn6blanVo2BZmWvN2iqXHRVtk6dNCrqvede3NenXsZm9/9aiUZPp1bNmH5eLy577uaWBGXPQClz0Va7cKoydt6rm491e6O7fujkbor9vsFWWa3Wc9XlOojbPlTZrjopXz/vLwXdb7srvcx9ILdPenkHwar3eyl6n16EU7uLk8qUpdW9foZlzLKKfoRAp+Xo5e86rKRhWYZlDdP9kDrh0O+jYdo4evHF6ralvxpuHV1kmNbzqFqJZej1VOPQT0i3G323O45uewpmnRq13kQ/JXOXTSt38K3d66HzW+QOyy//WHr8s5e949BPTDc7jm5/kMSsU/7Zy95ZNRdnWf81uxmdW1rWb816k+5lVueWvlXS7RCTWSfcy+ydUi19SbdLOi3pjKS9Bc//hKSvSLoo6T29L6aZpcy9zN5p29KXNAYcAN4GnAWOSpqNiGdys/0p8AHgl/pRSDMz9zJ7o8zwzi3AmYh4DkDSw8AdwOXQj4jns+de7kMZzcysR8oM70wDL+Qen82mVSbpbklzkubOnz/fyVuYmVkXyoR+0e+ndXRFV0Q8FBEzETEzNTXVyVuYmVkXyoT+WWBD7vENwLn+FMfMzPqpTOgfBbZI2ixpHbALmO1vsczMrB/ahn5EXAR2A0eAZ4FHIuKUpAck7QSQ9GOSzgL/APgtSaf6WWgzM+tMqYuzIuIwcLhh2v25v49SG/YxM7Mh5tswmJklxKFvZpYQh76ZWUIG9iMqks4D3xzIh8P1wJ8P6LN7abXUA1yXYbRa6gGrqy5bI+JVnb54YHfZjIiBXZ0laa6bX54ZFqulHuC6DKPVUg9YfXXp5vUe3jEzS4hD38wsIamG/kODLkCPrJZ6gOsyjFZLPcB1uWxgB3LNzGzlpdrSNzNL0qoLfUkbJH1R0rOSTkn6xWz6fklfk/SUpP8iaTL3mvuyn4I8LWnH4Eq/XLO65J7/JUkh6frssST9u6wuT0l602BKvlyrekj6l9lyPyXp47npI7VOJN0s6QlJJ7LfjLglmz6U6wRA0iskfVnSyawu/zqbvlnSk5K+LulgdqNFJF2TPT6TPb9pkOWva1GPT2fbz1clfUrSeDZ95NZJ7vnflPRXucfV10lErKp/wGuBN2V/vwr4v8CNwE8Da7Ppvwr8avb3jcBJ4BpgM/AnwNig69GqLtnjDdRugvdN4Pps2juAP6T2Gwi3Ak8Oug5t1slbgD8Grsmee82orhPgC8Dbc+vhS8O8TrKyCXhl9vc48GRWxkeAXdn0TwIfzP7+58Ans793AQcHXYc29XhH9pyA/5yrx8itk+zxDPB7wF/l5q+8TlZdSz8i/iwivpL9/ZfU7gw6HRFfiNodQwGe4MoN4u4AHo6I70fEN4Az1H4icuCa1SV7+hPAv2L5D9rcAfxu1DwBTEp67UqWuUiLenwQ2BcR38+e+1b2klFcJwH8tWy2V3PlNyeGcp0AZGWqtxrHs38BbAc+m03/HeDO7O87ssdkz/89SUU/srSimtUjIg5nzwXwZZZ/50dqnaj2W+X7qX3n8yqvk1UX+nlZV2cbtb1l3j+mtqeHHv4cZD/l66LaLa3nI+Jkw2xDX5eGdfJ64O9m3dL/KenHstmGvh5wVV3uAfZLegH4NeC+bLahroukMUkngG8B/51ar2oh10DKl/dyXbLnvwP8jZUtcbHGekTEk7nnxoF/CPxRNmmk1klWl93AbET8WcPsldfJqg19Sa8EPgfcExHfzU3/CHAR+HR9UsHLh+qUpnxdqJX9I8D9RbMWTBuauhSsk7XAddS62HuAR7JWylDXAwrr8kHg3ojYANwL/HZ91oKXD01dIuJSRNxMrRV8C/C3imbL/h/aujTWQ9KP5p7+98D/ioj/nT0e2npAYV1+gtpvlfxmweyV67IqQz/bs38O+HREPJqb/n7g7wPvy7p8MOQ/B1lQl9dRG+c+Kel5auX9iqQfZIjr0mSdnAUezbq0XwZepnaPlKGtBzSty/uB+t+f4cpw1FDXpS4iFoAvUdsBT0qq36IlX97LdcmefzXw7ZUtaWu5etwOIOmjwBTwodxso7ZO3gL8CHAm+85fK+lMNlvldbLqQj9rKf428GxE/Hpu+u3ALwM7I+JC7iWzwK7sKPhmYAu18b+BK6pLRDwdEa+JiE0RsYnaSn9TRPw/anX5R9nZCbcC3ynoDq64ZusEOERt/BhJrwfWUbsp1kitk8w54Cezv7cDX8/+Hsp1AiBpStlZbJImgLdSO0bxReA92WzvB/4g+3s2e0z2/GO5xtPANKnH1yT9PLADuCsiXs69ZNTWybGI+MHcd/5CRPxI9pLq66Tdkd5R+wf8HWrdm6eAE9m/d1A7GPhCbtonc6/5CLWxzNNkZ2AMw79mdWmY53munL0j4EBWl6eBmUHXoc06WQf8J+CrwFeA7aO6TrLpx6iddfQk8OZhXidZ2d4IHM/q8lXg/mz6D1PbyZ6h1mupn131iuzxmez5Hx50HdrU42K23OvrqT595NZJwzz5s3cqrxNfkWtmlpBVN7xjZmbNOfTNzBLi0DczS4hD38wsIQ59M7OEOPTNzBLi0DczS4hD38wsIf8fcJUcqxQptpYAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.154897\n", - "std 0.059096\n", - "min 0.117732\n", - "25% 0.140775\n", - "50% 0.147136\n", - "75% 0.153087\n", - "max 0.602036\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "diff = dd.ratio - d1.ratio\n", - "scatter(diff.index, diff)\n", - "show()\n", - "diff.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean 16.120223\n", - "std 4.232953\n", - "min 0.000000\n", - "25% 14.892881\n", - "50% 15.705279\n", - "75% 16.768788\n", - "max 44.992405\n", - "Name: start_len, dtype: float64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(dd.start_len * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean 31.385163\n", - "std 4.862132\n", - "min 16.811636\n", - "25% 28.985656\n", - "50% 30.843297\n", - "75% 32.528865\n", - "max 58.756055\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(dd.end_len * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Doing dynamic cache update improves cache-hit from ~46% to ~60%." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Is there easy way to reduce cache size, keeping cache-hit ratio?" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_onlydup():\n", - " row = []\n", - " for x in FILES[1:]:\n", - " known, cache = set(), set()\n", - " with open('{:d}'.format(x - 1)) as fd:\n", - " for _ in fd:\n", - " key = binascii.unhexlify(_[2:66])\n", - " if key in known:\n", - " cache.add(key)\n", - " known.add(key)\n", - " row.append(df_row_with_cache(x, cache, dyn=False))\n", - " return df_from_rows(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "od = calc_onlydup()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.432306\n", - "std 0.058772\n", - "min 0.000000\n", - "25% 0.432889\n", - "50% 0.439739\n", - "75% 0.446418\n", - "max 0.474412\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAG7VJREFUeJzt3X+QHOdd5/H316uVs06C1+ClEq8kJBJjEDix4o2gyndAnIDsGCRV4mCZ43DqoFQXMGAHdMgVygHfXUWJAAcoU8FAqgIYLCcxYiECBc4Od5fCjlaRbEdxRBbHRLuCiiBe51La2Cv5e39Mt9Q76l/T07PTM8/nVaXSTE/37PP0j08//fSPMXdHRETCcFG/CyAiIitHoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiARkVb/+8OWXX+7r16/v158XERlIhw8f/jd3n6g6fd9Cf/369czMzPTrz4uIDCQz++dupi/VvWNmN5jZcTObNbPdKZ+/08xOmdnR6N9Pd1MoERHpjcKWvpmNAPcBPwTMAYfMbNrdP9826j53v70HZRQRkZqUaelvBmbd/Rl3fxF4ENjW22KJiEgvlAn9SeBE4v1cNKzd283sSTP7mJmtraV0IiJSqzKhbynD2h/C/5fAend/HfB3wEdSv8hsp5nNmNnMqVOnOiupiIh0rUzozwHJlvsa4GRyBHf/d3d/IXr7+8C1aV/k7ve7+5S7T01MVL7iSEREKioT+oeAK81sg5mtBnYA08kRzOzVibdbgafrK6KIiNSl8Ooddz9jZrcDB4ER4MPufszM7gFm3H0a+Hkz2wqcAb4KvLOHZRYRkYqsX7+ROzU15bo5S0SkM2Z22N2nqk6vZ++IiAREoS8iEpC+PXtH+mP/kXn2HjzOyYVFrhgfY9eWq9i+Ke22i95MLyL9pdAPyP4j89z18FMsLp0FYH5hkbsefgqgVHB3O70U005Vek3dOytg/5F5rtvzCBt2f4Lr9jzC/iPzfSnH3oPHzwV2bHHpLHsPHl+R6SVfvFOdX1jEOb9T7df6IsNpKFv6TWotNal1fHJhsaPhdU8v+fJ2qv1cf5uyLUk9hi70mxSyUO+G3O0GeMX4GPMpAX3F+FhX019kxv4j88GHQbfLp46dap0h3bRtSTugegxd907TuiDqah3Xcei/a8tVjI2OLBs2NjrCri1XVZ4e4Kx7X7ohmtJtFpel2+WTtfMtu1NOK8Md+46y6Z5PVpo3TdqW1PVVn6EL/Tq7IOoIlW435FgdG+D2TZO8721XMzk+hgGT42O8721Xl24txdOP2IXP4FvpMKgzBOpYznUsn253ymllAHju9FKledOk7rwm7YAG3VB17+w/Ms9FZpxNucu405Ct40qXvQePM7+wiLH8saSdbMjJ70nT6Qa4fdNkV4fE2zdNcue+o7WUpRt1dZvV1YVRR0DGf69qF0be36oyb7rtDqxTk3ZAsUHtbhqa0I833rTA7yRkY92ESnuQOJwL/skOVo7270nTjw2wCWFQVwjUtfOoa550s1POKkOs03mza8tVF6x/VbalOjRhnYPsxly/z3d0Ymi6d7IObUfMOurCiHUTKmlliQP/07uvL12WrDrF+rUBdtsNUYe6us3q2nk0YZ5knXOJVdkBle0O7PX5lSbM32SXIlz4oyKLS2f5xYeeaPx5hqFp6WdtpC+5V9rzdtOyqCtI8sbv5Iihbt12Q9QhrRVqtFpc1+15pLA8cYst63GDVQISqs+TOroK4vF/dfoYC4tLyz6rGpBljjxW4iqfOte5qvO6qBEG5y9qSJa5aQbqKZt5C+u6PY+khnTcuu5UWtfK2OhIakunvVynXzzDc6eX2r+S8bFRXn7xKk4uLHLp2ChmsHB6KXPFq7NOafMOzm9EZcrTyXevxApfdN4kr1Wa1202NjrC26+d5NEvnOppnaqUPzmv85bZSi6TTtbTKuWqqy77j8xn7hDL9AZs2P2JzEZCu6q5U0a3T9kcmNAvCuGsz7vZeMusbGl/d/QiA4Ols547LKl9xetmBW0v95u+c4KPH54vLGOnfyer/mWnrUunO8es8aHVHXjWvaOdSBVlztckGwlZyzGpqIxlw7PTkM0Lw8nxsdzylylzHetXmfndfvRctjGXxoAv7bmpdPk6EUzo522okykt16orWZasDSGrXO0bbJkVJg6prBX0sktGee+Pfndh4Kd1e1RZymVaK3UfYVVRNnTiZdZJi639u8q0XKG4GyJvfc5SZjlmzfey4VlmvLJh2F7erPIny1z2u/Om6WZ+x3UFCncSeeJM6sXRVrehPzB9+nn923Ef4vvedvWyFf66PY/0/LK+rHI9v7jE0ff+8Ln3G3Z/ovDvxP3RWSvnJatXnTuqaT/Ef+700rlWaruqu/W4PHkrbRMupcs6/xL38cPyZVZ0lUuW9jqlrRe7PvrEsiOo+YVF7tx3lDv2HV3Wkqwyf8osx6zvLXuVUtF4aXUevcgYHbFlR41pAZ9V/rjMad9dVM+y5xPKzu/ktf9FgX/ZJaPc9LpXpzYs3/SdE426mzlpYEK/aEONz5zD+Zm6Epf1lT3hWzZoilb09pU82f2TFvjdSobmHfuO8mt/eWzZ0UY/LqUr030F6VdX7D14PPUkcBntdUpbL5Zeyt7p1rHjKZL1SIysdb59x55VprwGydJLfsH5qvZuyTzxfC1zorTMNItLZ/nV6WPL1pHxS0ZLd82UzYdLVq/if2y/mqlv++YLWvRNfI5SbGAu2Sy6HA0ufBxAJ5f15V1ylrfzSCtX8iqS+HvKlL/IFeNjHW0YeeLWWafa7+7MqtfpF8+UunSt00v90u7E/fjhed5+7STjY6OFf29+YfHcPIzvLJ4cH+OyS/Knbb/6Zf+R+UqhndzxpK03ReUpWmJpj8SIb1rMkpyXWazg8+cXl/j07uu595ZreOHMS7nfk5Scr2XDtsw0C4tLy+r19W+cKb2+XzE+1tFVets3TfLp3dfzpT03nbskO28n2+9LOgempZ+8ZKuoxR/vTfMu63vNXQc4685kSkux/VAsrzXbXq6iGzbSumXKiFf0rLthO5F2DuTSsVG+9o0lUhqqF0jO46zLBOOdA2Qfzla51C+rBfXoF07x8otXFbYwk+F11n1ZgGSdCxmPltUd+47yiw89ce5Eb1UnFxYLL0HMOzcTd+ONZyyztO6Ybo4Cy5xLKNNab7+wIt4G7tx3lL0Hj5dqjSfX3ev2PFK667L9aCTvnF/W+pBV56zPsnKq3908A3MiN6noTHzyzHneZXHJ8fNOMJU9sVrlpGaZk4rJvuAqJwDrLEus/eqEKnWvc37FIZxX/jLLOe2kbCddQUVXRSX/XpEyl3Teue9o5vz40p6bul5fss4TJSVP9OatQx+85ZrcnVqRuE5Vpk1On5Scx3Fd2xtFnV4SXKZ+3VzoEOQPo+c9+AuW74HjQ6/J8bHMlTFreNxFA/C+t119QffBc6eXuHPfUdZHXRNVno+T11oYGx3hg7dcs+wu3rLdRGndN0U36HTSD98+bpXzJ0X9zO1dFHktu6JD8rzln3eY3ml32t53vJ69N7+eyagsed0ZRfLW3eQ5pTTx8G5PqL8UhWCW9rt0s8ozmTgqhs7672NV+v7Tpk+KewTGRkfO7dySR5yf3n09z+65iXtvuaajBxXGGZWln88MGsjQh9ZM/Y0fe33pW7OrzuTkCvDyiy/sDUt25WQd8ueFUVaIX3bJaOqK1X5r/PjY6Ln+32QfdTJ8yq6oZXcoafM4q44OmX31efMl+VjgX9n/1LLb37PKk3WrfrzjzAqvvHJ0st7EwRaHdZWw6KQMWeeUksun2xPq8RFP3nxN1qfs4xKK5msdff/tss4zlXmCZ1qDoMj2TZOV1rleG5g+/TSd3JpddLVEXr9lvAIUrWzJB6vFilp2VW4v7+ShXJ0ETFZZypQv74qYrL76MlfRPHd6iQce+3Ludfjt5Ukrf3wk1unyKXuVTdb3dPMAtaIytJ9TSls+afO4TBcUnK9TJ+to2XGL5mv8rKq076h65VPWeaaiI85ubugsOm/QDwPZp19FXh9b+wmyNEb5lS1rZR12RY+Bzru5qcpGXOaux7wTomWeX1Rmven1c5C6vSu17A1kacNWsk5JeX3eZe+CL3MzGBTfuNXt3el1P9IjmDty65B20qaTuwbLtExX8k7Upso72ZoV0lVOOK7UHcN5J/tWaoc+qM9uz1Pno0bSdlpZ61P7elj2EQ1NuTs9mDty69B+qJ22kPK6aPIuz0yOF7oqN2wV7VCrzuu6ftyk3wHbhDLULa5TlR1a1vwo2r7hwvWwzOXgZdaXJtydXkZQod8ua2Hk9ScmV7ZhbH3VocqPb2Rd7x9PW/UQuSk/viHZerVD62Q9jMtQdkeRpsy61oTMCDr0sxZS2cOxYWx91aHKyel4uqxnCz3w2Je5YnyMexPXepfRpF9/GkZNCLEsVdbDbtaXomlX4ncHygiqT79dEx4LLNnqfKxuU4NpkA3r9tPN+rISv/mxIidyzewG4LeAEeAP3H1Pxng3Ax8F3ujuuYnehNAHBUKTNeGxzZJNy6czVS5wSNPzE7lmNgLcB/wQMAccMrNpd/9823ivBH4eeLxqYfpBXTTNNSgnxkKl5dOZppxfKnNH7mZg1t2fcfcXgQeBbSnj/XfgA8A3aiyfDLhufjC7k6ekysrT8ulME37cHcqF/iRwIvF+Lhp2jpltAta6+1/VWDYZcGmPQW5/7G+epmwkkk7LpzPtj1Cp+liObpW5eiftkTLnuqbM7CLgXuCdhV9kthPYCbBu3bpyJZSB1e0PSVS9CkhWhpZP55rQnVwm9OeAtYn3a4CTifevBL4H+JS1Hvj1KmDazLa2n8x19/uB+6F1IreLcssAGJYboySbls/gKdO9cwi40sw2mNlqYAcwHX/o7s+7++Xuvt7d1wOPARcEvoRHfb4izVMY+u5+BrgdOAg8DTzk7sfM7B4z29rrAsrgUp+vSPOUuiPX3Q8AB9qG3Z0x7g92XywZBurzFWmeoB/DIL2nPl+RZhnYX84SEZHOKfRFRAKi0BcRCYhCX0QkIDqRKyI9p6fZNodCX0R6qik/HiIt6t4RkZ7KewaTrDyFvoj0lJ673ywKfRHpKT2DqVkU+iLSU3oGU7PoRK6I9JSewdQsCn0R6Tk9g6k51L0jIhIQhb6ISEAU+iIiAVHoi4gERKEvIhIQhb6ISEAU+iIiAVHoi4gERKEvIhIQhb6ISEAU+iIiAVHoi4gERKEvIhIQhb6ISEAU+iIiAVHoi4gERKEvIhIQhb6ISEBKhb6Z3WBmx81s1sx2p3z+X83sKTM7amb/18w21l9UERHpVmHom9kIcB9wI7ARuDUl1P/U3a9292uADwC/WXtJRUSka2Va+puBWXd/xt1fBB4EtiVHcPevJd6+HPD6iigiInVZVWKcSeBE4v0c8L3tI5nZzwLvBlYD16d9kZntBHYCrFu3rtOyiohIl8q09C1l2AUteXe/z91fA/wy8CtpX+Tu97v7lLtPTUxMdFZSERHpWpnQnwPWJt6vAU7mjP8gsL2bQomISG+UCf1DwJVmtsHMVgM7gOnkCGZ2ZeLtTcAX6yuiiIjUpbBP393PmNntwEFgBPiwux8zs3uAGXefBm43s7cAS8BzwG29LLSIiFRT5kQu7n4AONA27O7E61+ouVwiItIDuiNXRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYCUCn0zu8HMjpvZrJntTvn83Wb2eTN70sz+l5l9W/1FFRGRbhWGvpmNAPcBNwIbgVvNbGPbaEeAKXd/HfAx4AN1F1RERLpXpqW/GZh192fc/UXgQWBbcgR3f9TdT0dvHwPW1FtMERGpQ5nQnwROJN7PRcOy/BTw190USkREemNViXEsZZinjmj2E8AU8AMZn+8EdgKsW7euZBFFRKQuZVr6c8DaxPs1wMn2kczsLcB7gK3u/kLaF7n7/e4+5e5TExMTVcorIiJdKBP6h4ArzWyDma0GdgDTyRHMbBPwe7QC/yv1F1NEROpQGPrufga4HTgIPA085O7HzOweM9sajbYXeAXwUTM7ambTGV8nIiJ9VKZPH3c/ABxoG3Z34vVbai6XiIj0gO7IFREJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgCn0RkYAo9EVEAqLQFxEJiEJfRCQgpULfzG4ws+NmNmtmu1M+/34z+6yZnTGzm+svpoiI1KEw9M1sBLgPuBHYCNxqZhvbRvsy8E7gT+suoIiI1GdViXE2A7Pu/gyAmT0IbAM+H4/g7s9Gn73UgzKKiEhNynTvTAInEu/nomEiIjJgyoS+pQzzKn/MzHaa2YyZzZw6darKV4iISBfKhP4csDbxfg1wssofc/f73X3K3acmJiaqfIWIiHShTOgfAq40sw1mthrYAUz3tlgiItILhaHv7meA24GDwNPAQ+5+zMzuMbOtAGb2RjObA94B/J6ZHetloUVEpJoyV+/g7geAA23D7k68PkSr20dERBpMd+SKiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQFaVGcnMbgB+CxgB/sDd97R9fjHwR8C1wL8Dt7j7s/UWVYbF/iPz7D14nJMLi1wxPsauLVexfdNkv4u1YkKvv/RXYUvfzEaA+4AbgY3ArWa2sW20nwKec/fXAvcC76+7oDIc9h+Z566Hn2J+YREH5hcWuevhp9h/ZL7fRVsRoddf+q9M985mYNbdn3H3F4EHgW1t42wDPhK9/hjwZjOz+oopw2LvweMsLp1dNmxx6Sx7Dx7vU4lWVuj1l/4rE/qTwInE+7loWOo47n4GeB74lvYvMrOdZjZjZjOnTp2qVmIZaCcXFjsaPmxCr7/0X5nQT2uxe4VxcPf73X3K3acmJibKlE+GzBXjYx0NHzah11/6r0zozwFrE+/XACezxjGzVcClwFfrKKAMl11brmJsdGTZsLHREXZtuapPJVpZoddf+q/M1TuHgCvNbAMwD+wAfrxtnGngNuAfgJuBR9z9gpa+SHyVSqhXr4Ref+k/K5PNZvZW4IO0Ltn8sLv/TzO7B5hx92kzexnwx8AmWi38He7+TN53Tk1N+czMTNcVEBEJiZkddvepqtOXuk7f3Q8AB9qG3Z14/Q3gHVULISIiK0N35IqIBEShLyISEIW+iEhAFPoiIgFR6IuIBEShLyISEIW+iEhAFPoiIgFR6IuIBEShLyISkFLP3unJHzY7BfxzX/44XA78W5/+dp2GpR6gujTRsNQDhqsuV7n7K6tOXOrZO73g7n17oL6ZzXTzwKKmGJZ6gOrSRMNSDxi+unQzvbp3REQCotAXEQlIqKF/f78LUJNhqQeoLk00LPUA1eWcvp3IFRGRlRdqS19EJEhDF/pmttbMHjWzp83smJn9QjR8r5l9wcyeNLM/N7PxxDR3mdmsmR03sy39K/1yWXVJfP5LZuZmdnn03szst6O6PGlmb+hPyZfLq4eZ/Vw034+Z2QcSwwdqmZjZNWb2mJkdNbMZM9scDW/kMgEws5eZ2WfM7ImoLr8WDd9gZo+b2RfNbJ+ZrY6GXxy9n40+X9/P8sdy6vFAtP58zsw+bGaj0fCBWyaJz3/HzL6eeN/5MnH3ofoHvBp4Q/T6lcA/AhuBHwZWRcPfD7w/er0ReAK4GNgA/BMw0u965NUler8WOEjrXofLo2FvBf4aMOD7gMf7XYeCZfIm4O+Ai6PPvnVQlwnwSeDGxHL4VJOXSVQ2A14RvR4FHo/K+BCt37kG+BDwruj1zwAfil7vAPb1uw4F9Xhr9JkBf5aox8Atk+j9FK3fIv96YvyOl8nQtfTd/V/c/bPR6/8HPA1Muvsn3f1MNNpjwJro9TbgQXd/wd2/BMwCm1e63Gmy6hJ9fC/w34DkSZltwB95y2PAuJm9eiXLnCanHu8C9rj7C9FnX4kmGcRl4sA3RaNdCpyMXjdymQBEZYpbjaPRPweuBz4WDf8IsD16vS16T/T5m83MVqi4mbLq4e4Hos8c+AzLt/mBWiZmNgLspbXNJ3W8TIYu9JOiQ51NtPaWSf+F1p4eWhvsicRnc5wP1sZI1sXMtgLz7v5E22iNr0vbMvkO4D9Gh6V/b2ZvjEZrfD3ggrrcAew1sxPArwN3RaM1ui5mNmJmR4GvAH9L66hqIdFASpb3XF2iz58HvmVlS5yuvR7u/njis1HgPwN/Ew0aqGUS1eV2YNrd/6Vt9I6XydCGvpm9Avg4cIe7fy0x/D3AGeCBeFDK5I26pClZF1plfw9wd9qoKcMaU5eUZbIKuIzWIfYu4KGoldLoekBqXd4F3Onua4E7gT+MR02ZvDF1cfez7n4NrVbwZuC70kaL/m9sXdrrYWbfk/j4d4H/7e7/J3rf2HpAal2+H3gH8Dspo3dcl6EM/WjP/nHgAXd/ODH8NuBHgP8UHfJBay+/NjH5Gs4fmvddSl1eQ6uf+wkze5ZWeT9rZq+iwXXJWCZzwMPRIe1ngJdoPSOlsfWAzLrcBsSvP8r57qhG1yXm7gvAp2jtgMfNLH5ES7K85+oSfX4p8NWVLWm+RD1uADCz9wITwLsTow3aMnkT8FpgNtrmLzGz2Wi0jpfJ0IV+1FL8Q+Bpd//NxPAbgF8Gtrr76cQk08CO6Cz4BuBKWv1/fZdWF3d/yt2/1d3Xu/t6Wgv9De7+r7Tq8pPR1QnfBzyfcji44rKWCbCfVv8xZvYdwGpaD8UaqGUSOQn8QPT6euCL0etGLhMAM5uw6Co2MxsD3kLrHMWjwM3RaLcBfxG9no7eE33+SKLx1DcZ9fiCmf00sAW41d1fSkwyaMvksLu/KrHNn3b310aTdL5Mis70Dto/4D/QOrx5Ejga/XsrrZOBJxLDPpSY5j20+jKPE12B0YR/WXVpG+dZzl+9Y8B9UV2eAqb6XYeCZbIa+BPgc8BngesHdZlEww/TuuroceDaJi+TqGyvA45EdfkccHc0/Ntp7WRnaR21xFdXvSx6Pxt9/u39rkNBPc5E8z1eTvHwgVsmbeMkr97peJnojlwRkYAMXfeOiIhkU+iLiAREoS8iEhCFvohIQBT6IiIBUeiLiAREoS8iEhCFvohIQP4/wpgnNtBPve4AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(od.x, od.ratio)\n", - "od.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAD8CAYAAABkbJM/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3X+wXGd93/H3B0k2MiSWhGVbvrYqJRhTiIMFt4aO09Y2tmSgwRqXH84wrWYCowmddGqSGORxglxDahGHkiahZVSgoxAKNuDaNwNEkS27P5ix8RW2MQ52pPLLklVQKotALLAkvv1jz0qr1Tm7z9lzdvfsvZ/XzJ27e87ZPc+z58f3PD/OcxQRmJmZ9fOCcSfAzMwmgwOGmZklccAwM7MkDhhmZpbEAcPMzJI4YJiZWRIHDDMzS+KAYWZmSRwwzMwsycJxJ2AQZ511VqxatWrcyTAzmyi7du3624hYPujnJzJgrFq1itnZ2XEnw8xsokj6bpXPu0rKzMySOGCYmVkSBwwzM0vigGFmZkkcMMzMLIkDhpmZJXHAMDOzJA4YZmaWxAHDzMySOGCYmVkSBwwzM0vigGFmZkkcMMzMLIkDhpmZJXHAMDOzJA4YZmaWpJaAIekaSU9J2iNpU8780yXdkc1/SNKqbPpLJN0v6ceS/rSOtJiZ2XBUDhiSFgAfBd4AvAL4NUmv6FrsncCzEfFS4CPAh7LpPwF+D/idqukwM7PhqqOEcSmwJyK+FRHPA58Fru1a5lpgW/b688DrJSki/j4i/jetwGFmZg1WR8CYAp7ueL83m5a7TEQcBX4IvKSGdZuZ2YjUETCUMy0GWKb3SqSNkmYlzR44cKDMR83MrAZ1BIy9wAUd788HnilaRtJC4EzgYJmVRMTWiJiOiOnly5dXSK6ZmQ2ijoDxMHChpNWSTgOuB2a6lpkBNmSv3wLsjIhSJQwzMxuvhVW/ICKOSvpNYDuwAPhkRDwh6VZgNiJmgE8An5K0h1bJ4vr25yV9B/h54DRJ64G1EfHXVdNlZmb1qhwwACLiS8CXuqa9v+P1T4C3Fnx2VR1pMDOz4fKd3mZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSWpJWBIukbSU5L2SNqUM/90SXdk8x+StKpj3k3Z9KckrasjPTZ33f3IPi7bspPVm77IZVt2cvcj+8adpJGZz3m3ZlhY9QskLQA+ClwN7AUeljQTEX/dsdg7gWcj4qWSrgc+BLxd0iuA64FXAucB90p6WUQcq5oum3vufmQfN931OIePtHaPfYcOc9NdjwOwfs3UOJM2dPM579YcdZQwLgX2RMS3IuJ54LPAtV3LXAtsy15/Hni9JGXTPxsRP42IbwN7su8zO8Xt2586fsJsO3zkGLdvf2pMKRqd+Zx3a446AsYU8HTH+73ZtNxlIuIo8EPgJYmfBUDSRkmzkmYPHDhQQ7Jt0jxz6HCp6XPJfM67NUcdAUM50yJxmZTPtiZGbI2I6YiYXr58eckk2lxw3pLFpabPJfM579YcdQSMvcAFHe/PB54pWkbSQuBM4GDiZ80AuHHdRSxetOCkaYsXLeDGdReNKUWjM5/zbs1RR8B4GLhQ0mpJp9FqxJ7pWmYG2JC9fguwMyIim3591otqNXAh8NUa0mRz0Po1U9x23cVMLVmMgKkli7ntuovnRaPvfM67NYda5+2KXyK9EfgjYAHwyYj4fUm3ArMRMSPphcCngDW0ShbXR8S3ss/eDPw6cBS4ISK+3G9909PTMTs7WzndZmbziaRdETE98OfrCBij5oBhZlZe1YDhO73NzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZkkoBQ9IySTsk7c7+Ly1YbkO2zG5JGzqm/76kpyX9uEo6zMxs+KqWMDYB90XEhcB92fuTSFoGbAZeC1wKbO4ILH+RTTMzs4arGjCuBbZlr7cB63OWWQfsiIiDEfEssAO4BiAiHoyI/RXTYGZmI1A1YJzTPuFn/8/OWWYKeLrj/d5sWimSNkqalTR74MCBgRJrZmaDW9hvAUn3AufmzLo5cR3KmRaJnz3xgYitwFaA6enp0p83M7Nq+gaMiLiqaJ6k70taERH7Ja0AfpCz2F7g8o735wMPlEynmXW4+5F93L79KZ45dJjzlizmxnUXsX5N6YK7WSlVq6RmgHavpw3APTnLbAfWSlqaNXavzaaZ2QDufmQfN931OPsOHSaAfYcOc9Ndj3P3I/vGnTSb46oGjC3A1ZJ2A1dn75E0LenjABFxEPgA8HD2d2s2DUl/IGkvcIakvZJuqZgesznv9u1PcfjIsZOmHT5yjNu3PzWmFNl80bdKqpeI+H/A63OmzwLv6nj/SeCTOcu9F3hvlTSYzTfPHDpcarpZXXynt9mEOW/J4lLTzerigGE2YW5cdxGLFy04adriRQu4cd1FY0qRzReVqqTMbPTavaG6e0kBXLZlp3tO2dA4YJhNoPVrpk4KBu2eU+3G8HbPqfayk8rdh5vFVVJmc8Bc7Dnl7sPN44BhNgfMxZ5TczEITjoHDLM5YC72nJqLQXDSOWCYzQFzsefUXAyCvdz9yD4u27KT1Zu+yGVbdjay6s0Bw+alSTg4y1i/ZorbrruYqSWLETC1ZDG3XXfxRDcQz8UgWGRS2mvcS8rmnbnao6i759SkK+o+PJfy2NarvaZJ+XXAsHnXdXFSDk6be0GwyKS01zhgzHOTfLU9aKCblIPT5o/zlixmX87+17T2GgeMea7Oq+1RllTKBLrudC05YxHPPnfklO9s2sFp88eN6y46aX+GZrbXOGD0MB+qauq62h51SSU10OWla9ELxKIF4sixEw9uHOfBOR/2M+ttUtprHDAKTHJVTRl1FYWrllTKnjRTA11euo78LFiyeBEvOn3h2A/Oov1s9rsHuf/JA2NPn43OJLTXOGAUaErDaNkTadnl6yoKVympDBKcUwNd0fp/ePgIj25e2zdtw1a0n336we8df/D9uC5W6ir5pH6PS1rN54BRoAkNo2Xr6W+ZeYJDh0/UzaecaOoqClcpqRSdNG+ZeaIwXamBrumNiUX7U3S9H/XFSl0l7NTvGXeJ3sEqjW/c69K+oav7gG0b5YkmdSyd9sHWGSx6Ld9t/ZopvrLpSr695U18ZdOVAx0oVW6yKjppHjp8pPBGps4b1QAWSMfz2nmzU9Nv/iqzP43yYqWucZxSv2ec40ZNyk1zTTCvSxjdVxVXvHw5X9i175Qdt23YJ5ru9ORdGUNaPX2v5YehSkmlV147dV9lt//3ujJtWmNiyj4nTi1hwHAvVgbd9/pJLamPs0Q/jurnSS3RzNuAkVcE7qw37jY1hm6iqSeOfgfVqEpFgzba5VUvFUkJlnmBpQkHY942/sKuffyL10yd1MCdF0TKXqzknZAgP3DmpatI2X0ptUpwnFWHVYPVIO2Mk9qhZt4GjLwTTVGwEPCVTVeOJT3dQaNMPX3R8k2TVwp47vmjSfdKjPpgr/I9RcHt/icPnLJ/Tf+DZQOnK++EdOPnHgNxvCtx50mqXwm1bZB9KbWtaZz3IVQJVoOc/JvSoWYQ8yZgpBa589R1ldPrpNKr8XNqyeKeJ46iK/SlZyxi86++svE7IfR/ghyUC5ZB63GlvU60w27Y7e4aW6aap0qpqKgrcbf2SSoluC6QBhrMsMzjZG+77uKx9KaqEqwGOfk3oUPNoBRRdF2d8GFpGXAHsAr4DvC2iHg2Z7kNwO9mbz8YEdsknQF8DvhF4BjwFxGxKWW909PTMTs7m5zOvJNPUXVP3hV9HaN+Fp0A29992ZaduSeUqSWLk0o3k1on2ktKnvJ+125FgbPqb97ve7r3paJ9ruz6+lm96YuFpeVuIq0NScC3t7wpd17VKhlIP86qfLbf9w5y/PT6rYsu9Ora7wYhaVdETA/6+aoljE3AfRGxRdKm7P37uhK4DNgMTNM6XnZJmgF+CvxhRNwv6TTgPklviIgvV0zTKVKre9rvF0gci6i13aLflUjVInlT6um7VQlkKXnqvIItOuk9+9yR3JJDypVeSvpTu8amVjFWVaYE3c5Tv6BbVMruVUqD/DaTKlUyw6rOGfT4KfqtxYm2oH2HDvOeOx7lhjseZaqmNqpxqRowrgUuz15vAx6gK2AA64AdEXEQQNIO4JqI+AxwP0BEPC/pa8D5FdOTq191T3cD87GI4xuwrpNwv5NT03rz9DPI1f+w7mJuH+y9rvbyTir96q5Tq6zKnKBTqhirygsAi16gk9owgFP28XbQzQtqV7x8+SlVSL1O/rfMPMFPj/4s97erUiXTtOqcvN86ryTZeRPmnz/4Pc5Y9AKWnrGIQ88dafyx3qlqwDgnIvYDRMR+SWfnLDMFPN3xfm827ThJS4BfBf5j0YokbQQ2AqxcubJUIosO6AUSN667KPfqtO5GqJSGtaaWErqlnkhHfRdzvxN390mlX6ku9Wq2TC+vUVQ79Go3KArynftev66/KSf/XvcEVWlkLvrsmYsX5Qa0Ycv7rVMuHp478jMC8ZG3XzIRx3xb34Ah6V7g3JxZNyeuQznTjgdgSQuBzwB/HBHfKvqSiNgKbIVWG0biuoHiA/pYRM8Dvc6rljp7gYy7vSL1RDrqu5j7nbi7T0j9SnWpV7Mp1WIw2mqHoouPlN+4+7OXbdlZuL3LdiB55tBhPvL2SwY+FopKT3///NHjQSr1IqSu4yjv9xrkvqJJ0DdgRMRVRfMkfV/Siqx0sQL4Qc5iezlRbQWtaqcHOt5vBXZHxB8lpXgA7Q3y23c+xrGuRv7DR44db7PoVmcf8LqqnJrQhzv1RFrmZFJHcG7nv3uIFCg+IfUq1ZW5Ek6pFpvUR6b22t5FJ/8XLnpBYbfoKsdCahfsfifjYR5HVe4rarqqVVIzwAZgS/b/npxltgP/XtLS7P1a4CYASR8EzgTeVTEdfa1fM8V77ng0d167zWLYjVB1VDk1oQ936om0zIHT2W5QJai2f+M6rh5TSoWpz9qY6jhRTppe27tX9Vev367KsdD92dWbvpi7XK+T8TCPo35tQp2aMqZZqqoBYwtwp6R3At8D3gogaRr4jYh4V0QclPQB4OHsM7dm086nVa31JPA1SQB/GhEfr5imQkU7frs3VJUbpUb12SY0+hUFgueeP8rdj+w7qV4c4IaCQN3WPpHUedVXR3DudyU8Cc/a6Cdl/0vZ3r1uUht21ekgbSLDvuGzu02oTKl33FXOvVS6D2Ncyt6H0VZnH+72Ri3qVVJ3n/LO9eUZZmNq0TATRQdBd/p71el2dl0eZ//0QRSltynP2uin7P6Xur1Hrdd9VkVd46vsa4OeRwa9r6jO33jc92FMlGG1IwzaiJtaLO53c1qZK9i6xr257bqLedHpC085gaT2Iso7CJpQeiqj6c/aKNLr4qNo3213oU3Z3qPWqwqoqJQ66ru72+sf130mdZlXAQOG147Qrc4+5b3WV+bmwkGeg110UsnrQFCU/l713J1dISftWdtNf9ZGnpQ748sG7nEG9O4LoKU5+1DeCbfKxeMwf4cm/sad5l3AqEPKxqvSpzx1gL2ygyLWVaKBVkeBMsNwdwfquVD/P84B8waVcrFTtO82LUCWGWU3dbyulBL4MO8Fadpv3M0PUBpAv41Xpk95ysN9itbX3kFXb/oil23ZWfjAl/ZDoep6vkZbe6iLfunPUzRA3otOW8jUksWIVulp3PXjvaxfc+IhTpOQXuh/sdNr++Xtr+0hMHrtf8OSup9CuZFnux+k9Lt3P37ScXbFy5ef8ju07wWp+hCmpj/wa141etdlkEa2Xt81SENY0VAP3SeslNLCAokPv+1Vxz9XZvA6GGyoi6J19BrkzqpL7YBQpI7OHoMadMTp1HSlDiK5eNGCU55hUjQcf79G9DLPLamDG73HoGrjedmG5yo3K6VchbXveG+vq8zBOGgPpiYWvXttlyZ3dSyTttQOCEXa1Th5J9dhNs6WecDYoL3UyoxM0P0Mk0HuBenVoaSJvQLBAWNggzaeD3qvwaA3K6U2lvUbObfX4HWDaFr9f79RV4dxV3AdQajs/lRXT8FRN86WecDYLW8e7BkwVUYmGOQCqOk9ovK4DWPEeu0kZfRqmExZLk/nyLnddfO3v/VV3P6WV9VWX9+0+v9e26Vo3g13PDpw3X1RfXnZ7xpkf1q/ZoqvbLqSb295E1/ZdOXAJ9cy06vqN+J0HftQURtNnrxRDcq2PTS9R1QelzBGrK6dJPUKPXX4ZUjr5VHnCb2OLs51GXS7DFra6DUs+CTc+T/qEmKvURrqqr7JK32lPrtikJJbE6tl+3HAGLG6dpLUHTT1IBh0xM+5ot926VVVMUg1Qq9hwctsg3GddOqq2ko1qgCVdxGT+nz1shdATauWTeFeUkNSVD897Fv/B03foL085oqi3mRLz1jEm355xSkBtlvZ3l2pQ2BD723QlP1pFJrc8WBQo85T1V5SDhhD0O8gbuKO726uvcdLanejrGssr5Tuzm39tkET96emqPu3mfTf2t1qG6iofvq373wMaFbdfdsk1qfWrdd4Se1ulEUXA2WrEVK7SkP/bdDE/akJ6n7mRROeRTNuDhhDUFQ/3X2/Q5FxXMVMYn3qMPRrRK6z7r7fcCkw97ZB6r5dxzGQcuFWJq3PPX905N1gm1aiccAYgl79ucf5JLBeRt2I2VQpJa1BruhTDvxhbINRn3D63fyYt2/PfvfgSXdN93qGeB0dC1Iu3KqOU1WHJpZo3IYxBP3qp3vVSU/aMyHmmmE0Io+rYXrU6+23vtShN4q6fZc9Bvp1LOj1fXV1SqhiGOeCqm0YvnFvCNo3pS1Q/m0/w3wSmFUzjBsK67pZs+nr7be+1KE3ii5hB7lXqftmutTvS13XMKsMm3gucJXUkLRPMGXrpN34PDyp1TN1NyKP68Af9Xr7ra/M0Bt5Br1XqejZLb2+ryito3yaYhPPBS5hlNQeKrzfkOIw2NVq04c3nlR1DcUxiFEPozGu9fZbX5mhNwYdNr/b+jVTfPhtryp9TBUdh7e8+ZWVh1VJ1cRzgQNGCYOcdMqO29O0MZbminFVC8H4DvxRr7ff+vL27Xe8bmXuZ97xupVjHbesCcdhE9LQzY3eJbhBenKN+8bEcXWPbFIvqUlO47jVlWbf6T1C4z7p2OAc7A0mcyiVOtM81l5SkpZJ2iFpd/Z/acFyG7Jldkva0DH9LyU9JukJSR+TVNyloQHGVRc9acq084xKE+uDbfTGWTU5qCaluWobxibgvoi4ELgve38SScuAzcBrgUuBzR2B5W0R8Srgl4DlwFsrpmeoJu2kM44T9zgbl3tpYn2wjV4Tu6r206Q0V+1Wey1wefZ6G/AA8L6uZdYBOyLiIICkHcA1wGci4u860nEaxV2wG2GS7oYe112iTX6KmMdcsiZ2Ve2nSWmuWsI4JyL2A2T/z85ZZgp4uuP93mwaAJK2Az8AfgR8vmJ6hq6Op5WNwriKsU26GjLrNmm1BNCsNPctYUi6Fzg3Z9bNievI62p9vCQREeskvRD4NHAlsKMgHRuBjQArV65MXPX8Na4Td5Ouhsy6TVItQVuT0tw3YETEVUXzJH1f0oqI2C9pBa2SQre9nKi2AjifVtVV5zp+ImmGVhVXbsCIiK3AVmj1kuqX7jpNYje8cZ24PeqtNd0kVk02Jc1Vq6RmgHavpw3APTnLbAfWSlqaNXavBbZLenEWZJC0EHgj8GTF9NSuqY24/YyrGOvGZbO5q2qj9xbgTknvBL5H1stJ0jTwGxHxrog4KOkDwMPZZ27Npp0DzEg6HVgA7AQ+VjE9tWtyI24v4yzGNuVqyMzq5Rv3+vDNemY2V3h48yHzzXpmZi0OGH00qUubmdk4+XkYfTSpS5uZ2Tg5YCRwI66ZmaukzMwskQOGmZklccAwM7MkDhhmZpbEAcPMzJI4YJiZWRJ3q63RJI5qa2aWygGjJuN6wp2Z2ai4SqomTXpQu5nZMDhg1MSPJjWzuc4BoyYe1dbM5joHjJp4VFszm+vc6F0Tj2prZnOdA0aNPKqtmc1lrpIyM7MkDhhmZpbEAcPMzJI4YJiZWZJKAUPSMkk7JO3O/i8tWG5DtsxuSRty5s9I+kaVtJiZ2XBVLWFsAu6LiAuB+7L3J5G0DNgMvBa4FNjcGVgkXQf8uGI6zMxsyKoGjGuBbdnrbcD6nGXWATsi4mBEPAvsAK4BkPRi4LeAD1ZMh5mZDVnVgHFOROwHyP6fnbPMFPB0x/u92TSADwAfBp6rmA4zMxuyvjfuSboXODdn1s2J61DOtJB0CfDSiHiPpFUJ6dgIbARYuXJl4qrNzKwufQNGRFxVNE/S9yWtiIj9klYAP8hZbC9wecf784EHgH8MvEbSd7J0nC3pgYi4nBwRsRXYCjA9PR390m1mZvWqWiU1A7R7PW0A7slZZjuwVtLSrLF7LbA9Iv5zRJwXEauAXwH+pihYmJnZ+FUNGFuAqyXtBq7O3iNpWtLHASLiIK22ioezv1uzaWZmNkEUMXm1O9PT0zE7OzvuZJiZTRRJuyJietDP+05vMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZEgcMMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlqRSwJC0TNIOSbuz/0sLltuQLbNb0oaO6Q9IekrSo9nf2VXSY2Zmw1O1hLEJuC8iLgTuy96fRNIyYDPwWuBSYHNXYHlHRFyS/f2gYnrMzGxIqgaMa4Ft2ettwPqcZdYBOyLiYEQ8C+wArqm4XjMzG7GqAeOciNgPkP3Pq1KaAp7ueL83m9b2X7PqqN+TpIrpMTOzIVnYbwFJ9wLn5sy6OXEdeUEgsv/viIh9kn4O+ALwL4E/K0jHRmAjwMqVKxNXbWZmdekbMCLiqqJ5kr4vaUVE7Je0Ashrg9gLXN7x/nzggey792X/fyTpv9Fq48gNGBGxFdgKMD09HXnLmJnZ8FStkpoB2r2eNgD35CyzHVgraWnW2L0W2C5poaSzACQtAv458I2K6TEzsyGpGjC2AFdL2g1cnb1H0rSkjwNExEHgA8DD2d+t2bTTaQWOrwOPAvuA/1IxPWZmNiSKmLzanenp6ZidnR13MszMJoqkXRExPejnfae3mZklccAwM7MkDhhmZpbEAcPMzJJMZKO3pAPAd8ew6rOAvx3DeofBeWmeuZIPcF6a6CzgRRGxfNAvmMiAMS6SZqv0MGgS56V55ko+wHlpojry4SopMzNL4oBhZmZJHDDK2TruBNTIeWmeuZIPcF6aqHI+3IZhZmZJXMIwM7MkDhgZSRdIul/SNyU9IenfZtNvl/SkpK9L+u+SlnR85iZJe7Lnkq8bX+pPVpSXjvm/Iyk6RguWpD/O8vJ1Sa8eT8pP1Ssvkv5N9ts/IekPOqZP1HaRdImkB7MHic1KujSb3sjtIumFkr4q6bEsH/8um75a0kOSdku6Q9Jp2fTTs/d7svmrxpn+Tj3y8uls//mGpE9mI2o3dptAcV465v+JpB93vC+/XSLCf61quRXAq7PXPwf8DfAKWsOxL8ymfwj4UPb6FcBjtEbdXQ38H2DBuPPRKy/Z+wtoDTn/XeCsbNobgS/TetjV64CHxp2HhO1yBXAvcHo27+xJ3S7AXwFv6NgWDzR5u2TpeXH2ehHwUJa+O4Hrs+kfA96dvf7XwMey19cDd4w7Dwl5eWM2T8BnOvLSyG3SKy/Z+2ngU8CPO5YvvV1cwshExP6I+Fr2+kfAN4GpiPiriDiaLfYgrQdAQet55p+NiJ9GxLeBPbQeADV2RXnJZn8EeC8nnnoIrbz8WbQ8CCzJHog1dj3y8m5gS0T8NJvXfnjXJG6XAH4+W+xM4JnsdSO3S5ae9pXqouwvgCuBz2fTtwHrs9fXZu/J5r9easbjmIvyEhFfyuYF8FVOPu4bt02gOC+SFgC30zruO5XeLg4YObKi2RpaEbrTr9O6uoD+zypvhM68SHozsC8iHutabOLyArwM+CdZUfp/SPpH2WKTmJcbgNslPQ38IXBTtlhj8yJpgaRHaT1lcwetktyhjourzrQez0c2/4fAS0ab4mLdeYmIhzrmLaL16Oi/zCY1dptAYV5+E5iJiP1di5feLg4YXSS9mNbzxW+IiL/rmH4zcBT4dHtSzscb1eWsMy+00n4z8P68RXOmNTYv2XZZCCylVS1wI3BndnU0iXl5N/CeiLgAeA/wifaiOR9vRF4i4lhEXELryvtS4B/mLZb9b2w+4NS8SPqljtn/CfifEfG/sveTlpd/CrwV+JOcxUvnxQGjQ3Y18QXg0xFxV8f0DbQeIfuOrIgKrSuLCzo+fj4nqhLGLicvv0irTv8xSd+hld6vSTqXycsLtNJ8V1YM/yrwM1pj5UxiXjYA7def40QVWqPzAhARh4AHaAXuJZIWZrM603o8H9n8M4GDo01pfx15uQZA0mZgOfBbHYs1fpvASXm5AngpsCc77s+QtCdbrPR2ccDIZFennwC+GRH/oWP6NcD7gDdHxHMdH5kBrs96GqwGLqRV1zl2eXmJiMcj4uyIWBURq2jtLK+OiP9LKy//KusB8jrghznF17Eo2i7A3bTqzJH0MuA0WgPETdR2yTwD/LPs9ZXA7ux1I7eLpOXKegtKWgxcRas95n7gLdliG4B7stcz2Xuy+Ts7LrzGqiAvT0p6F7AO+LWI+FnHRxq5TaAwL7si4tyO4/65iHhp9pHy26Vfq/h8+QN+hVZxrP2M8Udp9YjYQ6uerz2djJ7BAAAAs0lEQVTtYx2fuZlW3e1TZL1cmvBXlJeuZb7DiV5SAj6a5eVxYHrceUjYLqcBfw58A/gacOWkbpds+i5avbseAl7T5O0C/DLwSJaPbwDvz6b/Aq3gvIdWSandg+2F2fs92fxfGHceEvJyNPvd29upPb2R26RXXrqW6ewlVXq7+E5vMzNL4iopMzNL4oBhZmZJHDDMzCyJA4aZmSVxwDAzsyQOGGZmlsQBw8zMkjhgmJlZkv8PF+CIo15UM44AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean -0.021453\n", - "std 0.004945\n", - "min -0.035891\n", - "25% -0.024240\n", - "50% -0.021191\n", - "75% -0.018773\n", - "max 0.000000\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "diff = od.ratio - d1.ratio\n", - "scatter(diff.index, diff)\n", - "show()\n", - "diff.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cache-hit becomes slightly worse with \"only duplicates from previous bucket\" strategy: from ~46% to ~44%." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean 1.919296\n", - "std 0.505534\n", - "min 0.000000\n", - "25% 1.735415\n", - "50% 1.877614\n", - "75% 2.015887\n", - "max 5.161518\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(od.end_len * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean 16.120223\n", - "std 4.232953\n", - "min 0.000000\n", - "25% 14.892881\n", - "50% 15.705279\n", - "75% 16.768788\n", - "max 44.992405\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(d1.end_len * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean -14.200927\n", - "std 3.758579\n", - "min -39.830887\n", - "25% -14.869665\n", - "50% -13.897655\n", - "75% -12.996148\n", - "max 0.000000\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "((od.end_len - d1.end_len) * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.119339\n", - "std 0.010292\n", - "min 0.101720\n", - "25% 0.111425\n", - "50% 0.117940\n", - "75% 0.124021\n", - "max 0.150828\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(od.end_len / d1.end_len).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But memory savings are quite measurable! From ~16 Mib (45 max) down to ~2 MiB (5 max)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How does _\"only-dup + dynamic\"_ behave?" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_combo():\n", - " row = []\n", - " for x in FILES[1:]:\n", - " known, cache = set(), set()\n", - " with open('{:d}'.format(x - 1)) as fd:\n", - " for _ in fd:\n", - " key = binascii.unhexlify(_[2:66])\n", - " if key in known:\n", - " cache.add(key)\n", - " known.add(key)\n", - " row.append(df_row_with_cache(x, cache, dyn=True))\n", - " return df_from_rows(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "cc = calc_combo()" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean 0.598836\n", - "std 0.008614\n", - "min 0.550029\n", - "25% 0.594257\n", - "50% 0.598882\n", - "75% 0.603401\n", - "max 0.615940\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAH4dJREFUeJzt3X+QH/V93/HnW6cTHHHsI0Fu4CQquRZysE0lc1HdEttBrZHiJEI1KWBnGpiOw4w9qguN1UrjjKGkreWoid2kdDy4YYoTNwhjRZHHiQ9ccOoyAetkSciSuFiGONLJLQpwbqlu4CTe/eO7X2nve7v73f3u7nf3+/2+HjMa3Xe/+/1+P5/98Xnv58d+1twdEREZbIuqToCIiFRPwUBERBQMREREwUBERFAwEBERFAxERAQFAxERQcFARERQMBAREWBx1Qloddlll/mKFSuqToaISE/Zv3//37j70k4/X7tgsGLFCiYnJ6tOhohITzGzH+T5vJqJREREwUBERBQMREQEBQMREUHBQEREUDAQEREUDEREBAUDERFBwUBERFAwEBERFAxERAQFAxERQcFARERQMBARERQMREQEBQMREUHBQEREUDAQEREUDEREBAUDERFBwUBERFAwEBERFAxERAQFAxERQcFARERIGQzMbKOZTZnZcTPbFrPOzWZ21MyOmNl/C5atMbO/CJY9Y2a3FJl4EREpxuJ2K5jZEHAf8H7gJLDPzPa6+9HQOquA7cB17v6ymb05eOsM8Kvu/j0zuwLYb2YT7j5TeE5ERKRjaWoG64Dj7v6cu78GPATc2LLOrwH3ufvLAO7+QvD/X7r794K/TwEvAEuLSryIiBQjTTAYA06EXp8MloVdBVxlZk+a2VNmtrH1S8xsHbAE+H6niRURkXK0bSYCLGKZR3zPKuDngGXAt8zsHc3mIDO7HPgD4DZ3f33BD5jdAdwBcOWVV6ZOvIiIFCNNzeAksDz0ehlwKmKdP3H3OXd/HpiiERwwszcCXwN+w92fivoBd7/f3cfdfXzpUrUiiYh0W5pgsA9YZWYrzWwJcCuwt2WdPcD1AGZ2GY1mo+eC9f8Y+KK7f7m4ZIuISJHaBgN3PwtsASaAY8DD7n7EzO41s03BahPAi2Z2FHgC2OruLwI3A+8Fbjezg8G/NaXkREREOmburc3/1RofH/fJycmqkyEi0lPMbL+7j3f6+TQdyCIi8+w5MM3OiSlOzcxyxegIWzesZvPa1kGG0ksUDEQkkz0Hptm++zCzc+cAmJ6ZZfvuwwAKCD1McxOJSCY7J6bOB4Km2blz7JyYqihFUgQFAxHJ5NTMbKbl0hvUTCQyYPK2918xOsJ0RMF/xehIkcmULlMwkL6kDs75mttjemYW48IUAp2092/dsHpenwHAyPAQWzesLjjV0k0DFwxUSPQ/dXDO17o9WgeTN9v7026b5no6j/rLQAUDFRKDIamDcxD3c9T2aJW1vX/z2rGB3Jb9bKCCgQqJZP1Sa+rXDs5O90+afKu9XwYqGPRrIVGEvLWmOgWSfuzgzLN/4rZHk9r7BQZsaGlcYVBVIbHnwDTX7Xicldu+xnU7HmfPgelK0gH5xo43C6rpmVmcRkF1166DrKgoX1s3rGZkeGjesl4v8PLsn6jt0ZyXfmx0hE9/8J09WQOUYg1UzaBOoyDq1n+Rp9YUVVDlGa2SVz92cObZP/24PaR4AxUM6nRS1K3/Ik/TSrsCqYp89VsHZ96mr37bHlK8gQgGdWrPbqpb/0WeWlO7NmlQv0xedarVlq2O5+sg6Ps+g6j27O27D1faPg/167/YvHaMT3/wnYyNjmBka0uOapNu1cudt3WQZ//0krqer4Og759ncN2OxyOvWsdGR3hy2/rCfier1j4DaFzp9eoJHneHK/R2vqS76nq+9gI9z6CNujXHNNWp/6II4TZpVfOlU3U9XwdB3wSDuAKozmPO+7VTr1/zJeWr8/na7/qizyCpnbGOY87rdH+BSJ3U8XwdFH3RZxDXzjhkxm/f/HeB+jTHlNFXoGaZetP+yUbbqzN5+wz6Ihis3Pa1BTMxNtWt87LoDrJ+64juN9o/0i15g0FfNBMltSfW5XF8zaahuPH4nXaQ6RGE9dZv+0dNnP2rLzqQo27ICat6JELU1WGrTjrI9hyYLjy4NL9X1fRi9NPomLpMoVLX47Ou6UqrL4JBc4P/+sOHOBfR7FX1SIR288l30kHWPDHjdJrnupzwdVDEyd1Po2PqMIVKXY/PuqYri74IBnBhg5dxy37eQiHpKnCsw0ImKcDkyXMdTvg6KOrk7pVpJKKOcZg/8KKMWmhWdT0+65quLPomGEA5N3IVUSjEnUh57qpMOgHzdE72U7NGHkWd3L1wc2HUMb71y4fAYO6cn1/Wemd5UzdrOXU9Puuariz6KhhAvhueoq6O8hYKew5M8/9ePbtged6rw6QAk6eg6admjTyKPLnrfhNe1DE+9/rCYt8hcqqRbtZy6np81jVdWfTFaKIixN24lqdq3PzOmdm5ecsvvWQ499DCvDfnxI0K0U0/DXWbSLBMWQKcQ6WT5dX1+IybrPHMa2cXjLiq64isvqsZhGVp64+rAQyZddwpHdeuf8mSxblPoDzND2mavurcrBGnyNEcvdLWX4Q0U5A3VT1hXF2Oz6hj7dMffCf37D0y7+Lv5TNz886tOnc098VNZ1Gy3uzT7sa1Tm4aivtOA57f8QspctG5pIIxy41vvTJcTnd2dy5q2w0vsnl9BqCb5ZqSjrXmzL2tRkeG+bGLFscG3SKCrGYtjZG1rT+pDb7Zd5C1UOikHbGIAiju6mPyBy/xxLOnUzd9VXEV02n+yxjNUfe2/rA8x03c1XbUsl7ZHmVKOtbimtxmZucWNBeH1aGjuW+DQdYOwKRmgU4LhaxNDUUVvnEH65ee+uvY2g8sDFLdHi6XJ//9MJojLEvhXsRxE3eM91vhn2YIbbugl3SsZWlyC6tDX1TfdiBn7QDM+iSpNJ1AWb8zzdQFaX437mBNCgRRQarbBWyeqRv6qcM369O++m3Ki7JEbdetXz7E1kcOZXqyWtwxtciM69+2tO1T/1rVpS8qVc3AzDYC/xEYAv6Lu++IWOdm4B4aZc4hd/9wsPw24DeC1f6tuz9YQLrb6qQDMG0NIMuVWJZaRbvCN+3vZr06ibvxrezhcq1XaXlGbvVTh2/WGlm/1YrKknYIbbvab9z0N+fc+cr+aW66downnj19/rg+89pZXj4T3UQ0ZDYvcFdZE2tbMzCzIeA+4OeBq4EPmdnVLeusArYD17n724E7g+U/AdwN/D1gHXC3mV1aaA5ilPnM2LKuxNpd3ab93ahhbhbzm82Oq6jtUuYwvqirtLg0pgk+WfZ3XYf2NWUt3PupVlSmLMExad3msTZkC4/Y2blzPPHsaZ7ctp7nd/wCT25bz92/9PYF59HwImN46MJIxTo86zlNzWAdcNzdnwMws4eAG4GjoXV+DbjP3V8GcPcXguUbgMfc/aXgs48BG4E/Kib5ycrqACzrSqzd1W3a343qELz+bUv5yv7pBd99/duWct2OxyPbS8PfMz0zW+hVTFRgy3tTU5r9XeehfU1Za2Rl14r6ZVRVlhpzu0C6ee0Yd+06GPle+HxsbrvwMPWxmNpC1dNXpAkGY8CJ0OuTNK70w64CMLMnaTQl3ePuX4/5bO8dRYHmjo1re897JdZuDHWWQiKqYBz/2z+RGCCiCsaoOZ+KKECT+jXGRkdKK3jSNMEUVfh1+j1ZC/fW4+ZNI8OYwV27DrJzYirXNuyF4BmndftHXRDFDaFNE0jbnY+t2+6c+/nvThNIui1NMIiqvbeWh4uBVcDPAcuAb5nZO1J+FjO7A7gD4Morr0yRpO5rNw11UVdiSVe3ea8AW7/7uh2Pp2qbLmNUURnzNaURd7JNz8yev/8iXDvptPDLU4h2cmNVc98WXXj36gRsUdshqj0/zxDadudj0rar4/QVaYLBSWB56PUy4FTEOk+5+xzwvJlN0QgOJ2kEiPBnv9n6A+5+P3A/NG46S5n2rkqaJbTTmUfD0lxFFn33ZdpmpzKaxarq8E1qKmgubz0Aw4VfeD81r8Bnzswt2Bd5C9FOmziLLrzLahItu+kpbjs02/NbdfLb7c7HpG332VvW1G7AQ5pgsA9YZWYrgWngVuDDLevsAT4E/Fczu4xGs9FzwPeBfx/qNL6BRkdz5bIejHE71iD3lWxZo5PC3x+V17RXJ2VcxVQ1rUC7ByHFOTUzu2A/hW8iat1naUaGlZH3ogvvLPs+7Rh+KL7ZsVW3RlglnY9J264u02qEtQ0G7n7WzLYAEzT6Ax5w9yNmdi8w6e57g/duMLOjwDlgq7u/CGBmv0kjoADc2+xMrlInVekyq3XtrubyFBxJeU17dV7WVXwVd/i2doqndcXoSNuHFM3OnePXHz50fv2446XMdviij9O0+z7tNNjbdx/m4uFFpTc91aEZpt22Cx//zXP8rl0HKwsMfTs3UZJOHkpf5oPNk+YwiqtOpv3ddnlNG2j6ZURJWNJ8VGHN7X3XroOp17/p2rHI0VtJ89cU0V9S1RxNSc/3ziLL4IGkdHW6HYo+ztN8X1H7LO/cRAMZDJIKgaT2/7IKxKQCG8hVcNR1srw6SCrAmp3IY6GmjbjHqkZJmtMq7z5pt12r2O5pA2uSqGHFSfeKtCtAs26HMi/4knRycRpFE9V1oF0nYqft9WUMJcw7BK2M6nInVztZm0K6UaDF9R9ceskwd//S2xdcZaYNBNDIb9ywzjz7JM12raL5LcsY/tGRYV49+/q87R71FLWkpqM0HeVZt0NVI6fqcgd5385NlCTuQRRNndxNnHU+mbCku2fz3l1a9F3EafOZ5y7tPNsyi6jt/rlb1nDgUze0HVoLjQJsUdxt0wnpzrNP6joPUVSemnfZho0MD3HPprcv2O5xYTZrQZmnAK2qUK7LHeQDWTNI04mY9QAoayhhEfcWNNNXxFV22nzmObHKukKLq220+86kNP/OzQv7dNqlO8s+KXL+pjLF5SlqWesNjRDfVJJUUBZd4437zkVm7DkwXVrtoC7zag1kMIALhW/WgzBOWVcVRRTmRTYbpM1nnpO1jG2Zp9kq7RDBNAV1a+H+2VvWxP5+VJrr8FD6OHHHWRE3cKVZ37hw82AnFzxJE9CVedd1XYaZDmwwaCoqKpc5lC1LYV52W3vafObZrkVuy+b2iPq+tLWNtEME211YZA1IZczfVFdZC8TWIFzEXePNdaMGCpTdd1BFP0+rgewzCEtqr0+jOQNm1Kyb3T5Ju9HWHjcjavOKrPlbebZrUf0c4e0RJ01tI21e2qW7XXt/62yqcekOP5R+dGSYi4cXcdeug7WcgTWLzWvH5s322e5Yaa4f1efQaT/K5rVjvB4zUKDqpriyDXzNAKKjcicjZsJXbUVMUZFVN0ZDZLki6/Rqp6hqc7ubxCB9bSNNXtqlO6n5K0uTUPgekaSaRreGmFY9hLjKu677iYJBhLTV+bhqfFGTrRU1ZUZZt+BHXb0WFXyKqDa3y3cZNbekdCcVMp00CbWraXRjttE6zGraaeEdd36V0R/RCwa+mShK2uF7ZRa+nTT5dHuIWpb8V/FAmaR8F/mwo7SSmpHaTekd1TyVtP27NQS1DkNdO2lWTDq/ws2CQGTtt5eb4+IoGETIMmImStnzFcUp+p6CdtLmP+rEu3PXQdbe+2ipJ1Xc9vjcLWtStUkXrZP7SZq1zKh29KTt361aYh1umOqkf6rd+VVGf0TdqZkoQjdGzLTTyUnW7SFqafMf13b/8pm5gRiy15qmou4nSfpM3AiqomuJdWlfz9qsmHf69umgn6efmosUDCKkPTGTbrSJe5RkWp2eZN0copa2sE0KYGn6GNpNSJb0+3UYspdGJ4Gr3We6cSNTXW6Yyirv9O1AzzzxLa2BnKgujU5HSBQ12VVVk2aVod2MlkkTtCVtB4gu8HpxG5WhWxPadWM0URmziaY5dto94bDsJ/NloVlLa6aoGQih+iF7RclzQpU5o+sgq+Jio+oLrE7Ts+fANHfGTBjZjdl/09KspTVTZIdarzRxtNPMwz17j8x7Ohi0b1LoZHv2+81BRYjrQL1n75FSLkDyDEEt6/6ZtOfX5rVjXeuDqZJGExWsLjMQ1s3mtWMcvPsGPnfLmkyjPpK2p7Z15+IC5szsXCl3sHc6BHXPgelaTM7X7ZF6VVDNoGC92qHWLVlrO+22p7Z1Z9I+f6Comwg7qeE1axNx0gb9Ippb6zgyrWgKBgUbhIOmm9JsT23r7OJm6IzSyRV4awE8eskwL5+ZW7BeUoGeNJ1I2qBf5B3S/dJsG0cdyCIDqrXAPvPa2cgCO2uHfFSH7/AiA4O5cxfKm3adwEmP0vxcwtTfYUUO6Kg7dSCLSEdar3TjRu1kbXaLuqKfe90ZHRnmxy5anLoWF9eUNRZ6jkQ7dbhDulcoGIgIUFwTZ1xB+6PZOQ7efUPq7ymi/60ud0j3AgUDETmviHbxogrgIoKTBnSkp2AgIoUqsgDOG5w0oCM9BQMRKVSVBXDcMFIV/u0pGIhI4aoogOvwoJ1epjuQRaQv1OFBO71MwUBE+oKGkeajYCAifUFzVeWjYCA9o4rnKEvvGITJ5MqkDmTpCeoclHY0jDQfBQPpCWXNaS/9RcNIO6dmIukJ6hwUKZeCgfQEdQ6KlCtVMDCzjWY2ZWbHzWxbxPu3m9lpMzsY/PtI6L3fMrMjZnbMzH7XzKzIDMhgUOegSLna9hmY2RBwH/B+4CSwz8z2uvvRllV3ufuWls/+A+A64Jpg0f8E3gd8M2e6ZcCoc1CkXGk6kNcBx939OQAzewi4EWgNBlEcuBhYAhgwDPzvzpIqg06dgyLlSdNMNAacCL0+GSxrdZOZPWNmj5jZcgB3/wvgCeCHwb8Jdz/W+kEzu8PMJs1s8vTp05kzISIi+aQJBlFt/K1Po/sqsMLdrwG+ATwIYGZvBX4aWEYjgKw3s/cu+DL3+9193N3Hly5dmiX9IiJSgDTB4CSwPPR6GXAqvIK7v+jurwYvvwBcG/z9j4Gn3P0Vd38F+DPg3fmSLCIiRUsTDPYBq8xspZktAW4F9oZXMLPLQy83Ac2moL8G3mdmi81smEbn8YJmIhERqVbbDmR3P2tmW4AJYAh4wN2PmNm9wKS77wU+bmabgLPAS8DtwccfAdYDh2k0LX3d3b9afDZERCQPc29t/q/W+Pi4T05OVp0MEZGeYmb73X2808/rDmQREVEwEBERBQMREUHBQEREUDAQEREUDEREBAUDERFBwUBERFAwEBERFAxERAQFAxERQcFARERQMBARERQMREQEBQMREUHBQERESPGkM5E62nNgmp0TU5yameWK0RG2bljN5rVjVSdLpGcpGEjP2XNgmu27DzM7dw6A6ZlZtu8+DKCAINIhNRNJz9k5MXU+EDTNzp1j58RURSkS6X0KBtJzTs3MZlouIu0pGEjPuWJ0JNNyEWlPwUB6ztYNqxkZHpq3bGR4iK0bVleUIpHepw5k6TnNTmKNJhIpjoKB9KTNa8dU+IsUSM1EIiKiYCAiIgoGIiKCgoGIiKBgICIiKBiIiAgKBiIigoKBiIigYCAiIqQMBma20cymzOy4mW2LeP92MzttZgeDfx8JvXelmT1qZsfM7KiZrSgu+SIiUoS201GY2RBwH/B+4CSwz8z2uvvRllV3ufuWiK/4IvDv3P0xM3sD8HreRIuISLHS1AzWAcfd/Tl3fw14CLgxzZeb2dXAYnd/DMDdX3H3Mx2nVkRESpEmGIwBJ0KvTwbLWt1kZs+Y2SNmtjxYdhUwY2a7zeyAme0MahoiIlIjaYKBRSzzltdfBVa4+zXAN4AHg+WLgfcAnwB+BngLcPuCHzC7w8wmzWzy9OnTKZMuIiJFSRMMTgLLQ6+XAafCK7j7i+7+avDyC8C1oc8eCJqYzgJ7gHe1/oC73+/u4+4+vnTp0qx5EBGRnNIEg33AKjNbaWZLgFuBveEVzOzy0MtNwLHQZy81s2YJvx5o7XgWEZGKtR1N5O5nzWwLMAEMAQ+4+xEzuxeYdPe9wMfNbBNwFniJoCnI3c+Z2SeA/25mBuynUXMQEZEaMffW5v9qjY+P++TkZNXJEBHpKWa2393HO/287kAWEREFAxERUTAQEREUDEREBAUDERFBwUBERFAwEBERFAxERAQFAxERQcFARERQMBARERQMREQEBQMRESHFFNYiImXZc2CanRNTnJqZ5YrREbZuWM3mtVFP1ZWyKRiISCX2HJhm++7DzM6dA2B6Zpbtuw8DKCBUQM1EIlKJnRNT5wNB0+zcOXZOTFWUosGmYCAilTg1M5tpuZRLwUBEKnHF6Eim5VIuBQMRqcTWDasZGR6at2xkeIitG1ZXlKLBpg5kEalEs5NYo4nqQcFARCqzee2YCv+aUDORiIgoGIiIiIKBiIigYCAiIigYiIgICgYiIoKCgYiIoGAgIiIoGIiICAoGIiKCgoGIiKBgICIipAwGZrbRzKbM7LiZbYt4/3YzO21mB4N/H2l5/41mNm1m/6mohIuISHHazlpqZkPAfcD7gZPAPjPb6+5HW1bd5e5bYr7mN4E/z5VSEREpTZqawTrguLs/5+6vAQ8BN6b9ATO7FvhbwKOdJVFERMqWJhiMASdCr08Gy1rdZGbPmNkjZrYcwMwWAb8NbM2dUhERKU2aYGARy7zl9VeBFe5+DfAN4MFg+ceAP3X3EyQwszvMbNLMJk+fPp0iSSIiUqQ0Tzo7CSwPvV4GnAqv4O4vhl5+AfhM8PffB95jZh8D3gAsMbNX3H1by+fvB+4HGB8fbw00IiJSsjTBYB+wysxWAtPArcCHwyuY2eXu/sPg5SbgGIC7/0ponduB8dZAICIi1WsbDNz9rJltASaAIeABdz9iZvcCk+6+F/i4mW0CzgIvAbeXmGYRESmYuderVWZ8fNwnJyerToaISE8xs/3uPt7p53UHsoiIKBiIiIiCgYiIoGAgIiIoGIiICAoGIiKCgoGIiKBgICIiKBiIiAgKBiIigoKBiIigYCAiIigYiIgINZy11MxOAz+o6OcvA/6mot8uWr/kpV/yAcpLXfVLXla7+493+uE0D7fpKndfWtVvm9lknilg66Rf8tIv+QDlpa76JS9mlmvufzUTiYiIgoGIiCgYtLq/6gQUqF/y0i/5AOWlrvolL7nyUbsOZBER6T7VDEREZHCCgZktN7MnzOyYmR0xs38RLN9pZs+a2TNm9sdmNhr6zHYzO25mU2a2obrUzxeXl9D7nzAzN7PLgtdmZr8b5OUZM3tXNSlfKCkvZvbPg21/xMx+K7S8dvsl4fhaY2ZPmdlBM5s0s3XB8jrvk4vN7NtmdijIy78Jlq80s6fN7HtmtsvMlgTLLwpeHw/eX1Fl+sMS8vKl4Pj5rpk9YGbDwfKe2y+h93/PzF4Jvc62X9x9IP4BlwPvCv7+ceAvgauBG4DFwfLPAJ8J/r4aOARcBKwEvg8MVZ2PpLwEr5cDEzTu1bgsWPYB4M8AA94NPF11HlLsl+uBbwAXBe+9uc77JSEfjwI/H9oP3+yBfWLAG4K/h4GngzQ+DNwaLP888NHg748Bnw/+vhXYVXUeUuTlA8F7BvxRKC89t1+C1+PAHwCvhNbPtF8Gpmbg7j909+8Ef/9f4Bgw5u6PuvvZYLWngGXB3zcCD7n7q+7+PHAcWNftdEeJy0vw9meBfwWEO4NuBL7oDU8Bo2Z2eTfTHCchLx8Fdrj7q8F7LwQfqeV+SciHA28MVnsTcCr4u877xN29eYU5HPxzYD3wSLD8QWBz8PeNwWuC9/+hmVmXkpsoLi/u/qfBew58m/nnfU/tFzMbAnbSOO/DMu2XgQkGYUF1aS2NyBr2z2hcFUDjRD4Reu8kFwrc2gjnxcw2AdPufqhltZ7LC3AV8J6gevvnZvYzwWq1z0tLPu4EdprZCeA/ANuD1WqdDzMbMrODwAvAYzRqYDOhC6dwes/nJXj/R8BPdjfF8Vrz4u5Ph94bBv4p8PVgUU/tlyAvW4C97v7DltUz7ZeBCwZm9gbgK8Cd7v5/Qss/CZwFvtRcFPHxWg29CueFRto/CXwqatWIZbXNS7BfFgOX0qiqbwUeDq5qap2XiHx8FLjL3ZcDdwG/31w14uO1yYe7n3P3NTSumNcBPx21WvB/T+XFzN4Revs/A//D3b8VvO61vLwX+CfA70WsnikvAxUMgquArwBfcvfdoeW3Ab8I/EpQbYTGFcHy0MeXcaGKX7mIvPwdGm3oh8zsr2ik9ztm9lP0Xl6gkebdQdX428DrNOaQqW1eYvJxG9D8+8tcaNKqbT7C3H0G+CaNoDxqZs0pbMLpPZ+X4P03AS91N6XthfKyEcDM7gaWAv8ytFqv7ZfrgbcCx4Pz/hIzOx6slmm/DEwwCK4qfx845u6/E1q+EfjXwCZ3PxP6yF7g1qBHfiWwikbbYuWi8uLuh939ze6+wt1X0DgQ3uXu/4tGXn41GCnxbuBHEVXKSsTtF2APjTZqzOwqYAmNycRquV8S8nEKeF/w93rge8Hfdd4nSy0YVWdmI8A/otEH8gTwy8FqtwF/Evy9N3hN8P7joYuqSsXk5Vkz+wiwAfiQu78e+kiv7Zf97v5TofP+jLu/NfhItv2S1LvcT/+An6VRRXoGOBj8+wCNDsgToWWfD33mkzTaSqcIRoTU4V9cXlrW+SsujCYy4L4gL4eB8arzkGK/LAH+EPgu8B1gfZ33S0I+fhbYT2ME1NPAtT2wT64BDgR5+S7wqWD5W2gE3uM0ajnNkV4XB6+PB++/peo8pMjL2WDbN/dVc3nP7ZeWdcKjiTLtF92BLCIig9NMJCIi8RQMREREwUBERBQMREQEBQMREUHBQEREUDAQEREUDEREBPj/ZaRKL81cPccAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "scatter(cc.x, cc.ratio)\n", - "cc.ratio.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAD8CAYAAABkbJM/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAHfVJREFUeJzt3X+sHeV95/H3F/sCF7LlQjABrmHtNoSUlhInd2kk9kdCAia0jS2apK6qXUtthZTdrBY2ZWuENqRkd3HqdlO1TYWsJBJNUWI2ZJ1bkdY1v9rdShCusZ3ECV67+bH2hQ2ujGkTO8F2vvvHeY4ZH8+PZ87M3Jlzz+clWT5nZs6c57kzZ77Pr3nG3B0REZEiZ7WdABERGQ0KGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkShL207AMC6++GJfsWJF28kQERkpO3bs+Ht3Xzbs50cyYKxYsYK5ubm2kyEiMlLM7LtVPq8mKRERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEotQQMM7vFzPaa2X4z25Cy/hwz2xLWP2NmK8Ly15vZk2b2fTP74zrSIiIizagcMMxsCfBJ4D3ANcCvmtk1A5v9BvCyu78R+ATw8bD8h8B/Bn6rajpERKRZddQwrgf2u/u33P1V4PPAmoFt1gAPhtdfAN5lZubuP3D3/00vcIiISIfVETCmgQOJ9wfDstRt3P0E8Arw+hq+W0REFkgdAcNSlvkQ2+R/idntZjZnZnOHDh0q81EREalBHQHjIHBF4v1y4IWsbcxsKXABcLjMl7j7ZnefcfeZZcuWVUiuiIgMo46A8SxwlZmtNLOzgXXA7MA2s8D68Pp9wBPuXqqGISIi7VpadQfufsLMPgRsA5YAn3H3PWZ2HzDn7rPAp4HPmtl+ejWLdf3Pm9l3gJ8AzjaztcDN7v6NqukSEZF6VQ4YAO7+ZeDLA8s+knj9Q+D9GZ9dUUcaRESkWbrTW0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJUkvAMLNbzGyvme03sw0p688xsy1h/TNmtiKx7u6wfK+Zra4jPSIiUr+lVXdgZkuATwI3AQeBZ81s1t2/kdjsN4CX3f2NZrYO+DjwK2Z2DbAO+BngcuAxM3uTu5+smi5ZnLbunGfTtr28cOQYl09Nctfqq1m7arrtZC2Icc67dEMdNYzrgf3u/i13fxX4PLBmYJs1wIPh9ReAd5mZheWfd/cfufu3gf1hfyJn2Lpznru/+DXmjxzDgfkjx7j7i19j6875tpPWuHHOu3RHHQFjGjiQeH8wLEvdxt1PAK8Ar4/8rAgAm7bt5djx0yufx46fZNO2vS2laOGMc96lO+oIGJayzCO3iflsbwdmt5vZnJnNHTp0qGQSZTF44cixUssXk3HOu3RHHQHjIHBF4v1y4IWsbcxsKXABcDjyswC4+2Z3n3H3mWXLltWQbBk1l09Nllq+mIxz3qU76ggYzwJXmdlKMzubXif27MA2s8D68Pp9wBPu7mH5ujCKaiVwFfCVGtIki9Bdq69mcmLJacsmJ5Zw1+qrW0rRwhnnvEt3VB4l5e4nzOxDwDZgCfAZd99jZvcBc+4+C3wa+KyZ7adXs1gXPrvHzB4GvgGcAP6dRkhJlv6IoHEcKTTOeZfusF5Bf7TMzMz43Nxc28kQERkpZrbD3WeG/bzu9BYRkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIlMqz1YrIwtPzvaUNChgiI6b/fO/+I1v7z/cGFDSkUWqSEhkxer63tEUBQ2TE6Pne0hY1SUlnqZ0+3eVTk8ynBAc931uaphpGh2zdOc8NG59g5YZHuWHjE2zdOd92klrTb6efP3IM57V2+nH+m/Tp+d7SFgWMjtAF8nRqp8+2dtU09992LdNTkxgwPTXJ/bddq9qXNE5NUh2Rd4FcDBeCss1LaqfPt3bV9KI4L2S0jE3A6Hp7+GK+QA4zDLTNdvqunyvjRMeiW8aiSWoUmnuyLoSLoSNzmOalttrpR+FcybLY+sBG+VgsVmMRMGIuWG3/2BZzR+Ywtae22ulHte9kMV5cR/VYDKvta1CMsWiSKrpgdeHO2f73tFH9brraP2zzUhvt9F1pGix7TKr2gXWx6acrx2IhdOEaFGMsAkbRBWuhOpyLfpRtXCCbPlG37pznBz86ccbyrtaeunCPwzDHpMrFtasXqy4ci4WSdQ368MO7ge4EjbEIGHetvvq0HwScfsFaiJJMnT/KOkuDdQbLwXS9883LeGTH/Bn7v/C8Ce79pZ9J3X9deSvaT9b6onNl2O8rs90wx6TKxbWrI/SGPRaxulSryrrWnHTvRPDuG4uAUdTcsxAlmbp+lHWXBusKlmnpeujp/4unbHve2UszL6Z15K1oPzHfU+ZCkrW/ue8e5snnD2UG0Kz8DXNMqlxcu9r002QzbddqVVnXIOhG8O4bi4AB6c09/RLG/JFjGJx2ccv6sQ1bKsn68c0fOcbWnfPRJ0PdpcG6gmVautKCBWT/LerKW9F+itaXbRrM2l8yYGYF0LT8DXNMqlxcy3zfQpfKm2qm7VqtKi3gJ7UdvPvGJmAMGixhOJwKGtMZP4QqpZK8EsSdW3Zxx5Zdmd+blBd4btj4ROkfclbJ9J1vXlZqf2VO6KwLX10l3aL9lPmemAtk1v4Gg0NsAB22tjDsxTX2+7pWKq+ia7Wq/t/vww/v5qSfeaZ0pd9mLIbVpskqEU9PTfK3G25M/QFUGeaXNmw2+b0QNxQy68Sx8PmyQyrThq/+8tumeWTHfKn95aUrKRmMBocP1nUvStF+Yr8nZqjq1p3znGWDuSxn8HsXekhxzPdt3TnPhx/evWiGuXbxvqe1q6b5/Q9c1+nh9eYp0azrZmZmfG5urtI+Vm54NLPENz01yQtHjnHB5ARmcOTo8dwaggHf3vgLhd+5dec8d2zZVbhdP2j1P1PUkTzYnJa2nzJu2PhEal7z9jdY+oTeif7Lb5vObcfvb3f/bdcCpO6j7MUyKy39/RStj/07pO2nSFrTZ9fngSrKZ+z5P7jPNjucY8+BNjT5tzGzHe4+M+znx7ZJKisA9EvqAEeOHT+1PK2fI7mvGP3286zA05d3f8gjO+bPuAhn7a/uJpaiG+2guA39ho1PpJZSPzq7h/PPWcqx4ydZYsZJ96gmupi09AP/nVt2sWnbXu5afTX333bt0H+Hfr9TWo0zT1oAXcgLZdkLUbKPL0/ZUnnsIIEm/zZ1dqjXfYHv8jxhYxcw8jq6swJCn6dsU7a6WNS5BcX3hzz5/KHTSvpZJeGsJpaiNugmb7TLuggfOXb8VIA+6X7q7zrsD6eflqw833/btYW1r7xgXLZmAbRaei3T/7B15zwfnd1zWoEpyzBDjo++eiJqkEDT/SN5A2GqjpBrMt1tGqs+jGSbNLwWAKDXzBDTONfv5xi2bTnZXgzpbfxl7w/J6h85+uqJ09rbY/tgYqYp6U9jsGLDo/zU3V9mReR0BrGl0TJt43lTKhTlOe+zef1O/ZpQrOmpyVYvILHHvv8biQkWS8wKz/+0fqCXj6bvO2sE2UIZZnqVYfs1R2EakDRjVcMo6ujOKqknDdsvkJQs2aSVaKBXa8gKYGmdpMAZpcKXjx4/rbQTG4CKquuDpar+qI6Y0lVMDSsrXWmKSnh5eS76bD8PWf1O/ZpQMi8TZxkYHD/52tFro9Ny8LzKOq8HR9ellf7TxLb3l222G7SQo5aGGWo7TPPtKNdKKtUwzOwiM9tuZvvC/xdmbLc+bLPPzNYnlv9XMztgZt+vko5YRQc3r0QJzfzw166a5m833Mi3N/7CqUCUrAXFpmHtqmnOP+fM+J8s7ZQZGTKYruSJnHcRKCpdpY3IufC8ieh0DSoq4eXlOaZ0uHbV9Kna4KB+DTOZl03vv45N77uu1YcbpZWUs+pCg6Prskr/SWXyFHvBz0pf8vg1XSof5uI/zGirUZ5UsWoNYwPwuLtvNLMN4f1vJzcws4uAe4EZegX6HWY26+4vA38O/DGwr2I6ohS1zWd1lvZHSQ2WspsYyZB3MS7qBI4JiHVMtVB0EShaP9h2nDVipY67lPPyfGdGzaHMfRFZ/TZda35K638r6rMbNMwoorzaTXJwQ9bouf45sBBznp0V0jPogsmJzHuShvlNde0ekDKqBow1wDvC6weBpxgIGMBqYLu7HwYws+3ALcDn3P3psKxiMuLEHNyYjtsmT96sk8Zg6E7a5Al/weQE506clRoEi/SDZNFFpuyomSbuUnZ6zXp5I6KyRgBlNfl1Zd6hInk3EvaHjOddyNPkzf+VJWviyb7BwQ0z//SizL9xk5Pz9X/PacFi4izjB6+eONXUm9VsWWbesqnzJlJrcl25OS9P1YDxBnd/EcDdXzSzS1K2mQYOJN4fDMtKMbPbgdsBrrzyyiGSWt8Pv8lpBapM1ZEWEAdP+CPHjjM5sYRP/MpbSo0Qib3nYNhmu2GGEhZdkIpGRJUpHXZ5qOOgrHNosP8tq89uanKC889ZWnr47WA/XMz5EjslS5OT82XV6peY8bpzl55xcR/8reelO61wOXGWMbHEovq52r5fZVBhwDCzx4BLU1bdE/kdadWH0ncLuvtmYDP0btwr+/m+On74TVYpq0zVkRYQj756ovCEh+JaU15TWdX7JoYRG8DyAvmo1RxiFQXCojnUPvre+JpE1nlz7sRZ0Z3dMb+bJifny/r+H7tzJKNPJ/a3nva7Of5jjwrKaX/bO7bs4nf+fE/p2l5dCgOGu787a52Zfc/MLgu1i8uAl1I2O8hrzVYAy+k1XY2smGaQKvcPwOkXsdhZTvvvk8tWbng09XsGT/iiWlNeU9nf3X9rfAZrUmb0TdENh6MeIAblBcJh5lDLk3XelBkZNWztOSl5jMuWyotq9VUm58w69145dpxd996c+9msc3xw9ONCqtokNQusBzaG/7+Uss024L8lRlDdDNxd8XtblXfy1tGfMXgRy7o7OqZUFdvEVVRramIK+CrV7TomO6wzPV2TFQiLhpaXsXXnfKl+EBj+xtd+Xoom50srlRdN7llUI6syOWeV303eOd7WzLpVb9zbCNxkZvuAm8J7zGzGzD4FEDq7PwY8G/7dl+gA/10zOwicZ2YHzeyjFdOzIAZvvhtU9xC5ohlq84YZZt2ENzgB4FTB0Na6nzle9RnUZSY7jEnjYnwmdpq6n3+SZWpyIvV8+bW3Xzn0kOO1q4on58ubZj/rmKYN9e6nK21dmck5i343eUOFi4JKG6OqxnbywbpkTWI4zIRsWbI6J2MnsouZwDDrhrPk/uosgQ8zweFgnmImO4xNY9X0dFXatBxpI3TK5jPvJtfkZJJNPfwoa795k4r2VT2mZc+VZJ9RzFDivEkyq+ZDkw+2rImmmkFpVea0MfRZ1dSYJq6Yjrg62/urlnTr7rBuemx8nY9wLfOdVUbo5Mn7uyQLGU00meSdhzHDhase07LnSj+tMU+kHBw1BmfO4ADtTXmugFFR088dhvSLY5kZamO3iemIixFz0asj0NYZwJoM/DGPjE0btVS1P6zKCJ0ieUN3F/qu9qLa86Cqx3SYc2XYJ1L2z/Gu9K8pYFS0UEMz02oJw17g2rw49i1EoC2jyfQUTQUxOGopbbthzqcmCwZdOH5F0/+XefRyGXXe3Z0ma6qeLgzAUMCoQRsHs8oPtq2L42DzVn/7tktNTacnb9BC1qifmM8XabJgkPX3AoZ6VPCgmBJ1zPT/TZTM67y7u4mA1iR1eo+wKj+Gpqq4CzEIYNTEDlrIMmwn7UI/Va6u74vdT14H9x+kzGTQtLwbIrMGlSz0A7XU6T3GqtRsyny2THBZiEEAoyZ20EKaKiXOha7FDTtlTuwDlgb3U/SAK1i4G9vSbohMqqvvqG0KGJKr7ESLXWjbblrZ2lmZQQtQ7c7rtO9eqItSXc+GiN1/3g20gwGmrhp11n5iZh6oa1BJmxQwJFfZUmPX+ibqNuxMxbGDFpaY8fsfuG4k/151jR7K239S/2+U9YCrfoCpa3bpvP3Ezoc16sbqEa1S3jClxrWrsh++NOrqevhN1h3AwwSLrjzuc5jZAGI79PNmEc6acaF/ga7rmOXtpygYLJZatgKG5BrmiWKLWV03+OVNR1FGl6Y0GSZPWefR1ORE9H6KAtWwx2wwEOfd+5SWhv40NW08dbEpapJqWJs33NTx3ePQJ1FGnZ36dfQvNPlslmGUzVPW+VVmivWiZtBhjlla81PWQIXLEzcrLtam2D4FjAa1+bD3ur57XH4IsboWQEf5cZ9Q3/mVF6iGOWaxj7lN7qcrN9c1SQGjQW2W/ur87nH4IcTqWgBdDMOYmz6/hjlmsY+5HbfCkwJGg9os/Y16ybPLuhRAu1bj6aqyxyz2MbfjRp3eDWqzw1id1eOhrs5zOV3dz39ZLFTDaFCbpT+VPMdHl2o8i0XXmh67QgGjQW2edDrhRapRID6TJh+M0MbDb6R+Oj4y7jT5YMNih6e2OYRWiun4iFSnTu8CsdMK1DX9gDRDx0ekOgWMArHDUzWMtdt0fESqU8AoEDs8NWs7h1YnhJMeDTMWqU4Bo0DseOy07franBBOejSuXqQ6BYwCsTdGJbdLo/bydukGN5HqNKy2AXqutYh0kYbVdtBimBBOZLHS/TjDU5NUA9ReLtJNXXrg1ChSwGiA2stFukn341SjJqmGaB4ake7R/TjVqIYhImND9+NUo4AhImND/YvVqElKRMaGpv2vRgFDRMaK+heHpyYpERGJooAhIiJRKgUMM7vIzLab2b7w/4UZ260P2+wzs/Vh2Xlm9qiZPW9me8xsY5W0iIhIs6rWMDYAj7v7VcDj4f1pzOwi4F7g54HrgXsTgeX33P3NwCrgBjN7T8X0iIhIQ6oGjDXAg+H1g8DalG1WA9vd/bC7vwxsB25x96Pu/iSAu78KPAcsr5geERFpSNWA8QZ3fxEg/H9JyjbTwIHE+4Nh2SlmNgX8Er1aSiozu93M5sxs7tChQxWTLSIiZRUOqzWzx4BLU1bdE/kdlrLs1OzfZrYU+Bzwh+7+rayduPtmYDP0pjeP/G4REalJYcBw93dnrTOz75nZZe7+opldBryUstlB4B2J98uBpxLvNwP73P0PolIsIiKtqNokNQusD6/XA19K2WYbcLOZXRg6u28OyzCz/wJcANxRMR0iItKwqgFjI3CTme0DbgrvMbMZM/sUgLsfBj4GPBv+3efuh81sOb1mrWuA58xsl5n9ZsX0iIhIQ/SIVhGRMVH1Ea2601tERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISBQFDBERiaKAISIiURQwREQkigKGiIhEUcAQEZEoChgiIhJFAUNERKIoYIiISJRKAcPMLjKz7Wa2L/x/YcZ268M2+8xsfWL5X5rZbjPbY2YPmNmSKukREZHmVK1hbAAed/ergMfD+9OY2UXAvcDPA9cD9yYCywfc/TrgZ4FlwPsrpkdERBpSNWCsAR4Mrx8E1qZssxrY7u6H3f1lYDtwC4C7/0PYZilwNuAV0yMiIg2pGjDe4O4vAoT/L0nZZho4kHh/MCwDwMy2AS8B/wh8oWJ6RESkIUuLNjCzx4BLU1bdE/kdlrLsVE3C3Veb2bnAQ8CN9Gogaem4Hbgd4Morr4z8ahERqUthwHD3d2etM7Pvmdll7v6imV1Gr6Yw6CDwjsT75cBTA9/xQzObpdfElRow3H0zsBlgZmZGTVciIgusapPULNAf9bQe+FLKNtuAm83swtDZfTOwzcxeF4IMZrYUuBV4vmJ6RESkIVUDxkbgJjPbB9wU3mNmM2b2KQB3Pwx8DHg2/LsvLDsfmDWzrwK76dVOHqiYHhERaYi5j17rzszMjM/NzbWdDBGRkWJmO9x9ZtjP605vERGJooAhIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJooAhIiJRFDBERCTKSM4lZWaHgO+28NUXA3/fwvc2QXnpnsWSD1Beuuhi4Hx3XzbsDkYyYLTFzOaqTNzVJcpL9yyWfIDy0kV15ENNUiIiEkUBQ0REoihglLO57QTUSHnpnsWSD1BeuqhyPtSHISIiUVTDEBGRKAoYgZldYWZPmtk3zWyPmf2HsHyTmT1vZl81s/9pZlOJz9xtZvvNbK+ZrW4v9afLykti/W+ZmZvZxeG9mdkfhrx81cze2k7Kz5SXFzP79+Fvv8fMfjexfKSOi5m9xcyeNrNdZjZnZteH5Z08LmZ2rpl9xcx2h3z8Tli+0syeMbN9ZrbFzM4Oy88J7/eH9SvaTH9STl4eCufP183sM2Y2EZZ38phAdl4S6//IzL6feF/+uLi7/vWa5S4D3hpe/xPg/wDXADcDS8PyjwMfD6+vAXYD5wArgb8DlrSdj7y8hPdXANvo3cdycVh2K/AXgAFvB55pOw8Rx+WdwGPAOWHdJaN6XIC/At6TOBZPdfm4hPS8LryeAJ4J6XsYWBeWPwB8MLz+t8AD4fU6YEvbeYjIy61hnQGfS+Slk8ckLy/h/QzwWeD7ie1LHxfVMAJ3f9Hdnwuv/xH4JjDt7n/l7ifCZk8Dy8PrNcDn3f1H7v5tYD9w/UKnO01WXsLqTwD/CUh2Xq0B/tR7ngamzOyyhUxzlpy8fBDY6O4/CuteCh8ZxePiwE+EzS4AXgivO3lcQnr6JdWJ8M+BG4EvhOUPAmvD6zXhPWH9u8zMFii5ubLy4u5fDusc+Aqn/+47d0wgOy9mtgTYRO93n1T6uChgpAhVs1X0InTSr9MrXUDvh34gse4gr12UOyOZFzN7LzDv7rsHNhu5vABvAv5FqEr/tZn9s7DZKOblDmCTmR0Afg+4O2zW2byY2RIz2wW8BGynV5M7kihcJdN6Kh9h/SvA6xc2xdkG8+LuzyTWTQD/GvjLsKizxwQy8/IhYNbdXxzYvPRxUcAYYGavAx4B7nD3f0gsvwc4ATzUX5Ty8U4NOUvmhV7a7wE+krZpyrLO5iUcl6XAhfSaBe4CHg6lo1HMyweBO939CuBO4NP9TVM+3om8uPtJd38LvZL39cBPp20W/u9sPuDMvJjZzyZW/wnwN+7+v8L7UcvLvwTeD/xRyual86KAkRBKE48AD7n7FxPL1wO/CPxaqKJCr2RxReLjy3mtKaF1KXn5KXpt+rvN7Dv00vucmV3K6OUFemn+YqiGfwX4Mb25ckYxL+uB/uv/wWtNaJ3OC4C7HwGeohe4p8xsaViVTOupfIT1FwCHFzalxRJ5uQXAzO4FlgH/MbFZ548JnJaXdwJvBPaH3/15ZrY/bFb6uChgBKF0+mngm+7+3xPLbwF+G3ivux9NfGQWWBdGGqwErqLX1tm6tLy4+9fc/RJ3X+HuK+idLG919/9HLy//JowAeTvwSkr1tRVZxwXYSq/NHDN7E3A2vQniRuq4BC8A/yq8vhHYF1538riY2TILowXNbBJ4N73+mCeB94XN1gNfCq9nw3vC+icSBa9WZeTleTP7TWA18Kvu/uPERzp5TCAzLzvc/dLE7/6ou78xfKT8cSnqFR+Xf8A/p1cd+yqwK/y7lV6n6YHEsgcSn7mHXtvtXsIoly78y8rLwDbf4bVRUgZ8MuTla8BM23mIOC5nA38GfB14DrhxVI9LWL6D3uiuZ4C3dfm4AD8H7Az5+DrwkbD8J+kF5/30akr9EWznhvf7w/qfbDsPEXk5Ef7u/ePUX97JY5KXl4FtkqOkSh8X3ektIiJR1CQlIiJRFDBERCSKAoaIiERRwBARkSgKGCIiEkUBQ0REoihgiIhIFAUMERGJ8v8BAk7GOspTOp0AAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "count 119.000000\n", - "mean -0.009820\n", - "std 0.002781\n", - "min -0.017557\n", - "25% -0.011426\n", - "50% -0.009475\n", - "75% -0.008116\n", - "max 0.000000\n", - "Name: ratio, dtype: float64" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "diff = cc.ratio - dd.ratio\n", - "scatter(diff.index, diff)\n", - "show()\n", - "diff.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_\"Only-dup + dynamic-update\"_ is ~1% worse than _\"prev-bucket + dynamic-update\"_.\n", - "\n", - "Is memory saving measurable?..." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "58.8 -> 46.2\n" - ] - }, - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean -13.828503\n", - "std 3.759391\n", - "min -39.489471\n", - "25% -14.576219\n", - "50% -13.533365\n", - "75% -12.564087\n", - "max 0.000000\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('{:.1f} -> {:.1f}'.format(dd.end_len.max() * CACHEENTRY_SIZE / MB, cc.end_len.max() * CACHEENTRY_SIZE / MB))\n", - "((cc.end_len - dd.end_len) * CACHEENTRY_SIZE / MB).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121.000000\n", - "mean 0.559057\n", - "std 0.090338\n", - "min 0.113575\n", - "25% 0.548500\n", - "50% 0.560755\n", - "75% 0.571382\n", - "max 1.000000\n", - "Name: end_len, dtype: float64" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "(cc.end_len / dd.end_len).describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Stats" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " name | hit0 h25 h50 h75 h100 | mem0 m25 m50 m75 m100\n", - " d1-gap7 | 0.0 32.3 32.9 33.5 35.2 | 0.0 14.8 15.7 16.7 45.0\n", - " only-dup | 0.0 43.3 44.0 44.6 47.4 | 0.0 1.7 1.9 2.0 5.2\n", - " d1 | 0.0 45.3 46.1 46.9 49.7 | 0.0 14.9 15.7 16.8 45.0\n", - " d2 | 38.7 45.8 46.7 47.5 50.0 | 16.8 29.0 30.8 32.6 58.8\n", - "only-dup + dyn | 55.0 59.4 59.9 60.3 61.6 | 2.2 16.3 17.2 18.2 46.2\n", - " d1 + dyn | 55.7 60.4 60.9 61.3 62.6 | 16.8 29.0 30.8 32.5 58.8\n" - ] - } - ], - "source": [ - "def _():\n", - " print '{:>14s} | {:>4s} {:>4s} {:>4s} {:>4s} {:>4s} | {:>4s} {:>4s} {:>4s} {:>4s} {:>4s}'.format(\n", - " 'name',\n", - " 'hit0', 'h25', 'h50', 'h75', 'h100',\n", - " 'mem0', 'm25', 'm50', 'm75', 'm100'\n", - " )\n", - " \n", - " for name, stat in (\n", - " ('d1-gap7', d17),\n", - " ('only-dup', od),\n", - " ('d1', d1),\n", - " ('d2', d2),\n", - " ('only-dup + dyn', cc),\n", - " ('d1 + dyn', dd),\n", - " ):\n", - " r = stat.ratio.describe()\n", - " m = stat.end_len.describe() * CACHEENTRY_SIZE / MB\n", - " \n", - " print '{:>14s} | {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} | {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f}'.format(\n", - " name,\n", - " r['min'] * 100, r['25%'] * 100, r['50%'] * 100, r['75%'] * 100, r['max'] * 100,\n", - " m['min'], m['25%'], m['50%'], m['75%'], m['max']\n", - " )\n", - "_()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Why is speedup so small if hit-rate is so good?!\n", - "\n", - "Something like ~15% speedup with ~50% hit-rate on a test set... Or ~33% speedup with ~60% hit-rate full bucket.\n", - "\n", - "Seems, CPU is spent on \"bytes\", not not \"pages\", so we should count hit-rate in bytes..." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " name | hit0 h25 h50 h75 h100 | mem0 m25 m50 m75 m100\n", - " d1-gap7 | 0.0 6.4 6.7 7.0 8.5 | 0.0 14.8 15.7 16.7 45.0\n", - " only-dup | 0.0 17.2 17.9 18.7 21.0 | 0.0 1.7 1.9 2.0 5.2\n", - " d1 | 0.0 19.0 19.7 20.8 23.1 | 0.0 14.9 15.7 16.8 45.0\n", - " d2 | 12.4 19.4 20.1 21.2 23.3 | 16.8 29.0 30.8 32.6 58.8\n", - "only-dup + dyn | 29.8 36.8 37.4 38.2 41.2 | 2.2 16.3 17.2 18.2 46.2\n", - " d1 + dyn | 30.4 37.5 38.2 39.0 42.0 | 16.8 29.0 30.8 32.5 58.8\n" - ] - } - ], - "source": [ - "def _():\n", - " print '{:>14s} | {:>4s} {:>4s} {:>4s} {:>4s} {:>4s} | {:>4s} {:>4s} {:>4s} {:>4s} {:>4s}'.format(\n", - " 'name',\n", - " 'hit0', 'h25', 'h50', 'h75', 'h100',\n", - " 'mem0', 'm25', 'm50', 'm75', 'm100'\n", - " )\n", - " \n", - " for name, stat in (\n", - " ('d1-gap7', d17),\n", - " ('only-dup', od),\n", - " ('d1', d1),\n", - " ('d2', d2),\n", - " ('only-dup + dyn', cc),\n", - " ('d1 + dyn', dd),\n", - " ):\n", - " r = stat.bratio.describe()\n", - " m = stat.end_len.describe() * CACHEENTRY_SIZE / MB\n", - " \n", - " print '{:>14s} | {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} | {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f} {:>4.1f}'.format(\n", - " name,\n", - " r['min'] * 100, r['25%'] * 100, r['50%'] * 100, r['75%'] * 100, r['max'] * 100,\n", - " m['min'], m['25%'], m['50%'], m['75%'], m['max']\n", - " )\n", - "_()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/old_backend/AUTHORS b/old_backend/AUTHORS deleted file mode 100644 index e395e880..00000000 --- a/old_backend/AUTHORS +++ /dev/null @@ -1,4 +0,0 @@ -Jacob Appelbaum -Arturo Filasto -Linus Nordberg -Isis Lovecruft diff --git a/old_backend/ChangeLog.rst b/old_backend/ChangeLog.rst deleted file mode 100644 index 428b4456..00000000 --- a/old_backend/ChangeLog.rst +++ /dev/null @@ -1,118 +0,0 @@ -ChangeLog -========= - -1.3.6 (Mon, 25 Mar 2019) ------------------------- - -* Implement the OONI bouncer v2.0.0 spec - -1.3.5 (Thu, 21 Feb 2019) ------------------------- - -* Disable collecting reports from ooniprobe-android 2.0.0 - -1.3.4 (Tue, 26 Sep 2017) ------------------------- - -* fix(report/handlers): accept more semver versions (#111) - -* README.rst: also apt-get install libdumbnet-dev (#108) - -1.3.3 (Thu, 2 Feb 2017) -------------------------- -* Add support for allows clients to send HTTP request headers - -* Ignore redirects to localhost - -1.3.2 (Mon, 30 Jan 2017) -------------------------- - -* Fix backward compatibility with legacy clients when stripping invalid tcp_connect fields - -1.3.1 (Thu, 26 Jan 2017) -------------------------- - -* Add support for intermediate certificate (#95) - -* Add support for specifying collector-alternate field (#92) - -* Move state of reports to filesystem (#91) - -* Strip invalid tcp_connect fields in web_connectivity test helper - -1.3.0 (Mon, 30 May 2016) -------------------------- - -* Add web connectivity test helper - -* Add support for HTTPS collectors and bouncers - -* Fix problems with priviledge shedding and daemonisation - https://github.com/TheTorProject/ooni-backend/issues/65 - -1.2.0 (Wed, 27 Apr 2016) -------------------------- - -* Add support for receiving JSON based reports - -1.1.4 (Wed, 1 Oct 2014) ------------------------ - -* Fix bug that lead test helpers to not being started - -1.1.3 (Mon, 29 Sep 2014) ------------------------ - -* Add support for specifying the report archive path from config file - -* Write tor notice level logs to file - -1.1.2 (Wed, 3 Sep 2014) ------------------------ - -* Fix bug that lead oonib not running when a test helper was disabled. - -1.1.1 (Wed, 3 Sep 2014) ------------------------ - -* Fix daemonize API breakage when upgrading from Twisted <= 13.1 to >= 13.2 - https://trac.torproject.org/projects/tor/ticket/12644 - -* Make it possible to use a reports directory on a different volume than the - archive directory. - -1.1.0 (Tue, 2 Sep 2014) ------------------------ - -* Make changes to the bouncer API to make it aware of the policy of collectors. - -* Improve the bouncer API to make it more RESTful. - -* Add test helper that can be used to discover the DNS resolver being used by - the probe. - -* Code coverage and unittesting improvements. - -* Fix compatibility with latest txtorcon versions. - -1.0.2 (Wed, 21 May 2014) ------------------------- - -Various code improvements and fixes following the Least Authority release -engineering work. - -1.0.0 (Wed, 26 Mar 2014) ------------------------- - -First public release of oonibackend - -* Implements collector for receiver reports. - -* Implements HTTPReturnJSONHeaders HTTP helper - -* Implement DNS resolver helper - -* Implements TCP echo helper - -* Implements bouncer for directing directing probes to an adequate collector - and test helper. diff --git a/old_backend/Dockerfile b/old_backend/Dockerfile deleted file mode 100644 index 43ac22cc..00000000 --- a/old_backend/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -# Build: run ooni-sysadmin.git/scripts/docker-build from this directory - -FROM python:2.7.15-slim -ENV PYTHONUNBUFFERED 1 - -ENV PYTHONPATH /app/ - -# Setup the locales in the Dockerfile -RUN set -x \ - && apt-get update \ - && apt-get install locales -y \ - && locale-gen en_US.UTF-8 - -RUN set -x \ - && apt-get install gcc g++ make python-dev -y - -COPY requirements.txt /tmp/requirements.txt - -# Install Python dependencies -RUN set -x \ - && pip install -U pip setuptools \ - && pip install -r /tmp/requirements.txt - -# Install tor - -RUN set -x \ - && apt-get install tor -y - -# Copy the directory into the container -COPY . /app/ - -# Set our work directory to our app directory -WORKDIR /app/ diff --git a/old_backend/HACKING b/old_backend/HACKING deleted file mode 100644 index 44dfe015..00000000 --- a/old_backend/HACKING +++ /dev/null @@ -1,196 +0,0 @@ -Hacking on OONI -*************** - -This documents gives guidelines on where to start looking -for helping out in developing OONI and what guidelines you -should follow when writing code. - -We try to follow the general python best practices and styling -guides as specified in PEP. - - Beautiful is better than ugly. - Explicit is better than implicit. - Simple is better than complex. - Complex is better than complicated. - Flat is better than nested. - Sparse is better than dense. - Readability counts. - Special cases aren't special enough to break the rules. - Although practicality beats purity. - Errors should never pass silently. - Unless explicitly silenced. - In the face of ambiguity, refuse the temptation to guess. - There should be one-- and preferably only one --obvious way to do it. - Although that way may not be obvious at first unless you're Dutch. - Now is better than never. - Although never is often better than *right* now. - If the implementation is hard to explain, it's a bad idea. - If the implementation is easy to explain, it may be a good idea. - Namespaces are one honking great idea -- let's do more of those! - - - Tim Peters, The Zen of Python - -Code Structure ---------- - -- HACKING - The document you are currently reading. - -- oonib/ - Contains the OONI probe backend to be run on the ooni-net - -- ooniprobe.conf - The main OONI-probe configuration file. This can be used - to configure your OONI CLI, tell it where it should report - to, where the asset files are located, what should be used - for control, etc. - - -Style guide ------------ - -This is an extract of the most important parts of PEP-8. When in doubt on -what code style should be followed first consult this doc, then PEP-8 and -if all fails use your best judgement or ask for help. - -The most important part to read is the following as it contains the guidelines -of naming of variables, functions and classes, as it does not follow pure -PEP-8. - -Naming convention -................. - -Class names should follow the CapWords convention. -Note: When using abbreviations in CapWords, capitalize all the letters - of the abbreviation. Thus HTTPServerError is better than - HttpServerError. - -Exception names should follow the class names convention as exceptions -should be classes. - -Method names should follow camelCase with the first letter non-capital. - -Class attributes should also follow camelCase with the first letter non-capital. - -Functions should follow camelCase with the first letter non-capital. - -Functions and variables that are inside the local scope of a class or method -should be all lowercase separated by an underscore. - -Indentation -........... - - Use 4 spaces per indentation level. - - This can be setup in vi with: - set tabstop=4 - set shiftwidth=4 - set expandtab - - - Continuation lines should be wrapper like this: - - foo = long_function_name(var_one, var_two, - var_three, var_four) - - or this: - - def long_function_name(var_one, - var_two, var_three, - var_four): - print(var_one) - - - They should NOT be wrapper like this: - - foo = long_function_name(var_one, var_two, - var_three, var_four) - - and NOT like this: - - # See how it creates confusion with what is inside the function? - def long_function_name(var_one, - var_two, var_three, - var_four): - print(var_one) - - -Tabs or Spaces? -............... - -Everytime you insert a \t into any piece of code a kitten dies. - -Only spaces. Please. - -(code should be run with python -tt) - -Maximum Line Length -................... - -Maximum of 79 characters. 72 characters for long blocks of text is recommended. - -Blank Lines -........... - -Separate top-level function and class definitions with two blank lines. - -Method definitions inside of class are separated by a single blank line. - - -Encoding -........ - -Always use UTF-8 encoding. This can be specified by adding the encoding cookie -to the beginning of your python files: - - # -*- coding: UTF-8 - -All identifiers should be ASCII-only. All doc strings and comments should also -only be in ASCII. Non ASCII characters are allowed when they are related to -testing non-ASCII features or for the names of authors. - - -Imports -....... - -Import should be one per line as so: - - import os - import sys - from subprocess import Popen, PIPE - -Imports are always at the top of the file just after any module comments -and docstrings, berfore module globals and constants. - -Imports should be grouped in the following order: - -1. standard library imports -2. related third party imports -3. local application/library specific imports - -You should put a blank line between each group of imports. - - -Comments -........ - -Comments should always be up to date with the code. Don't have -comments that contraddict with the code. - -Comments should always be written in English. - -Blocks comments are indented to the same level of the code that -they refer to. They start with # and are followed by a single space. - -Use inline comments sparingly. # Gotcha? - - -Documentation strings -..................... - -Write docstrings for all public modules, functions, classes and -methods. Even better if you write them also for non-public methods. - -Place docstrings under the def. - -For a better overview on how to write docstrings consult: PEP-257 diff --git a/old_backend/LICENSE b/old_backend/LICENSE deleted file mode 100644 index a0c3c188..00000000 --- a/old_backend/LICENSE +++ /dev/null @@ -1,26 +0,0 @@ -Copyright (c) 2012, Jacob Appelbaum, Arturo Filastò, Isis Lovecruft -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The views and conclusions contained in the software and documentation are those -of the authors and should not be interpreted as representing official policies, -either expressed or implied, of the FreeBSD Project. diff --git a/old_backend/MANIFEST.in b/old_backend/MANIFEST.in deleted file mode 100644 index 18db7db7..00000000 --- a/old_backend/MANIFEST.in +++ /dev/null @@ -1,15 +0,0 @@ -include data/decks/README -include data/inputs/Makefile -include data/reports/empty.txt -include data/archive/empty.txt -include data/tor/empty.txt -include data/bouncer.yaml -include data/policy.yaml -include oonib.conf.example -include requirements.txt -include AUTHORS -include HACKING -include LICENSE -include MANIFEST.in -include README.rst -include ChangeLog.rst diff --git a/old_backend/README.rst b/old_backend/README.rst deleted file mode 100644 index 9c6d2c34..00000000 --- a/old_backend/README.rst +++ /dev/null @@ -1,307 +0,0 @@ -oonibackend: backend infrastructure for ooniprobe -================================================= - -.. image:: https://travis-ci.org/ooni/backend-legacy.png?branch=master - :target: https://travis-ci.org/ooni/backend-legacy - -.. image:: https://coveralls.io/repos/TheTorProject/ooni-backend/badge.png?branch=master - :target: https://coveralls.io/r/TheTorProject/ooni-backend - -oonibackend is used by ooniprobe to discover the addresses of test helpers (via -the bouncer) to submit reports to (via the collector) and to perform some -measurements that require a backend system to talk to (via test helpers). - -If you are interested in supporting the OONI project by running this backend -infrastructure follow this guide and then inform OONI developers of the address -of your collector and test helper by sending an email to -ooni-talk@lists.torproject.org. - -Dependencies and Installation -============================= - -Distro dependencies (Debian) ----------------------------- - -There are a few dependencies which we recommend you get from your -distribution's archives:: - - sudo apt-get install build-essential python-dev python-setuptools openssl libsqlite3-dev libffi-dev git curl libdumbnet-dev - -Tor -... - -You will need a Tor binary on your system. For complete instructions, see -also:: - - https://www.torproject.org/docs/tor-doc-unix.html.en - https://www.torproject.org/docs/rpms.html.en - -If you've already got Tor, or plan to compile it yourself from source, great! -You can skip this step. Otherwise, if you're installing Tor (or reinstalling), -you'll want to make sure to get our keyring package in Debian:: - - echo "deb http://deb.torproject.org/torproject.org wheezy main" | \ - sudo tee -a /etc/apt/sources.list - gpg --keyserver keys.gnupg.net --recv EE8CBC9E886DDD89 - gpg --export A3C4F0F979CAA22CDBA8F512EE8CBC9E886DDD89 | sudo apt-key add - - sudo apt-get update - sudo apt-get install deb.torproject.org-keyring tor tor-geoipdb - -Pip (>=7.0.0) -............. - -We recommend using the Pip>=7.0.0 because it included several important -security and privacy related patches: - - * It forces the use of HTTPS for [PyPI](pypi.python.org). - * and checks package hash sums before installation, with support for hashes - more collision-resistant than MD5. - * It does not fetch insecure metadata from external sourced by default. - * It does not support an insecure index without explicit opt in. - -The least painful way (that we know of) to install a newer Pip is to use Pip's -get-pip script:: - - # Grab the get-pip installer to make sure we have pip>=1.3.0 - curl -O https://bootstrap.pypa.io/get-pip.py - sudo python ./get-pip.py ## pip (>=7.0.0) is recommended for security reasons - # And make sure we're actually using the newer one: - sudo update-alternatives --install /usr/bin/pip pip /usr/local/bin/pip 0 - -Virtualenv -.......... - -We recommend that you use a python virtualenv. The recommended commands for -setting up this up and installing are:: - - sudo pip install --upgrade virtualenv virtualenvwrapper - # Setup the virtualenv directory: - export WORKON_HOME=~/.virtualenvs && mkdir -p $WORKON_HOME - source /usr/local/bin/virtualenvwrapper.sh - # Clone ooni-backend: - git clone https://github.com/TheTorProject/ooni-backend.git && cd ooni-backend - # Create the virtualenv for ooni-backend... - mkvirtualenv -a $PWD --unzip-setuptools --setuptools --no-site-packages oonib - # ...and install ooni-backend (sudo is not necessary since we're in a virtualenv): - pip install -r requirements.txt - # Note: it is important that you install the requirements before you run - # the setup.py script. If you fail to do so they will be downloaded over - # plaintext. - python setup.py install - -Running an OONI collector -========================= - -Configure oonib ---------------- - -Copy the example config file to ``oonib.conf``:: - - cp oonib.conf.example oonib.conf - -Then edit your configuration to fit your needs. The fields you should probably -end up changing are ``report_dir`` (the public web server directory where you -would like ooni-probe clients to be able to submit reports to, for example, if -the clients should submit POSTs to https://abcdef0123456789.onion/report then -this would simply be ``'report'``) and ``tor_datadir`` (where you would -like the spawned Tor process to keep its data). If you compiled Tor yourself, -you'll likely want to specify it for the ``tor_binary`` option. - -To configure the format of the log for the bouncer, collector and the HTTP based -test helpers you can specify the `log_format` option in the main section. - -By default this is the logging policy adopted:: - - [{protocol}] {status} {request_method} {request_uri} 127.0.0.1 {request_time}ms - -The supported keys are ``protocol``, ``status``, ``request_method``, ``remote_ip``, -``request_uri``, ``request_time``. - -Configure bouncer and collector endpoints -......................................... - -The bouncer and collector are HTTP applications ("protocols" in twisted terminology) that can be configured to run on top of plain TCP, TLS, or onion service endpoints. -Here is an example of the relevant part of the configuration:: - - bouncer_endpoints: - - {type: tls, port: 10443, cert: "private/fullchain.pem", privkey: "private/privkey.pem"} - - {type: tcp, port: 10080} - - {type: onion, hsdir: "/some/private/bouncer"} - - collector_endpoints: - - {type: tls, port: 11443, fullchain: "private/fullchain.pem", privkey: "private/privkey.pem"} - - {type: onion, hsdir: "/some/private/collector"} - -`scripts/gen-ssl-key-cert.sh` in this repo contains the openssl command to generate a self-signed certificate which you can use for the tls endpoint. -txtorcon will use the hostname/private_key from the configured hsdir to start an onion service, or generate a new key if hsdir is empty. - - -Bouncer configuration -..................... - -The bouncer.yaml file contains the list of collectors and test-helpers that are -available to ooniprobe for receiving network measurement results. - -In our deployment of oonibackend the bouncer.yaml file is generated -automatically every 24 hours via a cronjob that runs `update-bouncer.py`. -What this script does is it fetches the collector addresses and IP address of -mlab nodes and joins them with the base bouncer information stored in -`/data/bouncer/bouncer-base.yaml`. - -To specify additional test helpers (for example when they change address or -when a new test helper comes out) you will need to edit -`/data/bouncer/bouncer-base.yaml`.:: - - collector: - httpo://ihiderha53f36lsd.onion: - test-helper: {dns: '213.138.109.232:57004', ssl: 'https://213.138.109.232', tcp-echo: '213.138.109.232', traceroute: '213.138.109.232', web-connectivity: 'httpo://ckjj3ra6456muu7o.onion'} - -You need to edit the content of the dictionary `test-helper`. The keys are the -names of the test helpers. -The value is the address of the test helper and this depends on the type of -test helper. - -Here is a list of test helpers: - -* dns (value: xxx.xxx.xxx.xxx) - -* ssl (value: https://xxx.xxx.xxx.xxx) - -* tcp-echo (xxx.xxx.xxx.xxx) - -* traceroute (xxx.xxx.xxx.xxx) - -* web-connectivity (httpo://xxxxxxxxx.onion) - -Moreover it is possible to specify test-helper-alternate addresses that are -used to determine alternative names for a given test helper. - -Currently only `web-connectivity` supports the test-helper-alternate field. - -This can be specified like follows:: - - collector: - httpo://ihiderha53f36lsd.onion: - test-helper: {dns: '213.138.109.232:57004', ssl: 'https://213.138.109.232', tcp-echo: '213.138.109.232', traceroute: '213.138.109.232', web-connectivity: 'httpo://ckjj3ra6456muu7o.onion'} - test-helper-alternate: - web-connectivity: - - {address: 'httpo://ckjj3ra6456muu7o.onion', type: 'onion'} - - {address: 'https://web-connectivity.ooni.io', type: 'https'} - - {address: 'http://web-connectivity.ooni.io', type: 'http'} - - {address: 'https://xxxxxxxxx.cloudfront.net', type: 'cloudfront', front: 'a0.awsstatic.com'} - -Also collectors can have a set of alternate addresses. These can be -specified inside of the `collector-alternate` key under the collector -address like so:: - - collector: - httpo://thirteenchars123.onion: - collector-alternate: - - {address: 'https://a.collector.ooni.io', type: 'https'} - - {address: 'http://a.collector.ooni.io', type: 'http'} - - {address: 'https://xxxxxxxxx.cloudfront.net', type: 'cloudfront', front: 'a0.awsstatic.com'} - - -The currently supported types are 'https' and 'http'. - -Generate self signed certs for OONIB -.................................... -If you want to use the HTTPS test helper, you will need to create a -certificate:: - - openssl genrsa -des3 -out private.key 4096 - openssl req -new -key private.key -out server.csr - cp private.key private.key.org - # Remove passphrase from key - openssl rsa -in private.key.org -out private.key - openssl x509 -req -days 365 -in server.csr -signkey private.key -out certificate.crt - rm private.key.org - rm server.csr - -If you decide to put your certificate and key somewhere else, don't forget to -update oonib.conf options ```helpers.ssl.private_key``` and ```helpers.ssl.certificate``` ! - -Redirect low ports with iptables -................................ - -The following iptables commands will map connections on low ports to those -bound by oonib:: - - # Map port 80 to config.helpers['http-return-json-headers'].port (default: 57001) - iptables -t nat -A PREROUTING -p tcp -m tcp --dport 80 -j REDIRECT --to-ports 57001 - # Map port 443 to config.helpers.ssl.port (default: 57006) - iptables -t nat -A PREROUTING -p tcp -m tcp --dport 443 -j REDIRECT --to-ports 57006 - # Map port 53 udp to config.helpers.dns.udp_port (default: 57004) - iptables -t nat -A PREROUTING -p tcp -m udp --dport 53 -j REDIRECT --tor-ports - # Map port 53 tcp to config.helpers.dns.tcp_port (default: 57005) - iptables -t nat -A PREROUTING -p tcp -m tcp --dport 53 -j REDIRECT --tor-ports - -(For Experts Only) Tor2webmode -.............................. - -**WARNING**: provides no anonymity! Use only if you know what you are doing! -Tor2webmode will improve the performance of the collector Hidden Service by -discarding server-side anonymity. - -You will need to build Tor from source. At the time of writing, the latest -stable Tor is tor-0.2.3.25. You should use the most recent stable Tor. - -Example:: - - git clone https://git.torproject.org/tor.git - git checkout tor-0.2.3.25 - git verify-tag -v tor-0.2.3.25 - -You should see:: - - object 17c24b3118224d6536c41fa4e1493a831fb29f0a - type commit - tag tor-0.2.3.25 - tagger Roger Dingledine 1353399116 -0500 - - tag 0.2.3.25 - gpg: Signature made Tue 20 Nov 2012 08:11:59 UTC - gpg: using RSA key C218525819F78451 - gpg: Good signature from "Roger Dingledine " - gpg: aka "Roger Dingledine " - gpg: aka "Roger Dingledine " - -It is always good idea to verify:: - - $ gpg --recv-keys C218525819F78451 - [...] - $ gpg --fingerprint C218525819F78451 - pub 4096R/C218525819F78451 2010-05-07 - Key fingerprint = F65C E37F 04BA 5B36 0AE6 EE17 C218 5258 19F7 8451 - uid [ full ] Roger Dingledine - uid [ full ] Roger Dingledine - uid [ full ] Roger Dingledine - sub 4096R/690234AC0DCC0FE1 2013-05-09 [expires: 2014-05-09] - -Build Tor with enable-tor2web-mode:: - - ./autogen.sh ; ./configure --enable-tor2web-mode ; make - -Copy the tor binary from src/or/tor somewhere and set the corresponding -options in oonib.conf. - -To launch oonib on system boot ------------------------------- -To launch oonib on startup, you may want to use supervisord (www.supervisord.org) -The following supervisord config will use the virtual environment in -/home/ooni/venv_oonib and start oonib on boot:: - - [program:oonib] - command=/home/ooni/venv_oonib/bin/python /home/ooni/ooni-probe/bin/oonib - autostart=true - user=oonib - directory=/home/oonib/ - -Testing with vagrant --------------------- - -To test the deployment of oonib you may use [vagrant](http://www.vagrantup.com). - -Once installed you will be able to install oonib in the virtual machine via:: - - vagrant up diff --git a/old_backend/Vagrantfile b/old_backend/Vagrantfile deleted file mode 100644 index a2abe0d3..00000000 --- a/old_backend/Vagrantfile +++ /dev/null @@ -1,82 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -Vagrant.configure("2") do |config| - config.vm.box = "ubuntu/precise32" - - # Create a forwarded port mapping which allows access to a specific port - # within the machine from a port on the host machine. In the example below, - # accessing "localhost:8080" will access port 80 on the guest machine. - config.vm.network :forwarded_port, guest: 57001, host: 57001 - config.vm.network :forwarded_port, guest: 57001, host: 57002 - config.vm.network :forwarded_port, guest: 57001, host: 57003 - config.vm.network :forwarded_port, guest: 57004, host: 57004 - config.vm.network :forwarded_port, guest: 57005, host: 57005 - config.vm.network :forwarded_port, guest: 57006, host: 57006 - config.vm.network :forwarded_port, guest: 57007, host: 57007 - - # Create a private network, which allows host-only access to the machine - # using a specific IP. - # config.vm.network :private_network, ip: "192.168.33.10" - - # Create a public network, which generally matched to bridged network. - # Bridged networks make the machine appear as another physical device on - # your network. - # config.vm.network :public_network - - # Share an additional folder to the guest VM. The first argument is - # the path on the host to the actual folder. The second argument is - # the path on the guest to mount the folder. And the optional third - # argument is a set of non-required options. - config.vm.synced_folder ".", "/data/oonib" -end - -$setup_script = <