|
| 1 | +#! /usr/bin/env python |
| 2 | + |
| 3 | +# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list |
| 4 | + |
| 5 | +import argparse |
| 6 | +import re |
| 7 | +import sys |
| 8 | +import urllib2 |
| 9 | + |
| 10 | + |
| 11 | +def parse_blacklist(content, trusted=False): |
| 12 | + rx_comment = re.compile(r'^(#|$)') |
| 13 | + rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$') |
| 14 | + rx_u = re.compile(r'^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$') |
| 15 | + rx_l = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,})$') |
| 16 | + rx_h = re.compile(r'^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$') |
| 17 | + rx_mdl = re.compile(r'^"[^"]+","([a-z0-9.-]+[.][a-z]{2,})",') |
| 18 | + rx_b = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,') |
| 19 | + rx_trusted = re.compile(r'^([*a-z0-9.-]+)$') |
| 20 | + |
| 21 | + names = set() |
| 22 | + rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b] |
| 23 | + if trusted: |
| 24 | + rx_set = [rx_trusted] |
| 25 | + for line in content.splitlines(): |
| 26 | + line = str.lower(str.strip(line)) |
| 27 | + if rx_comment.match(line): |
| 28 | + continue |
| 29 | + line = rx_inline_comment.sub('', line) |
| 30 | + for rx in rx_set: |
| 31 | + matches = rx.match(line) |
| 32 | + if not matches: |
| 33 | + continue |
| 34 | + name = matches.group(1) |
| 35 | + names.add(name) |
| 36 | + return names |
| 37 | + |
| 38 | + |
| 39 | +def list_from_url(url): |
| 40 | + sys.stderr.write("Loading data from [{}]\n".format(url)) |
| 41 | + req = urllib2.Request(url) |
| 42 | + trusted = False |
| 43 | + if req.get_type() == "file": |
| 44 | + trusted = True |
| 45 | + response = None |
| 46 | + try: |
| 47 | + response = urllib2.urlopen(req, timeout=10) |
| 48 | + except urllib2.URLError as err: |
| 49 | + raise Exception("[{}] could not be loaded: {}\n".format(url, err)) |
| 50 | + if trusted is False and response.getcode() != 200: |
| 51 | + raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode())) |
| 52 | + content = response.read() |
| 53 | + |
| 54 | + return parse_blacklist(content, trusted) |
| 55 | + |
| 56 | + |
| 57 | +def name_cmp(name): |
| 58 | + parts = name.split(".") |
| 59 | + parts.reverse() |
| 60 | + return str.join(".", parts) |
| 61 | + |
| 62 | + |
| 63 | +def has_suffix(names, name): |
| 64 | + parts = str.split(name, ".") |
| 65 | + while parts: |
| 66 | + parts = parts[1:] |
| 67 | + if str.join(".", parts) in names: |
| 68 | + return True |
| 69 | + |
| 70 | + return False |
| 71 | + |
| 72 | + |
| 73 | +def whitelist_from_url(url): |
| 74 | + if not url: |
| 75 | + return set() |
| 76 | + |
| 77 | + return list_from_url(url) |
| 78 | + |
| 79 | + |
| 80 | +def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure): |
| 81 | + blacklists = {} |
| 82 | + all_names = set() |
| 83 | + unique_names = set() |
| 84 | + |
| 85 | + if whitelist and not re.match(r'^[a-z0-9]+:', whitelist): |
| 86 | + whitelist = "file:" + whitelist |
| 87 | + |
| 88 | + whitelisted_names = whitelist_from_url(whitelist) |
| 89 | + |
| 90 | + with open(file) as fd: |
| 91 | + for line in fd: |
| 92 | + line = str.strip(line) |
| 93 | + if str.startswith(line, "#") or line == "": |
| 94 | + continue |
| 95 | + url = line |
| 96 | + try: |
| 97 | + names = list_from_url(url) |
| 98 | + blacklists[url] = names |
| 99 | + all_names |= names |
| 100 | + except Exception as e: |
| 101 | + sys.stderr.write(e.message) |
| 102 | + if not ignore_retrieval_failure: |
| 103 | + exit(1) |
| 104 | + |
| 105 | + for url, names in blacklists.items(): |
| 106 | + print("\n\n########## Blacklist from {} ##########\n".format(url)) |
| 107 | + ignored, whitelisted = 0, 0 |
| 108 | + list_names = list() |
| 109 | + for name in names: |
| 110 | + if has_suffix(all_names, name) or name in unique_names: |
| 111 | + ignored = ignored + 1 |
| 112 | + elif has_suffix(whitelisted_names, name) or name in whitelisted_names: |
| 113 | + whitelisted = whitelisted + 1 |
| 114 | + else: |
| 115 | + list_names.append(name) |
| 116 | + unique_names.add(name) |
| 117 | + |
| 118 | + list_names.sort(key=name_cmp) |
| 119 | + if ignored: |
| 120 | + print("# Ignored duplicates: {}\n".format(ignored)) |
| 121 | + if whitelisted: |
| 122 | + print("# Ignored entries due to the whitelist: {}\n".format(whitelisted)) |
| 123 | + for name in list_names: |
| 124 | + print(name) |
| 125 | + |
| 126 | + |
| 127 | +argp = argparse.ArgumentParser(description="Create a unified blacklist from a set of local and remote files") |
| 128 | +argp.add_argument("-c", "--config", default="domains-blacklist.conf", |
| 129 | + help="file containing blacklist sources") |
| 130 | +argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt", |
| 131 | + help="file containing a set of names to exclude from the blacklist") |
| 132 | +argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true', |
| 133 | + help="generate list even if some urls couldn't be retrieved") |
| 134 | +args = argp.parse_args() |
| 135 | + |
| 136 | +conf = args.config |
| 137 | +whitelist = args.whitelist |
| 138 | +ignore_retrieval_failure = args.ignore_retrieval_failure |
| 139 | + |
| 140 | +blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure) |
0 commit comments