forked from DNSCrypt/dnscrypt-proxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-domains-blacklist.py
executable file
·140 lines (113 loc) · 4.45 KB
/
generate-domains-blacklist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! /usr/bin/env python
# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
import argparse
import re
import sys
import urllib2
def parse_blacklist(content, trusted=False):
rx_comment = re.compile(r'^(#|$)')
rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$')
rx_u = re.compile(r'^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$')
rx_l = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,})$')
rx_h = re.compile(r'^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$')
rx_mdl = re.compile(r'^"[^"]+","([a-z0-9.-]+[.][a-z]{2,})",')
rx_b = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,')
rx_trusted = re.compile(r'^([*a-z0-9.-]+)$')
names = set()
rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b]
if trusted:
rx_set = [rx_trusted]
for line in content.splitlines():
line = str.lower(str.strip(line))
if rx_comment.match(line):
continue
line = rx_inline_comment.sub('', line)
for rx in rx_set:
matches = rx.match(line)
if not matches:
continue
name = matches.group(1)
names.add(name)
return names
def list_from_url(url):
sys.stderr.write("Loading data from [{}]\n".format(url))
req = urllib2.Request(url)
trusted = False
if req.get_type() == "file":
trusted = True
response = None
try:
response = urllib2.urlopen(req, timeout=10)
except urllib2.URLError as err:
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
if trusted is False and response.getcode() != 200:
raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
content = response.read()
return parse_blacklist(content, trusted)
def name_cmp(name):
parts = name.split(".")
parts.reverse()
return str.join(".", parts)
def has_suffix(names, name):
parts = str.split(name, ".")
while parts:
parts = parts[1:]
if str.join(".", parts) in names:
return True
return False
def whitelist_from_url(url):
if not url:
return set()
return list_from_url(url)
def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure):
blacklists = {}
all_names = set()
unique_names = set()
if whitelist and not re.match(r'^[a-z0-9]+:', whitelist):
whitelist = "file:" + whitelist
whitelisted_names = whitelist_from_url(whitelist)
with open(file) as fd:
for line in fd:
line = str.strip(line)
if str.startswith(line, "#") or line == "":
continue
url = line
try:
names = list_from_url(url)
blacklists[url] = names
all_names |= names
except Exception as e:
sys.stderr.write(e.message)
if not ignore_retrieval_failure:
exit(1)
for url, names in blacklists.items():
print("\n\n########## Blacklist from {} ##########\n".format(url))
ignored, whitelisted = 0, 0
list_names = list()
for name in names:
if has_suffix(all_names, name) or name in unique_names:
ignored = ignored + 1
elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
whitelisted = whitelisted + 1
else:
list_names.append(name)
unique_names.add(name)
list_names.sort(key=name_cmp)
if ignored:
print("# Ignored duplicates: {}\n".format(ignored))
if whitelisted:
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted))
for name in list_names:
print(name)
argp = argparse.ArgumentParser(description="Create a unified blacklist from a set of local and remote files")
argp.add_argument("-c", "--config", default="domains-blacklist.conf",
help="file containing blacklist sources")
argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt",
help="file containing a set of names to exclude from the blacklist")
argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true',
help="generate list even if some urls couldn't be retrieved")
args = argp.parse_args()
conf = args.config
whitelist = args.whitelist
ignore_retrieval_failure = args.ignore_retrieval_failure
blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure)