Skip to content

Commit 35e32b8

Browse files
committed
Import the generate-domains-blacklists tool
1 parent 6ad53c7 commit 35e32b8

5 files changed

+391
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
2+
##################################################################################
3+
# #
4+
# Generate a black list of domains using public data sources, and the local #
5+
# domains-blacklist-local-additions.txt file. #
6+
# #
7+
# Comment the URLs of the sources you want to disable, and run the script to #
8+
# build the dnscrypt-blacklist-domains.txt file: #
9+
# #
10+
# $ generate-domains-blacklist.py > dnscrypt-blacklist-domains.txt #
11+
# #
12+
# That blacklist file can then be used in the dnscrypt-proxy configuration: #
13+
# #
14+
# BlackList domains:/etc/dnscrypt-blacklist-domains.txt #
15+
# #
16+
##################################################################################
17+
18+
# Local additions
19+
file:domains-blacklist-local-additions.txt
20+
21+
# Bambenek malware C2s
22+
http://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt
23+
24+
# hpHosts’ Ad and tracking servers
25+
http://hosts-file.net/.%5Cad_servers.txt
26+
27+
# Malware domains
28+
http://mirror1.malwaredomains.com/files/justdomains
29+
30+
# Abuse.ch Ransomware Tracker
31+
http://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt
32+
33+
# Malware Domain List
34+
http://www.malwaredomainlist.com/mdlcsv.php?inactive=off
35+
36+
# Adblock Warning Removal List
37+
https://easylist-downloads.adblockplus.org/antiadblockfilters.txt
38+
39+
# EasyList
40+
https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt
41+
42+
# EasyList China
43+
https://easylist-downloads.adblockplus.org/easylistchina.txt
44+
45+
# Fanboy’s Social Blocking List
46+
https://easylist-downloads.adblockplus.org/fanboy-social.txt
47+
48+
# Peter Lowe’s Ad and tracking server list
49+
https://pgl.yoyo.org/adservers/serverlist.php
50+
51+
# Spam404
52+
https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt
53+
54+
# CJX Annoyance List
55+
https://raw.githubusercontent.com/cjx82630/cjxlist/master/cjxlist.txt
56+
57+
# EU: Prebake - Filter Obtrusive Cookie Notices
58+
https://raw.githubusercontent.com/liamja/Prebake/master/obtrusive.txt
59+
60+
# Malvertising filter list by Disconnect
61+
https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt
62+
63+
# Malware filter list by Disconnect
64+
https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt
65+
66+
# Basic tracking list by Disconnect
67+
https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt
68+
69+
# Quidsup NoTrack
70+
https://raw.githubusercontent.com/quidsup/notrack/master/trackers.txt
71+
72+
# Sysctl list (ads)
73+
http://sysctl.org/cameleon/hosts
74+
75+
# KAD host file (fraud/adware) - https://github.com/azet12/KADhosts
76+
https://raw.githubusercontent.com/azet12/KADhosts/master/KADhosts.txt
77+
78+
# Fake news sites
79+
https://raw.githubusercontent.com/marktron/fakenews/master/fakenews
80+
81+
# Dynamic DNS services, sadly often used by malware
82+
http://mirror2.malwaredomains.com/files/dynamic_dns.txt
83+
84+
# Block pornography
85+
https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts
86+
https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts
87+
http://securemecca.com/Downloads/hosts.txt
88+
89+
# Block gambling sites
90+
https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts
91+
92+
# Block social media sites
93+
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
# Local set of patterns to block
3+
4+
ad.*
5+
ads.*
6+
banner.*
7+
banners.*
8+
creatives.*
9+
oas.*
10+
oascentral.*
11+
stats.*
12+
tag.*
13+
telemetry.*
14+
tracker.*
15+
16+
# My Macbook constantly sends a lot of useless queries to *.local,
17+
# so I block them. *.lan is apparently another common one, and
18+
# *.localdomain and *.workgroup are common on Windows.
19+
20+
*.lan
21+
*.local
22+
*.localdomain
23+
*.workgroup
24+
25+
# eth0.me is hardcoded in tools such as Archey, but is not available any
26+
# more, causing issues such as terminal sessions taking a long time to
27+
# start.
28+
29+
eth0.me
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
2+
##################################################################################
3+
# #
4+
# Generate a black list of domains using public data sources, and the local #
5+
# domains-blacklist-local-additions.txt file. #
6+
# #
7+
# The default configuration is just indicative, and corresponds to the one #
8+
# used to produce the public "mybase" set. #
9+
# #
10+
# Comment out the URLs of the sources you wish to disable, leave the ones #
11+
# you would like enabled uncommented. Then run the script to build the #
12+
# dnscrypt-blacklist-domains.txt file: #
13+
# #
14+
# $ generate-domains-blacklist.py > dnscrypt-blacklist-domains.txt #
15+
# #
16+
# Domains that should never be blocked can be put into a file named #
17+
# domains-whitelist.txt. #
18+
# #
19+
# That blacklist file can then be used in the dnscrypt-proxy configuration: #
20+
# #
21+
# BlackList domains:/etc/dnscrypt-blacklist-domains.txt #
22+
# #
23+
##################################################################################
24+
25+
# Local additions
26+
file:domains-blacklist-local-additions.txt
27+
28+
# Bambenek malware C2s
29+
http://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt
30+
31+
# hpHosts’ Ad and tracking servers
32+
http://hosts-file.net/.%5Cad_servers.txt
33+
34+
# Malware domains
35+
http://mirror1.malwaredomains.com/files/justdomains
36+
37+
# Abuse.ch Ransomware Tracker
38+
http://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt
39+
40+
# Malware Domain List
41+
http://www.malwaredomainlist.com/mdlcsv.php?inactive=off
42+
43+
# Adblock Warning Removal List
44+
https://easylist-downloads.adblockplus.org/antiadblockfilters.txt
45+
46+
# EasyList
47+
https://easylist-downloads.adblockplus.org/easylist_noelemhide.txt
48+
49+
# EasyList China
50+
https://easylist-downloads.adblockplus.org/easylistchina.txt
51+
52+
# Fanboy’s Social Blocking List
53+
https://easylist-downloads.adblockplus.org/fanboy-social.txt
54+
55+
# Peter Lowe’s Ad and tracking server list
56+
https://pgl.yoyo.org/adservers/serverlist.php
57+
58+
# Spam404
59+
https://raw.githubusercontent.com/Dawsey21/Lists/master/adblock-list.txt
60+
61+
# CJX Annoyance List
62+
https://raw.githubusercontent.com/cjx82630/cjxlist/master/cjxlist.txt
63+
64+
# EU: Prebake - Filter Obtrusive Cookie Notices
65+
https://raw.githubusercontent.com/liamja/Prebake/master/obtrusive.txt
66+
67+
# Malvertising filter list by Disconnect
68+
https://s3.amazonaws.com/lists.disconnect.me/simple_malvertising.txt
69+
70+
# Malware filter list by Disconnect
71+
https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt
72+
73+
# Basic tracking list by Disconnect
74+
https://s3.amazonaws.com/lists.disconnect.me/simple_tracking.txt
75+
76+
# Sysctl list (ads)
77+
http://sysctl.org/cameleon/hosts
78+
79+
# KAD host file (fraud/adware) - https://github.com/azet12/KADhosts
80+
https://raw.githubusercontent.com/azet12/KADhosts/master/KADhosts.txt
81+
82+
# BarbBlock list (spurious and invalid DMCA takedowns)
83+
https://ssl.bblck.me/blacklists/domain-list.txt
84+
85+
# Dan Pollock's hosts list
86+
http://someonewhocares.org/hosts/hosts
87+
88+
# Websites potentially publishing fake news
89+
# https://raw.githubusercontent.com/marktron/fakenews/master/fakenews
90+
91+
# Quidsup NoTrack - Contains too many false positives to be enabled by default
92+
# https://raw.githubusercontent.com/quidsup/notrack/master/trackers.txt
93+
94+
# Dynamic DNS services, sadly often used by malware
95+
# http://mirror2.malwaredomains.com/files/dynamic_dns.txt
96+
97+
# Block pornography
98+
# https://raw.githubusercontent.com/Clefspeare13/pornhosts/master/0.0.0.0/hosts
99+
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/pornography-hosts
100+
# http://securemecca.com/Downloads/hosts.txt
101+
102+
# Block gambling sites
103+
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/gambling-hosts
104+
105+
# Block social media sites
106+
# https://raw.githubusercontent.com/Sinfonietta/hostfiles/master/social-hosts
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
a-msedge.net
2+
amazon.com
3+
appsflyer.com
4+
azurewebsites.net
5+
cdnetworks.com
6+
cloudapp.net
7+
edgekey.net
8+
elasticbeanstalk.com
9+
invalid
10+
j.mp
11+
l-msedge.net
12+
lan
13+
localdomain
14+
microsoft.com
15+
msedge.net
16+
nsatc.net
17+
ovh.net
18+
pusher.com
19+
pusherapp.com
20+
spotify.com
21+
tagcommander.com
22+
tracker.debian.org
23+
windows.net
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#! /usr/bin/env python
2+
3+
# run with python generate-domains-blacklist.py > list.txt.tmp && mv -f list.txt.tmp list
4+
5+
import argparse
6+
import re
7+
import sys
8+
import urllib2
9+
10+
11+
def parse_blacklist(content, trusted=False):
12+
rx_comment = re.compile(r'^(#|$)')
13+
rx_inline_comment = re.compile(r'\s*#\s*[a-z0-9-].*$')
14+
rx_u = re.compile(r'^@*\|\|([a-z0-9.-]+[.][a-z]{2,})\^?(\$(popup|third-party))?$')
15+
rx_l = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,})$')
16+
rx_h = re.compile(r'^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9.-]+[.][a-z]{2,})$')
17+
rx_mdl = re.compile(r'^"[^"]+","([a-z0-9.-]+[.][a-z]{2,})",')
18+
rx_b = re.compile(r'^([a-z0-9.-]+[.][a-z]{2,}),.+,[0-9: /-]+,')
19+
rx_trusted = re.compile(r'^([*a-z0-9.-]+)$')
20+
21+
names = set()
22+
rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b]
23+
if trusted:
24+
rx_set = [rx_trusted]
25+
for line in content.splitlines():
26+
line = str.lower(str.strip(line))
27+
if rx_comment.match(line):
28+
continue
29+
line = rx_inline_comment.sub('', line)
30+
for rx in rx_set:
31+
matches = rx.match(line)
32+
if not matches:
33+
continue
34+
name = matches.group(1)
35+
names.add(name)
36+
return names
37+
38+
39+
def list_from_url(url):
40+
sys.stderr.write("Loading data from [{}]\n".format(url))
41+
req = urllib2.Request(url)
42+
trusted = False
43+
if req.get_type() == "file":
44+
trusted = True
45+
response = None
46+
try:
47+
response = urllib2.urlopen(req, timeout=10)
48+
except urllib2.URLError as err:
49+
raise Exception("[{}] could not be loaded: {}\n".format(url, err))
50+
if trusted is False and response.getcode() != 200:
51+
raise Exception("[{}] returned HTTP code {}\n".format(url, response.getcode()))
52+
content = response.read()
53+
54+
return parse_blacklist(content, trusted)
55+
56+
57+
def name_cmp(name):
58+
parts = name.split(".")
59+
parts.reverse()
60+
return str.join(".", parts)
61+
62+
63+
def has_suffix(names, name):
64+
parts = str.split(name, ".")
65+
while parts:
66+
parts = parts[1:]
67+
if str.join(".", parts) in names:
68+
return True
69+
70+
return False
71+
72+
73+
def whitelist_from_url(url):
74+
if not url:
75+
return set()
76+
77+
return list_from_url(url)
78+
79+
80+
def blacklists_from_config_file(file, whitelist, ignore_retrieval_failure):
81+
blacklists = {}
82+
all_names = set()
83+
unique_names = set()
84+
85+
if whitelist and not re.match(r'^[a-z0-9]+:', whitelist):
86+
whitelist = "file:" + whitelist
87+
88+
whitelisted_names = whitelist_from_url(whitelist)
89+
90+
with open(file) as fd:
91+
for line in fd:
92+
line = str.strip(line)
93+
if str.startswith(line, "#") or line == "":
94+
continue
95+
url = line
96+
try:
97+
names = list_from_url(url)
98+
blacklists[url] = names
99+
all_names |= names
100+
except Exception as e:
101+
sys.stderr.write(e.message)
102+
if not ignore_retrieval_failure:
103+
exit(1)
104+
105+
for url, names in blacklists.items():
106+
print("\n\n########## Blacklist from {} ##########\n".format(url))
107+
ignored, whitelisted = 0, 0
108+
list_names = list()
109+
for name in names:
110+
if has_suffix(all_names, name) or name in unique_names:
111+
ignored = ignored + 1
112+
elif has_suffix(whitelisted_names, name) or name in whitelisted_names:
113+
whitelisted = whitelisted + 1
114+
else:
115+
list_names.append(name)
116+
unique_names.add(name)
117+
118+
list_names.sort(key=name_cmp)
119+
if ignored:
120+
print("# Ignored duplicates: {}\n".format(ignored))
121+
if whitelisted:
122+
print("# Ignored entries due to the whitelist: {}\n".format(whitelisted))
123+
for name in list_names:
124+
print(name)
125+
126+
127+
argp = argparse.ArgumentParser(description="Create a unified blacklist from a set of local and remote files")
128+
argp.add_argument("-c", "--config", default="domains-blacklist.conf",
129+
help="file containing blacklist sources")
130+
argp.add_argument("-w", "--whitelist", default="domains-whitelist.txt",
131+
help="file containing a set of names to exclude from the blacklist")
132+
argp.add_argument("-i", "--ignore-retrieval-failure", action='store_true',
133+
help="generate list even if some urls couldn't be retrieved")
134+
args = argp.parse_args()
135+
136+
conf = args.config
137+
whitelist = args.whitelist
138+
ignore_retrieval_failure = args.ignore_retrieval_failure
139+
140+
blacklists_from_config_file(conf, whitelist, ignore_retrieval_failure)

0 commit comments

Comments
 (0)