fullstackpython.com/check_urls.py at master · averyjordan1/fullstackpython.com · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
from concurrent import futures
import multiprocessing as mp
import os
import uuid

import requests
import urllib3


# Ignore security hazard since certs SHOULD be trusted (https)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Avoid rate limiting (tcp)
_URL_BOT_ID = 'Bot {id}'.format(id=str(uuid.uuid4()))
URL_HEADERS = {'User-Agent': _URL_BOT_ID}
URL_TIMEOUT = 10.0

# Sources of data (file)
IN_PATH = os.path.join(os.getcwd(), 'urlin.txt')
OUT_PATH = os.path.join(os.getcwd(), 'urlout.txt')

# Collect repository URLs (bash)
_URL_RE = 'https?:\/\/[=a-zA-Z0-9\_\/\?\&\.\-]+'  # proto://host+path+params
_FIND_URLS = "find . -type f | xargs grep -hEo '{regex}'".format(regex=_URL_RE)
_FILTER_URLS = "sed '/Binary/d' | sort | uniq > {urlin}".format(urlin=IN_PATH)
COMMAND = '{find} | {filter}'.format(find=_FIND_URLS, filter=_FILTER_URLS)


def run_workers(work, data, worker_threads=mp.cpu_count()*4):
    with futures.ThreadPoolExecutor(max_workers=worker_threads) as executor:
        future_to_result = {
            executor.submit(work, arg): arg for arg in data}
        for future in futures.as_completed(future_to_result):
            yield future.result()


def get_url_status(url):
    for local in ('localhost', '127.0.0.1', 'app_server'):
        if url.startswith('http://' + local):
            return (url, 0)
    clean_url = url.strip('?.')
    try:
        response = requests.get(
            clean_url, verify=False, timeout=URL_TIMEOUT,
            headers=URL_HEADERS)
        return (clean_url, response.status_code)
    except requests.exceptions.Timeout:
        return (clean_url, 504)
    except requests.exceptions.ConnectionError:
        return (clean_url, -1)


def bad_url(url_status):
    if url_status == -1:
        return True
    elif url_status == 401 or url_status == 403:
        return False
    elif url_status == 503:
        return False
    elif url_status >= 400:
        return True
    return False


def main():
    print('Extract urls...')
    os.system(COMMAND)
    with open(IN_PATH, 'r') as fr:
        urls = map(lambda l: l.strip('\n'), fr.readlines())
    with open(OUT_PATH, 'w') as fw:
        url_id = 1
        max_strlen = -1
        for url_path, url_status in run_workers(get_url_status, urls):
            output = 'Currently checking: id={uid} host={uhost}'.format(
                uid=url_id, uhost=urllib3.util.parse_url(url_path).host)
            if max_strlen < len(output):
                max_strlen = len(output)
            print(output.ljust(max_strlen), end='\r')
            if bad_url(url_status) is True:
                fw.write('{}: {}\n'.format(url_path, url_status))
            url_id += 1
    print('\nDone.')


if __name__ == '__main__':
    main()