-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathcrawlytics.py
152 lines (131 loc) · 5.42 KB
/
crawlytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import re
import advertools as adv
import pandas as pd
import pyarrow.parquet as pq
def redirect_summary(crawldf):
"""Create a tidy DataFrame for redirects with the columns:
url: All the URLs in the redirect chain.
status: The status code of each URL.
type: "requested", "inermediate", or "crawled".
order: 1, 2, 3... up to the number of urls in the redirect chain.
redirect_times: The number of redirects in the chain (URLs in the chain minus one).
Parameters:
-----------
crawldf : pandas.DataFrame
A DataFrame of an advertools crawl file
"""
if 'redirect_urls' not in crawldf.columns:
return pd.DataFrame()
redirect_df = (crawldf[['url', 'status', 'redirect_urls', 'redirect_reasons']]
.dropna(subset=['redirect_urls', 'redirect_reasons']))
redirect_df['redirect_urls'] = redirect_df['redirect_urls'].str.split('@@')
redirect_df['redirect_reasons'] = redirect_df['redirect_reasons'].str.split('@@')
for url, redirect_urls in redirect_df[['url', 'redirect_urls']].values:
redirect_urls.append(url)
for status, redirect_reasons in redirect_df[['status', 'redirect_reasons']].values:
redirect_reasons.append(status)
redirect_df['order'] = [list(range(1, len(x)+1)) for x in redirect_df['redirect_reasons']]
redirect_df['type'] = [
['requested' if o == min(order) else 'crawled' if o == max(order) else 'intermediate'
for o in order]
for order in redirect_df['order']]
redirect_df.columns = ['NA1', 'NA2', 'url', 'status', 'order', 'type']
exploded = redirect_df[['url', 'status', 'order', 'type']].apply(pd.Series.explode)
final_df = pd.merge(exploded, crawldf[['download_latency', 'redirect_times']], left_index=True, right_index=True)
final_df['redirect_times'] = final_df['redirect_times'].astype(int)
return final_df
def link_summary(crawldf, internal_url_regex=None):
"""Get a DataFrame summary of links from a crawl DataFrame
Parameters:
-----------
crawldf : DataFrame
A DataFrame of a website crawled with advertools.
internal_url_regex : str
A regular expression for identifying if a link is internal or not.
For example if your website is example.com, this would be "example.com".
Returns:
--------
link_df : pandas.DataFrame
"""
link_df = pd.merge(
crawldf[['url']],
crawldf.filter(regex='^links_').apply(lambda s: s.str.split('@@').explode()),
left_index=True, right_index=True)
link_df['links_nofollow'] = link_df['links_nofollow'].replace({
'True': True, 'False': False, pd.NA: False})
link_df.columns = ['url', 'link', 'text', 'nofollow']
if internal_url_regex is not None:
link_df['internal'] = link_df['link'].fillna('').str.contains(internal_url_regex, regex=True)
# link_df.columns.append('internal')
return link_df
def jl_to_parquet(jl_filepath, parquet_filepath):
"""Convert a jsonlines crawl file to the parquet format.
Parameters
----------
jl_filepath : str
The path of an existing .jl file.
parquet_fileapth : str
The pather where you want the new file to be saved.
"""
status = 'not done'
crawldf = pd.read_json(jl_filepath, lines=True)
while status == 'not done':
try:
crawldf.to_parquet(parquet_filepath, index=False, version='2.6')
status = 'done'
except Exception as e:
error = e.args[-1]
column = re.findall('column (\S+)', error)
print(f'converting to string: {column[0]}')
crawldf[column[0]] = crawldf[column[0]].astype(str).replace('nan', pd.NA)
def parquet_columns(filepath):
"""Get column names and datatypes of a parquet file.
Parameters
----------
filepath : str
The path of the file that you want to get columns names and types.
Returns
-------
columns_types : pandas.DataFrame
A DataFrame with two columns "column" and "type".
"""
pqdataset = pq.ParquetDataset(filepath)
columns_df = pd.DataFrame(
zip(
pqdataset.schema.names,
pqdataset.schema.types),
columns=['column', 'type'])
return columns_df
def image_summary(crawldf):
"""Get a DataFrame summary of images in a crawl DataFrame.
Parameters
----------
crawldf : pandas.DataFrame
A crawl DataFrame as a result of the advertools.crawl function.
Returns
-------
img_summary : pandas.DataFrame
A DataFrame containing all available img tags mapped to their respective URLs
where each image data is represented in a row.
"""
dfs = []
img_df = crawldf.filter(regex='^url$|img_')
for index, row in img_df.iterrows():
notna = row.dropna().index
if len(notna) == 1:
temp = pd.DataFrame({'url': row['url']}, index=[index])
else:
temp = row.to_frame().T.set_index('url').apply(lambda s: s.str.split('@@')).explode(notna.tolist()[1:])
temp = temp.reset_index()
temp.index = [index for i in range(len(temp))]
dfs.append(temp)
final_df = pd.concat(dfs)
return final_df
def jl_subset(filepath, columns):
regex = '^' + '$|^'.join(columns) + '$'
dfs = []
for chunk in pd.read_json(filepath, lines=True, chunksize=1000):
chunk_subset = chunk.filter(regex=regex)
dfs.append(chunk_subset)
final_df = pd.concat(dfs, ignore_index=True)
return final_df