-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutil.py
More file actions
71 lines (59 loc) · 1.83 KB
/
util.py
File metadata and controls
71 lines (59 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import httpx
import sys
from bs4 import BeautifulSoup
import os
import re
import pickle
def get_content(link):
page = httpx.get(link)
return page.text
def write_content(*args):
# TODO:
COUNT = args[4]
COUNT = COUNT + 1
file = open(args[-2] + "/" + str(COUNT) + ".html","w")
data = args[0] + "\n" + args[1] + "\n" + str(args[2])
file.write(data)
file.close()
return COUNT
def find_links(content, URLList):
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
if link not in URLList:
URLList.append(link.get('href'))
def store_to_file(file, map):
with open(file,'w') as f:
f.write(pickle.dumps(map))
def read_from_file(file):
f = open(file,'r')
map = pickle.load(f)
return map
def find_occurance(word, words):
return words.count(word)
# Creating a datastructure to store the data from different files
def store_to_ds(words, file, map):
for word in set(words):
### calculate the frequency of the current word
occurance = find_occurance(word, words)
#for each file the dict should get appended with filename and occurance
if word in map:
map[word]['tot_occur'] = map[word]['tot_occur'] + occurance
map[word][file] = occurance
else:
map[word] = {}
map[word]['tot_occur'] = occurance
map[word][file] = occurance
# parsing data into list of words
def parse_data_into_words(data):
result = []
lines = data.strip().split('\n')
for line in lines:
line = line.strip()
if line:
words = line.split()
[result.append(word.strip()) for word in words]
return result
#Remove html tags from a string
def get_data_without_tags(file):
clean = re.compile('<.*?>')
return re.sub(clean, '', file)