-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexpresiones_regulares.py
66 lines (56 loc) · 2.59 KB
/
expresiones_regulares.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
def parser_sub():
"""
Parses the HTML file to extract list items by removing <li> and </li> tags.
It reads the file 'web.html', searches for lines containing <li> and </li> tags,
removes these tags, and prints the remaining content.
"""
with open('web_scraping/web.html', mode='r') as file:
content_file = file.readlines() # Read all lines from the HTML file
goal = '<li>' # Opening list item tag
goal2 = '</li>' # Closing list item tag
# Iterate through each line in the content
for line in content_file:
# Check if both <li> and </li> tags are in the line
if goal in line and goal2 in line:
# Remove <li> tag
line = re.sub(pattern=goal, repl='', string=line)
# Remove </li> tag
line = re.sub(pattern=goal2, repl='', string=line)
# Print the cleaned line without leading/trailing whitespace
print(line.strip())
def parser_findall():
"""
Finds and prints all lines starting with <html> in the HTML file.
It uses the regex pattern '^<html>' to locate such lines in 'web.html'.
"""
with open('web_scraping/web.html', mode='r') as file:
content_file = file.readlines() # Read all lines from the HTML file
goal = '^<html>' # Pattern to find lines that start with <html>
# Iterate through each line in the content
for line in content_file:
finded = re.findall(goal, line) # Find all occurrences of the pattern in the line
if finded:
print(finded) # Print the found occurrences
def parser_search():
"""
Searches for the occurrence of the word 'Lorem' in each line of the HTML file.
It prints the line if the word 'Lorem' is found.
"""
with open('web_scraping/web.html', mode='r') as file:
# Iterate through each line in the file
for line in file.readlines():
goal = 'Lorem' # Word to search for
wanted = re.search(goal, line) # Search for the word in the line
if wanted:
print(line.strip()) # Print the line if the word is found
def main():
"""
Main function to execute the parsers for the HTML file.
It calls the three parsing functions: parser_search, parser_findall, and parser_sub.
"""
parser_search() # Search for the word 'Lorem'
parser_findall() # Find lines starting with <html>
parser_sub() # Extract and print list items
if __name__ == '__main__':
main() # Run the main function when the script is executed