-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparsear_contenido_sin_modulos.py
74 lines (60 loc) · 2.26 KB
/
parsear_contenido_sin_modulos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import urllib3
class GobeaSoup:
"""
A simple class to fetch and parse HTML content from a specified URL.
This class encapsulates the logic to perform an HTTP GET request
and generate a soup-like structure from the fetched HTML.
"""
def __init__(self, url: str):
"""
Initializes the GobeaSoup object with the specified URL.
Args:
url (str): The URL from which to fetch HTML content.
"""
self._url = url # Store the URL
self._soup = self._generate_soup() # Generate the soup upon initialization
def _generate_soup(self):
"""
Performs an HTTP GET request and retrieves the HTML content.
Returns:
str: The HTML content of the specified URL.
"""
http = urllib3.PoolManager() # Create a PoolManager instance for making HTTP requests
response = http.request('GET', self.get_url()) # Make a GET request to the specified URL
return response.data.decode('utf-8') # Decode the response content to a string
def get_url(self):
"""
Returns the stored URL.
Returns:
str: The URL of the GobeaSoup instance.
"""
return self._url
def get_soup(self):
"""
Returns the fetched HTML content.
Returns:
str: The HTML content retrieved from the URL.
"""
return self._soup
def set_url(self, url: str):
"""
Updates the stored URL.
Args:
url (str): The new URL to set.
"""
self._url = url # Update the URL
def main():
"""
Main function to demonstrate the usage of the GobeaSoup class.
It creates an instance of GobeaSoup and prints the parsed HTML content.
"""
soup = GobeaSoup(
url='https://lorem2.com/' # Create an instance of GobeaSoup with the specified URL
)
sopa = soup.get_soup() # Get the fetched HTML content
print(sopa.replace("\r", "").replace("\t", "").split('\n')) # Clean and print the HTML content line by line
if __name__ == '__main__':
try:
main() # Run the main function when the script is executed
except KeyboardInterrupt:
exit() # Exit the program gracefully on keyboard interrupt