web_scraping/parsear_contenido_sin_modulos.py

import urllib3

class GobeaSoup:
    """
    A simple class to fetch and parse HTML content from a specified URL.

    This class encapsulates the logic to perform an HTTP GET request
    and generate a soup-like structure from the fetched HTML.
    """

    def __init__(self, url: str):
        """
        Initializes the GobeaSoup object with the specified URL.

        Args:
            url (str): The URL from which to fetch HTML content.
        """
        self._url = url  # Store the URL
        self._soup = self._generate_soup()  # Generate the soup upon initialization
        
    def _generate_soup(self):
        """
        Performs an HTTP GET request and retrieves the HTML content.

        Returns:
            str: The HTML content of the specified URL.
        """
        http = urllib3.PoolManager()  # Create a PoolManager instance for making HTTP requests
        response = http.request('GET', self.get_url())  # Make a GET request to the specified URL
        return response.data.decode('utf-8')  # Decode the response content to a string
        
    def get_url(self):
        """
        Returns the stored URL.

        Returns:
            str: The URL of the GobeaSoup instance.
        """
        return self._url
    
    def get_soup(self):
        """
        Returns the fetched HTML content.

        Returns:
            str: The HTML content retrieved from the URL.
        """
        return self._soup
    
    def set_url(self, url: str):
        """
        Updates the stored URL.

        Args:
            url (str): The new URL to set.
        """
        self._url = url  # Update the URL

def main():
    """
    Main function to demonstrate the usage of the GobeaSoup class.
    It creates an instance of GobeaSoup and prints the parsed HTML content.
    """
    soup = GobeaSoup(
        url='https://lorem2.com/'  # Create an instance of GobeaSoup with the specified URL
    )
    sopa = soup.get_soup()  # Get the fetched HTML content
    print(sopa.replace("\r", "").replace("\t", "").split('\n'))  # Clean and print the HTML content line by line

if __name__ == '__main__':
    try:
        main()  # Run the main function when the script is executed
    except KeyboardInterrupt:
        exit()  # Exit the program gracefully on keyboard interrupt