web_scraping/expresiones_regulares.py

import re

def parser_sub():
    """
    Parses the HTML file to extract list items by removing <li> and </li> tags.

    It reads the file 'web.html', searches for lines containing <li> and </li> tags,
    removes these tags, and prints the remaining content.
    """
    with open('web_scraping/web.html', mode='r') as file:
        content_file = file.readlines()  # Read all lines from the HTML file
        goal = '<li>'  # Opening list item tag
        goal2 = '</li>'  # Closing list item tag
        
        # Iterate through each line in the content
        for line in content_file:
            # Check if both <li> and </li> tags are in the line
            if goal in line and goal2 in line:
                # Remove <li> tag
                line = re.sub(pattern=goal, repl='', string=line)
                # Remove </li> tag
                line = re.sub(pattern=goal2, repl='', string=line)
                # Print the cleaned line without leading/trailing whitespace
                print(line.strip())

def parser_findall():
    """
    Finds and prints all lines starting with <html> in the HTML file.

    It uses the regex pattern '^<html>' to locate such lines in 'web.html'.
    """
    with open('web_scraping/web.html', mode='r') as file:
        content_file = file.readlines()  # Read all lines from the HTML file
        goal = '^<html>'  # Pattern to find lines that start with <html>
        
        # Iterate through each line in the content
        for line in content_file:
            finded = re.findall(goal, line)  # Find all occurrences of the pattern in the line
            if finded:
                print(finded)  # Print the found occurrences

def parser_search():
    """
    Searches for the occurrence of the word 'Lorem' in each line of the HTML file.

    It prints the line if the word 'Lorem' is found.
    """
    with open('web_scraping/web.html', mode='r') as file:
        # Iterate through each line in the file
        for line in file.readlines():
            goal = 'Lorem'  # Word to search for
            wanted = re.search(goal, line)  # Search for the word in the line
            if wanted:
                print(line.strip())  # Print the line if the word is found

def main():
    """
    Main function to execute the parsers for the HTML file.
    It calls the three parsing functions: parser_search, parser_findall, and parser_sub.
    """
    parser_search()  # Search for the word 'Lorem'
    parser_findall()  # Find lines starting with <html>
    parser_sub()  # Extract and print list items

if __name__ == '__main__':
    main()  # Run the main function when the script is executed