-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrequests-beautifulsoup-scraper.py
54 lines (40 loc) · 1.55 KB
/
requests-beautifulsoup-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
from bs4 import BeautifulSoup
import csv
# URL of the page to scrape
url = "https://en.wikipedia.org/wiki/Web_scraping"
# Send a GET request to the URL and get the response
response = requests.get(url)
# Get the HTML content of the page
html = response.text
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
# List where to store the scraped titles
titles = []
# List of header levels (h1, h2, h3, h4, h5)
title_level_list = [1, 2, 3, 4, 5]
# Loop through each header level (h1, h2, h3, h4, h5)
for title_level in title_level_list:
# Find all elements of the current header level
title_elements = soup.find_all(f"h{title_level}")
# Loop through each title element found
for title_element in title_elements:
# Data extraction logic
tag = title_element.name
text = title_element.text
# Create a dictionary with the tag and the title text
title = {
"tag": tag,
"title": text,
}
# Append the dictionary to the titles list
titles.append(title)
# Open a CSV file to write the data
with open("titles.csv", mode="w", newline="", encoding="utf-8") as file:
# Create a CSV writer object and specify the fieldnames (columns)
writer = csv.DictWriter(file, fieldnames=["tag", "title"])
# Write the header (column names) to the CSV file
writer.writeheader()
# Write each row (dictionary) to the CSV file
for row in titles:
writer.writerow(row)