Skip to content

Commit fc5e8dd

Browse files
committed
BeautifulSoup Tutorial
1 parent aff3362 commit fc5e8dd

File tree

2 files changed

+68
-0
lines changed

2 files changed

+68
-0
lines changed

BeautifulSoup/scrape.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
import csv
4+
5+
source = requests.get('http://coreyms.com').text
6+
7+
soup = BeautifulSoup(source, 'lxml')
8+
9+
csv_file = open('cms_scrape.csv', 'w')
10+
11+
csv_writer = csv.writer(csv_file)
12+
csv_writer.writerow(['headline', 'summary', 'video_link'])
13+
14+
for article in soup.find_all('article'):
15+
headline = article.h2.a.text
16+
print(headline)
17+
18+
summary = article.find('div', class_='entry-content').p.text
19+
print(summary)
20+
21+
try:
22+
vid_src = article.find('iframe', class_='youtube-player')['src']
23+
24+
vid_id = vid_src.split('/')[4]
25+
vid_id = vid_id.split('?')[0]
26+
27+
yt_link = f'https://youtube.com/watch?v={vid_id}'
28+
except Exception as e:
29+
yt_link = None
30+
31+
print(yt_link)
32+
33+
print()
34+
35+
csv_writer.writerow([headline, summary, yt_link])
36+
37+
csv_file.close()

BeautifulSoup/simple.html

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
<!doctype html>
2+
<html class="no-js" lang="">
3+
<head>
4+
<title>Test - A Sample Website</title>
5+
<meta charset="utf-8">
6+
<link rel="stylesheet" href="css/normalize.css">
7+
<link rel="stylesheet" href="css/main.css">
8+
</head>
9+
<body>
10+
<h1 id='site_title'>Test Website</h1>
11+
<hr></hr>
12+
<div class="article">
13+
<h2><a href="article_1.html">Article 1 Headline</a></h2>
14+
<p>This is a summary of article 1</p>
15+
</div>
16+
<hr></hr>
17+
<div class="article">
18+
<h2><a href="article_2.html">Article 2 Headline</a></h2>
19+
<p>This is a summary of article 2</p>
20+
</div>
21+
<hr></hr>
22+
23+
<div class='footer'>
24+
<p>Footer Information</p>
25+
</div>
26+
27+
<script src="js/vendor/modernizr-3.5.0.min.js"></script>
28+
<script src="js/plugins.js"></script>
29+
<script src="js/main.js"></script>
30+
</body>
31+
</html>

0 commit comments

Comments
 (0)