File tree 2 files changed +68
-0
lines changed
2 files changed +68
-0
lines changed Original file line number Diff line number Diff line change
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import csv
4
+
5
+ source = requests .get ('http://coreyms.com' ).text
6
+
7
+ soup = BeautifulSoup (source , 'lxml' )
8
+
9
+ csv_file = open ('cms_scrape.csv' , 'w' )
10
+
11
+ csv_writer = csv .writer (csv_file )
12
+ csv_writer .writerow (['headline' , 'summary' , 'video_link' ])
13
+
14
+ for article in soup .find_all ('article' ):
15
+ headline = article .h2 .a .text
16
+ print (headline )
17
+
18
+ summary = article .find ('div' , class_ = 'entry-content' ).p .text
19
+ print (summary )
20
+
21
+ try :
22
+ vid_src = article .find ('iframe' , class_ = 'youtube-player' )['src' ]
23
+
24
+ vid_id = vid_src .split ('/' )[4 ]
25
+ vid_id = vid_id .split ('?' )[0 ]
26
+
27
+ yt_link = f'https://youtube.com/watch?v={ vid_id } '
28
+ except Exception as e :
29
+ yt_link = None
30
+
31
+ print (yt_link )
32
+
33
+ print ()
34
+
35
+ csv_writer .writerow ([headline , summary , yt_link ])
36
+
37
+ csv_file .close ()
Original file line number Diff line number Diff line change
1
+ <!doctype html>
2
+ < html class ="no-js " lang ="">
3
+ < head >
4
+ < title > Test - A Sample Website</ title >
5
+ < meta charset ="utf-8 ">
6
+ < link rel ="stylesheet " href ="css/normalize.css ">
7
+ < link rel ="stylesheet " href ="css/main.css ">
8
+ </ head >
9
+ < body >
10
+ < h1 id ='site_title '> Test Website</ h1 >
11
+ < hr > </ hr >
12
+ < div class ="article ">
13
+ < h2 > < a href ="article_1.html "> Article 1 Headline</ a > </ h2 >
14
+ < p > This is a summary of article 1</ p >
15
+ </ div >
16
+ < hr > </ hr >
17
+ < div class ="article ">
18
+ < h2 > < a href ="article_2.html "> Article 2 Headline</ a > </ h2 >
19
+ < p > This is a summary of article 2</ p >
20
+ </ div >
21
+ < hr > </ hr >
22
+
23
+ < div class ='footer '>
24
+ < p > Footer Information</ p >
25
+ </ div >
26
+
27
+ < script src ="js/vendor/modernizr-3.5.0.min.js "> </ script >
28
+ < script src ="js/plugins.js "> </ script >
29
+ < script src ="js/main.js "> </ script >
30
+ </ body >
31
+ </ html >
You can’t perform that action at this time.
0 commit comments