-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathimdb_reviews.py
More file actions
57 lines (50 loc) · 2.48 KB
/
imdb_reviews.py
File metadata and controls
57 lines (50 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
################# IMDB reviews extraction ######################## Time Taking process as this program is going
# to operate the web page while extracting reviews
############# time library in order to sleep and make it to extract for that specific page
#### We need to install selenium for python
#### pip install selenium
#### time library to make the extraction process sleep for few seconds
from selenium import webdriver
browser = webdriver.Chrome() # opens the chrome browser
from bs4 import BeautifulSoup as bs
#page = "http://www.imdb.com/title/tt0944947/reviews?ref_=tt_urv"
#page = "http://www.imdb.com/title/tt6294822/reviews?ref_=tt_urv" # required url page where the movie reviews are residing
#page = "http://www.imdb.com/title/tt2704998/reviews?ref_=tt_urv"
#page = "http://www.imdb.com/title/tt2873282/reviews?ref_=tt_urv"
## Moana Movie #####
page= "http://www.imdb.com/title/tt3521164/reviews?ref_=tt_urv"
# Importing few exceptions to surpass the error messages while extracting reviews
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotVisibleException
browser.get(page)
import time
reviews = []
i=1
# Below while loop is to load all the reviews into the browser till load more button dissapears
while (i>0):
#i=i+25
try:
# Storing the load more button page xpath which we will be using it for click it through selenium
# for loading few more reviews
button = browser.find_element_by_xpath('//*[@id="load-more-trigger"]') # //*[@id="load-more-trigger"]
button.click()
time.sleep(5)
except NoSuchElementException:
break
except ElementNotVisibleException:
break
# Getting the page source for the entire imdb after loading all the reviews
ps = browser.page_source
#Converting page source into Beautiful soup object
soup=bs(ps,"html.parser")
#Extracting the reviews present in div html_tag having class containing "text" in its value
reviews = soup.findAll("div",attrs={"class","text"})
for i in range(len(reviews)):
reviews[i] = reviews[i].text
##### If we want only few recent reviews you can either press ctrl+c to break the operation in middle but the it will store
##### Whatever data it has extracted so far #######
# Creating a data frame
import pandas as pd
movie_reviews = pd.DataFrame(columns = ["reviews"])
movie_reviews["reviews"] = reviews
movie_reviews.to_csv("movie_reviews.csv",encoding="utf-8")