File tree 4 files changed +21
-1
lines changed
4 files changed +21
-1
lines changed Original file line number Diff line number Diff line change 12
12
SPIDER_MODULES = ['quotes.spiders' ]
13
13
NEWSPIDER_MODULE = 'quotes.spiders'
14
14
15
+ # PROXY_POOL_ENABLED = True
15
16
16
17
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17
18
#USER_AGENT = 'quotes (+http://www.yourdomain.com)'
19
+ DOWNLOADER_MIDDLEWARES = {
20
+ #The below two lines are for user agents
21
+ 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware' : None ,
22
+ 'scrapy_user_agents.middlewares.RandomUserAgentMiddleware' : 400 ,
23
+ # Enable the below line to use proxies
24
+ # 'scrapy_proxy_pool.middlewares.ProxyPoolMiddleware': 610,
25
+ # 'scrapy_proxy_pool.middlewares.BanDetectionMiddleware': 620,
26
+ }
18
27
19
28
# Obey robots.txt rules
20
29
ROBOTSTXT_OBEY = True
Original file line number Diff line number Diff line change @@ -10,7 +10,7 @@ class QuotesScraper(scrapy.Spider):
10
10
11
11
def _parse (self , response , ** kwargs ):
12
12
item = QuotesItem ()
13
- for quote in response .css (".quote" ):
13
+ for quote in response .css (".quote" )[: 2 ] :
14
14
title = quote .css (".quoteText::text" ).extract_first ()
15
15
author = quote .css (".authorOrTitle::text" ).extract_first ()
16
16
item ["title" ] = title
Original file line number Diff line number Diff line change
1
+ ## ** QuotesScrapy**
2
+ This scraper is based on the scrapy framework with pagination feature. It uses fake user agents to bypass the security.
3
+
4
+ Steps to run the projects:-
5
+ 1 . Activate virtual env with ` . env/bin/activate `
6
+ 2 . Install requirements using ` pip install -r requirements.txt `
7
+ 3 . Run the following commands:-
8
+ <br >` scrapy crawl QuotesScraper `
Original file line number Diff line number Diff line change
1
+ Scrapy == 2.4.0
2
+ scrapy-proxy-pool == 0.1.9
3
+ scrapy-user-agents == 0.1.1
You can’t perform that action at this time.
0 commit comments