Skip to content

Commit ca2f7e3

Browse files
Settings
1 parent 2b6b225 commit ca2f7e3

File tree

3 files changed

+101
-186
lines changed

3 files changed

+101
-186
lines changed

src/board_game_scraper/middlewares.py

-133
This file was deleted.

src/board_game_scraper/pipelines.py

-14
This file was deleted.

src/board_game_scraper/settings.py

+101-39
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,154 @@
1-
# Scrapy settings for board_game_scraper project
2-
#
3-
# For simplicity, this file contains only settings considered important or
4-
# commonly used. You can find more settings consulting the documentation:
5-
#
6-
# https://docs.scrapy.org/en/latest/topics/settings.html
7-
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8-
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
1+
import os
2+
from pathlib import Path
93

10-
BOT_NAME = "board_game_scraper"
4+
BOT_NAME = "board-game-scraper"
115

126
SPIDER_MODULES = ["board_game_scraper.spiders"]
137
NEWSPIDER_MODULE = "board_game_scraper.spiders"
148

9+
LOG_LEVEL = os.getenv("LOG_LEVEL") or "INFO"
10+
LOG_FORMATTER = "scrapy_extensions.QuietLogFormatter"
11+
LOG_SCRAPED_ITEMS = os.getenv("LOG_SCRAPED_ITEMS")
12+
13+
BASE_DIR = Path(__file__).resolve().parent.parent.parent
1514

1615
# Crawl responsibly by identifying yourself (and your website) on the user-agent
17-
USER_AGENT = "board_game_scraper (+https://recommend.games)"
16+
USER_AGENT = "board-game-scraper (+https://recommend.games)"
1817

1918
# Obey robots.txt rules
2019
ROBOTSTXT_OBEY = True
2120
ROBOTSTXT_PARSER = "scrapy.robotstxt.PythonRobotParser"
2221

2322
# Configure maximum concurrent requests performed by Scrapy (default: 16)
24-
# CONCURRENT_REQUESTS = 32
23+
CONCURRENT_REQUESTS = 8
2524

2625
# Configure a delay for requests for the same website (default: 0)
2726
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
2827
# See also autothrottle settings and docs
29-
# DOWNLOAD_DELAY = 3
28+
DOWNLOAD_DELAY = 3
3029
# The download delay setting will honor only one of:
31-
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
30+
CONCURRENT_REQUESTS_PER_DOMAIN = 8
3231
# CONCURRENT_REQUESTS_PER_IP = 16
3332

3433
# Disable cookies (enabled by default)
35-
# COOKIES_ENABLED = False
34+
COOKIES_ENABLED = True
3635

3736
# Disable Telnet Console (enabled by default)
38-
# TELNETCONSOLE_ENABLED = False
37+
TELNETCONSOLE_ENABLED = True
3938

4039
# Override the default request headers:
41-
# DEFAULT_REQUEST_HEADERS = {
42-
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
43-
# "Accept-Language": "en",
44-
# }
40+
DEFAULT_REQUEST_HEADERS = {
41+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42+
"Accept-Language": "en",
43+
}
4544

4645
# Enable or disable spider middlewares
4746
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48-
# SPIDER_MIDDLEWARES = {
49-
# "board_game_scraper.middlewares.BoardGameScraperSpiderMiddleware": 543,
50-
# }
47+
SPIDER_MIDDLEWARES = {
48+
"scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
49+
"scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
50+
"scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
51+
"scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
52+
}
5153

5254
# Enable or disable downloader middlewares
5355
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54-
# DOWNLOADER_MIDDLEWARES = {
55-
# "board_game_scraper.middlewares.BoardGameScraperDownloaderMiddleware": 543,
56-
# }
56+
DOWNLOADER_MIDDLEWARES = {
57+
"scrapy.downloadermiddlewares.offsite.OffsiteMiddleware": 50,
58+
"scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
59+
"scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
60+
"scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
61+
"scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
62+
"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
63+
"scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
64+
"scrapy_extensions.DelayedRetryMiddleware": 555,
65+
"scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
66+
"scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
67+
"scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
68+
"scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
69+
"scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
70+
"scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
71+
"scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
72+
"scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900,
73+
}
5774

5875
# Enable or disable extensions
5976
# See https://docs.scrapy.org/en/latest/topics/extensions.html
60-
# EXTENSIONS = {
61-
# "scrapy.extensions.telnet.TelnetConsole": None,
62-
# }
77+
EXTENSIONS = {
78+
"scrapy.extensions.corestats.CoreStats": 0,
79+
"scrapy.extensions.telnet.TelnetConsole": 0,
80+
"scrapy.extensions.memusage.MemoryUsage": 0,
81+
"scrapy.extensions.memdebug.MemoryDebugger": 0,
82+
"scrapy.extensions.closespider.CloseSpider": 0,
83+
"scrapy.extensions.feedexport.FeedExporter": 0,
84+
"scrapy.extensions.logstats.LogStats": 0,
85+
"scrapy.extensions.spiderstate.SpiderState": 0,
86+
"scrapy.extensions.throttle.AutoThrottle": None,
87+
"scrapy_extensions.NicerAutoThrottle": 0,
88+
}
6389

6490
# Configure item pipelines
6591
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66-
# ITEM_PIPELINES = {
67-
# "board_game_scraper.pipelines.BoardGameScraperPipeline": 300,
68-
# }
92+
ITEM_PIPELINES = {
93+
"scrapy.pipelines.images.ImagesPipeline": 600,
94+
"scrapy_extensions.BlurHashPipeline": 700,
95+
}
96+
97+
# See https://doc.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.closespider
98+
CLOSESPIDER_TIMEOUT = os.getenv("CLOSESPIDER_TIMEOUT")
6999

70100
# Enable and configure the AutoThrottle extension (disabled by default)
71101
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72-
# AUTOTHROTTLE_ENABLED = True
102+
AUTOTHROTTLE_ENABLED = True
73103
# The initial download delay
74-
# AUTOTHROTTLE_START_DELAY = 5
104+
AUTOTHROTTLE_START_DELAY = max(DOWNLOAD_DELAY * 2, 5)
75105
# The maximum download delay to be set in case of high latencies
76-
# AUTOTHROTTLE_MAX_DELAY = 60
106+
AUTOTHROTTLE_MAX_DELAY = 60
77107
# The average number of requests Scrapy should be sending in parallel to
78108
# each remote server
79-
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
109+
AUTOTHROTTLE_TARGET_CONCURRENCY = CONCURRENT_REQUESTS_PER_DOMAIN
80110
# Enable showing throttling stats for every response received:
81-
# AUTOTHROTTLE_DEBUG = False
111+
AUTOTHROTTLE_DEBUG = False
112+
AUTOTHROTTLE_HTTP_CODES = (429, 503, 504)
82113

83114
# Enable and configure HTTP caching (disabled by default)
84115
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85-
# HTTPCACHE_ENABLED = True
86-
# HTTPCACHE_EXPIRATION_SECS = 0
116+
HTTPCACHE_ENABLED = True
117+
HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 * 7 # 1 week
87118
# HTTPCACHE_DIR = "httpcache"
88-
# HTTPCACHE_IGNORE_HTTP_CODES = []
119+
HTTPCACHE_IGNORE_HTTP_CODES = (202, 408, 429, 500, 502, 503, 504)
89120
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
121+
HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
122+
123+
# Retry settings
124+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#retrymiddleware-settings
125+
RETRY_ENABLED = True
126+
RETRY_TIMES = 2
127+
RETRY_HTTP_CODES = (408, 429, 500, 502, 503, 504, 522, 524)
128+
RETRY_PRIORITY_ADJUST = -1
129+
130+
# Delayed retry settings
131+
DELAYED_RETRY_HTTP_CODES = (202,)
132+
DELAYED_RETRY_TIMES = -1
133+
DELAYED_RETRY_PRIORITY_ADJUST = 0
134+
DELAYED_RETRY_DELAY = 10.0
135+
DELAYED_RETRY_BACKOFF = True
136+
DELAYED_RETRY_BACKOFF_MAX_DELAY = 100.0
137+
138+
MEDIA_ALLOW_REDIRECTS = True
139+
140+
# Image processing
141+
# https://docs.scrapy.org/en/latest/topics/media-pipeline.html#using-the-images-pipeline
142+
IMAGES_STORE = BASE_DIR / "images"
143+
IMAGES_URLS_FIELD = "image_url_download"
144+
IMAGES_RESULT_FIELD = "image_file"
145+
IMAGES_EXPIRES = 360
146+
# IMAGES_THUMBS = {"thumb": (1024, 1024)}
147+
148+
# BlurHash
149+
BLURHASH_FIELD = "image_blurhash"
150+
BLURHASH_X_COMPONENTS = 4
151+
BLURHASH_Y_COMPONENTS = 4
90152

91153
# Set settings whose default value is deprecated to a future-proof value
92154
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"

0 commit comments

Comments
 (0)