|
1 |
| -# Scrapy settings for board_game_scraper project |
2 |
| -# |
3 |
| -# For simplicity, this file contains only settings considered important or |
4 |
| -# commonly used. You can find more settings consulting the documentation: |
5 |
| -# |
6 |
| -# https://docs.scrapy.org/en/latest/topics/settings.html |
7 |
| -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html |
8 |
| -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html |
| 1 | +import os |
| 2 | +from pathlib import Path |
9 | 3 |
|
10 |
| -BOT_NAME = "board_game_scraper" |
| 4 | +BOT_NAME = "board-game-scraper" |
11 | 5 |
|
12 | 6 | SPIDER_MODULES = ["board_game_scraper.spiders"]
|
13 | 7 | NEWSPIDER_MODULE = "board_game_scraper.spiders"
|
14 | 8 |
|
| 9 | +LOG_LEVEL = os.getenv("LOG_LEVEL") or "INFO" |
| 10 | +LOG_FORMATTER = "scrapy_extensions.QuietLogFormatter" |
| 11 | +LOG_SCRAPED_ITEMS = os.getenv("LOG_SCRAPED_ITEMS") |
| 12 | + |
| 13 | +BASE_DIR = Path(__file__).resolve().parent.parent.parent |
15 | 14 |
|
16 | 15 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
|
17 |
| -USER_AGENT = "board_game_scraper (+https://recommend.games)" |
| 16 | +USER_AGENT = "board-game-scraper (+https://recommend.games)" |
18 | 17 |
|
19 | 18 | # Obey robots.txt rules
|
20 | 19 | ROBOTSTXT_OBEY = True
|
21 | 20 | ROBOTSTXT_PARSER = "scrapy.robotstxt.PythonRobotParser"
|
22 | 21 |
|
23 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
|
24 |
| -# CONCURRENT_REQUESTS = 32 |
| 23 | +CONCURRENT_REQUESTS = 8 |
25 | 24 |
|
26 | 25 | # Configure a delay for requests for the same website (default: 0)
|
27 | 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
28 | 27 | # See also autothrottle settings and docs
|
29 |
| -# DOWNLOAD_DELAY = 3 |
| 28 | +DOWNLOAD_DELAY = 3 |
30 | 29 | # The download delay setting will honor only one of:
|
31 |
| -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 |
| 30 | +CONCURRENT_REQUESTS_PER_DOMAIN = 8 |
32 | 31 | # CONCURRENT_REQUESTS_PER_IP = 16
|
33 | 32 |
|
34 | 33 | # Disable cookies (enabled by default)
|
35 |
| -# COOKIES_ENABLED = False |
| 34 | +COOKIES_ENABLED = True |
36 | 35 |
|
37 | 36 | # Disable Telnet Console (enabled by default)
|
38 |
| -# TELNETCONSOLE_ENABLED = False |
| 37 | +TELNETCONSOLE_ENABLED = True |
39 | 38 |
|
40 | 39 | # Override the default request headers:
|
41 |
| -# DEFAULT_REQUEST_HEADERS = { |
42 |
| -# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
43 |
| -# "Accept-Language": "en", |
44 |
| -# } |
| 40 | +DEFAULT_REQUEST_HEADERS = { |
| 41 | + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| 42 | + "Accept-Language": "en", |
| 43 | +} |
45 | 44 |
|
46 | 45 | # Enable or disable spider middlewares
|
47 | 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
48 |
| -# SPIDER_MIDDLEWARES = { |
49 |
| -# "board_game_scraper.middlewares.BoardGameScraperSpiderMiddleware": 543, |
50 |
| -# } |
| 47 | +SPIDER_MIDDLEWARES = { |
| 48 | + "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50, |
| 49 | + "scrapy.spidermiddlewares.referer.RefererMiddleware": 700, |
| 50 | + "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800, |
| 51 | + "scrapy.spidermiddlewares.depth.DepthMiddleware": 900, |
| 52 | +} |
51 | 53 |
|
52 | 54 | # Enable or disable downloader middlewares
|
53 | 55 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
54 |
| -# DOWNLOADER_MIDDLEWARES = { |
55 |
| -# "board_game_scraper.middlewares.BoardGameScraperDownloaderMiddleware": 543, |
56 |
| -# } |
| 56 | +DOWNLOADER_MIDDLEWARES = { |
| 57 | + "scrapy.downloadermiddlewares.offsite.OffsiteMiddleware": 50, |
| 58 | + "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100, |
| 59 | + "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300, |
| 60 | + "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350, |
| 61 | + "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400, |
| 62 | + "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500, |
| 63 | + "scrapy.downloadermiddlewares.retry.RetryMiddleware": None, |
| 64 | + "scrapy_extensions.DelayedRetryMiddleware": 555, |
| 65 | + "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560, |
| 66 | + "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580, |
| 67 | + "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590, |
| 68 | + "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600, |
| 69 | + "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700, |
| 70 | + "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750, |
| 71 | + "scrapy.downloadermiddlewares.stats.DownloaderStats": 850, |
| 72 | + "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900, |
| 73 | +} |
57 | 74 |
|
58 | 75 | # Enable or disable extensions
|
59 | 76 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
|
60 |
| -# EXTENSIONS = { |
61 |
| -# "scrapy.extensions.telnet.TelnetConsole": None, |
62 |
| -# } |
| 77 | +EXTENSIONS = { |
| 78 | + "scrapy.extensions.corestats.CoreStats": 0, |
| 79 | + "scrapy.extensions.telnet.TelnetConsole": 0, |
| 80 | + "scrapy.extensions.memusage.MemoryUsage": 0, |
| 81 | + "scrapy.extensions.memdebug.MemoryDebugger": 0, |
| 82 | + "scrapy.extensions.closespider.CloseSpider": 0, |
| 83 | + "scrapy.extensions.feedexport.FeedExporter": 0, |
| 84 | + "scrapy.extensions.logstats.LogStats": 0, |
| 85 | + "scrapy.extensions.spiderstate.SpiderState": 0, |
| 86 | + "scrapy.extensions.throttle.AutoThrottle": None, |
| 87 | + "scrapy_extensions.NicerAutoThrottle": 0, |
| 88 | +} |
63 | 89 |
|
64 | 90 | # Configure item pipelines
|
65 | 91 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
66 |
| -# ITEM_PIPELINES = { |
67 |
| -# "board_game_scraper.pipelines.BoardGameScraperPipeline": 300, |
68 |
| -# } |
| 92 | +ITEM_PIPELINES = { |
| 93 | + "scrapy.pipelines.images.ImagesPipeline": 600, |
| 94 | + "scrapy_extensions.BlurHashPipeline": 700, |
| 95 | +} |
| 96 | + |
| 97 | +# See https://doc.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.closespider |
| 98 | +CLOSESPIDER_TIMEOUT = os.getenv("CLOSESPIDER_TIMEOUT") |
69 | 99 |
|
70 | 100 | # Enable and configure the AutoThrottle extension (disabled by default)
|
71 | 101 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
72 |
| -# AUTOTHROTTLE_ENABLED = True |
| 102 | +AUTOTHROTTLE_ENABLED = True |
73 | 103 | # The initial download delay
|
74 |
| -# AUTOTHROTTLE_START_DELAY = 5 |
| 104 | +AUTOTHROTTLE_START_DELAY = max(DOWNLOAD_DELAY * 2, 5) |
75 | 105 | # The maximum download delay to be set in case of high latencies
|
76 |
| -# AUTOTHROTTLE_MAX_DELAY = 60 |
| 106 | +AUTOTHROTTLE_MAX_DELAY = 60 |
77 | 107 | # The average number of requests Scrapy should be sending in parallel to
|
78 | 108 | # each remote server
|
79 |
| -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 |
| 109 | +AUTOTHROTTLE_TARGET_CONCURRENCY = CONCURRENT_REQUESTS_PER_DOMAIN |
80 | 110 | # Enable showing throttling stats for every response received:
|
81 |
| -# AUTOTHROTTLE_DEBUG = False |
| 111 | +AUTOTHROTTLE_DEBUG = False |
| 112 | +AUTOTHROTTLE_HTTP_CODES = (429, 503, 504) |
82 | 113 |
|
83 | 114 | # Enable and configure HTTP caching (disabled by default)
|
84 | 115 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
85 |
| -# HTTPCACHE_ENABLED = True |
86 |
| -# HTTPCACHE_EXPIRATION_SECS = 0 |
| 116 | +HTTPCACHE_ENABLED = True |
| 117 | +HTTPCACHE_EXPIRATION_SECS = 60 * 60 * 24 * 7 # 1 week |
87 | 118 | # HTTPCACHE_DIR = "httpcache"
|
88 |
| -# HTTPCACHE_IGNORE_HTTP_CODES = [] |
| 119 | +HTTPCACHE_IGNORE_HTTP_CODES = (202, 408, 429, 500, 502, 503, 504) |
89 | 120 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
| 121 | +HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" |
| 122 | + |
| 123 | +# Retry settings |
| 124 | +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#retrymiddleware-settings |
| 125 | +RETRY_ENABLED = True |
| 126 | +RETRY_TIMES = 2 |
| 127 | +RETRY_HTTP_CODES = (408, 429, 500, 502, 503, 504, 522, 524) |
| 128 | +RETRY_PRIORITY_ADJUST = -1 |
| 129 | + |
| 130 | +# Delayed retry settings |
| 131 | +DELAYED_RETRY_HTTP_CODES = (202,) |
| 132 | +DELAYED_RETRY_TIMES = -1 |
| 133 | +DELAYED_RETRY_PRIORITY_ADJUST = 0 |
| 134 | +DELAYED_RETRY_DELAY = 10.0 |
| 135 | +DELAYED_RETRY_BACKOFF = True |
| 136 | +DELAYED_RETRY_BACKOFF_MAX_DELAY = 100.0 |
| 137 | + |
| 138 | +MEDIA_ALLOW_REDIRECTS = True |
| 139 | + |
| 140 | +# Image processing |
| 141 | +# https://docs.scrapy.org/en/latest/topics/media-pipeline.html#using-the-images-pipeline |
| 142 | +IMAGES_STORE = BASE_DIR / "images" |
| 143 | +IMAGES_URLS_FIELD = "image_url_download" |
| 144 | +IMAGES_RESULT_FIELD = "image_file" |
| 145 | +IMAGES_EXPIRES = 360 |
| 146 | +# IMAGES_THUMBS = {"thumb": (1024, 1024)} |
| 147 | + |
| 148 | +# BlurHash |
| 149 | +BLURHASH_FIELD = "image_blurhash" |
| 150 | +BLURHASH_X_COMPONENTS = 4 |
| 151 | +BLURHASH_Y_COMPONENTS = 4 |
90 | 152 |
|
91 | 153 | # Set settings whose default value is deprecated to a future-proof value
|
92 | 154 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
|
0 commit comments