Skip to content

Commit a25ef6d

Browse files
Added board_game_scraper.pipelines.LimitImagesPipeline
1 parent dafb2d2 commit a25ef6d

File tree

4 files changed

+94
-0
lines changed

4 files changed

+94
-0
lines changed

src/board_game_scraper/items.py

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class GameItem:
2929
url: str | None = None
3030
official_url: list[str] | None = None
3131
image_url: list[str] | None = None
32+
image_url_download: list[str] | None = None
3233
image_file: list[dict[str, str]] | None = None
3334
image_blurhash: list[dict[str, str]] | None = None
3435
video_url: list[str] | None = None
@@ -112,6 +113,7 @@ class RankingItem:
112113
bayes_rating: float | None = None
113114

114115
image_url: list[str] | None = None
116+
image_url_download: list[str] | None = None
115117
image_file: list[dict[str, str]] | None = None
116118
image_blurhash: list[dict[str, str]] | None = None
117119

@@ -140,6 +142,7 @@ class UserItem:
140142

141143
external_link: list[str] | None = None
142144
image_url: list[str] | None = None
145+
image_url_download: list[str] | None = None
143146
image_file: list[dict[str, str]] | None = None
144147
image_blurhash: list[dict[str, str]] | None = None
145148

src/board_game_scraper/pipelines.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""Scrapy item pipelines"""
2+
3+
from __future__ import annotations
4+
5+
from itertools import islice
6+
from typing import TYPE_CHECKING
7+
8+
from itemadapter import ItemAdapter
9+
from scrapy.exceptions import NotConfigured
10+
from scrapy.utils.misc import arg_to_iter
11+
12+
if TYPE_CHECKING:
13+
from typing import Self, TypeVar
14+
15+
from scrapy import Spider
16+
from scrapy.crawler import Crawler
17+
18+
Typed = TypeVar("Typed")
19+
20+
21+
class LimitImagesPipeline:
22+
"""Copy a limited number of image URLs to be downloaded from source to target."""
23+
24+
source_field: str
25+
target_field: str
26+
limit: int | None = None
27+
28+
@classmethod
29+
def from_crawler(cls, crawler: Crawler) -> Self:
30+
"""Init from crawler."""
31+
32+
source_field = crawler.settings.get("LIMIT_IMAGES_URLS_FIELD")
33+
target_field = crawler.settings.get("IMAGES_URLS_FIELD")
34+
35+
if not source_field or not target_field:
36+
raise NotConfigured
37+
38+
limit = crawler.settings.getint("LIMIT_IMAGES_TO_DOWNLOAD", -1)
39+
40+
return cls(
41+
source_field=source_field,
42+
target_field=target_field,
43+
limit=limit,
44+
)
45+
46+
def __init__(
47+
self,
48+
source_field: str,
49+
target_field: str,
50+
limit: int | None = None,
51+
):
52+
self.source_field = source_field
53+
self.target_field = target_field
54+
self.limit = limit
55+
56+
def process_item(
57+
self,
58+
item: Typed,
59+
spider: Spider, # noqa: ARG002
60+
) -> Typed:
61+
"""
62+
Copy a limited number of image URLs to be downloaded from source to target.
63+
"""
64+
65+
adapter = ItemAdapter(item)
66+
67+
# adding target field would result in error; return item as-is
68+
if self.target_field not in adapter.field_names():
69+
return item
70+
71+
if self.limit is None or self.limit < 0: # copy through everything
72+
adapter[self.target_field] = list(
73+
arg_to_iter(adapter.get(self.source_field)),
74+
)
75+
return item
76+
77+
if not self.limit: # limit is zero
78+
adapter[self.target_field] = []
79+
return item
80+
81+
# actual limit
82+
adapter[self.target_field] = list(
83+
islice(arg_to_iter(adapter.get(self.source_field)), self.limit),
84+
)
85+
return item

src/board_game_scraper/settings.py

+5
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
# Configure item pipelines
122122
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
123123
ITEM_PIPELINES = {
124+
"board_game_scraper.pipelines.LimitImagesPipeline": 500,
124125
"scrapy.pipelines.images.ImagesPipeline": 600,
125126
"scrapy_extensions.BlurHashPipeline": 700,
126127
}
@@ -176,6 +177,10 @@
176177
IMAGES_EXPIRES = 360
177178
# IMAGES_THUMBS = {"thumb": (1024, 1024)}
178179

180+
# Limit images to download
181+
LIMIT_IMAGES_TO_DOWNLOAD = 0
182+
LIMIT_IMAGES_URLS_FIELD = "image_url"
183+
179184
# BlurHash
180185
BLURHASH_FIELD = "image_blurhash"
181186
BLURHASH_X_COMPONENTS = 4

src/board_game_scraper/spiders/bgg.py

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ class BggSpider(SitemapSpider):
7777
custom_settings = { # noqa: RUF012
7878
"DOWNLOAD_DELAY": 2.0,
7979
"AUTOTHROTTLE_TARGET_CONCURRENCY": 4,
80+
"LIMIT_IMAGES_TO_DOWNLOAD": 1,
8081
}
8182

8283
def __init__(

0 commit comments

Comments
 (0)