Skip to content

Commit 1c8f64a

Browse files
Actually start scraping something
1 parent 5496de3 commit 1c8f64a

File tree

3 files changed

+47
-8
lines changed

3 files changed

+47
-8
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ venv.bak/
8181
# Project specific
8282
.scrapy/
8383
feeds/
84+
feeds_v3/
8485
images/
8586
jobs/
8687
logs/

src/board_game_scraper/settings.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,24 @@
1212

1313
BASE_DIR = Path(__file__).resolve().parent.parent.parent
1414

15+
GAME_ITEM_URI = f"{BASE_DIR}/feeds_v3/%(name)s/GameItem/%(time)s.jl"
16+
USER_ITEM_URI = f"{BASE_DIR}/feeds_v3/%(name)s/UserItem/%(time)s.jl"
17+
COLLECTION_ITEM_URI = f"{BASE_DIR}/feeds_v3/%(name)s/CollectionItem/%(time)s.jl"
18+
1519
FEEDS = {
16-
BASE_DIR / "feeds_v3" / "GameItem" / "%(time)s.jl": {
20+
GAME_ITEM_URI: {
1721
"item_classes": ["board_game_scraper.items.GameItem"],
1822
"format": "jsonlines",
1923
"overwrite": False,
2024
"store_empty": False,
2125
},
22-
BASE_DIR / "feeds_v3" / "UserItem" / "%(time)s.jl": {
26+
USER_ITEM_URI: {
2327
"item_classes": ["board_game_scraper.items.UserItem"],
2428
"format": "jsonlines",
2529
"overwrite": False,
2630
"store_empty": False,
2731
},
28-
BASE_DIR / "feeds_v3" / "CollectionItem" / "%(time)s.jl": {
32+
COLLECTION_ITEM_URI: {
2933
"item_classes": ["board_game_scraper.items.CollectionItem"],
3034
"format": "jsonlines",
3135
"overwrite": False,

src/board_game_scraper/spiders/bgg.py

+39-5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1+
from __future__ import annotations
2+
13
import re
2-
from collections.abc import Generator, Iterable
3-
from typing import Any
4+
from datetime import datetime, timezone
5+
from typing import TYPE_CHECKING, Any
46

5-
from scrapy.http import Response
67
from scrapy.spiders import SitemapSpider
78

9+
from board_game_scraper.items import CollectionItem, GameItem
10+
11+
if TYPE_CHECKING:
12+
from collections.abc import Generator, Iterable
13+
14+
from scrapy.http import Response
15+
816

917
class BggSpider(SitemapSpider):
1018
name = "bgg"
@@ -44,5 +52,31 @@ def sitemap_filter(
4452
)
4553
yield entry
4654

47-
def parse(self, response: Response) -> None:
48-
pass # TODO: Parse XML response
55+
def parse(
56+
self,
57+
response: Response,
58+
) -> Generator[GameItem | CollectionItem, None, None]:
59+
for game in response.xpath("/items/item"):
60+
bgg_item_type = game.xpath("@type").get()
61+
if bgg_item_type != "boardgame":
62+
self.logger.info("Skipping item type <%s>", bgg_item_type)
63+
continue
64+
65+
yield GameItem(
66+
name=game.xpath("name[@type='primary']/@value").get(),
67+
bgg_id=game.xpath("@id").get(),
68+
year=game.xpath("yearpublished/@value").get(),
69+
description=game.xpath("description/text()").get(),
70+
image_url=game.xpath("image/text()").getall(), # TODO: <thumbnail>
71+
scraped_at=datetime.now(timezone.utc),
72+
)
73+
74+
for comment in game.xpath("comments/comment"):
75+
yield CollectionItem(
76+
item_id=f"{comment.xpath("@username").get()}:{game.xpath("@id").get()}",
77+
bgg_id=game.xpath("@id").get(),
78+
bgg_user_name=comment.xpath("@username").get(),
79+
bgg_user_rating=comment.xpath("@rating").get(),
80+
comment=comment.xpath("@value").get(),
81+
scraped_at=datetime.now(timezone.utc),
82+
)

0 commit comments

Comments
 (0)