|
| 1 | +from __future__ import annotations |
| 2 | + |
1 | 3 | import re
|
2 |
| -from collections.abc import Generator, Iterable |
3 |
| -from typing import Any |
| 4 | +from datetime import datetime, timezone |
| 5 | +from typing import TYPE_CHECKING, Any |
4 | 6 |
|
5 |
| -from scrapy.http import Response |
6 | 7 | from scrapy.spiders import SitemapSpider
|
7 | 8 |
|
| 9 | +from board_game_scraper.items import CollectionItem, GameItem |
| 10 | + |
| 11 | +if TYPE_CHECKING: |
| 12 | + from collections.abc import Generator, Iterable |
| 13 | + |
| 14 | + from scrapy.http import Response |
| 15 | + |
8 | 16 |
|
9 | 17 | class BggSpider(SitemapSpider):
|
10 | 18 | name = "bgg"
|
@@ -44,5 +52,31 @@ def sitemap_filter(
|
44 | 52 | )
|
45 | 53 | yield entry
|
46 | 54 |
|
47 |
| - def parse(self, response: Response) -> None: |
48 |
| - pass # TODO: Parse XML response |
| 55 | + def parse( |
| 56 | + self, |
| 57 | + response: Response, |
| 58 | + ) -> Generator[GameItem | CollectionItem, None, None]: |
| 59 | + for game in response.xpath("/items/item"): |
| 60 | + bgg_item_type = game.xpath("@type").get() |
| 61 | + if bgg_item_type != "boardgame": |
| 62 | + self.logger.info("Skipping item type <%s>", bgg_item_type) |
| 63 | + continue |
| 64 | + |
| 65 | + yield GameItem( |
| 66 | + name=game.xpath("name[@type='primary']/@value").get(), |
| 67 | + bgg_id=game.xpath("@id").get(), |
| 68 | + year=game.xpath("yearpublished/@value").get(), |
| 69 | + description=game.xpath("description/text()").get(), |
| 70 | + image_url=game.xpath("image/text()").getall(), # TODO: <thumbnail> |
| 71 | + scraped_at=datetime.now(timezone.utc), |
| 72 | + ) |
| 73 | + |
| 74 | + for comment in game.xpath("comments/comment"): |
| 75 | + yield CollectionItem( |
| 76 | + item_id=f"{comment.xpath("@username").get()}:{game.xpath("@id").get()}", |
| 77 | + bgg_id=game.xpath("@id").get(), |
| 78 | + bgg_user_name=comment.xpath("@username").get(), |
| 79 | + bgg_user_rating=comment.xpath("@rating").get(), |
| 80 | + comment=comment.xpath("@value").get(), |
| 81 | + scraped_at=datetime.now(timezone.utc), |
| 82 | + ) |
0 commit comments