-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathitems.py
More file actions
79 lines (61 loc) · 2.58 KB
/
items.py
File metadata and controls
79 lines (61 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
import soscan.utils
def serializeDateTime(dt):
return soscan.utils.datetimeToJsonStr(dt)
class SitemapItem(scrapy.Item):
"""
Properties of a sitemap item.
Items of this type are emitted by the ldsitemapspider instead of
downloading the item.
Attributes:
source: The URL of the source sitemap
time_retrieved: When the item was found in the sitemap
url: URL of the item in the sitemap
time_loc: Timestamp in sitemap lastmod value, if available
changefreq: String value of the changefreq element, if available
priority: Value of the priority element, if available
"""
source = scrapy.Field()
time_retrieved = scrapy.Field(serializer=serializeDateTime)
url = scrapy.Field()
time_loc = scrapy.Field(serializer=serializeDateTime)
changefreq = scrapy.Field()
priority = scrapy.Field()
class JsonLDItem(SitemapItem):
"""
JSON-LD retrieved from a page found by crawling a sitemap.
Content retrieved from a web page is treated as an RDF dataset, and
so will always be a list even if only a single JSON-LD block
is retrieved from the page.
Attributes:
elapsed: Seconds taken to retrieve the page
time_modified: Value of Last-Modified response header, if available
jsonld: list of JSON-LD structures extracted from a page
"""
elapsed = scrapy.Field()
time_modified = scrapy.Field(serializer=serializeDateTime)
jsonld = scrapy.Field()
class SoscanItem(scrapy.Item):
"""
Set all the properties on a json-ld thing.
All properties that need to be preserved in the database etc should be set here
"""
url = scrapy.Field()
status = scrapy.Field() # http status
time_retrieved = scrapy.Field()
time_loc = scrapy.Field() # From the sitemap, if available
time_modified = scrapy.Field() # From the HTTP response header, if available
time_dsmodified = (
scrapy.Field()
) # dateModified value in JSON-LD object, if availale
jsonld = scrapy.Field() # The JSON-LD object (de-serialized)
normalized = scrapy.Field() # the normalized JSON-LD object
identifier = scrapy.Field() # PID to be used for the item
series_id = scrapy.Field() # Series ID to be used for the item
alt_identifiers = scrapy.Field() # alternative identifiers extracted from the item
format_id = scrapy.Field()
def __repr__(self):
"""Only print out url after exiting the Pipeline"""
return repr({"url": self["url"]})