Skip to content

Commit

Permalink
Update import options for versions (#861)
Browse files Browse the repository at this point in the history
Many years ago, we made a bunch of changes to the versions table in the DB and the corresponding format of import records in order to make them clearer and more definitely resemble a record of an HTTP response (since that is effectively what they wound up being). We never updated the corresponding import code here, though! That means we've had backwards-compatibility code sitting around in the DB import script for years. This finally updates things, and after shipping, we can also clean up the DB.

This is extracted from #858, which I thought would land sooner.
  • Loading branch information
Mr0grog authored Feb 18, 2025
1 parent bac83d7 commit 99ce792
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 74 deletions.
21 changes: 11 additions & 10 deletions web_monitoring/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,13 +413,13 @@ def process_record(self, record):
with memento:
version = self.format_memento(memento, record, self.maintainers,
self.tags)
if self.archive_storage and version['version_hash']:
if self.archive_storage and version['body_hash']:
url = self.archive_storage.store(
memento.content,
hash=version['version_hash'],
hash=version['body_hash'],
content_type=version['media_type']
)
version['uri'] = url
version['body_url'] = url

return version

Expand All @@ -434,7 +434,6 @@ def format_memento(self, memento, cdx_record, maintainers, tags):
iso_date = f'{no_tz_date}Z'

metadata = {
'headers': dict(memento.headers),
'view_url': cdx_record.view_url
}

Expand All @@ -459,19 +458,21 @@ def format_memento(self, memento, cdx_record, maintainers, tags):

return dict(
# Page-level info
page_url=cdx_record.url,
url=cdx_record.url,
page_maintainers=maintainers,
page_tags=tags,
title=title,

# Version/memento-level info
capture_time=iso_date,
uri=cdx_record.raw_url,
body_url=cdx_record.raw_url,
media_type=media_type or None,
version_hash=utils.hash_content(memento.content),
content_length=len(memento.content),
body_hash=utils.hash_content(memento.content),
source_type='internet_archive',
source_metadata=metadata,
status=memento.status_code
status=memento.status_code,
headers=dict(memento.headers),
)

def get_memento_media(self, memento):
Expand Down Expand Up @@ -770,8 +771,8 @@ def _filter_unchanged_versions(versions):
"""
last_hashes = {}
for version in versions:
if last_hashes.get(version['page_url']) != version['version_hash']:
last_hashes[version['page_url']] = version['version_hash']
if last_hashes.get(version['url']) != version['body_hash']:
last_hashes[version['url']] = version['body_hash']
yield version


Expand Down
42 changes: 24 additions & 18 deletions web_monitoring/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,9 @@ def _time_range_string(start_date, end_date):
return f'{start_str}..{end_str}'


def _build_version(*, page_id, uuid, capture_time, uri, hash, source_type,
title, source_metadata=None, media_type=None):
def _build_version(*, page_id, uuid, capture_time, body_url, body_hash,
source_type, title, source_metadata=None, media_type=None,
headers=None, content_length=None):
"""
Build a Version dict from parameters, performing some validation.
"""
Expand All @@ -113,42 +114,47 @@ def _build_version(*, page_id, uuid, capture_time, uri, hash, source_type,
version = {'page_id': page_id,
'uuid': uuid,
'capture_time': capture_time,
'uri': str(uri),
'hash': str(hash),
'body_url': str(body_url),
'body_hash': str(body_hash),
'source_type': str(source_type),
'title': str(title),
'source_metadata': source_metadata,
'media_type': media_type}
'media_type': media_type,
'headers': headers,
'content_length': content_length}
return version


def _build_importable_version(*, page_url, uuid=None, capture_time, uri,
version_hash, source_type, title,
def _build_importable_version(*, url, uuid=None, capture_time, body_url,
body_hash, source_type, title,
page_maintainers=None, page_tags=None,
source_metadata=None, status=None,
media_type=None):
media_type=None, headers=None,
content_length=None):
"""
Build a Version dict from parameters, performing some validation.
This is different than _build_version because it needs ``page_url`` instead
This is different than _build_version because it needs ``url`` instead
of ``page_id`` of an existing Page.
"""
if not isinstance(capture_time, str):
capture_time = _tzaware_isoformat(capture_time)
if source_metadata is None:
source_metadata = {}
version = {'page_url': page_url,
version = {'url': url,
'uuid': uuid,
'capture_time': capture_time,
'uri': str(uri),
'hash': str(version_hash),
'body_url': str(body_url),
'body_hash': str(body_hash),
'source_type': str(source_type),
'title': str(title),
'source_metadata': source_metadata,
'status': str(status),
'page_maintainers': page_maintainers,
'page_tags': page_tags,
'media_type': media_type}
'media_type': media_type,
'headers': headers,
'content_length': content_length}
return version


Expand Down Expand Up @@ -676,7 +682,7 @@ def get_version(self, version_id, include_change_from_previous=None,
result = self.request_json(GET, url, params=params)
return result

def add_version(self, *, page_id, capture_time, uri, hash,
def add_version(self, *, page_id, capture_time, body_url, body_hash,
source_type, title, uuid=None, source_metadata=None):
"""
Submit one new Version.
Expand All @@ -687,9 +693,9 @@ def add_version(self, *, page_id, capture_time, uri, hash,
----------
page_id : string
Page to which the Version is associated
uri : string
body_url : string
URI of content (such as an S3 bucket or InternetArchive URL)
hash : string
body_hash : string
SHA256 hash of Version content
source_type : string
such as 'versionista' or 'internetarchive'
Expand All @@ -710,8 +716,8 @@ def add_version(self, *, page_id, capture_time, uri, hash,
page_id=page_id,
uuid=uuid,
capture_time=capture_time,
uri=uri,
hash=hash,
body_url=body_url,
body_hash=body_hash,
source_type=source_type,
title=title,
source_metadata=source_metadata)
Expand Down
82 changes: 41 additions & 41 deletions web_monitoring/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,23 +31,23 @@

def test_filter_unchanged_versions():
versions = (
{'page_url': 'http://example.com', 'version_hash': 'a'},
{'page_url': 'http://example.com', 'version_hash': 'b'},
{'page_url': 'http://example.com', 'version_hash': 'b'},
{'page_url': 'http://other.com', 'version_hash': 'b'},
{'page_url': 'http://example.com', 'version_hash': 'b'},
{'page_url': 'http://example.com', 'version_hash': 'c'},
{'page_url': 'http://other.com', 'version_hash': 'd'},
{'page_url': 'http://other.com', 'version_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'a'},
{'url': 'http://example.com', 'body_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'b'},
{'url': 'http://other.com', 'body_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'c'},
{'url': 'http://other.com', 'body_hash': 'd'},
{'url': 'http://other.com', 'body_hash': 'b'},
)

assert list(_filter_unchanged_versions(versions)) == [
{'page_url': 'http://example.com', 'version_hash': 'a'},
{'page_url': 'http://example.com', 'version_hash': 'b'},
{'page_url': 'http://other.com', 'version_hash': 'b'},
{'page_url': 'http://example.com', 'version_hash': 'c'},
{'page_url': 'http://other.com', 'version_hash': 'd'},
{'page_url': 'http://other.com', 'version_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'a'},
{'url': 'http://example.com', 'body_hash': 'b'},
{'url': 'http://other.com', 'body_hash': 'b'},
{'url': 'http://example.com', 'body_hash': 'c'},
{'url': 'http://other.com', 'body_hash': 'd'},
{'url': 'http://other.com', 'body_hash': 'b'},
]


Expand Down Expand Up @@ -75,24 +75,24 @@ def test_format_memento():

assert isinstance(version, dict)

assert version['page_url'] == url
assert version['url'] == url
assert version['page_maintainers'] == ['maintainer']
assert version['page_tags'] == ['tag']
assert version['title'] == "U.S. Fish & Wildlife Service - Migratory Bird Program | Conserving America's Birds"

assert version['capture_time'] == '2017-11-24T15:13:15Z'
assert version['uri'] == f'https://web.archive.org/web/20171124151315id_/{url}'
assert version['version_hash'] == 'ae433414499f91630983fc379d9bafae67250061178930b8779ee76c82485491'
assert version['body_url'] == f'https://web.archive.org/web/20171124151315id_/{url}'
assert version['body_hash'] == 'ae433414499f91630983fc379d9bafae67250061178930b8779ee76c82485491'
assert version['source_type'] == 'internet_archive'
assert version['status'] == 200
assert version['media_type'] == 'text/html'
assert version['headers'] == {
'content-type': 'text/html',
'date': 'Fri, 24 Nov 2017 15:13:14 GMT',
'strict-transport-security': 'max-age=31536000; includeSubDomains; preload',
'transfer-encoding': 'chunked'
}
assert version['source_metadata'] == {
'headers': {
'content-type': 'text/html',
'date': 'Fri, 24 Nov 2017 15:13:14 GMT',
'strict-transport-security': 'max-age=31536000; includeSubDomains; preload',
'transfer-encoding': 'chunked'
},
'view_url': 'https://web.archive.org/web/20171124151315/https://www.fws.gov/birds/'
}

Expand Down Expand Up @@ -131,32 +131,32 @@ def test_format_memento_pdf():

assert isinstance(version, dict)

assert version['page_url'] == url
assert version['url'] == url
assert version['page_maintainers'] == ['maintainer']
assert version['page_tags'] == ['tag']
assert version['title'] == "EPA Office of Air and Radiation Climate Change Adaptation Implementation Plan, June 2014"
assert version['capture_time'] == '2020-04-30T02:42:32Z'
assert version['uri'] == f'https://web.archive.org/web/20200430024232id_/{url}'
assert version['version_hash'] == 'bdfd8c1ee22b70cd1b8bd513989822e066a9656f4578606ef3d5feb6204e3dc6'
assert version['body_url'] == f'https://web.archive.org/web/20200430024232id_/{url}'
assert version['body_hash'] == 'bdfd8c1ee22b70cd1b8bd513989822e066a9656f4578606ef3d5feb6204e3dc6'
assert version['source_type'] == 'internet_archive'
assert version['status'] == 200
assert version['media_type'] == 'application/pdf'
assert version['headers'] == {
'accept-ranges': 'bytes',
'cache-control': 'max-age=572',
'connection': 'close',
'content-length': '375909',
'content-type': 'application/pdf',
'date': 'Thu, 30 Apr 2020 02:42:32 GMT',
'etag': '"12c958e520c9ff580f52ee11446c5e0c:1579909999.298098"',
'expires': 'Thu, 30 Apr 2020 02:52:04 GMT',
'last-modified': 'Tue, 16 Aug 2016 15:43:21 GMT',
'server': 'AkamaiNetStorage',
'server-timing': 'cdn-cache; desc=HIT',
'strict-transport-security': 'max-age=31536000; preload;',
'x-content-type-options': 'nosniff'
}
assert version['source_metadata'] == {
'headers': {
'accept-ranges': 'bytes',
'cache-control': 'max-age=572',
'connection': 'close',
'content-length': '375909',
'content-type': 'application/pdf',
'date': 'Thu, 30 Apr 2020 02:42:32 GMT',
'etag': '"12c958e520c9ff580f52ee11446c5e0c:1579909999.298098"',
'expires': 'Thu, 30 Apr 2020 02:52:04 GMT',
'last-modified': 'Tue, 16 Aug 2016 15:43:21 GMT',
'server': 'AkamaiNetStorage',
'server-timing': 'cdn-cache; desc=HIT',
'strict-transport-security': 'max-age=31536000; preload;',
'x-content-type-options': 'nosniff'
},
'view_url': 'https://web.archive.org/web/20200430024232/https://www.epa.gov/sites/production/files/2016-08/documents/oar-climate-change-adaptation-plan.pdf'
}

Expand Down
10 changes: 5 additions & 5 deletions web_monitoring/tests/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,8 +266,8 @@ def test_add_version():
cli = Client(**AUTH)
cli.add_version(page_id=PAGE_ID, uuid=NEW_VERSION_ID,
capture_time=TIME,
uri='http://example.com',
hash='hash_placeholder',
body_url='http://example.com',
body_hash='hash_placeholder',
title='title_placeholder',
source_type='test')

Expand Down Expand Up @@ -301,10 +301,10 @@ def test_add_versions():
'b8cc3d0f-f2eb-43ef-bfc7-d0b589ee7f49']
versions = [dict(uuid=version_id,
# Notice the importer needs page_url instead of page_id.
page_url='http://example.com',
url='http://example.com',
capture_time=TIME,
uri='http://example.com',
version_hash='hash_placeholder',
body_url='http://example.com',
body_hash='hash_placeholder',
title='title_placeholder',
page_maintainers=['agency_placeholder'],
page_tags=['site:site_placeholder'],
Expand Down

0 comments on commit 99ce792

Please sign in to comment.