Skip to content

Commit 6019071

Browse files
committed
feat: Ignore resource_sort_order config if resource_regex is set
1 parent 5ac4dfa commit 6019071

3 files changed

Lines changed: 87 additions & 9 deletions

File tree

ckanext/switzerland/harvester/base_sbb_harvester.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,13 @@ def get_config_validation_schema(self):
194194
def load_config(self, config_str):
195195
schema = self.get_config_validation_schema()
196196
data = json.loads(config_str)
197+
# If resource_regex is in the config, ignore resource_sort_order.
198+
if "resource_regex" in data and "resource_sort_order" in data:
199+
del data["resource_sort_order"]
200+
log.info(
201+
"resource_regex is set in config: ignoring resource_sort_order and "
202+
"using default value (desc)"
203+
)
197204
return schema(data)
198205

199206
# tested
@@ -1006,7 +1013,7 @@ def _get_ordered_resources(self, package):
10061013
else:
10071014
reverse = True
10081015
log.debug(
1009-
f"The configured resource_sort_order is {self.config['resource_sort_order']}"
1016+
f"resource_sort_order is {self.config['resource_sort_order']}"
10101017
f" and reverse is {reverse}"
10111018
)
10121019

@@ -1033,9 +1040,11 @@ def finalize(self, harvest_object, harvest_object_data): # noqa: C901
10331040
# Deleting old resources, generate permalink, order resources:
10341041
# We do this by matching a regex, defined in the `resource_regex` key of the
10351042
# harvester json config, against the identifier (filename) of the resources of
1036-
# the dataset. The ones that matched are thrown in a list and sorted by name,
1037-
# descending. This makes the newest file appear first when the filesnames have
1038-
# the correct format (YYYY-MM-DD-*).
1043+
# the dataset. The ones that matched are thrown in a list and sorted by name.
1044+
# When resource_regex is omitted from the config JSON, resource_sort_order
1045+
# selects ascending vs descending; when resource_regex is included in the
1046+
# config, resource_sort_order is ignored and matched resources are always sorted
1047+
# descending (newest name first for YYYYMMDD-* style names).
10391048
# In case filesnames have different structure, e.g., *_YYYY-MM-DD.csv,
10401049
# `date_pattern` should be specified in the harvester configuration, which is
10411050
# used to list the newest files on the top of the list.

ckanext/switzerland/tests/base_ftp_harvester_tests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def run_harvester(
2424
self,
2525
force_all=False,
2626
resource_regex=None,
27+
resource_sort_order=None,
2728
max_resources=None,
2829
dataset=data.dataset_name,
2930
timetable_regex=None,
@@ -47,6 +48,8 @@ def run_harvester(
4748
config["force_all"] = True
4849
if resource_regex:
4950
config["resource_regex"] = resource_regex
51+
if resource_sort_order is not None:
52+
config["resource_sort_order"] = resource_sort_order
5053
if max_resources:
5154
config["max_resources"] = max_resources
5255
if timetable_regex:

ckanext/switzerland/tests/test_sbb_harvester.py

Lines changed: 71 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -363,11 +363,77 @@ def test_order_permalink_regex(self):
363363
),
364364
)
365365

366-
# TODO: add tests for resource_sort_order:
367-
# - default resource_sort_order is "desc"
368-
# - resource_sort_order set as "desc" is respected
369-
# - resource_sort_order set as "asc" is respected
370-
# - resource_sort_order is not used if resource_regex is set
366+
def test_resource_sort_order_default_is_desc(self):
367+
"""If resource_sort_order is omitted from config, resources are ordered desc."""
368+
filesystem = self.get_filesystem(filename="20160901.csv")
369+
MockFTPStorageAdapter.filesystem = filesystem
370+
path = os.path.join(data.environment, data.folder, "20160902.csv")
371+
filesystem.writetext(path, data.dataset_content_2)
372+
self.run_harvester(ftp_server="testserver")
373+
374+
package = self.get_package()
375+
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
376+
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")
377+
378+
def test_resource_sort_order_desc_explicit(self):
379+
"""resource_sort_order 'desc' sorts identifiers descending (newest name first)."""
380+
filesystem = self.get_filesystem(filename="20160901.csv")
381+
MockFTPStorageAdapter.filesystem = filesystem
382+
path = os.path.join(data.environment, data.folder, "20160902.csv")
383+
filesystem.writetext(path, data.dataset_content_2)
384+
self.run_harvester(ftp_server="testserver", resource_sort_order="desc")
385+
386+
package = self.get_package()
387+
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
388+
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")
389+
390+
def test_resource_sort_order_asc(self):
391+
"""resource_sort_order 'asc' sorts identifiers ascending."""
392+
filesystem = self.get_filesystem(filename="20160901.csv")
393+
MockFTPStorageAdapter.filesystem = filesystem
394+
path = os.path.join(data.environment, data.folder, "20160902.csv")
395+
filesystem.writetext(path, data.dataset_content_2)
396+
self.run_harvester(ftp_server="testserver", resource_sort_order="asc")
397+
398+
package = self.get_package()
399+
self.assertEqual(package.resources[0].extras["identifier"], "20160901.csv")
400+
self.assertEqual(package.resources[1].extras["identifier"], "20160902.csv")
401+
self.assertEqual(
402+
package.extras["permalink"],
403+
"http://odp.test/dataset/{}/resource/{}/download/20160901.csv".format(
404+
package.id, package.resources[0].id
405+
),
406+
)
407+
408+
def test_resource_sort_order_ignored_when_resource_regex_in_config(self):
409+
"""If resource_regex is present in the harvester config, resource_sort_order is
410+
ignored: matched files keep descending identifier order even when asc is set.
411+
Unmatched files still precede the matched block (unchanged).
412+
"""
413+
filesystem = self.get_filesystem(filename="20160901.csv")
414+
MockFTPStorageAdapter.filesystem = filesystem
415+
path = os.path.join(data.environment, data.folder, "20160902.csv")
416+
filesystem.writetext(path, data.dataset_content_2)
417+
path = os.path.join(data.environment, data.folder, "1111Resource.csv")
418+
filesystem.writetext(path, data.dataset_content_3)
419+
path = os.path.join(data.environment, data.folder, "9999Resource.csv")
420+
filesystem.writetext(path, data.dataset_content_3)
421+
422+
self.run_harvester(
423+
resource_regex=r"\d{8}.csv",
424+
resource_sort_order="asc",
425+
ftp_server="testserver",
426+
)
427+
package = self.get_package()
428+
self.assertEqual(
429+
[r.extras["identifier"] for r in package.resources],
430+
[
431+
"1111Resource.csv",
432+
"9999Resource.csv",
433+
"20160902.csv",
434+
"20160901.csv",
435+
],
436+
)
371437

372438
# cleanup tests
373439
def test_max_resources(self):

0 commit comments

Comments
 (0)