Skip to content

Commit 65a5370

Browse files
authored
Merge pull request #200 from liip/feat/add-sort-order-option-for-harvester-config
feat: Add resource_sort_order option in harvester
2 parents 3b6bd48 + 6019071 commit 65a5370

3 files changed

Lines changed: 104 additions & 6 deletions

File tree

ckanext/switzerland/harvester/base_sbb_harvester.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,22 @@ def get_config_validation_schema(self):
185185
"storage_adapter": str,
186186
"bucket": str,
187187
voluptuous.Required("date_pattern", default=""): str,
188+
voluptuous.Required(
189+
"resource_sort_order", default="desc"
190+
): voluptuous.In(["asc", "desc"]),
188191
}
189192
)
190193

191194
def load_config(self, config_str):
192195
schema = self.get_config_validation_schema()
193196
data = json.loads(config_str)
197+
# If resource_regex is in the config, ignore resource_sort_order.
198+
if "resource_regex" in data and "resource_sort_order" in data:
199+
del data["resource_sort_order"]
200+
log.info(
201+
"resource_regex is set in config: ignoring resource_sort_order and "
202+
"using default value (desc)"
203+
)
194204
return schema(data)
195205

196206
# tested
@@ -988,23 +998,34 @@ def _get_ordered_resources(self, package):
988998
unmatched_resources = []
989999

9901000
# get filename regex for permalink from harvester config or fallback to a
991-
# catch-all
1001+
# catch-all that matches all filenames (.*)
9921002
identifier_regex = self.config["resource_regex"]
9931003
for resource in package["resources"]:
9941004
if re.match(identifier_regex, resource["identifier"], re.IGNORECASE):
9951005
ordered_resources.append(resource)
9961006
else:
1007+
# We only add to unmatched_resources if the resource_regex exists and a
1008+
# filename doesn't match it
9971009
unmatched_resources.append(resource)
9981010

1011+
if self.config["resource_sort_order"] == "asc":
1012+
reverse = False
1013+
else:
1014+
reverse = True
1015+
log.debug(
1016+
f"resource_sort_order is {self.config['resource_sort_order']}"
1017+
f" and reverse is {reverse}"
1018+
)
1019+
9991020
if self.config["date_pattern"]:
10001021
ordered_resources.sort(
10011022
key=lambda r: re.search(
10021023
self.config["date_pattern"], r["identifier"]
10031024
).group(),
1004-
reverse=True,
1025+
reverse=reverse,
10051026
)
10061027
else:
1007-
ordered_resources.sort(key=lambda r: r["identifier"], reverse=True)
1028+
ordered_resources.sort(key=lambda r: r["identifier"], reverse=reverse)
10081029

10091030
return ordered_resources, unmatched_resources
10101031

@@ -1019,9 +1040,11 @@ def finalize(self, harvest_object, harvest_object_data): # noqa: C901
10191040
# Deleting old resources, generate permalink, order resources:
10201041
# We do this by matching a regex, defined in the `resource_regex` key of the
10211042
# harvester json config, against the identifier (filename) of the resources of
1022-
# the dataset. The ones that matched are thrown in a list and sorted by name,
1023-
# descending. This makes the newest file appear first when the filesnames have
1024-
# the correct format (YYYY-MM-DD-*).
1043+
# the dataset. The ones that matched are thrown in a list and sorted by name.
1044+
# When resource_regex is omitted from the config JSON, resource_sort_order
1045+
# selects ascending vs descending; when resource_regex is included in the
1046+
# config, resource_sort_order is ignored and matched resources are always sorted
1047+
# descending (newest name first for YYYYMMDD-* style names).
10251048
# In case filesnames have different structure, e.g., *_YYYY-MM-DD.csv,
10261049
# `date_pattern` should be specified in the harvester configuration, which is
10271050
# used to list the newest files on the top of the list.

ckanext/switzerland/tests/base_ftp_harvester_tests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def run_harvester(
2424
self,
2525
force_all=False,
2626
resource_regex=None,
27+
resource_sort_order=None,
2728
max_resources=None,
2829
dataset=data.dataset_name,
2930
timetable_regex=None,
@@ -47,6 +48,8 @@ def run_harvester(
4748
config["force_all"] = True
4849
if resource_regex:
4950
config["resource_regex"] = resource_regex
51+
if resource_sort_order is not None:
52+
config["resource_sort_order"] = resource_sort_order
5053
if max_resources:
5154
config["max_resources"] = max_resources
5255
if timetable_regex:

ckanext/switzerland/tests/test_sbb_harvester.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,78 @@ def test_order_permalink_regex(self):
363363
),
364364
)
365365

366+
def test_resource_sort_order_default_is_desc(self):
367+
"""If resource_sort_order is omitted from config, resources are ordered desc."""
368+
filesystem = self.get_filesystem(filename="20160901.csv")
369+
MockFTPStorageAdapter.filesystem = filesystem
370+
path = os.path.join(data.environment, data.folder, "20160902.csv")
371+
filesystem.writetext(path, data.dataset_content_2)
372+
self.run_harvester(ftp_server="testserver")
373+
374+
package = self.get_package()
375+
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
376+
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")
377+
378+
def test_resource_sort_order_desc_explicit(self):
379+
"""resource_sort_order 'desc' sorts identifiers descending (newest name first)."""
380+
filesystem = self.get_filesystem(filename="20160901.csv")
381+
MockFTPStorageAdapter.filesystem = filesystem
382+
path = os.path.join(data.environment, data.folder, "20160902.csv")
383+
filesystem.writetext(path, data.dataset_content_2)
384+
self.run_harvester(ftp_server="testserver", resource_sort_order="desc")
385+
386+
package = self.get_package()
387+
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
388+
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")
389+
390+
def test_resource_sort_order_asc(self):
391+
"""resource_sort_order 'asc' sorts identifiers ascending."""
392+
filesystem = self.get_filesystem(filename="20160901.csv")
393+
MockFTPStorageAdapter.filesystem = filesystem
394+
path = os.path.join(data.environment, data.folder, "20160902.csv")
395+
filesystem.writetext(path, data.dataset_content_2)
396+
self.run_harvester(ftp_server="testserver", resource_sort_order="asc")
397+
398+
package = self.get_package()
399+
self.assertEqual(package.resources[0].extras["identifier"], "20160901.csv")
400+
self.assertEqual(package.resources[1].extras["identifier"], "20160902.csv")
401+
self.assertEqual(
402+
package.extras["permalink"],
403+
"http://odp.test/dataset/{}/resource/{}/download/20160901.csv".format(
404+
package.id, package.resources[0].id
405+
),
406+
)
407+
408+
def test_resource_sort_order_ignored_when_resource_regex_in_config(self):
409+
"""If resource_regex is present in the harvester config, resource_sort_order is
410+
ignored: matched files keep descending identifier order even when asc is set.
411+
Unmatched files still precede the matched block (unchanged).
412+
"""
413+
filesystem = self.get_filesystem(filename="20160901.csv")
414+
MockFTPStorageAdapter.filesystem = filesystem
415+
path = os.path.join(data.environment, data.folder, "20160902.csv")
416+
filesystem.writetext(path, data.dataset_content_2)
417+
path = os.path.join(data.environment, data.folder, "1111Resource.csv")
418+
filesystem.writetext(path, data.dataset_content_3)
419+
path = os.path.join(data.environment, data.folder, "9999Resource.csv")
420+
filesystem.writetext(path, data.dataset_content_3)
421+
422+
self.run_harvester(
423+
resource_regex=r"\d{8}.csv",
424+
resource_sort_order="asc",
425+
ftp_server="testserver",
426+
)
427+
package = self.get_package()
428+
self.assertEqual(
429+
[r.extras["identifier"] for r in package.resources],
430+
[
431+
"1111Resource.csv",
432+
"9999Resource.csv",
433+
"20160902.csv",
434+
"20160901.csv",
435+
],
436+
)
437+
366438
# cleanup tests
367439
def test_max_resources(self):
368440
filesystem = self.get_filesystem(filename="20160901.csv")

0 commit comments

Comments
 (0)