diff --git a/ckanext/switzerland/harvester/base_sbb_harvester.py b/ckanext/switzerland/harvester/base_sbb_harvester.py index 89d201ff..9d186fca 100644 --- a/ckanext/switzerland/harvester/base_sbb_harvester.py +++ b/ckanext/switzerland/harvester/base_sbb_harvester.py @@ -185,12 +185,22 @@ def get_config_validation_schema(self): "storage_adapter": str, "bucket": str, voluptuous.Required("date_pattern", default=""): str, + voluptuous.Required( + "resource_sort_order", default="desc" + ): voluptuous.In(["asc", "desc"]), } ) def load_config(self, config_str): schema = self.get_config_validation_schema() data = json.loads(config_str) + # If resource_regex is in the config, ignore resource_sort_order. + if "resource_regex" in data and "resource_sort_order" in data: + del data["resource_sort_order"] + log.info( + "resource_regex is set in config: ignoring resource_sort_order and " + "using default value (desc)" + ) return schema(data) # tested @@ -988,23 +998,34 @@ def _get_ordered_resources(self, package): unmatched_resources = [] # get filename regex for permalink from harvester config or fallback to a - # catch-all + # catch-all that matches all filenames (.*) identifier_regex = self.config["resource_regex"] for resource in package["resources"]: if re.match(identifier_regex, resource["identifier"], re.IGNORECASE): ordered_resources.append(resource) else: + # We only add to unmatched_resources if the resource_regex exists and a + # filename doesn't match it unmatched_resources.append(resource) + if self.config["resource_sort_order"] == "asc": + reverse = False + else: + reverse = True + log.debug( + f"resource_sort_order is {self.config['resource_sort_order']}" + f" and reverse is {reverse}" + ) + if self.config["date_pattern"]: ordered_resources.sort( key=lambda r: re.search( self.config["date_pattern"], r["identifier"] ).group(), - reverse=True, + reverse=reverse, ) else: - ordered_resources.sort(key=lambda r: r["identifier"], reverse=True) + ordered_resources.sort(key=lambda r: r["identifier"], reverse=reverse) return ordered_resources, unmatched_resources @@ -1019,9 +1040,11 @@ def finalize(self, harvest_object, harvest_object_data): # noqa: C901 # Deleting old resources, generate permalink, order resources: # We do this by matching a regex, defined in the `resource_regex` key of the # harvester json config, against the identifier (filename) of the resources of - # the dataset. The ones that matched are thrown in a list and sorted by name, - # descending. This makes the newest file appear first when the filesnames have - # the correct format (YYYY-MM-DD-*). + # the dataset. The ones that matched are thrown in a list and sorted by name. + # When resource_regex is omitted from the config JSON, resource_sort_order + # selects ascending vs descending; when resource_regex is included in the + # config, resource_sort_order is ignored and matched resources are always sorted + # descending (newest name first for YYYYMMDD-* style names). # In case filesnames have different structure, e.g., *_YYYY-MM-DD.csv, # `date_pattern` should be specified in the harvester configuration, which is # used to list the newest files on the top of the list. diff --git a/ckanext/switzerland/tests/base_ftp_harvester_tests.py b/ckanext/switzerland/tests/base_ftp_harvester_tests.py index ec802c26..9fe8b23a 100644 --- a/ckanext/switzerland/tests/base_ftp_harvester_tests.py +++ b/ckanext/switzerland/tests/base_ftp_harvester_tests.py @@ -24,6 +24,7 @@ def run_harvester( self, force_all=False, resource_regex=None, + resource_sort_order=None, max_resources=None, dataset=data.dataset_name, timetable_regex=None, @@ -47,6 +48,8 @@ def run_harvester( config["force_all"] = True if resource_regex: config["resource_regex"] = resource_regex + if resource_sort_order is not None: + config["resource_sort_order"] = resource_sort_order if max_resources: config["max_resources"] = max_resources if timetable_regex: diff --git a/ckanext/switzerland/tests/test_sbb_harvester.py b/ckanext/switzerland/tests/test_sbb_harvester.py index 85317227..ea2a9480 100644 --- a/ckanext/switzerland/tests/test_sbb_harvester.py +++ b/ckanext/switzerland/tests/test_sbb_harvester.py @@ -363,6 +363,78 @@ def test_order_permalink_regex(self): ), ) + def test_resource_sort_order_default_is_desc(self): + """If resource_sort_order is omitted from config, resources are ordered desc.""" + filesystem = self.get_filesystem(filename="20160901.csv") + MockFTPStorageAdapter.filesystem = filesystem + path = os.path.join(data.environment, data.folder, "20160902.csv") + filesystem.writetext(path, data.dataset_content_2) + self.run_harvester(ftp_server="testserver") + + package = self.get_package() + self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv") + self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv") + + def test_resource_sort_order_desc_explicit(self): + """resource_sort_order 'desc' sorts identifiers descending (newest name first).""" + filesystem = self.get_filesystem(filename="20160901.csv") + MockFTPStorageAdapter.filesystem = filesystem + path = os.path.join(data.environment, data.folder, "20160902.csv") + filesystem.writetext(path, data.dataset_content_2) + self.run_harvester(ftp_server="testserver", resource_sort_order="desc") + + package = self.get_package() + self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv") + self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv") + + def test_resource_sort_order_asc(self): + """resource_sort_order 'asc' sorts identifiers ascending.""" + filesystem = self.get_filesystem(filename="20160901.csv") + MockFTPStorageAdapter.filesystem = filesystem + path = os.path.join(data.environment, data.folder, "20160902.csv") + filesystem.writetext(path, data.dataset_content_2) + self.run_harvester(ftp_server="testserver", resource_sort_order="asc") + + package = self.get_package() + self.assertEqual(package.resources[0].extras["identifier"], "20160901.csv") + self.assertEqual(package.resources[1].extras["identifier"], "20160902.csv") + self.assertEqual( + package.extras["permalink"], + "http://odp.test/dataset/{}/resource/{}/download/20160901.csv".format( + package.id, package.resources[0].id + ), + ) + + def test_resource_sort_order_ignored_when_resource_regex_in_config(self): + """If resource_regex is present in the harvester config, resource_sort_order is + ignored: matched files keep descending identifier order even when asc is set. + Unmatched files still precede the matched block (unchanged). + """ + filesystem = self.get_filesystem(filename="20160901.csv") + MockFTPStorageAdapter.filesystem = filesystem + path = os.path.join(data.environment, data.folder, "20160902.csv") + filesystem.writetext(path, data.dataset_content_2) + path = os.path.join(data.environment, data.folder, "1111Resource.csv") + filesystem.writetext(path, data.dataset_content_3) + path = os.path.join(data.environment, data.folder, "9999Resource.csv") + filesystem.writetext(path, data.dataset_content_3) + + self.run_harvester( + resource_regex=r"\d{8}.csv", + resource_sort_order="asc", + ftp_server="testserver", + ) + package = self.get_package() + self.assertEqual( + [r.extras["identifier"] for r in package.resources], + [ + "1111Resource.csv", + "9999Resource.csv", + "20160902.csv", + "20160901.csv", + ], + ) + # cleanup tests def test_max_resources(self): filesystem = self.get_filesystem(filename="20160901.csv")