Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions ckanext/switzerland/harvester/base_sbb_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,22 @@ def get_config_validation_schema(self):
"storage_adapter": str,
"bucket": str,
voluptuous.Required("date_pattern", default=""): str,
voluptuous.Required(
"resource_sort_order", default="desc"
): voluptuous.In(["asc", "desc"]),
}
)

def load_config(self, config_str):
schema = self.get_config_validation_schema()
data = json.loads(config_str)
# If resource_regex is in the config, ignore resource_sort_order.
if "resource_regex" in data and "resource_sort_order" in data:
del data["resource_sort_order"]
log.info(
"resource_regex is set in config: ignoring resource_sort_order and "
"using default value (desc)"
)
return schema(data)

# tested
Expand Down Expand Up @@ -988,23 +998,34 @@ def _get_ordered_resources(self, package):
unmatched_resources = []

# get filename regex for permalink from harvester config or fallback to a
# catch-all
# catch-all that matches all filenames (.*)
identifier_regex = self.config["resource_regex"]
for resource in package["resources"]:
if re.match(identifier_regex, resource["identifier"], re.IGNORECASE):
ordered_resources.append(resource)
else:
# We only add to unmatched_resources if the resource_regex exists and a
# filename doesn't match it
unmatched_resources.append(resource)

if self.config["resource_sort_order"] == "asc":
reverse = False
else:
reverse = True
log.debug(
f"resource_sort_order is {self.config['resource_sort_order']}"
f" and reverse is {reverse}"
)

if self.config["date_pattern"]:
ordered_resources.sort(
key=lambda r: re.search(
self.config["date_pattern"], r["identifier"]
).group(),
reverse=True,
reverse=reverse,
)
else:
ordered_resources.sort(key=lambda r: r["identifier"], reverse=True)
ordered_resources.sort(key=lambda r: r["identifier"], reverse=reverse)

return ordered_resources, unmatched_resources

Expand All @@ -1019,9 +1040,11 @@ def finalize(self, harvest_object, harvest_object_data): # noqa: C901
# Deleting old resources, generate permalink, order resources:
# We do this by matching a regex, defined in the `resource_regex` key of the
# harvester json config, against the identifier (filename) of the resources of
# the dataset. The ones that matched are thrown in a list and sorted by name,
# descending. This makes the newest file appear first when the filesnames have
# the correct format (YYYY-MM-DD-*).
# the dataset. The ones that matched are thrown in a list and sorted by name.
# When resource_regex is omitted from the config JSON, resource_sort_order
# selects ascending vs descending; when resource_regex is included in the
# config, resource_sort_order is ignored and matched resources are always sorted
# descending (newest name first for YYYYMMDD-* style names).
# In case filesnames have different structure, e.g., *_YYYY-MM-DD.csv,
# `date_pattern` should be specified in the harvester configuration, which is
# used to list the newest files on the top of the list.
Expand Down
3 changes: 3 additions & 0 deletions ckanext/switzerland/tests/base_ftp_harvester_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def run_harvester(
self,
force_all=False,
resource_regex=None,
resource_sort_order=None,
max_resources=None,
dataset=data.dataset_name,
timetable_regex=None,
Expand All @@ -47,6 +48,8 @@ def run_harvester(
config["force_all"] = True
if resource_regex:
config["resource_regex"] = resource_regex
if resource_sort_order is not None:
config["resource_sort_order"] = resource_sort_order
if max_resources:
config["max_resources"] = max_resources
if timetable_regex:
Expand Down
72 changes: 72 additions & 0 deletions ckanext/switzerland/tests/test_sbb_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,78 @@ def test_order_permalink_regex(self):
),
)

def test_resource_sort_order_default_is_desc(self):
"""If resource_sort_order is omitted from config, resources are ordered desc."""
filesystem = self.get_filesystem(filename="20160901.csv")
MockFTPStorageAdapter.filesystem = filesystem
path = os.path.join(data.environment, data.folder, "20160902.csv")
filesystem.writetext(path, data.dataset_content_2)
self.run_harvester(ftp_server="testserver")

package = self.get_package()
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")

def test_resource_sort_order_desc_explicit(self):
"""resource_sort_order 'desc' sorts identifiers descending (newest name first)."""
filesystem = self.get_filesystem(filename="20160901.csv")
MockFTPStorageAdapter.filesystem = filesystem
path = os.path.join(data.environment, data.folder, "20160902.csv")
filesystem.writetext(path, data.dataset_content_2)
self.run_harvester(ftp_server="testserver", resource_sort_order="desc")

package = self.get_package()
self.assertEqual(package.resources[0].extras["identifier"], "20160902.csv")
self.assertEqual(package.resources[1].extras["identifier"], "20160901.csv")

def test_resource_sort_order_asc(self):
"""resource_sort_order 'asc' sorts identifiers ascending."""
filesystem = self.get_filesystem(filename="20160901.csv")
MockFTPStorageAdapter.filesystem = filesystem
path = os.path.join(data.environment, data.folder, "20160902.csv")
filesystem.writetext(path, data.dataset_content_2)
self.run_harvester(ftp_server="testserver", resource_sort_order="asc")

package = self.get_package()
self.assertEqual(package.resources[0].extras["identifier"], "20160901.csv")
self.assertEqual(package.resources[1].extras["identifier"], "20160902.csv")
self.assertEqual(
package.extras["permalink"],
"http://odp.test/dataset/{}/resource/{}/download/20160901.csv".format(
package.id, package.resources[0].id
),
)

def test_resource_sort_order_ignored_when_resource_regex_in_config(self):
"""If resource_regex is present in the harvester config, resource_sort_order is
ignored: matched files keep descending identifier order even when asc is set.
Unmatched files still precede the matched block (unchanged).
"""
filesystem = self.get_filesystem(filename="20160901.csv")
MockFTPStorageAdapter.filesystem = filesystem
path = os.path.join(data.environment, data.folder, "20160902.csv")
filesystem.writetext(path, data.dataset_content_2)
path = os.path.join(data.environment, data.folder, "1111Resource.csv")
filesystem.writetext(path, data.dataset_content_3)
path = os.path.join(data.environment, data.folder, "9999Resource.csv")
filesystem.writetext(path, data.dataset_content_3)

self.run_harvester(
resource_regex=r"\d{8}.csv",
resource_sort_order="asc",
ftp_server="testserver",
)
package = self.get_package()
self.assertEqual(
[r.extras["identifier"] for r in package.resources],
[
"1111Resource.csv",
"9999Resource.csv",
"20160902.csv",
"20160901.csv",
],
)

# cleanup tests
def test_max_resources(self):
filesystem = self.get_filesystem(filename="20160901.csv")
Expand Down
Loading