Skip to content

Commit

Permalink
Merge pull request #16311 from davelopez/fix_tool_remote_test_data
Browse files Browse the repository at this point in the history
[23.1] Fix tool remote test data
  • Loading branch information
mvdbeek committed Jun 28, 2023
2 parents 197b954 + d8937d4 commit 71d4823
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 162 deletions.
16 changes: 6 additions & 10 deletions lib/galaxy/tool_util/verify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,24 +51,20 @@ def verify(
Throw an informative assertion error if any of these tests fail.
"""
use_default_test_data_resolver = get_filecontent is None
if get_filename is None:
get_filecontent_: Callable[[str], bytes]
if get_filecontent is None:
get_filecontent_ = DEFAULT_TEST_DATA_RESOLVER.get_filecontent
else:
get_filecontent_ = get_filecontent

def get_filename(filename: str) -> str:
file_content = _retrieve_file_content(filename)
file_content = get_filecontent_(filename)
local_name = make_temp_fname(fname=filename)
with open(local_name, "wb") as f:
f.write(file_content)
return local_name

def _retrieve_file_content(filename: str) -> bytes:
if use_default_test_data_resolver:
file_content = DEFAULT_TEST_DATA_RESOLVER.get_filecontent(filename, context=attributes)
else:
assert get_filecontent is not None
file_content = get_filecontent(filename)
return file_content

# Check assertions...
assertions = attributes.get("assert_list", None)
if attributes is not None and assertions is not None:
Expand Down
82 changes: 71 additions & 11 deletions lib/galaxy/tool_util/verify/interactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
TestCollectionOutputDef,
)
from galaxy.util.bunch import Bunch
from galaxy.util.hash_util import (
memory_bound_hexdigest,
parse_checksum_hash,
)
from . import verify
from .asserts import verify_assertions
from .wait import wait_on
Expand Down Expand Up @@ -315,7 +319,7 @@ def wait_for_jobs(self, history_id, jobs, maxseconds):

def verify_output_dataset(self, history_id, hda_id, outfile, attributes, tool_id, tool_version=None):
fetcher = self.__dataset_fetcher(history_id)
test_data_downloader = self.__test_data_downloader(tool_id, tool_version)
test_data_downloader = self.__test_data_downloader(tool_id, tool_version, attributes)
verify_hid(
outfile,
hda_id=hda_id,
Expand Down Expand Up @@ -478,10 +482,7 @@ def test_data_download(self, tool_id, filename, mode="file", is_output=True, too
local_path = self.test_data_path(tool_id, filename, tool_version=tool_version)

if result is None and (local_path is None or not os.path.exists(local_path)):
for test_data_directory in self.test_data_directories:
local_path = os.path.join(test_data_directory, filename)
if os.path.exists(local_path):
break
local_path = self._find_in_test_data_directories(filename)

if result is None and local_path is not None and os.path.exists(local_path):
if mode == "file":
Expand All @@ -503,6 +504,14 @@ def test_data_download(self, tool_id, filename, mode="file", is_output=True, too

return result

def _find_in_test_data_directories(self, filename: str) -> Optional[str]:
local_path = None
for test_data_directory in self.test_data_directories:
local_path = os.path.join(test_data_directory, filename)
if os.path.exists(local_path):
break
return local_path

def __output_id(self, output_data):
# Allow data structure coming out of tools API - {id: <id>, output_name: <name>, etc...}
# or simple id as comes out of workflow API.
Expand Down Expand Up @@ -551,15 +560,24 @@ def stage_data_async(
)
name = test_data["name"]
else:
name = os.path.basename(fname)
file_name = None
file_name_exists = False
location = self._ensure_valid_location_in(test_data)
if fname:
file_name = self.test_data_path(tool_id, fname, tool_version=tool_version)
file_name_exists = os.path.exists(f"{file_name}")
upload_from_location = not file_name_exists and location is not None
name = os.path.basename(location if upload_from_location else fname)
tool_input.update(
{
"files_0|NAME": name,
"files_0|type": "upload_dataset",
}
)
files = {}
if force_path_paste:
if upload_from_location:
tool_input.update({"files_0|url_paste": location})
elif force_path_paste:
file_name = self.test_data_path(tool_id, fname, tool_version=tool_version)
tool_input.update({"files_0|url_paste": f"file://{file_name}"})
else:
Expand All @@ -584,6 +602,13 @@ def stage_data_async(
assert len(jobs) > 0, f"Invalid response from server [{submit_response}], expecting a job."
return lambda: self.wait_for_job(jobs[0]["id"], history_id, maxseconds=maxseconds)

def _ensure_valid_location_in(self, test_data: dict) -> Optional[str]:
location: Optional[str] = test_data.get("location")
has_valid_location = location and util.is_url(location)
if location and not has_valid_location:
raise ValueError(f"Invalid `location` URL: `{location}`")
return location

def run_tool(self, testdef, history_id, resource_parameters=None) -> RunToolResponse:
# We need to handle the case where we've uploaded a valid compressed file since the upload
# tool will have uncompressed it on the fly.
Expand Down Expand Up @@ -825,11 +850,47 @@ def ensure_user_with_email(self, email, password=None):
test_user = self._post("users", data, key=admin_key).json()
return test_user

def __test_data_downloader(self, tool_id, tool_version=None):
def test_data_download(filename, mode="file"):
def __test_data_downloader(self, tool_id, tool_version=None, attributes: Optional[dict] = None):
location = None
checksum = attributes.get("checksum") if attributes else None

def test_data_download_from_galaxy(filename, mode="file"):
return self.test_data_download(tool_id, filename, mode=mode, tool_version=tool_version)

return test_data_download
def test_data_download_from_location(filename: str):
# try to find the file in the test data directories first
local_path = self._find_in_test_data_directories(filename)
if local_path and os.path.exists(local_path):
with open(local_path, mode="rb") as f:
return f.read()
# if not found, try to download it from the location to the test data directory
# to be reused in subsequent tests
if local_path:
util.download_to_file(location, local_path)
self._verify_checksum(local_path, checksum)
with open(local_path, mode="rb") as f:
return f.read()
# otherwise, download it to a temporary file
with tempfile.NamedTemporaryFile() as file_handle:
util.download_to_file(location, file_handle.name)
self._verify_checksum(file_handle.name, checksum)
return file_handle.file.read()

if attributes:
location = self._ensure_valid_location_in(attributes)
if location:
return test_data_download_from_location
return test_data_download_from_galaxy

def _verify_checksum(self, file_path: str, checksum: Optional[str] = None):
if checksum is None:
return
hash_function, expected_hash_value = parse_checksum_hash(checksum)
calculated_hash_value = memory_bound_hexdigest(hash_func_name=hash_function, path=file_path)
if calculated_hash_value != expected_hash_value:
raise AssertionError(
f"Failed to verify checksum with [{hash_function}] - expected [{expected_hash_value}] got [{calculated_hash_value}]"
)

def __dataset_fetcher(self, history_id):
def fetcher(hda_id, base_name=None):
Expand Down Expand Up @@ -1695,7 +1756,6 @@ def test_data_iter(required_files):
ftype=extra.get("ftype", DEFAULT_FTYPE),
dbkey=extra.get("dbkey", DEFAULT_DBKEY),
location=extra.get("location", None),
md5=extra.get("md5", None),
)
edit_attributes = extra.get("edit_attributes", [])

Expand Down
105 changes: 10 additions & 95 deletions lib/galaxy/tool_util/verify/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,12 @@
import re
import subprocess
from string import Template
from typing import (
Any,
Dict,
Optional,
)

from typing_extensions import Protocol

from galaxy.util import (
asbool,
download_to_file,
in_directory,
is_url,
smart_str,
)
from galaxy.util.hash_util import (
memory_bound_hexdigest,
parse_checksum_hash,
)

UPDATE_TEMPLATE = Template(
"git --work-tree $dir --git-dir $dir/.git fetch && " "git --work-tree $dir --git-dir $dir/.git merge origin/master"
Expand All @@ -34,8 +21,6 @@

LIST_SEP = re.compile(r"\s*,\s*")

TestDataContext = Dict[str, Any]


class TestDataResolver:
__test__ = False # Prevent pytest from discovering this class (issue #12071)
Expand All @@ -44,23 +29,23 @@ def __init__(self, file_dirs=None, env_var="GALAXY_TEST_FILE_DIR", environ=os.en
if file_dirs is None:
file_dirs = environ.get(env_var, None)
if file_dirs is None:
file_dirs = "test-data,url-location,https://github.com/galaxyproject/galaxy-test-data.git"
file_dirs = "test-data,https://github.com/galaxyproject/galaxy-test-data.git"
if file_dirs:
self.resolvers = [build_resolver(u, environ) for u in LIST_SEP.split(file_dirs)]
else:
self.resolvers = []

def get_filename(self, name: str, context: Optional[TestDataContext] = None) -> str:
def get_filename(self, name: str) -> str:
for resolver in self.resolvers or []:
if not resolver.exists(name, context):
if not resolver.exists(name):
continue
filename = resolver.path(name)
if filename:
return os.path.abspath(filename)
raise TestDataNotFoundError(f"Failed to find test file {name} against any test data resolvers")

def get_filecontent(self, name: str, context: Optional[TestDataContext] = None) -> bytes:
filename = self.get_filename(name=name, context=context)
def get_filecontent(self, name: str) -> bytes:
filename = self.get_filename(name=name)
with open(filename, mode="rb") as f:
return f.read()

Expand All @@ -72,32 +57,18 @@ class TestDataNotFoundError(ValueError):
pass


class TestDataChecksumError(ValueError):
pass


def build_resolver(uri: str, environ):
if uri.startswith("http") and uri.endswith(".git"):
return GitDataResolver(uri, environ)
elif uri == "url-location":
return RemoteLocationDataResolver(environ)
else:
return FileDataResolver(uri)


class DataResolver(Protocol):
def exists(self, filename: str, context: Optional[TestDataContext] = None):
raise NotImplementedError

def path(self, filename: str):
raise NotImplementedError


class FileDataResolver(DataResolver):
class FileDataResolver:
def __init__(self, file_dir: str):
self.file_dir = file_dir

def exists(self, filename: str, context: Optional[TestDataContext] = None):
def exists(self, filename: str):
path = os.path.abspath(self.path(filename))
return os.path.exists(path) and in_directory(path, self.file_dir)

Expand All @@ -118,12 +89,12 @@ def __init__(self, repository: str, environ):
# will leave it as true for now.
self.fetch_data = asbool(environ.get("GALAXY_TEST_FETCH_DATA", "true"))

def exists(self, filename: str, context: Optional[TestDataContext] = None):
exists_now = super().exists(filename, context)
def exists(self, filename: str):
exists_now = super().exists(filename)
if exists_now or not self.fetch_data or self.updated:
return exists_now
self.update_repository()
return super().exists(filename, context)
return super().exists(filename)

def update_repository(self):
self.updated = True
Expand Down Expand Up @@ -151,59 +122,3 @@ def execute(self, cmd):
"stderr": stderr,
}
print(UPDATE_FAILED_TEMPLATE.substitute(**kwds))


class RemoteLocationDataResolver(FileDataResolver):
def __init__(self, environ):
self.fetch_data = asbool(environ.get("GALAXY_TEST_FETCH_DATA", True))
repo_cache = environ.get("GALAXY_TEST_DATA_REPO_CACHE", "test-data-cache")
repo_path = os.path.join(repo_cache, "from-location")
super().__init__(repo_path)

def exists(self, filename: str, context: Optional[TestDataContext] = None):
exists_now = super().exists(filename, context)
if exists_now or not self.fetch_data or context is None:
return exists_now
self._try_download_from_location(filename, context)
exists_now = super().exists(filename, context)
if exists_now:
self._verify_checksum(filename, context)
return exists_now

def _try_download_from_location(self, filename: str, context: TestDataContext):
location = context.get("location")
if location is None:
return
if not is_url(location):
raise ValueError(f"Invalid 'location' URL for remote test data provided: {location}")
if not self._is_valid_filename(filename):
raise ValueError(f"Invalid 'filename' provided: '{filename}'")
self._ensure_base_dir_exists()
dest_file_path = self.path(filename)
download_to_file(location, dest_file_path)

def _ensure_base_dir_exists(self):
if not os.path.exists(self.file_dir):
os.makedirs(self.file_dir)

def _verify_checksum(self, filename: str, context: Optional[TestDataContext] = None):
if context is None or is_url(filename):
return
checksum = context.get("checksum")
if checksum is None:
return
hash_function, expected_hash_value = parse_checksum_hash(checksum)
file_path = self.path(filename)
calculated_hash_value = memory_bound_hexdigest(hash_func_name=hash_function, path=file_path)
if calculated_hash_value != expected_hash_value:
raise TestDataChecksumError(
f"Failed to validate test data '{filename}' with [{hash_function}] - expected [{expected_hash_value}] got [{calculated_hash_value}]"
)

def _is_valid_filename(self, filename: str):
"""
Checks that the filename does not contain the following
characters: <, >, :, ", /, \\, |, ?, *, or any control characters.
"""
pattern = r"^[^<>:\"/\\|?*\x00-\x1F]+$"
return bool(re.match(pattern, filename))
14 changes: 3 additions & 11 deletions lib/galaxy/tool_util/xsd/galaxy.xsd
Original file line number Diff line number Diff line change
Expand Up @@ -1544,17 +1544,9 @@ of ``type`` ``data``.</xs:documentation>
Please use this option only when is not possible to include the files in the `test-data` folder, since
this is more error prone due to external factors like remote availability.
You can use it in two ways:
- In combination with `value` it will look for the input file specified in `value`, if it's not available on disk it will
download the file pointed by `location` using the same name as in `value` and then use it as input.
- Specifiying the `location` without a `value`, it will download the file and use it as an alias of `value`. The name of the file
will be infered from the last component of the location URL. For example, `location="https://my_url/my_file.txt"` will be equivalent to `value="my_file.txt"`.
If you specify a `checksum`, it will be also used to check the integrity of the download.</xs:documentation>
</xs:annotation>
</xs:attribute>
<xs:attribute name="checksum" type="xs:string" gxdocs:added="23.1">
<xs:annotation>
<xs:documentation xml:lang="en">If specified in combination with `location`, after downloading the test input file, the checksum should match the value specified
here. This value should have the form ``hash_type$hash_value`` (e.g. ``sha1$8156d7ca0f46ed7abac98f82e36cfaddb2aca041``).</xs:documentation>
- If only ``location`` is given (and `value` is absent), the input file will be uploaded directly to Galaxy from the URL specified by the ``location`` (same as regular pasted URL upload).
- If ``location`` as well as ``value`` are given, the input file specified in ``value`` will be used from the tes data directory, if it's not available on disk it will use the ``location`` to upload the input as the previous case.
</xs:documentation>
</xs:annotation>
</xs:attribute>
</xs:complexType>
Expand Down
Loading

0 comments on commit 71d4823

Please sign in to comment.