diff --git a/lib/galaxy/tool_util/verify/__init__.py b/lib/galaxy/tool_util/verify/__init__.py index 4c11e1dddbbb..364aa46489b4 100644 --- a/lib/galaxy/tool_util/verify/__init__.py +++ b/lib/galaxy/tool_util/verify/__init__.py @@ -51,24 +51,20 @@ def verify( Throw an informative assertion error if any of these tests fail. """ - use_default_test_data_resolver = get_filecontent is None if get_filename is None: + get_filecontent_: Callable[[str], bytes] + if get_filecontent is None: + get_filecontent_ = DEFAULT_TEST_DATA_RESOLVER.get_filecontent + else: + get_filecontent_ = get_filecontent def get_filename(filename: str) -> str: - file_content = _retrieve_file_content(filename) + file_content = get_filecontent_(filename) local_name = make_temp_fname(fname=filename) with open(local_name, "wb") as f: f.write(file_content) return local_name - def _retrieve_file_content(filename: str) -> bytes: - if use_default_test_data_resolver: - file_content = DEFAULT_TEST_DATA_RESOLVER.get_filecontent(filename, context=attributes) - else: - assert get_filecontent is not None - file_content = get_filecontent(filename) - return file_content - # Check assertions... assertions = attributes.get("assert_list", None) if attributes is not None and assertions is not None: diff --git a/lib/galaxy/tool_util/verify/interactor.py b/lib/galaxy/tool_util/verify/interactor.py index 7a4019b1d15e..effb63cfdcb9 100644 --- a/lib/galaxy/tool_util/verify/interactor.py +++ b/lib/galaxy/tool_util/verify/interactor.py @@ -42,6 +42,10 @@ TestCollectionOutputDef, ) from galaxy.util.bunch import Bunch +from galaxy.util.hash_util import ( + memory_bound_hexdigest, + parse_checksum_hash, +) from . import verify from .asserts import verify_assertions from .wait import wait_on @@ -315,7 +319,7 @@ def wait_for_jobs(self, history_id, jobs, maxseconds): def verify_output_dataset(self, history_id, hda_id, outfile, attributes, tool_id, tool_version=None): fetcher = self.__dataset_fetcher(history_id) - test_data_downloader = self.__test_data_downloader(tool_id, tool_version) + test_data_downloader = self.__test_data_downloader(tool_id, tool_version, attributes) verify_hid( outfile, hda_id=hda_id, @@ -478,10 +482,7 @@ def test_data_download(self, tool_id, filename, mode="file", is_output=True, too local_path = self.test_data_path(tool_id, filename, tool_version=tool_version) if result is None and (local_path is None or not os.path.exists(local_path)): - for test_data_directory in self.test_data_directories: - local_path = os.path.join(test_data_directory, filename) - if os.path.exists(local_path): - break + local_path = self._find_in_test_data_directories(filename) if result is None and local_path is not None and os.path.exists(local_path): if mode == "file": @@ -503,6 +504,14 @@ def test_data_download(self, tool_id, filename, mode="file", is_output=True, too return result + def _find_in_test_data_directories(self, filename: str) -> Optional[str]: + local_path = None + for test_data_directory in self.test_data_directories: + local_path = os.path.join(test_data_directory, filename) + if os.path.exists(local_path): + break + return local_path + def __output_id(self, output_data): # Allow data structure coming out of tools API - {id: , output_name: , etc...} # or simple id as comes out of workflow API. @@ -551,7 +560,14 @@ def stage_data_async( ) name = test_data["name"] else: - name = os.path.basename(fname) + file_name = None + file_name_exists = False + location = self._ensure_valid_location_in(test_data) + if fname: + file_name = self.test_data_path(tool_id, fname, tool_version=tool_version) + file_name_exists = os.path.exists(f"{file_name}") + upload_from_location = not file_name_exists and location is not None + name = os.path.basename(location if upload_from_location else fname) tool_input.update( { "files_0|NAME": name, @@ -559,7 +575,9 @@ def stage_data_async( } ) files = {} - if force_path_paste: + if upload_from_location: + tool_input.update({"files_0|url_paste": location}) + elif force_path_paste: file_name = self.test_data_path(tool_id, fname, tool_version=tool_version) tool_input.update({"files_0|url_paste": f"file://{file_name}"}) else: @@ -584,6 +602,13 @@ def stage_data_async( assert len(jobs) > 0, f"Invalid response from server [{submit_response}], expecting a job." return lambda: self.wait_for_job(jobs[0]["id"], history_id, maxseconds=maxseconds) + def _ensure_valid_location_in(self, test_data: dict) -> Optional[str]: + location: Optional[str] = test_data.get("location") + has_valid_location = location and util.is_url(location) + if location and not has_valid_location: + raise ValueError(f"Invalid `location` URL: `{location}`") + return location + def run_tool(self, testdef, history_id, resource_parameters=None) -> RunToolResponse: # We need to handle the case where we've uploaded a valid compressed file since the upload # tool will have uncompressed it on the fly. @@ -825,11 +850,47 @@ def ensure_user_with_email(self, email, password=None): test_user = self._post("users", data, key=admin_key).json() return test_user - def __test_data_downloader(self, tool_id, tool_version=None): - def test_data_download(filename, mode="file"): + def __test_data_downloader(self, tool_id, tool_version=None, attributes: Optional[dict] = None): + location = None + checksum = attributes.get("checksum") if attributes else None + + def test_data_download_from_galaxy(filename, mode="file"): return self.test_data_download(tool_id, filename, mode=mode, tool_version=tool_version) - return test_data_download + def test_data_download_from_location(filename: str): + # try to find the file in the test data directories first + local_path = self._find_in_test_data_directories(filename) + if local_path and os.path.exists(local_path): + with open(local_path, mode="rb") as f: + return f.read() + # if not found, try to download it from the location to the test data directory + # to be reused in subsequent tests + if local_path: + util.download_to_file(location, local_path) + self._verify_checksum(local_path, checksum) + with open(local_path, mode="rb") as f: + return f.read() + # otherwise, download it to a temporary file + with tempfile.NamedTemporaryFile() as file_handle: + util.download_to_file(location, file_handle.name) + self._verify_checksum(file_handle.name, checksum) + return file_handle.file.read() + + if attributes: + location = self._ensure_valid_location_in(attributes) + if location: + return test_data_download_from_location + return test_data_download_from_galaxy + + def _verify_checksum(self, file_path: str, checksum: Optional[str] = None): + if checksum is None: + return + hash_function, expected_hash_value = parse_checksum_hash(checksum) + calculated_hash_value = memory_bound_hexdigest(hash_func_name=hash_function, path=file_path) + if calculated_hash_value != expected_hash_value: + raise AssertionError( + f"Failed to verify checksum with [{hash_function}] - expected [{expected_hash_value}] got [{calculated_hash_value}]" + ) def __dataset_fetcher(self, history_id): def fetcher(hda_id, base_name=None): @@ -1695,7 +1756,6 @@ def test_data_iter(required_files): ftype=extra.get("ftype", DEFAULT_FTYPE), dbkey=extra.get("dbkey", DEFAULT_DBKEY), location=extra.get("location", None), - md5=extra.get("md5", None), ) edit_attributes = extra.get("edit_attributes", []) diff --git a/lib/galaxy/tool_util/verify/test_data.py b/lib/galaxy/tool_util/verify/test_data.py index c1b28e785506..68bf408543e1 100644 --- a/lib/galaxy/tool_util/verify/test_data.py +++ b/lib/galaxy/tool_util/verify/test_data.py @@ -3,25 +3,12 @@ import re import subprocess from string import Template -from typing import ( - Any, - Dict, - Optional, -) - -from typing_extensions import Protocol from galaxy.util import ( asbool, - download_to_file, in_directory, - is_url, smart_str, ) -from galaxy.util.hash_util import ( - memory_bound_hexdigest, - parse_checksum_hash, -) UPDATE_TEMPLATE = Template( "git --work-tree $dir --git-dir $dir/.git fetch && " "git --work-tree $dir --git-dir $dir/.git merge origin/master" @@ -34,8 +21,6 @@ LIST_SEP = re.compile(r"\s*,\s*") -TestDataContext = Dict[str, Any] - class TestDataResolver: __test__ = False # Prevent pytest from discovering this class (issue #12071) @@ -44,23 +29,23 @@ def __init__(self, file_dirs=None, env_var="GALAXY_TEST_FILE_DIR", environ=os.en if file_dirs is None: file_dirs = environ.get(env_var, None) if file_dirs is None: - file_dirs = "test-data,url-location,https://github.com/galaxyproject/galaxy-test-data.git" + file_dirs = "test-data,https://github.com/galaxyproject/galaxy-test-data.git" if file_dirs: self.resolvers = [build_resolver(u, environ) for u in LIST_SEP.split(file_dirs)] else: self.resolvers = [] - def get_filename(self, name: str, context: Optional[TestDataContext] = None) -> str: + def get_filename(self, name: str) -> str: for resolver in self.resolvers or []: - if not resolver.exists(name, context): + if not resolver.exists(name): continue filename = resolver.path(name) if filename: return os.path.abspath(filename) raise TestDataNotFoundError(f"Failed to find test file {name} against any test data resolvers") - def get_filecontent(self, name: str, context: Optional[TestDataContext] = None) -> bytes: - filename = self.get_filename(name=name, context=context) + def get_filecontent(self, name: str) -> bytes: + filename = self.get_filename(name=name) with open(filename, mode="rb") as f: return f.read() @@ -72,32 +57,18 @@ class TestDataNotFoundError(ValueError): pass -class TestDataChecksumError(ValueError): - pass - - def build_resolver(uri: str, environ): if uri.startswith("http") and uri.endswith(".git"): return GitDataResolver(uri, environ) - elif uri == "url-location": - return RemoteLocationDataResolver(environ) else: return FileDataResolver(uri) -class DataResolver(Protocol): - def exists(self, filename: str, context: Optional[TestDataContext] = None): - raise NotImplementedError - - def path(self, filename: str): - raise NotImplementedError - - -class FileDataResolver(DataResolver): +class FileDataResolver: def __init__(self, file_dir: str): self.file_dir = file_dir - def exists(self, filename: str, context: Optional[TestDataContext] = None): + def exists(self, filename: str): path = os.path.abspath(self.path(filename)) return os.path.exists(path) and in_directory(path, self.file_dir) @@ -118,12 +89,12 @@ def __init__(self, repository: str, environ): # will leave it as true for now. self.fetch_data = asbool(environ.get("GALAXY_TEST_FETCH_DATA", "true")) - def exists(self, filename: str, context: Optional[TestDataContext] = None): - exists_now = super().exists(filename, context) + def exists(self, filename: str): + exists_now = super().exists(filename) if exists_now or not self.fetch_data or self.updated: return exists_now self.update_repository() - return super().exists(filename, context) + return super().exists(filename) def update_repository(self): self.updated = True @@ -151,59 +122,3 @@ def execute(self, cmd): "stderr": stderr, } print(UPDATE_FAILED_TEMPLATE.substitute(**kwds)) - - -class RemoteLocationDataResolver(FileDataResolver): - def __init__(self, environ): - self.fetch_data = asbool(environ.get("GALAXY_TEST_FETCH_DATA", True)) - repo_cache = environ.get("GALAXY_TEST_DATA_REPO_CACHE", "test-data-cache") - repo_path = os.path.join(repo_cache, "from-location") - super().__init__(repo_path) - - def exists(self, filename: str, context: Optional[TestDataContext] = None): - exists_now = super().exists(filename, context) - if exists_now or not self.fetch_data or context is None: - return exists_now - self._try_download_from_location(filename, context) - exists_now = super().exists(filename, context) - if exists_now: - self._verify_checksum(filename, context) - return exists_now - - def _try_download_from_location(self, filename: str, context: TestDataContext): - location = context.get("location") - if location is None: - return - if not is_url(location): - raise ValueError(f"Invalid 'location' URL for remote test data provided: {location}") - if not self._is_valid_filename(filename): - raise ValueError(f"Invalid 'filename' provided: '{filename}'") - self._ensure_base_dir_exists() - dest_file_path = self.path(filename) - download_to_file(location, dest_file_path) - - def _ensure_base_dir_exists(self): - if not os.path.exists(self.file_dir): - os.makedirs(self.file_dir) - - def _verify_checksum(self, filename: str, context: Optional[TestDataContext] = None): - if context is None or is_url(filename): - return - checksum = context.get("checksum") - if checksum is None: - return - hash_function, expected_hash_value = parse_checksum_hash(checksum) - file_path = self.path(filename) - calculated_hash_value = memory_bound_hexdigest(hash_func_name=hash_function, path=file_path) - if calculated_hash_value != expected_hash_value: - raise TestDataChecksumError( - f"Failed to validate test data '{filename}' with [{hash_function}] - expected [{expected_hash_value}] got [{calculated_hash_value}]" - ) - - def _is_valid_filename(self, filename: str): - """ - Checks that the filename does not contain the following - characters: <, >, :, ", /, \\, |, ?, *, or any control characters. - """ - pattern = r"^[^<>:\"/\\|?*\x00-\x1F]+$" - return bool(re.match(pattern, filename)) diff --git a/lib/galaxy/tool_util/xsd/galaxy.xsd b/lib/galaxy/tool_util/xsd/galaxy.xsd index 242a009d0369..455acdcc1e30 100644 --- a/lib/galaxy/tool_util/xsd/galaxy.xsd +++ b/lib/galaxy/tool_util/xsd/galaxy.xsd @@ -1544,17 +1544,9 @@ of ``type`` ``data``. Please use this option only when is not possible to include the files in the `test-data` folder, since this is more error prone due to external factors like remote availability. You can use it in two ways: -- In combination with `value` it will look for the input file specified in `value`, if it's not available on disk it will -download the file pointed by `location` using the same name as in `value` and then use it as input. -- Specifiying the `location` without a `value`, it will download the file and use it as an alias of `value`. The name of the file -will be infered from the last component of the location URL. For example, `location="https://my_url/my_file.txt"` will be equivalent to `value="my_file.txt"`. -If you specify a `checksum`, it will be also used to check the integrity of the download. - - - - - If specified in combination with `location`, after downloading the test input file, the checksum should match the value specified -here. This value should have the form ``hash_type$hash_value`` (e.g. ``sha1$8156d7ca0f46ed7abac98f82e36cfaddb2aca041``). +- If only ``location`` is given (and `value` is absent), the input file will be uploaded directly to Galaxy from the URL specified by the ``location`` (same as regular pasted URL upload). +- If ``location`` as well as ``value`` are given, the input file specified in ``value`` will be used from the tes data directory, if it's not available on disk it will use the ``location`` to upload the input as the previous case. + diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 8f0be21acfd5..943c62c1f3e5 100644 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -1355,31 +1355,13 @@ def test_data_path(self, filename): if not test_data: # Fallback to Galaxy test data directory for builtin tools, tools # under development, and some older ToolShed published tools that - # used stock test data. Also possible remote test data using `location`. + # used stock test data. try: - test_data_context = self._find_test_data_context(filename) - test_data = self.app.test_data_resolver.get_filename(filename, test_data_context) + test_data = self.app.test_data_resolver.get_filename(filename) except TestDataNotFoundError: test_data = None return test_data - def _find_test_data_context(self, filename: str): - """Returns the attributes (context) associated with a required file for test inputs or outputs.""" - # We are returning the attributes of the first test input or output that matches the filename - # Could there be multiple different test data files with the same filename and different attributes? - # I hope not... otherwise we need a way to match a test file with its test definition. - for test in self.tests: - # Check for input context attributes - for required_file in test.required_files: - if len(required_file) > 1 and required_file[0] == filename: - return required_file[1] - # Check for outputs context attributes too - for output in test.outputs: - value = output.get("value", None) - if value and value == filename: - return output.get("attributes") - return None - def __walk_test_data(self, dir, filename): for root, dirs, _ in os.walk(dir): if ".hg" in dirs: diff --git a/lib/galaxy_test/driver/driver_util.py b/lib/galaxy_test/driver/driver_util.py index 4a638f0c9fb8..39089f846595 100644 --- a/lib/galaxy_test/driver/driver_util.py +++ b/lib/galaxy_test/driver/driver_util.py @@ -56,7 +56,7 @@ galaxy_root = galaxy_directory() DEFAULT_CONFIG_PREFIX = "GALAXY" GALAXY_TEST_DIRECTORY = os.path.join(galaxy_root, "test") -GALAXY_TEST_FILE_DIR = "test-data,url-location,https://github.com/galaxyproject/galaxy-test-data.git" +GALAXY_TEST_FILE_DIR = "test-data,https://github.com/galaxyproject/galaxy-test-data.git" TOOL_SHED_TEST_DATA = os.path.join(galaxy_root, "lib", "tool_shed", "test", "test_data") TEST_WEBHOOKS_DIR = os.path.join(galaxy_root, "test", "functional", "webhooks") FRAMEWORK_TOOLS_DIR = os.path.join(GALAXY_TEST_DIRECTORY, "functional", "tools") diff --git a/test/functional/tools/remote_test_data_location.xml b/test/functional/tools/remote_test_data_location.xml index b6c6204d35fe..c8aa35252eea 100644 --- a/test/functional/tools/remote_test_data_location.xml +++ b/test/functional/tools/remote_test_data_location.xml @@ -21,8 +21,7 @@ + to the test data cache directory (GALAXY_TEST_DATA_REPO_CACHE) and then uploaded to Galaxy as usual. --> @@ -30,15 +29,6 @@ - - - - - - - - - @@ -51,13 +41,13 @@ - + - + diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml index 0d081142b6ec..83a70258cc6a 100644 --- a/test/functional/tools/sample_tool_conf.xml +++ b/test/functional/tools/sample_tool_conf.xml @@ -87,7 +87,6 @@ -