diff --git a/Jenkinsfile b/Jenkinsfile index 001e6a3f7..8d7379d76 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -40,7 +40,20 @@ pipeline steps { sh ''' - git submodule update --init --recursive + if [ "${CHOLLA_MAKE_TYPE}" = "cosmology" ] || + [ "${CHOLLA_MAKE_TYPE}" = "mhd" ] || + [ "${CHOLLA_MAKE_TYPE}" = "hydro" ] || + [ "${CHOLLA_MAKE_TYPE}" = "gravity" ]; then + ./tools/ci-setup-submodule.py \ + --color \ + --fallback-manual-lfs-download + else + # we skip the download because it's not currently + # necessary & we want to minimize calls to + # downloads from GitHub's raw-urls (when git-lfs + # commonly fails) + echo "hard-coded to skip submodule download" + fi make clobber ''' } diff --git a/docker/rocm/Dockerfile b/docker/rocm/Dockerfile index 2c48d3655..e9413883a 100644 --- a/docker/rocm/Dockerfile +++ b/docker/rocm/Dockerfile @@ -30,7 +30,7 @@ RUN apt-get -y install rocrand # Needed by Cholla Makefile ENV CHOLLA_MACHINE=github -ENV HIPCONFIG=/opt/rocm-5.2.3 -ENV ROCM_PATH=/opt/rocm-5.2.3 +ENV HIPCONFIG=/opt/rocm-5.5.1 +ENV ROCM_PATH=/opt/rocm-5.5.1 ENV HDF5_ROOT=/usr/lib/x86_64-linux-gnu/hdf5/serial ENV MPI_ROOT=/usr/lib/x86_64-linux-gnu/openmpi diff --git a/pyproject.toml b/pyproject.toml index d522bc51c..10d4c8826 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,12 +65,20 @@ testpaths = [ "python/tests", ] +[tool.ruff.per-file-target-version] +# By default, ruff uses project.requires-python to infer the project-wide +# python version (certain linting/formatting options may be enabled or disabled +# based on this choice). Here, we are overwriting this version for our developer/CI +# scripts which may depend on the system-python +"tools/**" = "py37" + + [tool.ruff.lint] exclude = [ "python_scripts/**", "docs/sphinx/conf.py", "docs/sphinx/PythonExamples/**", - "tools/**" + "tools/analyze_tidy_checks.py" ] [tool.ruff.format] @@ -78,6 +86,6 @@ exclude = [ "python_scripts/**", "docs/sphinx/conf.py", "docs/sphinx/PythonExamples/**", - "tools/**" + "tools/analyze_tidy_checks.py" ] diff --git a/tools/ci-setup-submodule.py b/tools/ci-setup-submodule.py new file mode 100755 index 000000000..0bc6d8ffe --- /dev/null +++ b/tools/ci-setup-submodule.py @@ -0,0 +1,828 @@ +#!/usr/bin/env python3 +""" +This script is used in CI to setup Cholla's data-file submodule. + +Motivation +========== +At the time of writing, Cholla uses a git-submodule to track data-files that +are used for running a subset of tests. Naturally, CI workflows that run +Cholla's tests must check out this submodule. + +Because of the size of the files, GitHub required us to use git-lfs to track +the large files. + +The primary reason for the creation of this script is that we have been +encountering some weird intermittent challenges with checking out the +submodule in our Jenkins Instance. This instance runs on the cluster +managed by the CRC at the University of Pittsburgh). It seems probable that +the underlying are tied to well-known latency issues related to the cluster's +distributed filesystem. + +Context: How git-lfs works +========================== +`git` generically provides support for "smudge" filters to modify files as +they are checked out. It allows external programs (like git-lfs) to create new +kinds of filters as a generic way to extend Git's functionality. + +`git-lfs` creates smudge-filters for tracking large files. Rather than +tracking large files directly as part of the repository, `git-lfs` instead +has users track tiny (<= 1 kB) "pointer-files" that contain the size and an +object-id of the large file. + +> Aside: an object-id is a generic git concept - its a checksum based on file +> contents that is used internally as a unique identifier of the file. + +`git-lfs` provides a smudge-filter that uses the contents of each +"pointer-file" to download and replace it with the corresponding large file. + +Under normal operation (and when `git-lfs` is installed), the process of a +`git-checkout` will seamlessly trigger `git-lfs` behind the scenes to replace +all of the pointer files being checked out. (It will be relevant later that +we can temporarily disable `git-lfs`) + +Setting up the Submodule +======================== +In an ideal world, we would simply call + +```sh +$ git submodule update --init +``` + +because it should do everything for us behind the scenes. Unfortunately, this +can trigger 2 distinct errors on the CRC cluster (one error pertains to +``git-submodule`` and the other pertains to ``git-lfs``). + +We instead adopt a more manual procedure that achieves equivalent results. +The steps to our procedure include: + +1. pull the submodule data, via ``git submodule update --init``, while + explicitly disabling ``git-lfs``. + + - In more detail, we use an environment variable to disable all "smudging" + pertaining to ``git-lfs`` when ``git submodule update`` internally + triggers machinery equivalent to ``git-clone``, ``git-fetch``, and + ``git-checkout`` for each submodule. + + - Aside: While `git-lfs` provides a few ways to disable smudging, a lot of + trial-and-error suggests that the environment variable seems to be the + only way to do it in the context of git submodule. + + - To be clear: at the end of this step, the submodule **SHOULD** hold + "pointer files" for each large file tracked by ``git-lfs`` + +2. Now we check if the previous step failed (this is a common failure point on + the CRC cluster). If it failed we try to "fix things" + + - to "fix things," we call ``git -C ./cholla-tests-data restore .``. + Importantly, we need to explicitly disable ``git-lfs`` (we use the same + environment variable). If we don't disable it, ``git-lfs`` will try to + replace all of the pointer files with the corresponding large file (and + could produce errors). + + - I don't fully understand exactly how/why ``git-submodule-update`` fails. + It always seems to fail when it invokes machinery equivalent to + ``git-checkout`` (if it instead failed when invoking machinery equivalent + to ``git-clone`` or ``git-fetch``, our attempt to "fix things," will not + work). + + - At the end of this step, the submodule **MUST** hold "pointer files" for + each large file tracked by ``git-lfs``. If this isn't true, then the + script should abort with a failure + +3. pre-fetch all of the relevant git-lfs data and then checkout that data + +4. If step 3 failed (common on the CRC cluster), we can switch to our + fallback strategy. This strategy must be enabled by passing a command line + option. We describe the strategy in the next section. + +A Manual Fallback Strategy +========================== +A lot of machinery in this script exists to support a (crude) fallback +strategy to retrieve the test-data after git-lfs fails. The strategy involves + +1. iterating over all pointer-files in the git-submodule + +2. using information about the file path to construct urls, where you can + directly download files from GitHub + +3. downloading the file from the url and validating its checksum (the checksum + is provided by the pointer-file) + +4. replacing the pointer-file with the downloaded file + +This definitely "gets the job done," there are some concerns: +- If we aren't careful, we could potentially hit GitHub's internal limits + for these kinds of downloads (see https://stackoverflow.com/a/74960542) +- frankly, it doesn't like a great idea to directly replace a file in the git + repository + +(We can probably overcome both issues) +""" + +# for portability: only use standard-library modules present in older python versions +import argparse +import functools +import hashlib +import logging +import os +import re +import shutil +import subprocess +import sys +import tempfile +from typing import ( + Container, + Dict, + IO, + Iterable, + Mapping, + NamedTuple, + Optional, + Tuple, + Union, +) + +# Handle some global stuff +# ======================== +if sys.version_info < (3, 6, 1): # 3.6.0 doesn't support all NamedTuple features + raise RuntimeError("python 3.6.1 or newer is required") + +logger = logging.getLogger("setup") +logger.setLevel(logging.DEBUG) + +_CHUNKSIZE = 8192 # default chunksize used for file operations + + +def _configure_logger(color=False): + global logger + + color_start = "" + color_stop = "" + if color: + color_start = "\x1b[36;20m" + color_stop = "\x1b[0m" + + fmt = f"{color_start}%(name)s{color_stop} > %(message)s" + + console_handler = logging.StreamHandler() + console_handler.setFormatter(logging.Formatter(fmt)) + logger.addHandler(console_handler) + + +class ScriptError(RuntimeError): + pass + + +def _fmt_env_args( + include_outer_env: bool = True, + env: Optional[Mapping[str, str]] = None, +) -> str: + """ + Format a string representation conveying env variables as concisely as possible + """ + # this assumes that the env-overwrites are short + kv_pairs = [] if env is None else (f"{k}={v}" for k, v in env.items()) + if include_outer_env and env is None: + return "" + elif include_outer_env: + return f".update({'; '.join(kv_pairs)})" + elif env is None: + return "" + else: + return f"{{{'; '.join(kv_pairs)}}}" + + +def _get_subprocess_run_env_kwarg( + include_outer_env: bool = True, + env: Optional[Mapping[str, str]] = None, +) -> Optional[Dict[str, str]]: + """Construct the env kwarg for subprocess.run""" + if include_outer_env and env is None: + return None # subprocess simply inherits the environment variables + elif include_outer_env: + out = os.environ.copy() + out.update(env) + return out + elif env is None: + return {} # subprocess is run with no environment variables + else: + return env + + +class CmdRslt(NamedTuple): + returncode: int # the exit code + stdout: Optional[str] # the stdout stream (if captured) + + +def _run( + *args: str, + log: bool = True, + silent: bool = False, + cwd: Optional[str] = None, + timeout: Optional[float] = None, + include_outer_env: bool = True, + env: Optional[Mapping[str, str]] = None, + stdout: Union[IO[str], int, None] = None, + stderr: Union[IO[str], int, None] = None, + success_codes: Optional[Container[int]] = (0,), +) -> CmdRslt: + """Invoke a command + + The interface is loosely inspired by the nox API + + Parameters + ---------- + *args + The command and its arguments + log : bool + When ``True``, we log the command. Default is ``True`` + silent : bool + Default is ``False``. When ``True``, silences command output and + returns the output from this function. This is accomplished by + combining stdout & stderr into a single stream. + cwd : str, optional + Optionally specifies a directory to invoke the command from + timeout : float, optional + If the timeout expires, the subprocess will be killed and after it + is done terminating, an exception is raised + include_outer_env: bool = True, + When True (the default), the subprocess inherits the environment of + the current process + env : dict, optional + When specified, it's used to specify the subprocess's env variables. + When include_outer_env is True, we overwrite variables. + stdout + Optionally specifies an open file object or a file descriptor where + the contents of stdout are written. Incompatible with silent=True + stderr + Optionally specifies an open file object or a file descriptor where + the contents of stderr are written. Incompatible with silent=True + + Returns + ------- + CmdRslt + Holdds the return code and stdout (if it was captured) + """ + # some argument checking: + if len(args) == 0: + raise ValueError("args was not specified") + elif not isinstance(args[0], str): + raise TypeError(f"args[0], {args[0]!r}, isn't a str") + + if log: + _msg = " ".join(args) + _meta_list = [] + if cwd is not None: + _meta_list.append(f"exec_dir: {cwd}") + _env_str = _fmt_env_args(include_outer_env=include_outer_env, env=env) + _meta_list.append(f"ENV: {_env_str}") + logger.info(f"$ {_msg}; ({'; '.join(_meta_list)})") + + # adjust stdout & stderr if necessary + if silent: + if stdout is not None: + raise ValueError("Can't specify stdout kwarg with silent==True") + elif stderr is not None: + raise ValueError("Can't specify stderr kwarg with silent==True") + # combine stdout and stder into a single stream + stdout = subprocess.PIPE + stderr = subprocess.STDOUT + elif stderr == subprocess.PIPE: + raise ValueError("currently no support for stderr=subprocess.PIPE") + + tmp_rslt = subprocess.run( + args, + cwd=cwd, + stdout=stdout, + stderr=stderr, + env=_get_subprocess_run_env_kwarg(include_outer_env=include_outer_env, env=env), + timeout=timeout, + ) + sys.stdout.flush() + + # repackage the result + _stdout = tmp_rslt.stdout.decode("utf8") if tmp_rslt.stdout is not None else None + rslt = CmdRslt(returncode=tmp_rslt.returncode, stdout=_stdout) + + if (success_codes is not None) and (rslt.returncode not in success_codes): + if silent and rslt.stdout: + print(rslt.stdout, file=sys.stderr, flush=True) + cwd = "./" if cwd is None else cwd + raise ScriptError( + "subprocess exited with nonzero code\n" + f" command: {' '.join(args)}\n exec_dir: {cwd!r}\n" + f" env: {_fmt_env_args(include_outer_env=include_outer_env, env=env)}\n" + f" code: {rslt.returncode}\n" + ) + return rslt + + +# define the actual CI logic +# -------------------------- +# -> the plan is to gradually script more and more CI log in python and move away +# from shell-scripting. But, we are starting out with EXTREMELY simple logic + +_keyvalue_regex = re.compile(r"(?P[._a-z0-9]+) (?P[^\r\n]+)\n") + + +class PointerFileInfo(NamedTuple): + """ + Specifies information about a large file tracked by git-lfs + """ + + # path (on disk) to the repository holding the file + full_file_path: str + # path of the file relative to the root of the repository holding the file + relative_to_repo_path: str + # oid is the standard git abbreviation for object id. It is checksum that is + # used to uniquely identify the file's contents + oid: str + # size specifies the full file's size + size: int + + +def _parse_ptr_file(repo_location: str, relative_to_repo_path: str) -> PointerFileInfo: + """ + Parse the contents of a git-lfs pointer file. + + Raises an exception if the file doesn't follow the specification: + https://github.com/git-lfs/git-lfs/blob/8e6e9f1894d8ec89b74222c3fc00cb183959afd9/docs/spec.md + """ + # to simplify code, we're slightly more permissive than the spec in 3 regards: + # 1. technically, the spec states that pointer files don't exceed 1024 bytes + # (we allow 1025 bytes since there is some ambiguity about whether a + # trailing newline would count) + # 2. we aren't strict about key ordering + + path = os.path.join(repo_location, relative_to_repo_path) + + def _mk_err(msg): + return RuntimeError(f"`{path}` isn't a git-lfs pointer file: {msg}") + + with open(path, "rb") as f: + # unsure if an extra trailing newline is allowed to be appended + # (for safety, assume that it is allowed and doesn't affect size) + buf = f.read(1026) + if buf.endswith(b"\n\n"): + buf = buf[:-1] + if len(buf) > 1024: + raise _mk_err("too large") + try: + contents = buf.decode(encoding="utf-8", errors="strict") + except UnicodeDecodeError: + raise _mk_err("not utf-8 encoded") from None + + tmp = {} + cur_pos = 0 + for match in _keyvalue_regex.finditer(contents): + if cur_pos != match.start(): + raise _mk_err(f"line {len(tmp)} isn't a standard key-value pair") + key = match["key"] + if key in tmp: + raise _mk_err(f"the key, {key!r}, appears more than once") + tmp[key] = match["value"] + cur_pos = match.end() + if cur_pos != len(contents): + raise _mk_err(f"line {1 + len(tmp)} isn't a standard key-value pair") + return PointerFileInfo( + full_file_path=path, + relative_to_repo_path=relative_to_repo_path, + oid=tmp["oid"], + size=tmp["size"], + ) + + +@functools.lru_cache(maxsize=1) +def _git_lfs_version() -> Tuple[int, int, int]: + """returns the major, minor, and patch version numbers for ``git-lfs``""" + string = _run("git", "lfs", "--version", log=False, silent=True).stdout.rstrip() + m = re.match(r"^git-lfs/(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)", string) + if m is None: + raise ScriptError( + f"git-lfs returns a string with an unexpected format: {string!r}" + ) + return (int(m.group(1)), int(m.group(2)), int(m.group(3))) + + +def _scan_lfs_tracked_paths( + repo_path: str, *, relative_to_repo_path: bool = True +) -> Iterable[str]: + """ + Return an iterable of all paths in a repository that correspond to files + tracked by git-lfs + + Parameters + ---------- + repo_path + Path to the local repository that we are querying + relative_to_repo_path + When ``True`` (the default), yielded paths are relative to the root + of the repository. Otherwise, absolute paths are yielded + + Yields + ------ + str + path tracked by ``git-lfs`` (see ``relative_to_repo_path`` kwarg for + more details) + """ + version = _git_lfs_version() + if version < (2, 6, 0): + raise ScriptError( + "Can't query files tracked by git-lfs since the git-lfs version, " + f"{'.'.join(version)}, is older than 2.6.0" + ) + else: + # get a list of tracked file names (use -n flag rather than --name-only since + # past release notes suggests that the latter was originally --names) + path_list_string = _run( + "git", "-C", repo_path, "lfs", "ls-files", "-n", log=False, silent=True + ).stdout + paths = path_list_string.rstrip().splitlines() + + if relative_to_repo_path: + yield from paths + else: + yield from (os.path.join(repo_path, p) for p in paths) + + +def _iterate_over_ptr_file(submodule_path: str) -> Iterable[PointerFileInfo]: + """ + Returns an iterable ``PointerFileInfo`` corresponding for each git-lfs + pointer file in the specified submodule. This does not include entries in + cases where the pointer file has been replaced with the large file. The + behavior is undefined (it may be inconsistent) if no file can be found at + the path to an lfs-tracked file. + """ + if _git_lfs_version() < (3, 7, 0): + for p in _scan_lfs_tracked_paths(submodule_path, relative_to_repo_path=True): + # test whether each path hold a "pointer file" or the file itself + try: + yield _parse_ptr_file(submodule_path, p) + except RuntimeError: + continue + except FileNotFoundError: + raise RuntimeError( + f"Expected to find a pointer file or large file at {p}" + ) from None + else: + # This branch only works for newer versions of git-lfs. It only exists because + # I didn't originaly realize that it depends on newer features (we keep it + # because it is more efficient) + + # we delay the import of json to reduce overhead + import json + + json_string = _run( + *["git", "-C", submodule_path, "lfs", "ls-files", "--json"], + log=False, + silent=True, + ).stdout.rstrip() + json_data = json.loads(json_string) + + # some quick sanity checks on the format + if "files" not in json_data: + raise ScriptError( + '"files" key is missing from json output of `git lfs ls-files` for ' + f"submodule @ {submodule_path}" + ) + elif len(json_data) == 0: + raise ScriptError( + "there don't appear to be any files tracked by git-lfs in submodule @ " + f"{submodule_path}" + ) + elif ("name" not in json_data["files"][0]) or not isinstance( + json_data["files"][0].get("checkout"), bool + ): + raise ScriptError( + "Unexpected json format from `git lfs ls-files --json` (did the schema " + "change between git-lfs versions)?" + ) + + # now confirm that each of the files was checked out + for finfo in json_data["files"]: + if not finfo["checkout"]: + yield PointerFileInfo( + full_file_path=os.path.join(submodule_path, finfo["name"]), + relative_to_repo_path=finfo["name"], + oid=f"{finfo['oid_type']}:{finfo['oid']}", + size=finfo["size"], + ) + + +def _progress_bar(tot_bytes, silent=False): + """provides a function for drawing/updating progress bars""" + from math import log10 + + ncols = shutil.get_terminal_size()[0] - 1 + power_div_3 = int(log10(tot_bytes) // 3) if tot_bytes > 0 else 0 + factor, unit = 1000.0**power_div_3, (" B", "KB", "MB", "GB")[power_div_3] + # the output line has the form: '[] / ' + fmt = "\r[{bar:{barlen}.{nfill}}] {size:.2f}" + f"/{tot_bytes / factor:.2f} {unit}" + barlen = ncols - 19 # for context, 15 <= (len(fmt.format(...)) - barlen) <= 19 + suppress = (barlen < 1) or silent or not sys.stdout.isatty() + bar = None if suppress else (barlen * "=") + + def _update(size): + nonlocal bar + if size is None and bar is not None: + print(flush=True) + bar = None + elif bar is not None: + nfill = int(barlen * (size / tot_bytes)) + val = fmt.format(bar=bar, barlen=barlen, nfill=nfill, size=size / factor) + print(val, end="", flush=True) + + return _update + + +def _retrieve_url( + url: str, dst: str, *, silent: bool = False, chunksize: int = _CHUNKSIZE +): + """download the file from url to dst""" + # delay the imports of the following modules since they are only used in + # this function, and this function is only invoked as a fallback-plan + import contextlib + import urllib.request + from urllib.error import URLError, HTTPError + + try: + req = urllib.request.Request(url) + with contextlib.ExitStack() as stack: + out_file = stack.enter_context(open(dst, "wb")) + response = stack.enter_context(urllib.request.urlopen(req)) + total_bytes = int(response.headers.get("Content-Length", -1)) + update_progress = _progress_bar(total_bytes, silent=silent) + stack.callback(update_progress, size=None) + + # write downloaded data to a file + downloaded_bytes = 0 + while True: + update_progress(downloaded_bytes) + block = response.read(chunksize) + if not block: + break + downloaded_bytes += len(block) + out_file.write(block) + except HTTPError as e: + raise ScriptError(f"server can't fulfill request to fetch {url}: {e.code}") + except URLError as e: + raise ScriptError(f"server can't be reached to fetch {url}: {e.code}") + + +def calc_checksum(fname, alg_name, *, chunksize=_CHUNKSIZE): + """Calculate the checksum for a given fname""" + hash_calculator = hashlib.new(alg_name) + with open(fname, "rb") as f: + buffer = bytearray(chunksize) + while True: + nbytes = f.readinto(buffer) + if nbytes == chunksize: + hash_calculator.update(buffer) + elif nbytes: # equivalent to: (nbytes is not None) and (nbytes > 0) + hash_calculator.update(buffer[:nbytes]) + else: + break + return ":".join([alg_name.lower(), hash_calculator.hexdigest()]) + + +def _fallback_download(repo_path: Optional[str], relative_submodule_path: str): + """ + An EXTREMELY hacky fallback scheme to manually download files if + git-lfs failed + + The module-level docstring at the top of this file provides more context + """ + + # get commit-hash associated with the submodule + with tempfile.TemporaryFile() as tmp_fp: + full_command = ["git", "rev-parse", f"HEAD:{relative_submodule_path}"] + _run(*full_command, log=False, cwd=repo_path, stdout=tmp_fp) + tmp_fp.seek(0) + submodule_commit_hash = tmp_fp.read().decode("utf-8").rstrip() + + # maybe don't hardcode _URL_PREFIX in the future + _URL_PREFIX = "https://github.com/cholla-hydro/cholla-tests-data" + base_url = f"{_URL_PREFIX}/raw/{submodule_commit_hash}" + + # construct an iterator for all of the pointer files that were not checked out + if repo_path is None: + submodule_path = relative_submodule_path + else: + submodule_path = os.path.join(repo_path, relative_submodule_path) + itr = _iterate_over_ptr_file(submodule_path) + logger.info("HACKY WORKAROUND: attempt to manually fetch files tracked by git-lfs") + logger.info(f"-> using : {base_url}") + + with tempfile.TemporaryDirectory() as tmpdirname: + tmp_dst_path = os.path.join(tmpdirname, "downloaded-file") + for ptr_info in itr: + logging_name = f"/{ptr_info.relative_to_repo_path}" + logger.info(f"try downloading {logging_name}") + + # download the file + full_url = f"{base_url}/{ptr_info.relative_to_repo_path}" + _retrieve_url(url=full_url, dst=tmp_dst_path) + + final_dst_path = ptr_info.full_file_path + + # validate checksum + oid = ptr_info.oid.lower() + alg_name, _ = ptr_info.oid.split(":") # alg_name is sha256 (as of now) + cksum = calc_checksum(tmp_dst_path, alg_name) + if cksum != oid: + raise ScriptError(f"downloaded {logging_name} has wrong checksum") + + # finally move the file to the final destination + os.remove(final_dst_path) + # we use shutil.move in case tmp_dst_path is on a different file-system + shutil.move(tmp_dst_path, final_dst_path) + + +def _check_submodule_validity(submod_path: str) -> bool: + """Checks whether the submodule is valid""" + message = _run( + "git", "-C", submod_path, "status", "--porcelain=v1", silent=True, log=False + ).stdout + return (len(message) == 0) or message.isspace() + + +def _setup_submodule( + repo_path: Optional[str] = None, + simulate_lfs_fetch_failure: bool = False, + simulate_submodule_error: bool = False, + fallback_manual_lfs_download: bool = False, +): + """ + Encodes the actual logic for setting up the submodule + + For some context: + - both git's submodule feature and the git-lfs are not particularly well + regarded. My impression is that both of these things historically had + major problems. While they both have come a long way and improved a lot, + I think the sentiment remains that they are not very optimal tools + (particularly git-lfs) + - while both git's submodule feature and the git-lfs features do work + together, there is a surprising lack of documentation about dealing with + issues + - things further get complicated while using them on shared file systems + with high latencies. We have run into a bunch of intermittent issues + with git-submodule-update failing in weird ways and with git-lfs failing + to download files + + See the module-level docstring for more details. + """ + + # we currently assume that the repository has already been cloned (it needs to be + # in order to be running this script) + _submod_name = "cholla-tests-data" + if repo_path is None: + logger.info(f"Submodule Setup (assumed repo-path: {os.getcwd()})") + submod_path = f"./{_submod_name}" + else: + logger.info(f"Submodule Setup (repo-path: {repo_path})") + submod_path = os.path.join(repo_path, _submod_name) + + # first, we fetch the submodule data without pulling data for the large files + # tracked by git-lfs + # -> instead we pull the pointer files (that instructs git-lfs where to get the + # data from) + # -> I spent a lot of time trying to see if we could prefetch the git-lfs data, but + # that doesn't seem to be possible for git submodules + # -> It appears that I NEED to use the environment variable to instruct git-lfs to + # not pull the big files. I also tried using + # `git lfs install --local --skip-smudge` + # but that doesn't work + logger.info("Get the submodule data (without full data tracked by git-lfs)") + _run( + *["git", "submodule", "update", "--init"], + cwd=repo_path, + env={"GIT_LFS_SKIP_SMUDGE": "1"}, + ) + + # we may want to simulate a git-submodule-update (for testing purposes) + if simulate_submodule_error: + logger.info("Simulate errors in git-submodule-update") + for path in _scan_lfs_tracked_paths(submod_path, relative_to_repo_path=False): + try: + os.remove(path) + except FileNotFoundError: + pass # this means that git-submodule-update naturally had errors + + # now, check for errors from git-submodule-update & try to recover (if necessary) + # -> silent errors commonly occur on the CRC cluster. + logger.info("Check whether git-submodule-update had any errors") + if _check_submodule_validity(submod_path): + logger.info("git submodule update succeeded.") + else: + logger.info("Error with git-submodule-update. Showing result of git-status") + _run("git", "-C", submod_path, "status") + + logger.info("Attempting to recover from git-submodule-update's error") + # we must explicitly disable git-lfs. Otherwise, it will try to run right now + # (and we don't want to deal with any git-lfs errors yet) + _run( + *["git", "-C", submod_path, "restore", "."], + env={"GIT_LFS_SKIP_SMUDGE": "1"}, + ) + if not _check_submodule_validity(submod_path): + raise ScriptError("could not fix up the module") + logger.info("Recovery was succesful") + + # finally, we pull the git-lfs tracked data + logger.info("Pre-fetch then Checkout data tracked by git-lfs") + try: + if simulate_lfs_fetch_failure: # for testing purposes + logger.info("simulating failure of git-lfs") + raise ScriptError("simulated failure") + _run( + *["git", "submodule", "foreach", "--recursive", "git", "lfs", "fetch"], + cwd=repo_path, + ) + _run( + *["git", "submodule", "foreach", "--recursive", "git", "lfs", "checkout"], + cwd=repo_path, + ) + except ScriptError: + if fallback_manual_lfs_download: + logger.info("Attempting to work around git-lfs failure") + _fallback_download( + repo_path=repo_path, relative_submodule_path="cholla-tests-data" + ) + else: + raise + + +def main(args: argparse.Namespace): + _configure_logger(color=args.color) + + if args.detailed_help: + print(__doc__.strip()) + return 0 + + try: + _setup_submodule( + repo_path=args.repo_path, + simulate_lfs_fetch_failure=args.simulate_lfs_fetch_failure, + simulate_submodule_error=args.simulate_submodule_error, + fallback_manual_lfs_download=args.fallback_manual_lfs_download, + ) + + except ScriptError as err: + # in this case, we handle "expected errors" + # - these are things that *should* generally work, but could go wrong + # - in general, these errors have nice error-messages and the standard python + # traceback would simply pollute this script's output + # - an example is that a git command may fail because of a network issue or + # something unrelated to the core-logic in the script + logger.error(f"ERROR: {err.args[0]}") + return 70 # https://www.man7.org/linux/man-pages/man3/sysexits.h.3head.html + except BaseException: + # here we handle all other exceptions (e.g. programming errors, + # KeyboardInterrupt). Generally we want a standard traceback in these cases + logger.error("Unexpected error:") + raise + else: + logger.info("success") + return 0 + + +parser = argparse.ArgumentParser( + description=( + "Helps setup the submodule for continuous integration. The --detailed-help " + "flag will display an extended description of this tool's purpose and why " + "it exists" + ), + allow_abbrev=False, +) + +parser.add_argument("--color", action="store_true", help="use color") +parser.add_argument( + "--detailed-help", action="store_true", help="shows the detailed help message" +) +parser.add_argument( # used for testing + "--repo-path", default=None, help="optionally specify path to repository" +) +parser.add_argument( # used for testing + "--simulate-lfs-fetch-failure", + action="store_true", + help=( + "skip the git-lfs commands and act as if it failed. This is primarily intended " + "for testing purposes" + ), +) +parser.add_argument( # used for testing + "--simulate-submodule-error", + action="store_true", + help="simulate a common git submodule update issue", +) +parser.add_argument( + "--fallback-manual-lfs-download", + action="store_true", + default=None, + help=( + "enables the fallback strategy when `git-lfs-fetch` or `git-lfs-checkout`fails" + ), +) + +if __name__ == "__main__": + sys.exit(main(parser.parse_args()))