Skip to content

Commit

Permalink
Limit reading/mounting from shared filesystem JobStore (#5047)
Browse files Browse the repository at this point in the history
* Add --symlinkJobStoreReads option to limit reading/mounting from shared filesystem

* Add missing documentation for --symlinkJobStoreReads

* Fix option indent so RST parses correctly

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
adamnovak and github-actions[bot] authored Aug 15, 2024
1 parent b5fd307 commit 5b1dbdb
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 6 deletions.
7 changes: 7 additions & 0 deletions docs/running/cliOptions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,13 @@ Allows configuring Toil's data storage.
to use a batch system that does not support
cleanup. Set to "true" if caching
is desired.
--symlinkJobStoreReads BOOL
Allow reads and container mounts from a JobStore's
shared filesystem directly via symlink. Can be turned
off if the shared filesystem can't support the IO load
of all the jobs reading from it at once, and you want
to use ``--caching=True`` to make jobs on each node
read from node-local cache storage. (Default=True)

**Autoscaling Options**
Allows the specification of the minimum and maximum number of nodes in an
Expand Down
2 changes: 2 additions & 0 deletions src/toil/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class Config:
caching: Optional[bool]
symlinkImports: bool
moveOutputs: bool
symlink_job_store_reads: bool

# Autoscaling options
provisioner: Optional[str]
Expand Down Expand Up @@ -338,6 +339,7 @@ def set_option(option_name: str,
set_option("symlinkImports", old_names=["linkImports"])
set_option("moveOutputs", old_names=["moveExports"])
set_option("caching", old_names=["enableCaching"])
set_option("symlink_job_store_reads")

# Autoscaling options
set_option("provisioner")
Expand Down
11 changes: 7 additions & 4 deletions src/toil/jobStores/fileJobStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def __init__(self, path: str, fanOut: int = 1000) -> None:

self.linkImports = None
self.moveExports = None
self.symlink_job_store_reads = None

def __repr__(self):
return f'FileJobStore({self.jobStoreDir})'
Expand All @@ -123,6 +124,7 @@ def initialize(self, config):
os.makedirs(self.sharedFilesDir, exist_ok=True)
self.linkImports = config.symlinkImports
self.moveExports = config.moveOutputs
self.symlink_job_store_reads = config.symlink_job_store_reads
super().initialize(config)

def resume(self):
Expand Down Expand Up @@ -314,8 +316,8 @@ def _copy_or_link(self, src_path, dst_path, symlink=False):
atomic_copy(srcPath, dst_path)

def _import_file(self, otherCls, uri, shared_file_name=None, hardlink=False, symlink=True):
# symlink argument says whether the caller can take symlinks or not
# ex: if false, it implies the workflow cannot work with symlinks and thus will hardlink imports
# symlink argument says whether the caller can take symlinks or not.
# ex: if false, it means the workflow cannot work with symlinks and we need to hardlink or copy.
# default is true since symlinking everything is ideal
uri_path = unquote(uri.path)
if issubclass(otherCls, FileJobStore):
Expand Down Expand Up @@ -515,8 +517,9 @@ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> Non
# one over the other will fail.
return

if symlink:
# If the reader will accept a symlink, so always give them one.
if symlink and self.symlink_job_store_reads:
# If the reader will accept a symlink, and we are willing to
# symlink into the jobstore, always give them one.
# There's less that can go wrong.
try:
os.symlink(jobStoreFilePath, local_path)
Expand Down
9 changes: 7 additions & 2 deletions src/toil/options/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,15 @@ def is_within(x: Union[int, float]) -> bool:
link_imports_help = ("When using a filesystem based job store, CWL input files are by default symlinked in. "
"Setting this option to True instead copies the files into the job store, which may protect "
"them from being modified externally. When set to False, as long as caching is enabled, "
"Toil will protect the file automatically by changing the permissions to read-only."
"Toil will protect the file automatically by changing the permissions to read-only. "
"default=%(default)s")
link_imports.add_argument("--symlinkImports", dest="symlinkImports", type=strtobool, default=True,
metavar="BOOL", help=link_imports_help)
move_exports = file_store_options.add_mutually_exclusive_group()
move_exports_help = ('When using a filesystem based job store, output files are by default moved to the '
'output directory, and a symlink to the moved exported file is created at the initial '
'location. Setting this option to True instead copies the files into the output directory. '
'Applies to filesystem-based job stores only.'
'Applies to filesystem-based job stores only. '
'default=%(default)s')
move_exports.add_argument("--moveOutputs", dest="moveOutputs", type=strtobool, default=False, metavar="BOOL",
help=move_exports_help)
Expand All @@ -368,6 +368,11 @@ def is_within(x: Union[int, float]) -> bool:
help=caching_help)
# default is None according to PR 4299, seems to be generated at runtime

file_store_options.add_argument("--symlinkJobStoreReads", dest="symlink_job_store_reads", type=strtobool, default=True,
metavar="BOOL",
help="Allow reads and container mounts from a JobStore's shared filesystem directly "
"via symlink. default=%(default)s")

# Auto scaling options
autoscaling_options = parser.add_argument_group(
title="Toil options for autoscaling the cluster of worker nodes.",
Expand Down
26 changes: 26 additions & 0 deletions src/toil/test/jobStores/jobStoreTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,32 @@ def test_file_link_imports(self):
os.remove(filename)
os.remove(srcUrl[7:])

def test_symlink_read_control(self):
"""
Test that files are read by symlink when expected
"""

for should_link in (False, True):
# Configure a jobstore to symlink out reads or not, as appropriate
config = self._createConfig()
config.symlink_job_store_reads = should_link
store = FileJobStore(self.namePrefix + ("-link" if should_link else "-nolink"))
store.initialize(config)

# Put something in the job store
src_url, _ = self._prepareTestFile(self._externalStore(), 1)
file_id = store.import_file(src_url, symlink=False)

# Read it out, accepting a symlink
dest_dir = self._createTempDir()
dest_path = os.path.join(dest_dir, "file.dat")
store.read_file(file_id, dest_path, symlink = True)

# Make sure we get a symlink exactly when configured to
assert os.path.exists(dest_path)
assert os.path.islink(dest_path) == should_link



@needs_google_project
@needs_google_storage
Expand Down

0 comments on commit 5b1dbdb

Please sign in to comment.