Skip to content

Commit

Permalink
Avoid copying files with same name prefix when copying folder (#576)
Browse files Browse the repository at this point in the history
* fix: avoid copying files with same name prefix

* feat: avoid calling isdir before fetching objects
  • Loading branch information
john-jam authored Aug 31, 2023
1 parent e64ede3 commit 5b97087
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 23 deletions.
34 changes: 14 additions & 20 deletions gcsfs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,18 +989,6 @@ async def _info(self, path, generation=None, **kwargs):
else:
raise FileNotFoundError(path)

async def _glob(self, path, prefix="", **kwargs):
if not prefix:
# Identify pattern prefixes. Ripped from fsspec.spec.AbstractFileSystem.glob and matches
# the glob.has_magic patterns.
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indques = path.find("?") if path.find("?") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indques, indbrace)
prefix = path[:ind].split("/")[-1]
return await super()._glob(path, prefix=prefix, **kwargs)

async def _ls(
self, path, detail=False, prefix="", versions=False, refresh=False, **kwargs
):
Expand Down Expand Up @@ -1399,21 +1387,27 @@ async def _find(
**kwargs,
):
path = self._strip_protocol(path)
bucket, key, generation = self.split_path(path)

if maxdepth is not None and maxdepth < 1:
raise ValueError("maxdepth must be at least 1")

if prefix:
_path = "" if not key else key.rstrip("/") + "/"
_prefix = f"{_path}{prefix}"
else:
_prefix = key

# Fetch objects as if the path is a directory
objects, _ = await self._do_list_objects(
bucket, delimiter="", prefix=_prefix, versions=versions
path, delimiter="", prefix=prefix, versions=versions
)

if not objects:
# Fetch objects as if the path is a file
bucket, key, _ = self.split_path(path)
if prefix:
_path = "" if not key else key.rstrip("/") + "/"
_prefix = f"{_path}{prefix}"
else:
_prefix = key
objects, _ = await self._do_list_objects(
bucket, delimiter="", prefix=_prefix, versions=versions
)

dirs = {}
cache_entries = {}

Expand Down
7 changes: 4 additions & 3 deletions gcsfs/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,10 +292,11 @@ def test_gcs_glob(gcs):
for f in gcs.glob(TEST_BUCKET + "/nested/*")
if gcs.isfile(f)
)
# the following is no longer true since the glob method list the root path
# Ensure the glob only fetches prefixed folders
gcs.dircache.clear()
gcs.glob(TEST_BUCKET + "/nested**1")
assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache)
# gcs.dircache.clear()
# gcs.glob(TEST_BUCKET + "/nested**1")
# assert all(d.startswith(TEST_BUCKET + "/nested") for d in gcs.dircache)
# the following is no longer true as of #437
# gcs.glob(TEST_BUCKET + "/test*")
# assert TEST_BUCKET + "/test" in gcs.dircache
Expand Down

0 comments on commit 5b97087

Please sign in to comment.