diff --git a/metaflow/plugins/datatools/s3/s3.py b/metaflow/plugins/datatools/s3/s3.py index 2ce11b64763..a758d863ead 100644 --- a/metaflow/plugins/datatools/s3/s3.py +++ b/metaflow/plugins/datatools/s3/s3.py @@ -49,6 +49,7 @@ get_timestamp, TRANSIENT_RETRY_START_LINE, TRANSIENT_RETRY_LINE_CONTENT, + CONSECUTIVE_SLASHES_REGEX, ) if TYPE_CHECKING: @@ -550,7 +551,9 @@ def __init__( else: prefix = os.path.join(prefix, run.parent.id, run.id) - self._s3root = "s3://%s" % os.path.join(bucket, prefix.strip("/")) + s3root_raw = "s3://%s" % os.path.join(bucket, prefix.strip("/")) + # Normalize the path by collapsing consecutive slashes + self._s3root = CONSECUTIVE_SLASHES_REGEX.sub("/", s3root_raw).rstrip("/") elif s3root: # 2. use an explicit S3 prefix parsed = urlparse(to_unicode(s3root)) @@ -558,7 +561,9 @@ def __init__( raise MetaflowS3URLException( "s3root needs to be an S3 URL prefixed with s3://." ) - self._s3root = s3root.rstrip("/") + # Normalize the path by collapsing consecutive slashes + normalized = CONSECUTIVE_SLASHES_REGEX.sub("/", s3root) + self._s3root = normalized.rstrip("/") else: # 3. use the client only with full URLs self._s3root = None diff --git a/metaflow/plugins/datatools/s3/s3op.py b/metaflow/plugins/datatools/s3/s3op.py index 9b0ee45fac1..68a192f8af6 100644 --- a/metaflow/plugins/datatools/s3/s3op.py +++ b/metaflow/plugins/datatools/s3/s3op.py @@ -45,6 +45,7 @@ get_timestamp, TRANSIENT_RETRY_LINE_CONTENT, TRANSIENT_RETRY_START_LINE, + CONSECUTIVE_SLASHES_REGEX, ) import metaflow.tracing as tracing from metaflow.metaflow_config import ( @@ -623,7 +624,7 @@ def list_prefix(self, prefix_url, delimiter=""): and len(key_path) > len(normalized_prefix) ): continue - url = url_base + key_path + url = CONSECUTIVE_SLASHES_REGEX.sub("/", url_base + key_path) urlobj = S3Url( url=url, bucket=prefix_url.bucket, @@ -635,7 +636,9 @@ def list_prefix(self, prefix_url, delimiter=""): if "CommonPrefixes" in page: # we get CommonPrefixes if Delimiter is a non-empty string for key in page.get("CommonPrefixes", []): - url = url_base + key["Prefix"] + url = CONSECUTIVE_SLASHES_REGEX.sub( + "/", url_base + key["Prefix"] + ) urlobj = S3Url( url=url, bucket=prefix_url.bucket, diff --git a/metaflow/plugins/datatools/s3/s3util.py b/metaflow/plugins/datatools/s3/s3util.py index 51a79787653..45304d936f2 100644 --- a/metaflow/plugins/datatools/s3/s3util.py +++ b/metaflow/plugins/datatools/s3/s3util.py @@ -1,6 +1,7 @@ from __future__ import print_function from datetime import datetime import random +import re import time import sys import os @@ -19,6 +20,10 @@ TRANSIENT_RETRY_LINE_CONTENT = "" TRANSIENT_RETRY_START_LINE = "### RETRY INPUTS ###" +# Compiled regex for normalizing consecutive slashes in S3 paths +# Matches two or more consecutive slashes not preceded by a colon (to preserve s3://) +CONSECUTIVE_SLASHES_REGEX = re.compile(r"(?