Skip to content

Make conversion of file URLs more consistent #13501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/13501.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make conversion of file URLs more consistent across Python versions.
47 changes: 24 additions & 23 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,43 +121,47 @@ def _clean_url_path_part(part: str) -> str:
return urllib.parse.quote(urllib.parse.unquote(part))


def _clean_file_url_path(part: str) -> str:
def _clean_file_url(url: str) -> str:
"""
Clean the first part of a URL path that corresponds to a local
Clean a URL that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# Replace "@" characters to protect them from percent-encoding.
at_symbol_token = "---PIP_AT_SYMBOL---"
assert at_symbol_token not in url
url = url.replace("@", at_symbol_token)
parts = urllib.parse.urlsplit(url)

# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
ret = urllib.request.pathname2url(urllib.request.url2pathname(part))
if ret.startswith("///"):
# Remove any URL authority section, leaving only the URL path.
ret = ret.removeprefix("//")
return ret
# exist, the colon should be quoted.
tidy_url = path_to_url(url_to_path(url), normalize_path=False)
tidy_parts = urllib.parse.urlsplit(tidy_url)

# Restore the original scheme, query and fragment components.
url = urllib.parse.urlunsplit(tidy_parts[:3] + parts[3:])
url = url.replace(tidy_parts.scheme, parts.scheme, 1)

# Restore "@" characters that were replaced earlier.
return url.replace(at_symbol_token, "@")


# percent-encoded: /
_reserved_chars_re = re.compile("(@|%2F)", re.IGNORECASE)


def _clean_url_path(path: str, is_local_path: bool) -> str:
def _clean_url_path(path: str) -> str:
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [""])):
cleaned_parts.append(clean_func(to_clean))
cleaned_parts.append(_clean_url_path_part(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

Expand All @@ -174,13 +178,10 @@ def _ensure_quoted_url(url: str) -> str:
# `scheme://netloc/path?query#fragment`.
result = urllib.parse.urlsplit(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
# Temporarily replace scheme with file to ensure the URL generated by
# urlunsplit() contains an empty netloc (file://) as per RFC 1738.
ret = urllib.parse.urlunsplit(result._replace(scheme="file", path=path))
ret = result.scheme + ret[4:] # Restore original scheme.
return ret
if not result.netloc:
return _clean_file_url(url)
path = _clean_url_path(result.path)
return urllib.parse.urlunsplit(result._replace(path=path))


def _absolute_link_url(base_url: str, url: str) -> str:
Expand Down
83 changes: 49 additions & 34 deletions src/pip/_internal/utils/urls.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,70 @@
import os
import string
import sys
import urllib.parse
import urllib.request

from .compat import WINDOWS


def path_to_url(path: str) -> str:
def path_to_url(path: str, normalize_path: bool = True) -> str:
"""
Convert a path to a file: URL. The path will be made absolute and have
quoted path parts.
Convert a path to a file: URL with quoted path parts. The path will be
normalized and made absolute if *normalize_path* is true (the default.)
"""
path = os.path.normpath(os.path.abspath(path))
url = urllib.parse.urljoin("file://", urllib.request.pathname2url(path))
return url
if normalize_path:
path = os.path.abspath(path)
if WINDOWS:
path = path.replace("\\", "/")

drive, tail = os.path.splitdrive(path)
if drive:
if drive[:4] == "//?/":
drive = drive[4:]
if drive[:4].upper() == "UNC/":
drive = "//" + drive[4:]
if drive[1:] == ":":
drive = "///" + drive
elif tail.startswith("/"):
tail = "//" + tail

encoding = sys.getfilesystemencoding()
errors = sys.getfilesystemencodeerrors()
drive = urllib.parse.quote(drive, "/:", encoding, errors)
tail = urllib.parse.quote(tail, "/", encoding, errors)
return "file:" + drive + tail


def url_to_path(url: str) -> str:
"""
Convert a file: URL to a path.
"""
assert url.startswith(
"file:"
scheme, netloc, path = urllib.parse.urlsplit(url)[:3]
assert scheme == "file" or scheme.endswith(
"+file"
), f"You can only turn file: urls into filenames (not {url!r})"

_, netloc, path, _, _ = urllib.parse.urlsplit(url)
if WINDOWS:
# e.g. file://c:/foo
if netloc[1:2] == ":":
path = netloc + path

# e.g. file://server/share/foo
elif netloc and netloc != "localhost":
path = "//" + netloc + path

# e.g. file://///server/share/foo
elif path[:3] == "///":
path = path[1:]

# e.g. file:///c:/foo
elif path[:1] == "/" and path[2:3] == ":":
path = path[1:]

if not netloc or netloc == "localhost":
# According to RFC 8089, same as empty authority.
netloc = ""
elif WINDOWS:
# If we have a UNC path, prepend UNC share notation.
netloc = "\\\\" + netloc
else:
path = path.replace("/", "\\")
elif netloc and netloc != "localhost":
raise ValueError(
f"non-local file URIs are not supported on this platform: {url!r}"
)

path = urllib.request.url2pathname(netloc + path)

# On Windows, urlsplit parses the path as something like "/C:/Users/foo".
# This creates issues for path-related functions like io.open(), so we try
# to detect and strip the leading slash.
if (
WINDOWS
and not netloc # Not UNC.
and len(path) >= 3
and path[0] == "/" # Leading slash to strip.
and path[1] in string.ascii_letters # Drive letter.
and path[2:4] in (":", ":/") # Colon + end of string, or colon + absolute path.
):
path = path[1:]

return path
encoding = sys.getfilesystemencoding()
errors = sys.getfilesystemencodeerrors()
return urllib.parse.unquote(path, encoding, errors)
17 changes: 0 additions & 17 deletions tests/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from io import BytesIO, StringIO
from textwrap import dedent
from typing import Any, AnyStr, Callable, Literal, Protocol, Union, cast
from urllib.request import pathname2url
from zipfile import ZipFile

import pytest
Expand Down Expand Up @@ -1364,19 +1363,3 @@ def __call__(


CertFactory = Callable[[], str]

# -------------------------------------------------------------------------
# Accommodations for Windows path and URL changes in recent Python releases
# -------------------------------------------------------------------------

# Trailing slashes are now preserved on Windows, matching POSIX behaviour.
# BPO: https://github.com/python/cpython/issues/126212
does_pathname2url_preserve_trailing_slash = pathname2url("C:\\foo\\").endswith("/")
skip_needs_new_pathname2url_trailing_slash_behavior_win = pytest.mark.skipif(
sys.platform != "win32" or not does_pathname2url_preserve_trailing_slash,
reason="testing windows (pathname2url) behavior for newer CPython",
)
skip_needs_old_pathname2url_trailing_slash_behavior_win = pytest.mark.skipif(
sys.platform != "win32" or does_pathname2url_preserve_trailing_slash,
reason="testing windows (pathname2url) behavior for older CPython",
)
36 changes: 13 additions & 23 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,13 @@
Link,
LinkHash,
MetadataFile,
_clean_file_url,
_clean_url_path,
_ensure_quoted_url,
)
from pip._internal.network.session import PipSession

from tests.lib import (
TestData,
make_test_link_collector,
skip_needs_new_pathname2url_trailing_slash_behavior_win,
skip_needs_old_pathname2url_trailing_slash_behavior_win,
)
from tests.lib import TestData, make_test_link_collector

ACCEPT = ", ".join(
[
Expand Down Expand Up @@ -296,31 +292,30 @@ def test_get_simple_response_dont_log_clear_text_password(
("a %2f b", "a%20%2F%20b"),
],
)
@pytest.mark.parametrize("is_local_path", [True, False])
def test_clean_url_path(path: str, expected: str, is_local_path: bool) -> None:
assert _clean_url_path(path, is_local_path=is_local_path) == expected
def test_clean_url_path(path: str, expected: str) -> None:
assert _clean_url_path(path) == expected


@pytest.mark.parametrize(
"path, expected",
"url, expected",
[
# Test a VCS path with a Windows drive letter and revision.
pytest.param(
"/T:/with space/[email protected]",
"/T:/with%20space/[email protected]",
"file:/T:/with space/[email protected]",
"file:///T:/with%20space/[email protected]",
marks=pytest.mark.skipif("sys.platform != 'win32'"),
),
# Test a VCS path with a Windows drive letter and revision,
# running on non-windows platform.
pytest.param(
"/T:/with space/[email protected]",
"/T%3A/with%20space/[email protected]",
"file:/T:/with space/[email protected]",
"file:///T%3A/with%20space/[email protected]",
marks=pytest.mark.skipif("sys.platform == 'win32'"),
),
],
)
def test_clean_url_path_with_local_path(path: str, expected: str) -> None:
actual = _clean_url_path(path, is_local_path=True)
def test_clean_file_url(url: str, expected: str) -> None:
actual = _clean_file_url(url)
assert actual == expected


Expand Down Expand Up @@ -387,16 +382,11 @@ def test_clean_url_path_with_local_path(path: str, expected: str) -> None:
),
# URL with Windows drive letter. The `:` after the drive
# letter should not be quoted. The trailing `/` should be
# removed.
pytest.param(
"file:///T:/path/with spaces/",
"file:///T:/path/with%20spaces",
marks=skip_needs_old_pathname2url_trailing_slash_behavior_win,
),
# retained.
pytest.param(
"file:///T:/path/with spaces/",
"file:///T:/path/with%20spaces/",
marks=skip_needs_new_pathname2url_trailing_slash_behavior_win,
marks=pytest.mark.skipif("sys.platform != 'win32'"),
),
# URL with Windows drive letter, running on non-windows
# platform. The `:` after the drive should be quoted.
Expand Down
Loading