From d1d1954d19b38589baabd5c256dc24747592c08e Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 18 Mar 2025 19:56:25 +0000 Subject: [PATCH 1/3] GH-126367: `url2pathname()`: handle NTFS alternate data streams Adjust `url2pathname()` to decode embedded colon characters in Windows URIs, rather than bailing out with an `OSError`. --- Doc/library/urllib.request.rst | 4 +++- Lib/nturl2path.py | 23 ++++++++----------- Lib/test/test_urllib.py | 6 +++-- ...-03-18-19-52-49.gh-issue-126367.PRxnuu.rst | 3 +++ 4 files changed, 19 insertions(+), 17 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-03-18-19-52-49.gh-issue-126367.PRxnuu.rst diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index b3efde3f189566..1df1c3221bf438 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -182,7 +182,9 @@ The :mod:`urllib.request` module defines the following functions: 'C:\\Program Files' .. versionchanged:: 3.14 - Windows drive letters are no longer converted to uppercase. + Windows drive letters are no longer converted to uppercase, and ``:`` + characters not following a drive letter no longer cause an + :exc:`OSError` exception to be raised on Windows. .. function:: getproxies() diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 7e13ae3128333d..29dc5d7a0522ba 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -14,7 +14,7 @@ def url2pathname(url): # ///C:/foo/bar/spam.foo # become # C:\foo\bar\spam.foo - import string, urllib.parse + import urllib.parse if url[:3] == '///': # URL has an empty authority section, so the path begins on the third # character. @@ -25,19 +25,14 @@ def url2pathname(url): if url[:3] == '///': # Skip past extra slash before UNC drive in URL path. url = url[1:] - # Windows itself uses ":" even in URLs. - url = url.replace(':', '|') - if not '|' in url: - # No drive specifier, just convert slashes - # make sure not to convert quoted slashes :-) - return urllib.parse.unquote(url.replace('/', '\\')) - comp = url.split('|') - if len(comp) != 2 or comp[0][-1] not in string.ascii_letters: - error = 'Bad URL: ' + url - raise OSError(error) - drive = comp[0][-1] - tail = urllib.parse.unquote(comp[1].replace('/', '\\')) - return drive + ':' + tail + else: + if url[:1] == '/' and url[2:3] in ':|': + # Skip past extra slash before DOS drive in URL path. + url = url[1:] + if url[1:2] == '|': + # Older URLs use a pipe after a drive letter + url = url.replace('|', ':', 1) + return urllib.parse.unquote(url.replace('/', '\\')) def pathname2url(p): """OS-specific conversion from a file system path to a relative URL diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 4842428d6fd103..69f69c420495a5 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1502,8 +1502,10 @@ def test_url2pathname_win(self): self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') - # Non-ASCII drive letter - self.assertRaises(IOError, fn, "///\u00e8|/") + # Colons in URI + self.assertEqual(fn('///\u00e8|/'), '\u00e8:\\') + self.assertEqual(fn('//host/share/spam.txt:eggs'), '\\\\host\\share\\spam.txt:eggs') + self.assertEqual(fn('///c:/spam.txt:eggs'), 'c:\\spam.txt:eggs') # UNC paths self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') diff --git a/Misc/NEWS.d/next/Library/2025-03-18-19-52-49.gh-issue-126367.PRxnuu.rst b/Misc/NEWS.d/next/Library/2025-03-18-19-52-49.gh-issue-126367.PRxnuu.rst new file mode 100644 index 00000000000000..cebfefbda486f7 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-03-18-19-52-49.gh-issue-126367.PRxnuu.rst @@ -0,0 +1,3 @@ +Fix issue where :func:`urllib.request.url2pathname` raised :exc:`OSError` +when given a Windows URI containing a colon character not following a drive +letter, such as before an NTFS alternate data stream. From 7a8586ebe914e9836db25bf59371a9aa5bf08680 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 18 Mar 2025 22:51:50 +0000 Subject: [PATCH 2/3] Fix matching empty string --- Lib/nturl2path.py | 2 +- Lib/test/test_urllib.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 29dc5d7a0522ba..90dcba317c5ef6 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -26,7 +26,7 @@ def url2pathname(url): # Skip past extra slash before UNC drive in URL path. url = url[1:] else: - if url[:1] == '/' and url[2:3] in ':|': + if url[:1] == '/' and url[2:3] in (':', '|'): # Skip past extra slash before DOS drive in URL path. url = url[1:] if url[1:2] == '|': diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 69f69c420495a5..ed23215c4d0ab7 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1484,6 +1484,7 @@ def test_pathname2url_nonascii(self): 'test specific to Windows pathnames.') def test_url2pathname_win(self): fn = urllib.request.url2pathname + self.assertEqual(fn('/'), '\\') self.assertEqual(fn('/C:/'), 'C:\\') self.assertEqual(fn("///C|"), 'C:') self.assertEqual(fn("///C:"), 'C:') From c50f547045e1c7585d8e8ac15212fedc78bb9e2b Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 18 Mar 2025 23:02:42 +0000 Subject: [PATCH 3/3] Handle pipe replacement more explicitly --- Lib/nturl2path.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 90dcba317c5ef6..7b5b82068e989f 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -31,7 +31,7 @@ def url2pathname(url): url = url[1:] if url[1:2] == '|': # Older URLs use a pipe after a drive letter - url = url.replace('|', ':', 1) + url = url[:1] + ':' + url[2:] return urllib.parse.unquote(url.replace('/', '\\')) def pathname2url(p):