Mailing List Archive

GH-114847: Speed up `posixpath.realpath()` (#114848)
https://github.com/python/cpython/commit/abfa16b44bb9426312613893b6e193b02ee0304f
commit: abfa16b44bb9426312613893b6e193b02ee0304f
branch: main
author: Barney Gale <barney.gale@gmail.com>
committer: barneygale <barney.gale@gmail.com>
date: 2024-04-05T12:35:01Z
summary:

GH-114847: Speed up `posixpath.realpath()` (#114848)

Apply the following optimizations to `posixpath.realpath()`:

- Remove use of recursion
- Construct child paths directly rather than using `join()`
- Use `os.getcwd[b]()` rather than `abspath()`
- Use `startswith(sep)` rather than `isabs()`
- Use slicing rather than `split()`

Co-authored-by: Petr Viktorin <encukou@gmail.com>

files:
A Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst
M Lib/posixpath.py
M Lib/test/test_posixpath.py

diff --git a/Lib/posixpath.py b/Lib/posixpath.py
index 76ee721bfb5e33..0e8bb5ab10d916 100644
--- a/Lib/posixpath.py
+++ b/Lib/posixpath.py
@@ -403,55 +403,66 @@ def realpath(filename, *, strict=False):
"""Return the canonical path of the specified filename, eliminating any
symbolic links encountered in the path."""
filename = os.fspath(filename)
- path, ok = _joinrealpath(filename[:0], filename, strict, {})
- return abspath(path)
-
-# Join two paths, normalizing and eliminating any symbolic links
-# encountered in the second path.
-# Two leading slashes are replaced by a single slash.
-def _joinrealpath(path, rest, strict, seen):
- if isinstance(path, bytes):
+ if isinstance(filename, bytes):
sep = b'/'
curdir = b'.'
pardir = b'..'
+ getcwd = os.getcwdb
else:
sep = '/'
curdir = '.'
pardir = '..'
+ getcwd = os.getcwd
+
+ # The stack of unresolved path parts. When popped, a special value of None
+ # indicates that a symlink target has been resolved, and that the original
+ # symlink path can be retrieved by popping again. The [::-1] slice is a
+ # very fast way of spelling list(reversed(...)).
+ rest = filename.split(sep)[::-1]
+
+ # The resolved path, which is absolute throughout this function.
+ # Note: getcwd() returns a normalized and symlink-free path.
+ path = sep if filename.startswith(sep) else getcwd()

- if rest.startswith(sep):
- rest = rest[1:]
- path = sep
+ # Mapping from symlink paths to *fully resolved* symlink targets. If a
+ # symlink is encountered but not yet resolved, the value is None. This is
+ # used both to detect symlink loops and to speed up repeated traversals of
+ # the same links.
+ seen = {}
+
+ # Whether we're calling lstat() and readlink() to resolve symlinks. If we
+ # encounter an OSError for a symlink loop in non-strict mode, this is
+ # switched off.
+ querying = True

while rest:
- name, _, rest = rest.partition(sep)
+ name = rest.pop()
+ if name is None:
+ # resolved symlink target
+ seen[rest.pop()] = path
+ continue
if not name or name == curdir:
# current dir
continue
if name == pardir:
# parent dir
- if path:
- parent, name = split(path)
- if name == pardir:
- # ../..
- path = join(path, pardir)
- else:
- # foo/bar/.. -> foo
- path = parent
- else:
- # ..
- path = pardir
+ path = path[:path.rindex(sep)] or sep
+ continue
+ if path == sep:
+ newpath = path + name
+ else:
+ newpath = path + sep + name
+ if not querying:
+ path = newpath
continue
- newpath = join(path, name)
try:
st = os.lstat(newpath)
+ if not stat.S_ISLNK(st.st_mode):
+ path = newpath
+ continue
except OSError:
if strict:
raise
- is_link = False
- else:
- is_link = stat.S_ISLNK(st.st_mode)
- if not is_link:
path = newpath
continue
# Resolve the symbolic link
@@ -467,14 +478,23 @@ def _joinrealpath(path, rest, strict, seen):
os.stat(newpath)
else:
# Return already resolved part + rest of the path unchanged.
- return join(newpath, rest), False
+ path = newpath
+ querying = False
+ continue
seen[newpath] = None # not resolved symlink
- path, ok = _joinrealpath(path, os.readlink(newpath), strict, seen)
- if not ok:
- return join(path, rest), False
- seen[newpath] = path # resolved symlink
+ target = os.readlink(newpath)
+ if target.startswith(sep):
+ # Symlink target is absolute; reset resolved path.
+ path = sep
+ # Push the symlink path onto the stack, and signal its specialness by
+ # also pushing None. When these entries are popped, we'll record the
+ # fully-resolved symlink target in the 'seen' mapping.
+ rest.append(newpath)
+ rest.append(None)
+ # Push the unresolved symlink target parts onto the stack.
+ rest.extend(target.split(sep)[::-1])

- return path, True
+ return path


supports_unicode_filenames = (sys.platform == 'darwin')
diff --git a/Lib/test/test_posixpath.py b/Lib/test/test_posixpath.py
index cbb7c4c52d9697..807f985f7f4df7 100644
--- a/Lib/test/test_posixpath.py
+++ b/Lib/test/test_posixpath.py
@@ -456,6 +456,15 @@ def test_realpath_relative(self):
finally:
os_helper.unlink(ABSTFN)

+ @os_helper.skip_unless_symlink
+ @skip_if_ABSTFN_contains_backslash
+ def test_realpath_missing_pardir(self):
+ try:
+ os.symlink(os_helper.TESTFN + "1", os_helper.TESTFN)
+ self.assertEqual(realpath("nonexistent/../" + os_helper.TESTFN), ABSTFN + "1")
+ finally:
+ os_helper.unlink(os_helper.TESTFN)
+
@os_helper.skip_unless_symlink
@skip_if_ABSTFN_contains_backslash
def test_realpath_symlink_loops(self):
diff --git a/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst b/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst
new file mode 100644
index 00000000000000..bf011fed3efdbc
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-02-01-08-09-20.gh-issue-114847.-JrWrR.rst
@@ -0,0 +1 @@
+Speed up :func:`os.path.realpath` on non-Windows platforms.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-leave@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: list-python-checkins@lists.gossamer-threads.com