Mailing List Archive: GH-117586: Speed up `pathlib.Path.glob()` by working with strings (#117589)

https://github.com/python/cpython/commit/6258844c27e3b5a43816e7c559089a5fe0a47123
commit: 6258844c27e3b5a43816e7c559089a5fe0a47123
branch: main
author: Barney Gale <barney.gale@gmail.com>
committer: barneygale <barney.gale@gmail.com>
date: 2024-04-10T20:43:07+01:00
summary:

GH-117586: Speed up `pathlib.Path.glob()` by working with strings (#117589)

Move pathlib globbing implementation into a new private class: `glob._Globber`. This class implements fast string-based globbing. It's called by `pathlib.Path.glob()`, which then converts strings back to path objects.

In the private pathlib ABCs, add a `pathlib._abc.Globber` subclass that works with `PathBase` objects rather than strings, and calls user-defined path methods like `PathBase.stat()` rather than `os.stat()`.

This sets the stage for two more improvements:

- GH-115060: Query non-wildcard segments with `lstat()`
- GH-116380: Unify `pathlib` and `glob` implementations of globbing.

No change to the implementations of `glob.glob()` and `glob.iglob()`.

files:
A Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst
M Lib/glob.py
M Lib/pathlib/__init__.py
M Lib/pathlib/_abc.py

diff --git a/Lib/glob.py b/Lib/glob.py
index a915cf0bdf4502..62cf0394e921d7 100644
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -4,7 +4,9 @@
import os
import re
import fnmatch
+import functools
import itertools
+import operator
import stat
import sys

@@ -256,7 +258,9 @@ def escape(pathname):
return drive + pathname

+_special_parts = ('', '.', '..')
_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
+_no_recurse_symlinks = object()

def translate(pat, *, recursive=False, include_hidden=False, seps=None):
@@ -312,3 +316,185 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
results.append(any_sep)
res = ''.join(results)
return fr'(?s:{res})\Z'
+
+
+@functools.lru_cache(maxsize=512)
+def _compile_pattern(pat, sep, case_sensitive, recursive=True):
+ """Compile given glob pattern to a re.Pattern object (observing case
+ sensitivity)."""
+ flags = re.NOFLAG if case_sensitive else re.IGNORECASE
+ regex = translate(pat, recursive=recursive, include_hidden=True, seps=sep)
+ return re.compile(regex, flags=flags).match
+
+
+class _Globber:
+ """Class providing shell-style pattern matching and globbing.
+ """
+
+ def __init__(self, sep, case_sensitive, recursive=False):
+ self.sep = sep
+ self.case_sensitive = case_sensitive
+ self.recursive = recursive
+
+ # Low-level methods
+
+ lstat = staticmethod(os.lstat)
+ scandir = staticmethod(os.scandir)
+ parse_entry = operator.attrgetter('path')
+ concat_path = operator.add
+
+ if os.name == 'nt':
+ @staticmethod
+ def add_slash(pathname):
+ tail = os.path.splitroot(pathname)[2]
+ if not tail or tail[-1] in '\\/':
+ return pathname
+ return f'{pathname}\\'
+ else:
+ @staticmethod
+ def add_slash(pathname):
+ if not pathname or pathname[-1] == '/':
+ return pathname
+ return f'{pathname}/'
+
+ # High-level methods
+
+ def compile(self, pat):
+ return _compile_pattern(pat, self.sep, self.case_sensitive, self.recursive)
+
+ def selector(self, parts):
+ """Returns a function that selects from a given path, walking and
+ filtering according to the glob-style pattern parts in *parts*.
+ """
+ if not parts:
+ return self.select_exists
+ part = parts.pop()
+ if self.recursive and part == '**':
+ selector = self.recursive_selector
+ elif part in _special_parts:
+ selector = self.special_selector
+ else:
+ selector = self.wildcard_selector
+ return selector(part, parts)
+
+ def special_selector(self, part, parts):
+ """Returns a function that selects special children of the given path.
+ """
+ select_next = self.selector(parts)
+
+ def select_special(path, exists=False):
+ path = self.concat_path(self.add_slash(path), part)
+ return select_next(path, exists)
+ return select_special
+
+ def wildcard_selector(self, part, parts):
+ """Returns a function that selects direct children of a given path,
+ filtering by pattern.
+ """
+
+ match = None if part == '*' else self.compile(part)
+ dir_only = bool(parts)
+ if dir_only:
+ select_next = self.selector(parts)
+
+ def select_wildcard(path, exists=False):
+ try:
+ # We must close the scandir() object before proceeding to
+ # avoid exhausting file descriptors when globbing deep trees.
+ with self.scandir(path) as scandir_it:
+ entries = list(scandir_it)
+ except OSError:
+ pass
+ else:
+ for entry in entries:
+ if match is None or match(entry.name):
+ if dir_only:
+ try:
+ if not entry.is_dir():
+ continue
+ except OSError:
+ continue
+ entry_path = self.parse_entry(entry)
+ if dir_only:
+ yield from select_next(entry_path, exists=True)
+ else:
+ yield entry_path
+ return select_wildcard
+
+ def recursive_selector(self, part, parts):
+ """Returns a function that selects a given path and all its children,
+ recursively, filtering by pattern.
+ """
+ # Optimization: consume following '**' parts, which have no effect.
+ while parts and parts[-1] == '**':
+ parts.pop()
+
+ # Optimization: consume and join any following non-special parts here,
+ # rather than leaving them for the next selector. They're used to
+ # build a regular expression, which we use to filter the results of
+ # the recursive walk. As a result, non-special pattern segments
+ # following a '**' wildcard don't require additional filesystem access
+ # to expand.
+ follow_symlinks = self.recursive is not _no_recurse_symlinks
+ if follow_symlinks:
+ while parts and parts[-1] not in _special_parts:
+ part += self.sep + parts.pop()
+
+ match = None if part == '**' else self.compile(part)
+ dir_only = bool(parts)
+ select_next = self.selector(parts)
+
+ def select_recursive(path, exists=False):
+ path = self.add_slash(path)
+ match_pos = len(str(path))
+ if match is None or match(str(path), match_pos):
+ yield from select_next(path, exists)
+ stack = [path]
+ while stack:
+ yield from select_recursive_step(stack, match_pos)
+
+ def select_recursive_step(stack, match_pos):
+ path = stack.pop()
+ try:
+ # We must close the scandir() object before proceeding to
+ # avoid exhausting file descriptors when globbing deep trees.
+ with self.scandir(path) as scandir_it:
+ entries = list(scandir_it)
+ except OSError:
+ pass
+ else:
+ for entry in entries:
+ is_dir = False
+ try:
+ if entry.is_dir(follow_symlinks=follow_symlinks):
+ is_dir = True
+ except OSError:
+ pass
+
+ if is_dir or not dir_only:
+ entry_path = self.parse_entry(entry)
+ if match is None or match(str(entry_path), match_pos):
+ if dir_only:
+ yield from select_next(entry_path, exists=True)
+ else:
+ # Optimization: directly yield the path if this is
+ # last pattern part.
+ yield entry_path
+ if is_dir:
+ stack.append(entry_path)
+
+ return select_recursive
+
+ def select_exists(self, path, exists=False):
+ """Yields the given path, if it exists.
+ """
+ if exists:
+ # Optimization: this path is already known to exist, e.g. because
+ # it was returned from os.scandir(), so we skip calling lstat().
+ yield path
+ else:
+ try:
+ self.lstat(path)
+ yield path
+ except OSError:
+ pass
diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py
index 747000f1a43475..88e3286d9b08dc 100644
--- a/Lib/pathlib/__init__.py
+++ b/Lib/pathlib/__init__.py
@@ -5,8 +5,10 @@
operating systems.
"""

+import glob
import io
import ntpath
+import operator
import os
import posixpath
import sys
@@ -111,6 +113,7 @@ class PurePath(_abc.PurePathBase):
'_hash',
)
parser = os.path
+ _globber = glob._Globber

def __new__(cls, *args, **kwargs):
"""Construct a PurePath from one or several strings and or existing
@@ -253,14 +256,17 @@ def _format_parsed_parts(cls, drv, root, tail):
return cls.parser.sep.join(tail)

def _from_parsed_parts(self, drv, root, tail):
- path_str = self._format_parsed_parts(drv, root, tail)
- path = self.with_segments(path_str)
- path._str = path_str or '.'
+ path = self._from_parsed_string(self._format_parsed_parts(drv, root, tail))
path._drv = drv
path._root = root
path._tail_cached = tail
return path

+ def _from_parsed_string(self, path_str):
+ path = self.with_segments(path_str)
+ path._str = path_str or '.'
+ return path
+
@classmethod
def _parse_path(cls, path):
if not path:
@@ -453,21 +459,6 @@ def as_uri(self):
from urllib.parse import quote_from_bytes
return prefix + quote_from_bytes(os.fsencode(path))

- @property
- def _pattern_stack(self):
- """Stack of path components, to be used with patterns in glob()."""
- parts = self._tail.copy()
- pattern = self._raw_path
- if self.anchor:
- raise NotImplementedError("Non-relative patterns are unsupported")
- elif not parts:
- raise ValueError("Unacceptable pattern: {!r}".format(pattern))
- elif pattern[-1] in (self.parser.sep, self.parser.altsep):
- # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
- parts.append('')
- parts.reverse()
- return parts
-
@property
def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching."""
@@ -576,6 +567,17 @@ def write_text(self, data, encoding=None, errors=None, newline=None):
encoding = io.text_encoding(encoding)
return _abc.PathBase.write_text(self, data, encoding, errors, newline)

+ _remove_leading_dot = operator.itemgetter(slice(2, None))
+ _remove_trailing_slash = operator.itemgetter(slice(-1))
+
+ def _filter_trailing_slash(self, paths):
+ sep = self.parser.sep
+ anchor_len = len(self.anchor)
+ for path_str in paths:
+ if len(path_str) > anchor_len and path_str[-1] == sep:
+ path_str = path_str[:-1]
+ yield path_str
+
def iterdir(self):
"""Yield path objects of the directory contents.

@@ -587,13 +589,9 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

- def _direntry_str(self, entry):
- # Transform an entry yielded from _scandir() into a path string.
- return entry.name if str(self) == '.' else entry.path
-
def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
- path_str = self._direntry_str(entry)
+ path_str = entry.name if str(self) == '.' else entry.path
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
@@ -626,8 +624,30 @@ def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
sys.audit("pathlib.Path.glob", self, pattern)
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
- return _abc.PathBase.glob(
- self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
+ if pattern.anchor:
+ raise NotImplementedError("Non-relative patterns are unsupported")
+ parts = pattern._tail.copy()
+ if not parts:
+ raise ValueError("Unacceptable pattern: {!r}".format(pattern))
+ raw = pattern._raw_path
+ if raw[-1] in (self.parser.sep, self.parser.altsep):
+ # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
+ parts.append('')
+ if not self.is_dir():
+ return iter([])
+ select = self._glob_selector(parts[::-1], case_sensitive, recurse_symlinks)
+ root = str(self)
+ paths = select(root, exists=True)
+
+ # Normalize results
+ if root == '.':
+ paths = map(self._remove_leading_dot, paths)
+ if parts[-1] == '':
+ paths = map(self._remove_trailing_slash, paths)
+ elif parts[-1] == '**':
+ paths = self._filter_trailing_slash(paths)
+ paths = map(self._from_parsed_string, paths)
+ return paths

def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
"""Recursively yield all existing files (of any kind, including
@@ -638,8 +658,7 @@ def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=False):
if not isinstance(pattern, PurePath):
pattern = self.with_segments(pattern)
pattern = '**' / pattern
- return _abc.PathBase.glob(
- self, pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)
+ return self.glob(pattern, case_sensitive=case_sensitive, recurse_symlinks=recurse_symlinks)

def walk(self, top_down=True, on_error=None, follow_symlinks=False):
"""Walk the directory tree from this directory, similar to os.walk()."""
@@ -669,9 +688,7 @@ def absolute(self):
# of joining, and we exploit the fact that getcwd() returns a
# fully-normalized string by storing it in _str. This is used to
# implement Path.cwd().
- result = self.with_segments(cwd)
- result._str = cwd
- return result
+ return self._from_parsed_string(cwd)
drive, root, rel = os.path.splitroot(cwd)
if not rel:
return self._from_parsed_parts(drive, root, self._tail)
diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py
index ca38a51d072cfb..553f797d75e793 100644
--- a/Lib/pathlib/_abc.py
+++ b/Lib/pathlib/_abc.py
@@ -12,6 +12,8 @@
"""

import functools
+import glob
+import operator
from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL
from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO

@@ -40,109 +42,23 @@ def _ignore_error(exception):
def _is_case_sensitive(parser):
return parser.normcase('Aa') == 'Aa'

-#
-# Globbing helpers
-#
-
-re = glob = None
-
-
-@functools.lru_cache(maxsize=512)
-def _compile_pattern(pat, sep, case_sensitive, recursive=True):
- """Compile given glob pattern to a re.Pattern object (observing case
- sensitivity)."""
- global re, glob
- if re is None:
- import re, glob
-
- flags = re.NOFLAG if case_sensitive else re.IGNORECASE
- regex = glob.translate(pat, recursive=recursive, include_hidden=True, seps=sep)
- return re.compile(regex, flags=flags).match

+class Globber(glob._Globber):
+ lstat = operator.methodcaller('lstat')
+ scandir = operator.methodcaller('_scandir')
+ add_slash = operator.methodcaller('joinpath', '')

-def _select_special(paths, part):
- """Yield special literal children of the given paths."""
- for path in paths:
- yield path._make_child_relpath(part)
-
-
-def _select_children(parent_paths, dir_only, match):
- """Yield direct children of given paths, filtering by name and type."""
- for parent_path in parent_paths:
- try:
- # We must close the scandir() object before proceeding to
- # avoid exhausting file descriptors when globbing deep trees.
- with parent_path._scandir() as scandir_it:
- entries = list(scandir_it)
- except OSError:
- pass
- else:
- for entry in entries:
- if dir_only:
- try:
- if not entry.is_dir():
- continue
- except OSError:
- continue
- # Avoid cost of making a path object for non-matching paths by
- # matching against the os.DirEntry.name string.
- if match is None or match(entry.name):
- yield parent_path._make_child_direntry(entry)
-
+ @staticmethod
+ def concat_path(path, text):
+ """Appends text to the given path.
+ """
+ return path.with_segments(path._raw_path + text)

-def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
- """Yield given paths and all their children, recursively, filtering by
- string and type.
- """
- for parent_path in parent_paths:
- if match is not None:
- # If we're filtering paths through a regex, record the length of
- # the parent path. We'll pass it to match(path, pos=...) later.
- parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
- paths = [parent_path._make_child_relpath('')]
- while paths:
- path = paths.pop()
- if match is None or match(str(path), parent_len):
- # Yield *directory* path that matches pattern (if any).
- yield path
- try:
- # We must close the scandir() object before proceeding to
- # avoid exhausting file descriptors when globbing deep trees.
- with path._scandir() as scandir_it:
- entries = list(scandir_it)
- except OSError:
- pass
- else:
- for entry in entries:
- # Handle directory entry.
- try:
- if entry.is_dir(follow_symlinks=follow_symlinks):
- # Recurse into this directory.
- paths.append(path._make_child_direntry(entry))
- continue
- except OSError:
- pass
-
- # Handle file entry.
- if not dir_only:
- # Avoid cost of making a path object for non-matching
- # files by matching against the os.DirEntry object.
- if match is None or match(path._direntry_str(entry), parent_len):
- # Yield *file* path that matches pattern (if any).
- yield path._make_child_direntry(entry)
-
-
-def _select_unique(paths):
- """Yields the given paths, filtering out duplicates."""
- yielded = set()
- try:
- for path in paths:
- path_str = str(path)
- if path_str not in yielded:
- yield path
- yielded.add(path_str)
- finally:
- yielded.clear()
+ @staticmethod
+ def parse_entry(entry):
+ """Returns the path of an entry yielded from scandir().
+ """
+ return entry

class UnsupportedOperation(NotImplementedError):
@@ -218,6 +134,7 @@ class PurePathBase:
'_resolving',
)
parser = ParserBase()
+ _globber = Globber

def __init__(self, path, *paths):
self._raw_path = self.parser.join(path, *paths) if paths else path
@@ -454,14 +371,6 @@ def is_absolute(self):
a drive)."""
return self.parser.isabs(self._raw_path)

- @property
- def _pattern_stack(self):
- """Stack of path components, to be used with patterns in glob()."""
- anchor, parts = self._stack
- if anchor:
- raise NotImplementedError("Non-relative patterns are unsupported")
- return parts
-
@property
def _pattern_str(self):
"""The path expressed as a string, for use in pattern-matching."""
@@ -487,8 +396,9 @@ def match(self, path_pattern, *, case_sensitive=None):
return False
if len(path_parts) > len(pattern_parts) and path_pattern.anchor:
return False
+ globber = self._globber(sep, case_sensitive)
for path_part, pattern_part in zip(path_parts, pattern_parts):
- match = _compile_pattern(pattern_part, sep, case_sensitive, recursive=False)
+ match = globber.compile(pattern_part)
if match(path_part) is None:
return False
return True
@@ -502,7 +412,8 @@ def full_match(self, pattern, *, case_sensitive=None):
pattern = self.with_segments(pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self.parser)
- match = _compile_pattern(pattern._pattern_str, pattern.parser.sep, case_sensitive)
+ globber = self._globber(pattern.parser.sep, case_sensitive, recursive=True)
+ match = globber.compile(pattern._pattern_str)
return match(self._pattern_str) is not None

@@ -772,11 +683,6 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

- def _direntry_str(self, entry):
- # Transform an entry yielded from _scandir() into a path string.
- # PathBase._scandir() yields PathBase objects, so use str().
- return str(entry)
-
def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
# PathBase._scandir() yields PathBase objects, so this is a no-op.
@@ -785,62 +691,26 @@ def _make_child_direntry(self, entry):
def _make_child_relpath(self, name):
return self.joinpath(name)

+ def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
+ if case_sensitive is None:
+ case_sensitive = _is_case_sensitive(self.parser)
+ recursive = True if recurse_symlinks else glob._no_recurse_symlinks
+ globber = self._globber(self.parser.sep, case_sensitive, recursive)
+ return globber.selector(parts)
+
def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Iterate over this subtree and yield all existing files (of any
kind, including directories) matching the given relative pattern.
"""
if not isinstance(pattern, PurePathBase):
pattern = self.with_segments(pattern)
- if case_sensitive is None:
- # TODO: evaluate case-sensitivity of each directory in _select_children().
- case_sensitive = _is_case_sensitive(self.parser)
-
- stack = pattern._pattern_stack
- specials = ('', '.', '..')
- deduplicate_paths = False
- sep = self.parser.sep
- paths = iter([self] if self.is_dir() else [])
- while stack:
- part = stack.pop()
- if part in specials:
- # Join special component (e.g. '..') onto paths.
- paths = _select_special(paths, part)
-
- elif part == '**':
- # Consume following '**' components, which have no effect.
- while stack and stack[-1] == '**':
- stack.pop()
-
- # Consume following non-special components, provided we're
- # treating symlinks consistently. Each component is joined
- # onto 'part', which is used to generate an re.Pattern object.
- if recurse_symlinks:
- while stack and stack[-1] not in specials:
- part += sep + stack.pop()
-
- # If the previous loop consumed pattern components, compile an
- # re.Pattern object based on those components.
- match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None
-
- # Recursively walk directories, filtering by type and regex.
- paths = _select_recursive(paths, bool(stack), recurse_symlinks, match)
-
- # De-duplicate if we've already seen a '**' component.
- if deduplicate_paths:
- paths = _select_unique(paths)
- deduplicate_paths = True
-
- elif '**' in part:
- raise ValueError("Invalid pattern: '**' can only be an entire path component")
-
- else:
- # If the pattern component isn't '*', compile an re.Pattern
- # object based on the component.
- match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None
-
- # Iterate over directories' children filtering by type and regex.
- paths = _select_children(paths, bool(stack), match)
- return paths
+ anchor, parts = pattern._stack
+ if anchor:
+ raise NotImplementedError("Non-relative patterns are unsupported")
+ if not self.is_dir():
+ return iter([])
+ select = self._glob_selector(parts, case_sensitive, recurse_symlinks)
+ return select(self, exists=True)

def rglob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
"""Recursively yield all existing files (of any kind, including
diff --git a/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst b/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst
new file mode 100644
index 00000000000000..65c699977bd807
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-04-06-20-31-09.gh-issue-117586.UgWdRK.rst
@@ -0,0 +1 @@
+Speed up :meth:`pathlib.Path.glob` by working with strings internally.

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-leave@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: list-python-checkins@lists.gossamer-threads.com