Mailing List Archive

gh-102511: Speed up os.path.splitroot() with native helpers (GH-118089)
https://github.com/python/cpython/commit/10bb90ed49a81a525b126ce8e4d8564c1616d0b3
commit: 10bb90ed49a81a525b126ce8e4d8564c1616d0b3
branch: main
author: Nice Zombies <nineteendo19d0@gmail.com>
committer: zooba <steve.dower@microsoft.com>
date: 2024-04-25T10:07:38+01:00
summary:

gh-102511: Speed up os.path.splitroot() with native helpers (GH-118089)

files:
A Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst
M Include/internal/pycore_fileutils.h
M Lib/ntpath.py
M Lib/posixpath.py
M Lib/test/test_ntpath.py
M Modules/clinic/posixmodule.c.h
M Modules/posixmodule.c
M Python/fileutils.c

diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h
index 5c55282fa39e6f..bc8100b58e8ea3 100644
--- a/Include/internal/pycore_fileutils.h
+++ b/Include/internal/pycore_fileutils.h
@@ -290,6 +290,8 @@ extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t
extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd);
#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */

+extern void _Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, Py_ssize_t *rootsize);
+
// Macros to protect CRT calls against instant termination when passed an
// invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler.
// Usage:
diff --git a/Lib/ntpath.py b/Lib/ntpath.py
index aba18bfe407abf..e810b655e5ac85 100644
--- a/Lib/ntpath.py
+++ b/Lib/ntpath.py
@@ -167,56 +167,76 @@ def splitdrive(p):
return drive, root + tail


-def splitroot(p):
- """Split a pathname into drive, root and tail. The drive is defined
- exactly as in splitdrive(). On Windows, the root may be a single path
- separator or an empty string. The tail contains anything after the root.
- For example:
-
- splitroot('//server/share/') == ('//server/share', '/', '')
- splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
- splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
- splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
- """
- p = os.fspath(p)
- if isinstance(p, bytes):
- sep = b'\\'
- altsep = b'/'
- colon = b':'
- unc_prefix = b'\\\\?\\UNC\\'
- empty = b''
- else:
- sep = '\\'
- altsep = '/'
- colon = ':'
- unc_prefix = '\\\\?\\UNC\\'
- empty = ''
- normp = p.replace(altsep, sep)
- if normp[:1] == sep:
- if normp[1:2] == sep:
- # UNC drives, e.g. \\server\share or \\?\UNC\server\share
- # Device drives, e.g. \\.\device or \\?\device
- start = 8 if normp[:8].upper() == unc_prefix else 2
- index = normp.find(sep, start)
- if index == -1:
- return p, empty, empty
- index2 = normp.find(sep, index + 1)
- if index2 == -1:
- return p, empty, empty
- return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
+try:
+ from nt import _path_splitroot_ex
+except ImportError:
+ def splitroot(p):
+ """Split a pathname into drive, root and tail. The drive is defined
+ exactly as in splitdrive(). On Windows, the root may be a single path
+ separator or an empty string. The tail contains anything after the root.
+ For example:
+
+ splitroot('//server/share/') == ('//server/share', '/', '')
+ splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
+ splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
+ splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
+ """
+ p = os.fspath(p)
+ if isinstance(p, bytes):
+ sep = b'\\'
+ altsep = b'/'
+ colon = b':'
+ unc_prefix = b'\\\\?\\UNC\\'
+ empty = b''
else:
- # Relative path with root, e.g. \Windows
- return empty, p[:1], p[1:]
- elif normp[1:2] == colon:
- if normp[2:3] == sep:
- # Absolute drive-letter path, e.g. X:\Windows
- return p[:2], p[2:3], p[3:]
+ sep = '\\'
+ altsep = '/'
+ colon = ':'
+ unc_prefix = '\\\\?\\UNC\\'
+ empty = ''
+ normp = p.replace(altsep, sep)
+ if normp[:1] == sep:
+ if normp[1:2] == sep:
+ # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+ # Device drives, e.g. \\.\device or \\?\device
+ start = 8 if normp[:8].upper() == unc_prefix else 2
+ index = normp.find(sep, start)
+ if index == -1:
+ return p, empty, empty
+ index2 = normp.find(sep, index + 1)
+ if index2 == -1:
+ return p, empty, empty
+ return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
+ else:
+ # Relative path with root, e.g. \Windows
+ return empty, p[:1], p[1:]
+ elif normp[1:2] == colon:
+ if normp[2:3] == sep:
+ # Absolute drive-letter path, e.g. X:\Windows
+ return p[:2], p[2:3], p[3:]
+ else:
+ # Relative path with drive, e.g. X:Windows
+ return p[:2], empty, p[2:]
else:
- # Relative path with drive, e.g. X:Windows
- return p[:2], empty, p[2:]
- else:
- # Relative path, e.g. Windows
- return empty, empty, p
+ # Relative path, e.g. Windows
+ return empty, empty, p
+else:
+ def splitroot(p):
+ """Split a pathname into drive, root and tail. The drive is defined
+ exactly as in splitdrive(). On Windows, the root may be a single path
+ separator or an empty string. The tail contains anything after the root.
+ For example:
+
+ splitroot('//server/share/') == ('//server/share', '/', '')
+ splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
+ splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
+ splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
+ """
+ p = os.fspath(p)
+ if isinstance(p, bytes):
+ drive, root, tail = _path_splitroot_ex(os.fsdecode(p))
+ return os.fsencode(drive), os.fsencode(root), os.fsencode(tail)
+ return _path_splitroot_ex(p)


# Split a path in head (everything up to the last '/') and tail (the
diff --git a/Lib/posixpath.py b/Lib/posixpath.py
index f1960ddb88e590..56b7915826daf4 100644
--- a/Lib/posixpath.py
+++ b/Lib/posixpath.py
@@ -134,33 +134,53 @@ def splitdrive(p):
return p[:0], p


-def splitroot(p):
- """Split a pathname into drive, root and tail. On Posix, drive is always
- empty; the root may be empty, a single slash, or two slashes. The tail
- contains anything after the root. For example:
-
- splitroot('foo/bar') == ('', '', 'foo/bar')
- splitroot('/foo/bar') == ('', '/', 'foo/bar')
- splitroot('//foo/bar') == ('', '//', 'foo/bar')
- splitroot('///foo/bar') == ('', '/', '//foo/bar')
- """
- p = os.fspath(p)
- if isinstance(p, bytes):
- sep = b'/'
- empty = b''
- else:
- sep = '/'
- empty = ''
- if p[:1] != sep:
- # Relative path, e.g.: 'foo'
- return empty, empty, p
- elif p[1:2] != sep or p[2:3] == sep:
- # Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
- return empty, sep, p[1:]
- else:
- # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
- # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
- return empty, p[:2], p[2:]
+try:
+ from posix import _path_splitroot_ex
+except ImportError:
+ def splitroot(p):
+ """Split a pathname into drive, root and tail. On Posix, drive is always
+ empty; the root may be empty, a single slash, or two slashes. The tail
+ contains anything after the root. For example:
+
+ splitroot('foo/bar') == ('', '', 'foo/bar')
+ splitroot('/foo/bar') == ('', '/', 'foo/bar')
+ splitroot('//foo/bar') == ('', '//', 'foo/bar')
+ splitroot('///foo/bar') == ('', '/', '//foo/bar')
+ """
+ p = os.fspath(p)
+ if isinstance(p, bytes):
+ sep = b'/'
+ empty = b''
+ else:
+ sep = '/'
+ empty = ''
+ if p[:1] != sep:
+ # Relative path, e.g.: 'foo'
+ return empty, empty, p
+ elif p[1:2] != sep or p[2:3] == sep:
+ # Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
+ return empty, sep, p[1:]
+ else:
+ # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
+ # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
+ return empty, p[:2], p[2:]
+else:
+ def splitroot(p):
+ """Split a pathname into drive, root and tail. On Posix, drive is always
+ empty; the root may be empty, a single slash, or two slashes. The tail
+ contains anything after the root. For example:
+
+ splitroot('foo/bar') == ('', '', 'foo/bar')
+ splitroot('/foo/bar') == ('', '/', 'foo/bar')
+ splitroot('//foo/bar') == ('', '//', 'foo/bar')
+ splitroot('///foo/bar') == ('', '/', '//foo/bar')
+ """
+ p = os.fspath(p)
+ if isinstance(p, bytes):
+ # Optimisation: the drive is always empty
+ _, root, tail = _path_splitroot_ex(os.fsdecode(p))
+ return b'', os.fsencode(root), os.fsencode(tail)
+ return _path_splitroot_ex(p)


# Return the tail (basename) part of a path, same as split(path)[1].
diff --git a/Lib/test/test_ntpath.py b/Lib/test/test_ntpath.py
index 31156130fcc747..7f91bf1c2b837a 100644
--- a/Lib/test/test_ntpath.py
+++ b/Lib/test/test_ntpath.py
@@ -374,6 +374,7 @@ def test_normpath(self):
tester("ntpath.normpath('\\\\foo\\')", '\\\\foo\\')
tester("ntpath.normpath('\\\\foo')", '\\\\foo')
tester("ntpath.normpath('\\\\')", '\\\\')
+ tester("ntpath.normpath('//?/UNC/server/share/..')", '\\\\?\\UNC\\server\\share\\')

def test_realpath_curdir(self):
expected = ntpath.normpath(os.getcwd())
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst
new file mode 100644
index 00000000000000..dfdf250710778e
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst
@@ -0,0 +1 @@
+Speed up :func:`os.path.splitroot` with a native implementation.
diff --git a/Modules/clinic/posixmodule.c.h b/Modules/clinic/posixmodule.c.h
index 0398629e3c10ce..a0d1f3238a6733 100644
--- a/Modules/clinic/posixmodule.c.h
+++ b/Modules/clinic/posixmodule.c.h
@@ -2248,6 +2248,64 @@ os__path_islink(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj

#endif /* defined(MS_WINDOWS) */

+PyDoc_STRVAR(os__path_splitroot_ex__doc__,
+"_path_splitroot_ex($module, /, path)\n"
+"--\n"
+"\n");
+
+#define OS__PATH_SPLITROOT_EX_METHODDEF \
+ {"_path_splitroot_ex", _PyCFunction_CAST(os__path_splitroot_ex), METH_FASTCALL|METH_KEYWORDS, os__path_splitroot_ex__doc__},
+
+static PyObject *
+os__path_splitroot_ex_impl(PyObject *module, PyObject *path);
+
+static PyObject *
+os__path_splitroot_ex(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+ PyObject *return_value = NULL;
+ #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+ #define NUM_KEYWORDS 1
+ static struct {
+ PyGC_Head _this_is_not_used;
+ PyObject_VAR_HEAD
+ PyObject *ob_item[NUM_KEYWORDS];
+ } _kwtuple = {
+ .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+ .ob_item = { &_Py_ID(path), },
+ };
+ #undef NUM_KEYWORDS
+ #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+ #else // !Py_BUILD_CORE
+ # define KWTUPLE NULL
+ #endif // !Py_BUILD_CORE
+
+ static const char * const _keywords[] = {"path", NULL};
+ static _PyArg_Parser _parser = {
+ .keywords = _keywords,
+ .fname = "_path_splitroot_ex",
+ .kwtuple = KWTUPLE,
+ };
+ #undef KWTUPLE
+ PyObject *argsbuf[1];
+ PyObject *path;
+
+ args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+ if (!args) {
+ goto exit;
+ }
+ if (!PyUnicode_Check(args[0])) {
+ _PyArg_BadArgument("_path_splitroot_ex", "argument 'path'", "str", args[0]);
+ goto exit;
+ }
+ path = args[0];
+ return_value = os__path_splitroot_ex_impl(module, path);
+
+exit:
+ return return_value;
+}
+
PyDoc_STRVAR(os__path_normpath__doc__,
"_path_normpath($module, /, path)\n"
"--\n"
@@ -12602,4 +12660,4 @@ os__supports_virtual_terminal(PyObject *module, PyObject *Py_UNUSED(ignored))
#ifndef OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
#define OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
#endif /* !defined(OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF) */
-/*[clinic end generated code: output=511f0788a6b90db0 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=c4698b47007cd6eb input=a9049054013a1b77]*/
diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c
index 5e54cf64cd563e..c9d67ccbb8c908 100644
--- a/Modules/posixmodule.c
+++ b/Modules/posixmodule.c
@@ -5467,6 +5467,49 @@ os__path_islink_impl(PyObject *module, PyObject *path)
#endif /* MS_WINDOWS */


+/*[clinic input]
+os._path_splitroot_ex
+
+ path: unicode
+
+[clinic start generated code]*/
+
+static PyObject *
+os__path_splitroot_ex_impl(PyObject *module, PyObject *path)
+/*[clinic end generated code: output=de97403d3dfebc40 input=f1470e12d899f9ac]*/
+{
+ Py_ssize_t len, drvsize, rootsize;
+ PyObject *drv = NULL, *root = NULL, *tail = NULL, *result = NULL;
+
+ wchar_t *buffer = PyUnicode_AsWideCharString(path, &len);
+ if (!buffer) {
+ goto exit;
+ }
+
+ _Py_skiproot(buffer, len, &drvsize, &rootsize);
+ drv = PyUnicode_FromWideChar(buffer, drvsize);
+ if (drv == NULL) {
+ goto exit;
+ }
+ root = PyUnicode_FromWideChar(&buffer[drvsize], rootsize);
+ if (root == NULL) {
+ goto exit;
+ }
+ tail = PyUnicode_FromWideChar(&buffer[drvsize + rootsize],
+ len - drvsize - rootsize);
+ if (tail == NULL) {
+ goto exit;
+ }
+ result = Py_BuildValue("(OOO)", drv, root, tail);
+exit:
+ PyMem_Free(buffer);
+ Py_XDECREF(drv);
+ Py_XDECREF(root);
+ Py_XDECREF(tail);
+ return result;
+}
+
+
/*[clinic input]
os._path_normpath

@@ -16799,6 +16842,7 @@ static PyMethodDef posix_methods[] = {
OS__FINDFIRSTFILE_METHODDEF
OS__GETVOLUMEPATHNAME_METHODDEF
OS__PATH_SPLITROOT_METHODDEF
+ OS__PATH_SPLITROOT_EX_METHODDEF
OS__PATH_NORMPATH_METHODDEF
OS_GETLOADAVG_METHODDEF
OS_URANDOM_METHODDEF
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 882d3299575cf3..54853ba2f75d9d 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -2295,6 +2295,99 @@ PathCchCombineEx(wchar_t *buffer, size_t bufsize, const wchar_t *dirname,

#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */

+void
+_Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize,
+ Py_ssize_t *rootsize)
+{
+ assert(drvsize);
+ assert(rootsize);
+#ifndef MS_WINDOWS
+#define IS_SEP(x) (*(x) == SEP)
+ *drvsize = 0;
+ if (!IS_SEP(&path[0])) {
+ // Relative path, e.g.: 'foo'
+ *rootsize = 0;
+ }
+ else if (!IS_SEP(&path[1]) || IS_SEP(&path[2])) {
+ // Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
+ *rootsize = 1;
+ }
+ else {
+ // Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
+ // https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
+ *rootsize = 2;
+ }
+#undef IS_SEP
+#else
+ const wchar_t *pEnd = size >= 0 ? &path[size] : NULL;
+#define IS_END(x) (pEnd ? (x) == pEnd : !*(x))
+#define IS_SEP(x) (*(x) == SEP || *(x) == ALTSEP)
+#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
+ if (IS_SEP(&path[0])) {
+ if (IS_SEP(&path[1])) {
+ // Device drives, e.g. \\.\device or \\?\device
+ // UNC drives, e.g. \\server\share or \\?\UNC\server\share
+ Py_ssize_t idx;
+ if (path[2] == L'?' && IS_SEP(&path[3]) &&
+ (path[4] == L'U' || path[4] == L'u') &&
+ (path[5] == L'N' || path[5] == L'n') &&
+ (path[6] == L'C' || path[6] == L'c') &&
+ IS_SEP(&path[7]))
+ {
+ idx = 8;
+ }
+ else {
+ idx = 2;
+ }
+ while (!SEP_OR_END(&path[idx])) {
+ idx++;
+ }
+ if (IS_END(&path[idx])) {
+ *drvsize = idx;
+ *rootsize = 0;
+ }
+ else {
+ idx++;
+ while (!SEP_OR_END(&path[idx])) {
+ idx++;
+ }
+ *drvsize = idx;
+ if (IS_END(&path[idx])) {
+ *rootsize = 0;
+ }
+ else {
+ *rootsize = 1;
+ }
+ }
+ }
+ else {
+ // Relative path with root, e.g. \Windows
+ *drvsize = 0;
+ *rootsize = 1;
+ }
+ }
+ else if (!IS_END(&path[0]) && path[1] == L':') {
+ *drvsize = 2;
+ if (IS_SEP(&path[2])) {
+ // Absolute drive-letter path, e.g. X:\Windows
+ *rootsize = 1;
+ }
+ else {
+ // Relative path with drive, e.g. X:Windows
+ *rootsize = 0;
+ }
+ }
+ else {
+ // Relative path, e.g. Windows
+ *drvsize = 0;
+ *rootsize = 0;
+ }
+#undef SEP_OR_END
+#undef IS_SEP
+#undef IS_END
+#endif
+}
+
// The caller must ensure "buffer" is big enough.
static int
join_relfile(wchar_t *buffer, size_t bufsize,
@@ -2411,49 +2504,39 @@ _Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *normsize)
#endif
#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))

- // Skip leading '.\'
if (p1[0] == L'.' && IS_SEP(&p1[1])) {
+ // Skip leading '.\'
path = &path[2];
- while (IS_SEP(path) && !IS_END(path)) {
+ while (IS_SEP(path)) {
path++;
}
p1 = p2 = minP2 = path;
lastC = SEP;
}
+ else {
+ Py_ssize_t drvsize, rootsize;
+ _Py_skiproot(path, size, &drvsize, &rootsize);
+ if (drvsize || rootsize) {
+ // Skip past root and update minP2
+ p1 = &path[drvsize + rootsize];
+#ifndef ALTSEP
+ p2 = p1;
+#else
+ for (; p2 < p1; ++p2) {
+ if (*p2 == ALTSEP) {
+ *p2 = SEP;
+ }
+ }
+#endif
+ minP2 = p2 - 1;
+ lastC = *minP2;
#ifdef MS_WINDOWS
- // Skip past drive segment and update minP2
- else if (p1[0] && p1[1] == L':') {
- *p2++ = *p1++;
- *p2++ = *p1++;
- minP2 = p2;
- lastC = L':';
- }
- // Skip past all \\-prefixed paths, including \\?\, \\.\,
- // and network paths, including the first segment.
- else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1])) {
- int sepCount = 2;
- *p2++ = SEP;
- *p2++ = SEP;
- p1 += 2;
- for (; !IS_END(p1) && sepCount; ++p1) {
- if (IS_SEP(p1)) {
- --sepCount;
- *p2++ = lastC = SEP;
- } else {
- *p2++ = lastC = *p1;
+ if (lastC != SEP) {
+ minP2++;
}
+#endif
}
- minP2 = p2 - 1;
- }
-#else
- // Skip past two leading SEPs
- else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1]) && !IS_SEP(&p1[2])) {
- *p2++ = *p1++;
- *p2++ = *p1++;
- minP2 = p2 - 1; // Absolute path has SEP at minP2
- lastC = SEP;
}
-#endif /* MS_WINDOWS */

/* if pEnd is specified, check that. Else, check for null terminator */
for (; !IS_END(p1); ++p1) {

_______________________________________________
Python-checkins mailing list -- python-checkins@python.org
To unsubscribe send an email to python-checkins-leave@python.org
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: list-python-checkins@lists.gossamer-threads.com