diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 84fee05cb4ce20..de6829eb92c5dc 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -307,6 +307,66 @@ These APIs can be used for fast direct character conversions: possible. This function does not raise exceptions. +.. c:function:: Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to lower case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be lower cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be lowercased into a buffer of *size* ``2``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to upper case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be upper cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be uppercased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Convert *ch* to title case, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be title cased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be titlecased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + + +.. c:function:: Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *buffer, Py_ssize_t size) + + Foldcase *ch*, store result in *buffer*, which should be + able to hold as many characters needed for *ch* to be foldcased, and + return the number of characters stored. If at some point a buffer overflow + is detected, an :exc:`ValueError` is raised and ``-1`` is returned. + + In Unicode 16.0, any character can be foldcased into a buffer of *size* ``3``. + See also :c:macro:`PyUCS4_CASE_CONVERSION_BUFFER_SIZE`. + + .. versionadded:: next + +.. c:macro:: PyUCS4_CASE_CONVERSION_BUFFER_SIZE + + The minimum buffer size needed for any call to :c:func:`PyUCS4_ToLower`, + :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle`, or + :c:func:`PyUCS4_ToFolded`. That is, ``3`` for Unicode 16.0. + +.. versionadded:: next + + These APIs can be used to work with surrogates: .. c:function:: int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 73e3bc44d6c9ca..662e3f5ab06dcf 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -733,6 +733,31 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha( Py_UCS4 ch /* Unicode character */ ); +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToLower( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToUpper( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToTitle( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + +PyAPI_FUNC(Py_ssize_t) PyUCS4_ToFolded( + Py_UCS4 ch, /* Unicode character */ + Py_UCS4 *res, /* Output buffer */ + Py_ssize_t size /* Buffer size */ + ); + + // Helper array used by Py_UNICODE_ISSPACE(). PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[]; @@ -767,6 +792,8 @@ static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) { #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch) +#define PyUCS4_CASE_CONVERSION_BUFFER_SIZE 3 + static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) { return (Py_UNICODE_ISALPHA(ch) || Py_UNICODE_ISDECIMAL(ch) diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 3791b913c17546..c83a221bb6a3a1 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -15,10 +15,6 @@ extern "C" { extern int _PyUnicode_IsXidStart(Py_UCS4 ch); extern int _PyUnicode_IsXidContinue(Py_UCS4 ch); -extern int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res); -extern int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res); extern int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch); extern int _PyUnicode_IsCased(Py_UCS4 ch); diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py index 6a9c60f3a6d75e..3a5d1a0053f351 100644 --- a/Lib/test/test_capi/test_unicode.py +++ b/Lib/test/test_capi/test_unicode.py @@ -1753,6 +1753,65 @@ def test_GET_CACHED_HASH(self): # impl detail: ASCII string hashes are equal to bytes ones self.assertEqual(unicode_GET_CACHED_HASH(obj), hash(content_bytes)) + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tolower(self): + import string + from _testcapi import unicode_tolower + + for i, c in enumerate(string.ascii_uppercase): + with self.subTest(c): + self.assertEqual(unicode_tolower(c), string.ascii_lowercase[i]) + + # Test unicode character + self.assertEqual(unicode_tolower("Č"), "č") + self.assertEqual(unicode_tolower("Σ"), "σ") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_toupper(self): + import string + from _testcapi import unicode_toupper, unicode_toupper_buffer_too_small + + for i, c in enumerate(string.ascii_lowercase): + with self.subTest(c): + self.assertEqual(unicode_toupper(c), string.ascii_uppercase[i]) + + # Test unicode character + self.assertEqual(unicode_toupper("č"), "Č") + self.assertEqual(unicode_toupper("ß"), "SS") + self.assertEqual(unicode_toupper("ΐ"), "Ϊ́") + + # Test unicode character with smaller buffer + with self.assertRaisesRegex(ValueError, "output buffer is too small"): + unicode_toupper_buffer_too_small("ß") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_totitle(self): + from _testcapi import unicode_totitle + + self.assertEqual(unicode_totitle("t"), "T") + + # Test unicode character + self.assertEqual(unicode_totitle("ł"), "Ł") + self.assertEqual(unicode_totitle("ß"), "Ss") + self.assertEqual(unicode_totitle("ΐ"), "Ϊ́") + + @support.cpython_only + @unittest.skipIf(_testcapi is None, 'need _testcapi module') + def test_tofolded(self): + from _testcapi import unicode_tofolded + + self.assertEqual(unicode_tofolded("T"), "t") + + # Test unicode character + self.assertEqual(unicode_tofolded("Ł"), "ł") + self.assertEqual(unicode_tofolded("Σ"), "σ") + + # Test case-ignorable character + self.assertEqual(unicode_tofolded("👍"), "👍") + class PyUnicodeWriterTest(unittest.TestCase): def create_writer(self, size): diff --git a/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst new file mode 100644 index 00000000000000..37d251b6e35d8f --- /dev/null +++ b/Misc/NEWS.d/next/C_API/2025-07-01-14-56-41.gh-issue-76535.9cwObj.rst @@ -0,0 +1 @@ +Make :c:func:`PyUCS4_ToLower`, :c:func:`PyUCS4_ToUpper`, :c:func:`PyUCS4_ToTitle` and :c:func:`PyUCS4_ToFolded` public. diff --git a/Modules/_testcapi/unicode.c b/Modules/_testcapi/unicode.c index 203282dd53dd0a..c3106f0fcb8543 100644 --- a/Modules/_testcapi/unicode.c +++ b/Modules/_testcapi/unicode.c @@ -220,6 +220,72 @@ unicode_copycharacters(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", to_copy, copied); } +static PyObject * +unicode_case_operation(PyObject *str, Py_ssize_t (*function)(Py_UCS4, Py_UCS4 *, Py_ssize_t), + Py_UCS4 *buf, Py_ssize_t size) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str type, got %T", str); + return NULL; + } + + if (PyUnicode_GET_LENGTH(str) != 1) { + PyErr_SetString(PyExc_ValueError, "expecting 1-character strings only"); + return NULL; + } + + Py_UCS4 c = PyUnicode_READ_CHAR(str, 0); + + Py_ssize_t chars = function(c, buf, size); + if (chars < 0) { + return NULL; + } + + return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, buf, chars); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tolower(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToLower, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + +/* Test PyUCS4_ToUpper() */ +static PyObject * +unicode_toupper(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToUpper, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUCS4_ToUpper() with a small buffer */ +static PyObject * +unicode_toupper_buffer_too_small(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf; + return unicode_case_operation(arg, PyUCS4_ToUpper, &buf, 1); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_totitle(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToTitle, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + +/* Test PyUCS4_ToLower() */ +static PyObject * +unicode_tofolded(PyObject *self, PyObject *arg) +{ + Py_UCS4 buf[PyUCS4_CASE_CONVERSION_BUFFER_SIZE]; + return unicode_case_operation(arg, PyUCS4_ToFolded, buf, PyUCS4_CASE_CONVERSION_BUFFER_SIZE); +} + + static PyObject* unicode_GET_CACHED_HASH(PyObject *self, PyObject *arg) { @@ -577,6 +643,11 @@ static PyMethodDef TestMethods[] = { {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_GET_CACHED_HASH", unicode_GET_CACHED_HASH, METH_O}, + {"unicode_tolower", unicode_tolower, METH_O}, + {"unicode_toupper", unicode_toupper, METH_O}, + {"unicode_toupper_buffer_too_small", unicode_toupper_buffer_too_small, METH_O}, + {"unicode_totitle", unicode_totitle, METH_O}, + {"unicode_tofolded", unicode_tofolded, METH_O}, {NULL}, }; diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c index 7cd0dca3d13545..aacfc316e2b960 100644 --- a/Objects/unicodectype.c +++ b/Objects/unicodectype.c @@ -198,67 +198,103 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) return ch + ctype->lower; } -int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToLower(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->lower & 0xFFFF; int n = ctype->lower >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->lower; return 1; } -int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToTitle(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->title & 0xFFFF; int n = ctype->title >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->title; return 1; } -int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToUpper(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK) { int index = ctype->upper & 0xFFFF; int n = ctype->upper >> 24; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } + + if (size < 1) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } res[0] = ch + ctype->upper; return 1; } -int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res) +Py_ssize_t PyUCS4_ToFolded(Py_UCS4 ch, Py_UCS4 *res, Py_ssize_t size) { const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) { int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24); int n = (ctype->lower >> 20) & 7; + if (n > size) { + PyErr_SetString(PyExc_ValueError, "output buffer is too small"); + return -1; + } + int i; for (i = 0; i < n; i++) res[i] = _PyUnicode_ExtendedCase[index + i]; return n; } - return _PyUnicode_ToLowerFull(ch, res); + + return PyUCS4_ToLower(ch, res, size); } int _PyUnicode_IsCased(Py_UCS4 ch) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8df7a48284dccd..c54d95cf9226fa 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10036,34 +10036,35 @@ handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i return (final_sigma) ? 0x3C2 : 0x3C3; } -static int +static Py_ssize_t lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i, - Py_UCS4 c, Py_UCS4 *mapped) + Py_UCS4 c, Py_UCS4 *mapped, Py_ssize_t mapped_size) { /* Obscure special case. */ if (c == 0x3A3) { mapped[0] = handle_capital_sigma(kind, data, length, i); return 1; } - return _PyUnicode_ToLowerFull(c, mapped); + return PyUCS4_ToLower(c, mapped, mapped_size); } static Py_ssize_t do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - Py_ssize_t i, k = 0; - int n_res, j; + Py_ssize_t i, k = 0, n_res, j; Py_UCS4 c, mapped[3]; c = PyUnicode_READ(kind, data, 0); - n_res = _PyUnicode_ToTitleFull(c, mapped); + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10078,17 +10079,18 @@ do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (Py_UNICODE_ISUPPER(c)) { - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); } else if (Py_UNICODE_ISLOWER(c)) { - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); } else { n_res = 1; mapped[0] = c; } + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10105,11 +10107,12 @@ do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (lower) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToUpperFull(c, mapped); + n_res = PyUCS4_ToUpper(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10138,7 +10141,8 @@ do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 for (i = 0; i < length; i++) { Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); + Py_ssize_t j, n_res = PyUCS4_ToFolded(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; @@ -10157,13 +10161,13 @@ do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *m for (i = 0; i < length; i++) { const Py_UCS4 c = PyUnicode_READ(kind, data, i); Py_UCS4 mapped[3]; - int n_res, j; + Py_ssize_t n_res, j; if (previous_is_cased) - n_res = lower_ucs4(kind, data, length, i, c, mapped); + n_res = lower_ucs4(kind, data, length, i, c, mapped, Py_ARRAY_LENGTH(mapped)); else - n_res = _PyUnicode_ToTitleFull(c, mapped); - + n_res = PyUCS4_ToTitle(c, mapped, Py_ARRAY_LENGTH(mapped)); + assert(n_res >= 1); for (j = 0; j < n_res; j++) { *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index d4cca68c3e3e71..40ef4379419008 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -44,6 +44,7 @@ # * Doc/library/stdtypes.rst, and # * Doc/library/unicodedata.rst # * Doc/reference/lexical_analysis.rst (three occurrences) +# * Doc/c-api-unicode.rst (in case conversion APIs) UNIDATA_VERSION = "16.0.0" UNICODE_DATA = "UnicodeData%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"