From 7f1f197b286a6e1b2a87dcd1cb0322069e1e07c9 Mon Sep 17 00:00:00 2001 From: Evgeny Vashkevich Date: Wed, 16 Apr 2025 17:18:34 +0200 Subject: [PATCH 1/3] [mypyc] Add primitive for bytes decode() method --- mypyc/lib-rt/CPy.h | 1 + mypyc/lib-rt/bytes_ops.c | 13 +++++++++++++ mypyc/primitives/bytes_ops.py | 8 ++++++++ mypyc/test-data/irbuild-bytes.test | 12 ++++++++++++ mypyc/test-data/run-bytes.test | 26 ++++++++++++++++++++++++++ 5 files changed, 60 insertions(+) diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index 1f0cf4dd63d6..6e0938c651dd 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -764,6 +764,7 @@ CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index); PyObject *CPyBytes_Concat(PyObject *a, PyObject *b); PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter); CPyTagged CPyBytes_Ord(PyObject *obj); +PyObject *CPy_DecodeUtf8(PyObject *bytes_obj, const char *errors); int CPyBytes_Compare(PyObject *left, PyObject *right); diff --git a/mypyc/lib-rt/bytes_ops.c b/mypyc/lib-rt/bytes_ops.c index 6ff34b021a9a..ff4a3cc7886b 100644 --- a/mypyc/lib-rt/bytes_ops.c +++ b/mypyc/lib-rt/bytes_ops.c @@ -162,3 +162,16 @@ CPyTagged CPyBytes_Ord(PyObject *obj) { PyErr_SetString(PyExc_TypeError, "ord() expects a character"); return CPY_INT_TAG; } + + +PyObject *CPy_DecodeUtf8(PyObject *bytes_obj, const char *errors) { + if (!PyBytes_Check(bytes_obj)) { + PyErr_SetString(PyExc_TypeError, "expected bytes object"); + return NULL; + } + + char *data = PyBytes_AS_STRING(bytes_obj); + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj); + + return PyUnicode_DecodeUTF8(data, size, errors); +} diff --git a/mypyc/primitives/bytes_ops.py b/mypyc/primitives/bytes_ops.py index 1afd196cff84..10f952afd16e 100644 --- a/mypyc/primitives/bytes_ops.py +++ b/mypyc/primitives/bytes_ops.py @@ -107,3 +107,11 @@ c_function_name="CPyBytes_Ord", error_kind=ERR_MAGIC, ) + +method_op( + name="decode", + arg_types=[bytes_rprimitive, bytes_rprimitive], + return_type=str_rprimitive, + c_function_name="CPy_DecodeUtf8", + error_kind=ERR_MAGIC, +) diff --git a/mypyc/test-data/irbuild-bytes.test b/mypyc/test-data/irbuild-bytes.test index 476c5ac59f48..81da031bcaee 100644 --- a/mypyc/test-data/irbuild-bytes.test +++ b/mypyc/test-data/irbuild-bytes.test @@ -185,3 +185,15 @@ L0: r10 = CPyBytes_Build(2, var, r9) b4 = r10 return 1 + +[case testDecodeUtf8] +def f(b: bytes) -> str: + return b.decode("utf-8") +[out] +def f(b): + b :: bytes + r0, r1 :: str +L0: + r0 = 'utf-8' + r1 = CPy_Decode(b, r0, 0) + return r1 diff --git a/mypyc/test-data/run-bytes.test b/mypyc/test-data/run-bytes.test index fa63c46a6798..5442919bdb90 100644 --- a/mypyc/test-data/run-bytes.test +++ b/mypyc/test-data/run-bytes.test @@ -323,3 +323,29 @@ class A: def test_bytes_dunder() -> None: assert b'%b' % A() == b'aaa' assert b'%s' % A() == b'aaa' + +[case testDecodeUtf8] +from typing import Any +from testutil import assertRaises +from a import bytes_subclass + +def test_decode_utf8() -> None: + assert b'hello'.decode('utf-8') == 'hello' + assert b''.decode('utf-8') == '' + + x: bytes = bytearray(b'hello') + assert x.decode('utf-8') == 'hello' + assert type(x.decode('utf-8')) == str + + y: Any = bytes_subclass() + assert y.decode('utf-8') == 'spook' + + n: Any = 123 + with assertRaises(AttributeError): + n.decode('utf-8') + + +[file a.py] +class bytes_subclass(bytes): + def decode(self, encoding='utf-8'): + return 'spook' From c086137815a13a5bdc795929e50aa935588696bc Mon Sep 17 00:00:00 2001 From: Evgeny Vashkevich Date: Tue, 22 Apr 2025 16:31:52 +0200 Subject: [PATCH 2/3] add latin1 and ascii decode functions --- mypyc/irbuild/specialize.py | 48 ++++++++++++++++++++++++++++++ mypyc/lib-rt/CPy.h | 2 ++ mypyc/lib-rt/bytes_ops.c | 26 ++++++++++++++++ mypyc/primitives/bytes_ops.py | 21 +++++++++++-- mypyc/test-data/irbuild-bytes.test | 38 ++++++++++++++++++----- mypyc/test-data/run-bytes.test | 26 ---------------- 6 files changed, 126 insertions(+), 35 deletions(-) diff --git a/mypyc/irbuild/specialize.py b/mypyc/irbuild/specialize.py index f652449f5289..50f3e9b14fb6 100644 --- a/mypyc/irbuild/specialize.py +++ b/mypyc/irbuild/specialize.py @@ -49,6 +49,7 @@ RTuple, RType, bool_rprimitive, + bytes_rprimitive, c_int_rprimitive, dict_rprimitive, int16_rprimitive, @@ -89,6 +90,7 @@ dict_setdefault_spec_init_op, dict_values_op, ) +from mypyc.primitives.bytes_ops import bytes_decode_utf8_strict, bytes_decode_latin1_strict, bytes_decode_ascii_strict from mypyc.primitives.list_ops import new_list_set_item_op from mypyc.primitives.str_ops import ( str_encode_ascii_strict, @@ -740,6 +742,52 @@ def str_encode_fast_path(builder: IRBuilder, expr: CallExpr, callee: RefExpr) -> return None +@specialize_function("decode", bytes_rprimitive) +def bytes_decode_fast_path(builder: IRBuilder, expr: CallExpr, callee: RefExpr) -> Value | None: + if not isinstance(callee, MemberExpr): + return None + + encoding = "utf8" + errors = "strict" + + # Handle up to 2 arguments: decode([encoding], [errors]) + if len(expr.arg_kinds) > 0 and isinstance(expr.args[0], StrExpr): + if expr.arg_kinds[0] == ARG_NAMED: + if expr.arg_names[0] == "encoding": + encoding = expr.args[0].value + elif expr.arg_names[0] == "errors": + errors = expr.args[0].value + elif expr.arg_kinds[0] == ARG_POS: + encoding = expr.args[0].value + else: + return None + + if len(expr.arg_kinds) > 1 and isinstance(expr.args[1], StrExpr): + if expr.arg_kinds[1] == ARG_NAMED: + if expr.arg_names[1] == "encoding": + encoding = expr.args[1].value + elif expr.arg_names[1] == "errors": + errors = expr.args[1].value + elif expr.arg_kinds[1] == ARG_POS: + errors = expr.args[1].value + else: + return None + + if errors != "strict": + return None + + normalized = encoding.lower().replace("-", "").replace("_", "") + + if normalized in ("utf8", "utf", "u8", "cp65001"): + return builder.primitive_op(bytes_decode_utf8_strict, [builder.accept(callee.expr)], expr.line) + elif normalized in ("ascii", "usascii", "646"): + return builder.primitive_op(bytes_decode_ascii_strict, [builder.accept(callee.expr)], expr.line) + elif normalized in ("latin1", "latin", "iso88591", "cp819", "8859", "l1"): + return builder.primitive_op(bytes_decode_latin1_strict, [builder.accept(callee.expr)], expr.line) + + return None + + @specialize_function("mypy_extensions.i64") def translate_i64(builder: IRBuilder, expr: CallExpr, callee: RefExpr) -> Value | None: if len(expr.args) != 1 or expr.arg_kinds[0] != ARG_POS: diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h index 6e0938c651dd..aca7a6b23e6a 100644 --- a/mypyc/lib-rt/CPy.h +++ b/mypyc/lib-rt/CPy.h @@ -765,6 +765,8 @@ PyObject *CPyBytes_Concat(PyObject *a, PyObject *b); PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter); CPyTagged CPyBytes_Ord(PyObject *obj); PyObject *CPy_DecodeUtf8(PyObject *bytes_obj, const char *errors); +PyObject *CPy_DecodeLatin1(PyObject *bytes_obj, const char *errors); +PyObject *CPy_DecodeAscii(PyObject *bytes_obj, const char *errors); int CPyBytes_Compare(PyObject *left, PyObject *right); diff --git a/mypyc/lib-rt/bytes_ops.c b/mypyc/lib-rt/bytes_ops.c index ff4a3cc7886b..4f7652e1cd1c 100644 --- a/mypyc/lib-rt/bytes_ops.c +++ b/mypyc/lib-rt/bytes_ops.c @@ -175,3 +175,29 @@ PyObject *CPy_DecodeUtf8(PyObject *bytes_obj, const char *errors) { return PyUnicode_DecodeUTF8(data, size, errors); } + + +PyObject *CPy_DecodeLatin1(PyObject *bytes_obj, const char *errors) { + if (!PyBytes_Check(bytes_obj)) { + PyErr_SetString(PyExc_TypeError, "expected bytes object"); + return NULL; + } + + char *data = PyBytes_AS_STRING(bytes_obj); + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj); + + return PyUnicode_DecodeLatin1(data, size, errors); +} + + +PyObject *CPy_DecodeAscii(PyObject *bytes_obj, const char *errors) { + if (!PyBytes_Check(bytes_obj)) { + PyErr_SetString(PyExc_TypeError, "expected bytes object"); + return NULL; + } + + char *data = PyBytes_AS_STRING(bytes_obj); + Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj); + + return PyUnicode_DecodeASCII(data, size, errors); +} diff --git a/mypyc/primitives/bytes_ops.py b/mypyc/primitives/bytes_ops.py index 10f952afd16e..3ad920bc9480 100644 --- a/mypyc/primitives/bytes_ops.py +++ b/mypyc/primitives/bytes_ops.py @@ -18,6 +18,7 @@ ERR_NEG_INT, binary_op, custom_op, + custom_primitive_op, function_op, load_address_op, method_op, @@ -108,10 +109,26 @@ error_kind=ERR_MAGIC, ) -method_op( +bytes_decode_utf8_strict = custom_primitive_op( name="decode", - arg_types=[bytes_rprimitive, bytes_rprimitive], + arg_types=[bytes_rprimitive, str_rprimitive], return_type=str_rprimitive, c_function_name="CPy_DecodeUtf8", error_kind=ERR_MAGIC, ) + +bytes_decode_latin1_strict = custom_primitive_op( + name="decode_latin1", + arg_types=[bytes_rprimitive, str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPy_DecodeLatin1", + error_kind=ERR_MAGIC, +) + +bytes_decode_ascii_strict = custom_primitive_op( + name="decode_ascii", + arg_types=[bytes_rprimitive, str_rprimitive], + return_type=str_rprimitive, + c_function_name="CPy_DecodeAscii", + error_kind=ERR_MAGIC, +) diff --git a/mypyc/test-data/irbuild-bytes.test b/mypyc/test-data/irbuild-bytes.test index 81da031bcaee..b7cfae7a72c3 100644 --- a/mypyc/test-data/irbuild-bytes.test +++ b/mypyc/test-data/irbuild-bytes.test @@ -186,14 +186,38 @@ L0: b4 = r10 return 1 -[case testDecodeUtf8] -def f(b: bytes) -> str: - return b.decode("utf-8") +[case testDecodeBytes] +def f(b: bytes) -> None: + b.decode() + b.decode('utf8') + b.decode('utf-8', 'strict') + b.decode('utf-8', 'strict') + b.decode('latin1', 'strict') + b.decode('ascii') + b.decode('latin-1') + b.decode('utf-8', 'ignore') + b.decode('ascii', 'replace') + b.decode('latin1', 'ignore') [out] def f(b): b :: bytes - r0, r1 :: str + r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 :: str L0: - r0 = 'utf-8' - r1 = CPy_Decode(b, r0, 0) - return r1 + r0 = CPy_DecodeUtf8(b) + r1 = CPy_DecodeUtf8(b) + r2 = CPy_DecodeUtf8(b) + r3 = CPy_DecodeUtf8(b) + r4 = CPy_DecodeLatin1(b) + r5 = CPy_DecodeAscii(b) + r6 = CPy_DecodeLatin1(b) + r7 = 'utf-8' + r8 = 'ignore' + r9 = CPy_Decode(b, r7, r8) + r10 = 'ascii' + r11 = 'replace' + r12 = CPy_Decode(b, r10, r11) + r13 = 'latin1' + r14 = 'ignore' + r15 = CPy_Decode(b, r13, r14) + return 1 + diff --git a/mypyc/test-data/run-bytes.test b/mypyc/test-data/run-bytes.test index 5442919bdb90..fa63c46a6798 100644 --- a/mypyc/test-data/run-bytes.test +++ b/mypyc/test-data/run-bytes.test @@ -323,29 +323,3 @@ class A: def test_bytes_dunder() -> None: assert b'%b' % A() == b'aaa' assert b'%s' % A() == b'aaa' - -[case testDecodeUtf8] -from typing import Any -from testutil import assertRaises -from a import bytes_subclass - -def test_decode_utf8() -> None: - assert b'hello'.decode('utf-8') == 'hello' - assert b''.decode('utf-8') == '' - - x: bytes = bytearray(b'hello') - assert x.decode('utf-8') == 'hello' - assert type(x.decode('utf-8')) == str - - y: Any = bytes_subclass() - assert y.decode('utf-8') == 'spook' - - n: Any = 123 - with assertRaises(AttributeError): - n.decode('utf-8') - - -[file a.py] -class bytes_subclass(bytes): - def decode(self, encoding='utf-8'): - return 'spook' From 4d3b7c84c17179a8c677e973fc14f39caf2e5c3f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Apr 2025 14:35:16 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mypyc/irbuild/specialize.py | 18 ++++++++++++++---- mypyc/test-data/irbuild-bytes.test | 3 +-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/mypyc/irbuild/specialize.py b/mypyc/irbuild/specialize.py index 50f3e9b14fb6..35b58649f173 100644 --- a/mypyc/irbuild/specialize.py +++ b/mypyc/irbuild/specialize.py @@ -84,13 +84,17 @@ join_formatted_strings, tokenizer_format_call, ) +from mypyc.primitives.bytes_ops import ( + bytes_decode_ascii_strict, + bytes_decode_latin1_strict, + bytes_decode_utf8_strict, +) from mypyc.primitives.dict_ops import ( dict_items_op, dict_keys_op, dict_setdefault_spec_init_op, dict_values_op, ) -from mypyc.primitives.bytes_ops import bytes_decode_utf8_strict, bytes_decode_latin1_strict, bytes_decode_ascii_strict from mypyc.primitives.list_ops import new_list_set_item_op from mypyc.primitives.str_ops import ( str_encode_ascii_strict, @@ -779,11 +783,17 @@ def bytes_decode_fast_path(builder: IRBuilder, expr: CallExpr, callee: RefExpr) normalized = encoding.lower().replace("-", "").replace("_", "") if normalized in ("utf8", "utf", "u8", "cp65001"): - return builder.primitive_op(bytes_decode_utf8_strict, [builder.accept(callee.expr)], expr.line) + return builder.primitive_op( + bytes_decode_utf8_strict, [builder.accept(callee.expr)], expr.line + ) elif normalized in ("ascii", "usascii", "646"): - return builder.primitive_op(bytes_decode_ascii_strict, [builder.accept(callee.expr)], expr.line) + return builder.primitive_op( + bytes_decode_ascii_strict, [builder.accept(callee.expr)], expr.line + ) elif normalized in ("latin1", "latin", "iso88591", "cp819", "8859", "l1"): - return builder.primitive_op(bytes_decode_latin1_strict, [builder.accept(callee.expr)], expr.line) + return builder.primitive_op( + bytes_decode_latin1_strict, [builder.accept(callee.expr)], expr.line + ) return None diff --git a/mypyc/test-data/irbuild-bytes.test b/mypyc/test-data/irbuild-bytes.test index b7cfae7a72c3..1a11442ac91a 100644 --- a/mypyc/test-data/irbuild-bytes.test +++ b/mypyc/test-data/irbuild-bytes.test @@ -195,7 +195,7 @@ def f(b: bytes) -> None: b.decode('latin1', 'strict') b.decode('ascii') b.decode('latin-1') - b.decode('utf-8', 'ignore') + b.decode('utf-8', 'ignore') b.decode('ascii', 'replace') b.decode('latin1', 'ignore') [out] @@ -220,4 +220,3 @@ L0: r14 = 'ignore' r15 = CPy_Decode(b, r13, r14) return 1 -