diff --git a/include/pocketpy/common/str.h b/include/pocketpy/common/str.h index 6a708f6c..d2d09d11 100644 --- a/include/pocketpy/common/str.h +++ b/include/pocketpy/common/str.h @@ -66,6 +66,7 @@ int c11__byte_index_to_unicode(const char* data, int n); bool c11__is_unicode_Lo_char(int c); int c11__u8_header(unsigned char c, bool suppress); int c11__u8_value(int u8bytes, const char* data); +int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]); typedef enum IntParsingResult { IntParsing_SUCCESS, diff --git a/src/common/str.c b/src/common/str.c index ca8fbbf7..9b5a7aa1 100644 --- a/src/common/str.c +++ b/src/common/str.c @@ -316,6 +316,38 @@ int c11__u8_value(int u8bytes, const char* data) { return (int)value; } +int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]) { + int length = 0; + + if(utf32_char <= 0x7F) { + // 1-byte UTF-8 + utf8_output[0] = (char)utf32_char; + length = 1; + } else if(utf32_char <= 0x7FF) { + // 2-byte UTF-8 + utf8_output[0] = (char)(0xC0 | ((utf32_char >> 6) & 0x1F)); + utf8_output[1] = (char)(0x80 | (utf32_char & 0x3F)); + length = 2; + } else if(utf32_char <= 0xFFFF) { + // 3-byte UTF-8 + utf8_output[0] = (char)(0xE0 | ((utf32_char >> 12) & 0x0F)); + utf8_output[1] = (char)(0x80 | ((utf32_char >> 6) & 0x3F)); + utf8_output[2] = (char)(0x80 | (utf32_char & 0x3F)); + length = 3; + } else if(utf32_char <= 0x10FFFF) { + // 4-byte UTF-8 + utf8_output[0] = (char)(0xF0 | ((utf32_char >> 18) & 0x07)); + utf8_output[1] = (char)(0x80 | ((utf32_char >> 12) & 0x3F)); + utf8_output[2] = (char)(0x80 | ((utf32_char >> 6) & 0x3F)); + utf8_output[3] = (char)(0x80 | (utf32_char & 0x3F)); + length = 4; + } else { + // Invalid UTF-32 character + return -1; + } + return length; +} + IntParsingResult c11__parse_uint(c11_sv text, int64_t* out, int base) { *out = 0; diff --git a/src/interpreter/vm.c b/src/interpreter/vm.c index 9c657ad2..e3f84d0b 100644 --- a/src/interpreter/vm.c +++ b/src/interpreter/vm.c @@ -74,7 +74,7 @@ void VM__ctor(VM* self) { self->recursion_depth = 0; self->max_recursion_depth = 1000; - + self->is_curr_exc_handled = false; self->ctx = NULL; @@ -92,7 +92,7 @@ void VM__ctor(VM* self) { char* p = py_newstrn(&self->ascii_literals[i], 1); *p = i; } - py_newstrn(&self->ascii_literals[128], 0); + py_newstrn(&self->ascii_literals[128], 0); // empty string // 0: unused void* placeholder = TypeList__emplace(&self->types); diff --git a/src/public/modules.c b/src/public/modules.c index 9c65134b..51c5da68 100644 --- a/src/public/modules.c +++ b/src/public/modules.c @@ -449,9 +449,16 @@ static bool builtins_delattr(int argc, py_Ref argv) { static bool builtins_chr(int argc, py_Ref argv) { PY_CHECK_ARGC(1); PY_CHECK_ARG_TYPE(0, tp_int); - py_i64 val = py_toint(py_arg(0)); - if(val < 0 || val > 128) { return ValueError("chr() arg not in range(128)"); } - py_assign(py_retval(), &pk_current_vm->ascii_literals[val]); + uint32_t val = py_toint(py_arg(0)); + if(val >= 0 && val < 128) { + py_assign(py_retval(), &pk_current_vm->ascii_literals[val]); + } else { + // convert to utf-8 + char utf8[4]; + int len = c11__u32_to_u8(val, utf8); + if(len == -1) return ValueError("invalid unicode code point: %d", val); + py_newstrv(py_retval(), (c11_sv){utf8, len}); + } return true; } diff --git a/tests/04_str.py b/tests/04_str.py index f6437b6a..44ce377a 100644 --- a/tests/04_str.py +++ b/tests/04_str.py @@ -191,6 +191,13 @@ assert (1 == '1') is False assert 1 == 1.0 assert chr(97) is 'a' +assert ord('a') == 97 + +assert ord('🥕') == 0x1f955 +assert chr(0x1f955) == '🥕' + +assert ord('测') == 27979 +assert chr(27979) == '测' exit()