From 3879903d73c88da35a43469a9f0382eb91864dbf Mon Sep 17 00:00:00 2001 From: blueloveTH Date: Tue, 6 May 2025 19:23:24 +0800 Subject: [PATCH] add `unicodedata` module --- docs/modules/unicodedata.md | 15 + include/pocketpy/common/str.h | 7 + include/pocketpy/interpreter/modules.h | 1 + scripts/gen_unicodedata.py | 46 ++ src/common/str.c | 550 ++++++++++++- src/interpreter/vm.c | 1 + src/modules/unicodedata.c | 1051 ++++++++++++++++++++++++ src/public/modules.c | 2 +- tests/83_unicodedata.py | 17 + 9 files changed, 1670 insertions(+), 20 deletions(-) create mode 100644 docs/modules/unicodedata.md create mode 100644 scripts/gen_unicodedata.py create mode 100644 src/modules/unicodedata.c create mode 100644 tests/83_unicodedata.py diff --git a/docs/modules/unicodedata.md b/docs/modules/unicodedata.md new file mode 100644 index 00000000..772b96ac --- /dev/null +++ b/docs/modules/unicodedata.md @@ -0,0 +1,15 @@ +--- +icon: package +label: unicodedata +--- + +### `unicodedata.east_asian_width(char: str) -> str` + +Returns the East Asian width of a Unicode character. The width is one of the following values: + +- `F`: Fullwidth +- `H`: Halfwidth +- `N`: Neutral +- `Na`: Narrow +- `W`: Wide +- `A`: Ambiguous diff --git a/include/pocketpy/common/str.h b/include/pocketpy/common/str.h index 3aebfad4..92ec1071 100644 --- a/include/pocketpy/common/str.h +++ b/include/pocketpy/common/str.h @@ -21,6 +21,12 @@ typedef struct c11_bytes { unsigned char data[]; // flexible array member } c11_bytes; +typedef struct { + int start; + int end; + char data[4]; +} c11_u32_range; + bool c11_bytes__eq(c11_bytes* self, c11_bytes* other); int c11_sv__cmp(c11_sv self, c11_sv other); @@ -66,6 +72,7 @@ int c11__unicode_index_to_byte(const char* data, int i); int c11__byte_index_to_unicode(const char* data, int n); bool c11__is_unicode_Lo_char(int c); +const char* c11__search_u32_ranges(int c, const c11_u32_range* p, int n_ranges); int c11__u8_header(unsigned char c, bool suppress); int c11__u8_value(int u8bytes, const char* data); int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]); diff --git a/include/pocketpy/interpreter/modules.h b/include/pocketpy/interpreter/modules.h index c9f982b5..76d1f993 100644 --- a/include/pocketpy/interpreter/modules.h +++ b/include/pocketpy/interpreter/modules.h @@ -17,6 +17,7 @@ void pk__add_module_inspect(); void pk__add_module_pickle(); void pk__add_module_base64(); void pk__add_module_importlib(); +void pk__add_module_unicodedata(); void pk__add_module_vmath(); void pk__add_module_array2d(); diff --git a/scripts/gen_unicodedata.py b/scripts/gen_unicodedata.py new file mode 100644 index 00000000..6d3777e9 --- /dev/null +++ b/scripts/gen_unicodedata.py @@ -0,0 +1,46 @@ +import unicodedata +from tqdm import trange +from typing import Literal + +info = [] + +for i in trange(0x110000): + char = chr(i) + category = unicodedata.category(char) + east_asian_width = unicodedata.east_asian_width(char) + info.append((i, category, east_asian_width)) + +def merge(index: Literal[1, 2], filter): + # index = 1, category + # index = 2, east_asian_width + result: list[tuple[int, int, str]] = [] + last_value = None + last_start = None + for i in range(len(info)): + value = info[i][index] + if value != last_value: + if last_value is not None: + result.append((last_start, i - 1, last_value)) + last_value = value + last_start = i + if last_value is not None: + result.append((last_start, len(info) - 1, last_value)) + return [x for x in result if filter(x[2])] + +df_category = merge(1, lambda x: x == 'Lo') +df_east_asian_width = merge(2, lambda x: x != 'N') + +def to_c11(ranges, name, with_value=True): + with open(f'{name}.c', 'wt', encoding='utf-8', newline='\n') as f: + f.write(f'const static c11_u32_range {name}[] = {{\n') + for start, end, value in ranges: + if with_value: + f.write(f' {{ {start}, {end}, "{value}\\0" }},\n') + else: + f.write(f' {{ {start}, {end} }},\n') + f.write(f'}};\n') + +to_c11(df_category, 'kLoRanges', with_value=False) +to_c11(df_east_asian_width, 'kEastAsianWidthRanges', with_value=True) + + diff --git a/src/common/str.c b/src/common/str.c index 9186f835..876c2c20 100644 --- a/src/common/str.c +++ b/src/common/str.c @@ -268,23 +268,6 @@ bool c11__sveq2(c11_sv a, const char* b) { return memcmp(a.data, b, size) == 0; } -// clang-format off -static const int kLoRangeA[] = {170,186,443,448,660,1488,1519,1568,1601,1646,1649,1749,1774,1786,1791,1808,1810,1869,1969,1994,2048,2112,2144,2208,2230,2308,2365,2384,2392,2418,2437,2447,2451,2474,2482,2486,2493,2510,2524,2527,2544,2556,2565,2575,2579,2602,2610,2613,2616,2649,2654,2674,2693,2703,2707,2730,2738,2741,2749,2768,2784,2809,2821,2831,2835,2858,2866,2869,2877,2908,2911,2929,2947,2949,2958,2962,2969,2972,2974,2979,2984,2990,3024,3077,3086,3090,3114,3133,3160,3168,3200,3205,3214,3218,3242,3253,3261,3294,3296,3313,3333,3342,3346,3389,3406,3412,3423,3450,3461,3482,3507,3517,3520,3585,3634,3648,3713,3716,3718,3724,3749,3751,3762,3773,3776,3804,3840,3904,3913,3976,4096,4159,4176,4186,4193,4197,4206,4213,4238,4352,4682,4688,4696,4698,4704,4746,4752,4786,4792,4800,4802,4808,4824,4882,4888,4992,5121,5743,5761,5792,5873,5888,5902,5920,5952,5984,5998,6016,6108,6176,6212,6272,6279,6314,6320,6400,6480,6512,6528,6576,6656,6688,6917,6981,7043,7086,7098,7168,7245,7258,7401,7406,7413,7418,8501,11568,11648,11680,11688,11696,11704,11712,11720,11728,11736,12294,12348,12353,12447,12449,12543,12549,12593,12704,12784,13312,19968,40960,40982,42192,42240,42512,42538,42606,42656,42895,42999,43003,43011,43015,43020,43072,43138,43250,43259,43261,43274,43312,43360,43396,43488,43495,43514,43520,43584,43588,43616,43633,43642,43646,43697,43701,43705,43712,43714,43739,43744,43762,43777,43785,43793,43808,43816,43968,44032,55216,55243,63744,64112,64285,64287,64298,64312,64318,64320,64323,64326,64467,64848,64914,65008,65136,65142,65382,65393,65440,65474,65482,65490,65498,65536,65549,65576,65596,65599,65616,65664,66176,66208,66304,66349,66370,66384,66432,66464,66504,66640,66816,66864,67072,67392,67424,67584,67592,67594,67639,67644,67647,67680,67712,67808,67828,67840,67872,67968,68030,68096,68112,68117,68121,68192,68224,68288,68297,68352,68416,68448,68480,68608,68864,69376,69415,69424,69600,69635,69763,69840,69891,69956,69968,70006,70019,70081,70106,70108,70144,70163,70272,70280,70282,70287,70303,70320,70405,70415,70419,70442,70450,70453,70461,70480,70493,70656,70727,70751,70784,70852,70855,71040,71128,71168,71236,71296,71352,71424,71680,71935,72096,72106,72161,72163,72192,72203,72250,72272,72284,72349,72384,72704,72714,72768,72818,72960,72968,72971,73030,73056,73063,73066,73112,73440,73728,74880,77824,82944,92160,92736,92880,92928,93027,93053,93952,94032,94208,100352,110592,110928,110948,110960,113664,113776,113792,113808,123136,123214,123584,124928,126464,126469,126497,126500,126503,126505,126516,126521,126523,126530,126535,126537,126539,126541,126545,126548,126551,126553,126555,126557,126559,126561,126564,126567,126572,126580,126585,126590,126592,126603,126625,126629,126635,131072,173824,177984,178208,183984,194560}; -static const int kLoRangeB[] = {170,186,443,451,660,1514,1522,1599,1610,1647,1747,1749,1775,1788,1791,1808,1839,1957,1969,2026,2069,2136,2154,2228,2237,2361,2365,2384,2401,2432,2444,2448,2472,2480,2482,2489,2493,2510,2525,2529,2545,2556,2570,2576,2600,2608,2611,2614,2617,2652,2654,2676,2701,2705,2728,2736,2739,2745,2749,2768,2785,2809,2828,2832,2856,2864,2867,2873,2877,2909,2913,2929,2947,2954,2960,2965,2970,2972,2975,2980,2986,3001,3024,3084,3088,3112,3129,3133,3162,3169,3200,3212,3216,3240,3251,3257,3261,3294,3297,3314,3340,3344,3386,3389,3406,3414,3425,3455,3478,3505,3515,3517,3526,3632,3635,3653,3714,3716,3722,3747,3749,3760,3763,3773,3780,3807,3840,3911,3948,3980,4138,4159,4181,4189,4193,4198,4208,4225,4238,4680,4685,4694,4696,4701,4744,4749,4784,4789,4798,4800,4805,4822,4880,4885,4954,5007,5740,5759,5786,5866,5880,5900,5905,5937,5969,5996,6000,6067,6108,6210,6264,6276,6312,6314,6389,6430,6509,6516,6571,6601,6678,6740,6963,6987,7072,7087,7141,7203,7247,7287,7404,7411,7414,7418,8504,11623,11670,11686,11694,11702,11710,11718,11726,11734,11742,12294,12348,12438,12447,12538,12543,12591,12686,12730,12799,19893,40943,40980,42124,42231,42507,42527,42539,42606,42725,42895,42999,43009,43013,43018,43042,43123,43187,43255,43259,43262,43301,43334,43388,43442,43492,43503,43518,43560,43586,43595,43631,43638,43642,43695,43697,43702,43709,43712,43714,43740,43754,43762,43782,43790,43798,43814,43822,44002,55203,55238,55291,64109,64217,64285,64296,64310,64316,64318,64321,64324,64433,64829,64911,64967,65019,65140,65276,65391,65437,65470,65479,65487,65495,65500,65547,65574,65594,65597,65613,65629,65786,66204,66256,66335,66368,66377,66421,66461,66499,66511,66717,66855,66915,67382,67413,67431,67589,67592,67637,67640,67644,67669,67702,67742,67826,67829,67861,67897,68023,68031,68096,68115,68119,68149,68220,68252,68295,68324,68405,68437,68466,68497,68680,68899,69404,69415,69445,69622,69687,69807,69864,69926,69956,70002,70006,70066,70084,70106,70108,70161,70187,70278,70280,70285,70301,70312,70366,70412,70416,70440,70448,70451,70457,70461,70480,70497,70708,70730,70751,70831,70853,70855,71086,71131,71215,71236,71338,71352,71450,71723,71935,72103,72144,72161,72163,72192,72242,72250,72272,72329,72349,72440,72712,72750,72768,72847,72966,72969,73008,73030,73061,73064,73097,73112,73458,74649,75075,78894,83526,92728,92766,92909,92975,93047,93071,94026,94032,100343,101106,110878,110930,110951,111355,113770,113788,113800,113817,123180,123214,123627,125124,126467,126495,126498,126500,126503,126514,126519,126521,126523,126530,126535,126537,126539,126543,126546,126548,126551,126553,126555,126557,126559,126562,126564,126570,126578,126583,126588,126590,126601,126619,126627,126633,126651,173782,177972,178205,183969,191456,195101}; - -// clang-format on - -bool c11__is_unicode_Lo_char(int c) { - if(c == 0x1f955) return true; - int index; - c11__lower_bound(const int, kLoRangeA, 476, c, c11__less, &index); - if(index == 476) return false; - if(c == kLoRangeA[index]) return true; - index -= 1; - if(index < 0) return false; - return c >= kLoRangeA[index] && c <= kLoRangeB[index]; -} - int c11__u8_header(unsigned char c, bool suppress) { if((c & 0b10000000) == 0) return 1; if((c & 0b11100000) == 0b11000000) return 2; @@ -348,7 +331,7 @@ int c11__u32_to_u8(uint32_t utf32_char, char utf8_output[4]) { return length; } -char* c11_strdup(const char* str){ +char* c11_strdup(const char* str) { int len = strlen(str); char* dst = PK_MALLOC(len + 1); memcpy(dst, str, len); @@ -451,4 +434,533 @@ IntParsingResult c11__parse_uint(c11_sv text, int64_t* out, int base) { return IntParsing_SUCCESS; } return IntParsing_FAILURE; -} \ No newline at end of file +} + +const char* c11__search_u32_ranges(int c, const c11_u32_range* p, int n_ranges) { + int lbound = 0; + int ubound = n_ranges - 1; + + if(c < p[0].start || c > p[ubound].end) return NULL; + while(ubound >= lbound) { + int mid = (lbound + ubound) / 2; + if(c > p[mid].end) { + lbound = mid + 1; + } else if(c < p[mid].start) { + ubound = mid - 1; + } else { + return p[mid].data; + } + } + return NULL; +} + +const static c11_u32_range kLoRanges[] = { + {170, 170 }, + {186, 186 }, + {443, 443 }, + {448, 451 }, + {660, 660 }, + {1488, 1514 }, + {1519, 1522 }, + {1568, 1599 }, + {1601, 1610 }, + {1646, 1647 }, + {1649, 1747 }, + {1749, 1749 }, + {1774, 1775 }, + {1786, 1788 }, + {1791, 1791 }, + {1808, 1808 }, + {1810, 1839 }, + {1869, 1957 }, + {1969, 1969 }, + {1994, 2026 }, + {2048, 2069 }, + {2112, 2136 }, + {2144, 2154 }, + {2160, 2183 }, + {2185, 2190 }, + {2208, 2248 }, + {2308, 2361 }, + {2365, 2365 }, + {2384, 2384 }, + {2392, 2401 }, + {2418, 2432 }, + {2437, 2444 }, + {2447, 2448 }, + {2451, 2472 }, + {2474, 2480 }, + {2482, 2482 }, + {2486, 2489 }, + {2493, 2493 }, + {2510, 2510 }, + {2524, 2525 }, + {2527, 2529 }, + {2544, 2545 }, + {2556, 2556 }, + {2565, 2570 }, + {2575, 2576 }, + {2579, 2600 }, + {2602, 2608 }, + {2610, 2611 }, + {2613, 2614 }, + {2616, 2617 }, + {2649, 2652 }, + {2654, 2654 }, + {2674, 2676 }, + {2693, 2701 }, + {2703, 2705 }, + {2707, 2728 }, + {2730, 2736 }, + {2738, 2739 }, + {2741, 2745 }, + {2749, 2749 }, + {2768, 2768 }, + {2784, 2785 }, + {2809, 2809 }, + {2821, 2828 }, + {2831, 2832 }, + {2835, 2856 }, + {2858, 2864 }, + {2866, 2867 }, + {2869, 2873 }, + {2877, 2877 }, + {2908, 2909 }, + {2911, 2913 }, + {2929, 2929 }, + {2947, 2947 }, + {2949, 2954 }, + {2958, 2960 }, + {2962, 2965 }, + {2969, 2970 }, + {2972, 2972 }, + {2974, 2975 }, + {2979, 2980 }, + {2984, 2986 }, + {2990, 3001 }, + {3024, 3024 }, + {3077, 3084 }, + {3086, 3088 }, + {3090, 3112 }, + {3114, 3129 }, + {3133, 3133 }, + {3160, 3162 }, + {3165, 3165 }, + {3168, 3169 }, + {3200, 3200 }, + {3205, 3212 }, + {3214, 3216 }, + {3218, 3240 }, + {3242, 3251 }, + {3253, 3257 }, + {3261, 3261 }, + {3293, 3294 }, + {3296, 3297 }, + {3313, 3314 }, + {3332, 3340 }, + {3342, 3344 }, + {3346, 3386 }, + {3389, 3389 }, + {3406, 3406 }, + {3412, 3414 }, + {3423, 3425 }, + {3450, 3455 }, + {3461, 3478 }, + {3482, 3505 }, + {3507, 3515 }, + {3517, 3517 }, + {3520, 3526 }, + {3585, 3632 }, + {3634, 3635 }, + {3648, 3653 }, + {3713, 3714 }, + {3716, 3716 }, + {3718, 3722 }, + {3724, 3747 }, + {3749, 3749 }, + {3751, 3760 }, + {3762, 3763 }, + {3773, 3773 }, + {3776, 3780 }, + {3804, 3807 }, + {3840, 3840 }, + {3904, 3911 }, + {3913, 3948 }, + {3976, 3980 }, + {4096, 4138 }, + {4159, 4159 }, + {4176, 4181 }, + {4186, 4189 }, + {4193, 4193 }, + {4197, 4198 }, + {4206, 4208 }, + {4213, 4225 }, + {4238, 4238 }, + {4352, 4680 }, + {4682, 4685 }, + {4688, 4694 }, + {4696, 4696 }, + {4698, 4701 }, + {4704, 4744 }, + {4746, 4749 }, + {4752, 4784 }, + {4786, 4789 }, + {4792, 4798 }, + {4800, 4800 }, + {4802, 4805 }, + {4808, 4822 }, + {4824, 4880 }, + {4882, 4885 }, + {4888, 4954 }, + {4992, 5007 }, + {5121, 5740 }, + {5743, 5759 }, + {5761, 5786 }, + {5792, 5866 }, + {5873, 5880 }, + {5888, 5905 }, + {5919, 5937 }, + {5952, 5969 }, + {5984, 5996 }, + {5998, 6000 }, + {6016, 6067 }, + {6108, 6108 }, + {6176, 6210 }, + {6212, 6264 }, + {6272, 6276 }, + {6279, 6312 }, + {6314, 6314 }, + {6320, 6389 }, + {6400, 6430 }, + {6480, 6509 }, + {6512, 6516 }, + {6528, 6571 }, + {6576, 6601 }, + {6656, 6678 }, + {6688, 6740 }, + {6917, 6963 }, + {6981, 6988 }, + {7043, 7072 }, + {7086, 7087 }, + {7098, 7141 }, + {7168, 7203 }, + {7245, 7247 }, + {7258, 7287 }, + {7401, 7404 }, + {7406, 7411 }, + {7413, 7414 }, + {7418, 7418 }, + {8501, 8504 }, + {11568, 11623 }, + {11648, 11670 }, + {11680, 11686 }, + {11688, 11694 }, + {11696, 11702 }, + {11704, 11710 }, + {11712, 11718 }, + {11720, 11726 }, + {11728, 11734 }, + {11736, 11742 }, + {12294, 12294 }, + {12348, 12348 }, + {12353, 12438 }, + {12447, 12447 }, + {12449, 12538 }, + {12543, 12543 }, + {12549, 12591 }, + {12593, 12686 }, + {12704, 12735 }, + {12784, 12799 }, + {13312, 19903 }, + {19968, 40980 }, + {40982, 42124 }, + {42192, 42231 }, + {42240, 42507 }, + {42512, 42527 }, + {42538, 42539 }, + {42606, 42606 }, + {42656, 42725 }, + {42895, 42895 }, + {42999, 42999 }, + {43003, 43009 }, + {43011, 43013 }, + {43015, 43018 }, + {43020, 43042 }, + {43072, 43123 }, + {43138, 43187 }, + {43250, 43255 }, + {43259, 43259 }, + {43261, 43262 }, + {43274, 43301 }, + {43312, 43334 }, + {43360, 43388 }, + {43396, 43442 }, + {43488, 43492 }, + {43495, 43503 }, + {43514, 43518 }, + {43520, 43560 }, + {43584, 43586 }, + {43588, 43595 }, + {43616, 43631 }, + {43633, 43638 }, + {43642, 43642 }, + {43646, 43695 }, + {43697, 43697 }, + {43701, 43702 }, + {43705, 43709 }, + {43712, 43712 }, + {43714, 43714 }, + {43739, 43740 }, + {43744, 43754 }, + {43762, 43762 }, + {43777, 43782 }, + {43785, 43790 }, + {43793, 43798 }, + {43808, 43814 }, + {43816, 43822 }, + {43968, 44002 }, + {44032, 55203 }, + {55216, 55238 }, + {55243, 55291 }, + {63744, 64109 }, + {64112, 64217 }, + {64285, 64285 }, + {64287, 64296 }, + {64298, 64310 }, + {64312, 64316 }, + {64318, 64318 }, + {64320, 64321 }, + {64323, 64324 }, + {64326, 64433 }, + {64467, 64829 }, + {64848, 64911 }, + {64914, 64967 }, + {65008, 65019 }, + {65136, 65140 }, + {65142, 65276 }, + {65382, 65391 }, + {65393, 65437 }, + {65440, 65470 }, + {65474, 65479 }, + {65482, 65487 }, + {65490, 65495 }, + {65498, 65500 }, + {65536, 65547 }, + {65549, 65574 }, + {65576, 65594 }, + {65596, 65597 }, + {65599, 65613 }, + {65616, 65629 }, + {65664, 65786 }, + {66176, 66204 }, + {66208, 66256 }, + {66304, 66335 }, + {66349, 66368 }, + {66370, 66377 }, + {66384, 66421 }, + {66432, 66461 }, + {66464, 66499 }, + {66504, 66511 }, + {66640, 66717 }, + {66816, 66855 }, + {66864, 66915 }, + {67072, 67382 }, + {67392, 67413 }, + {67424, 67431 }, + {67584, 67589 }, + {67592, 67592 }, + {67594, 67637 }, + {67639, 67640 }, + {67644, 67644 }, + {67647, 67669 }, + {67680, 67702 }, + {67712, 67742 }, + {67808, 67826 }, + {67828, 67829 }, + {67840, 67861 }, + {67872, 67897 }, + {67968, 68023 }, + {68030, 68031 }, + {68096, 68096 }, + {68112, 68115 }, + {68117, 68119 }, + {68121, 68149 }, + {68192, 68220 }, + {68224, 68252 }, + {68288, 68295 }, + {68297, 68324 }, + {68352, 68405 }, + {68416, 68437 }, + {68448, 68466 }, + {68480, 68497 }, + {68608, 68680 }, + {68864, 68899 }, + {69248, 69289 }, + {69296, 69297 }, + {69376, 69404 }, + {69415, 69415 }, + {69424, 69445 }, + {69488, 69505 }, + {69552, 69572 }, + {69600, 69622 }, + {69635, 69687 }, + {69745, 69746 }, + {69749, 69749 }, + {69763, 69807 }, + {69840, 69864 }, + {69891, 69926 }, + {69956, 69956 }, + {69959, 69959 }, + {69968, 70002 }, + {70006, 70006 }, + {70019, 70066 }, + {70081, 70084 }, + {70106, 70106 }, + {70108, 70108 }, + {70144, 70161 }, + {70163, 70187 }, + {70272, 70278 }, + {70280, 70280 }, + {70282, 70285 }, + {70287, 70301 }, + {70303, 70312 }, + {70320, 70366 }, + {70405, 70412 }, + {70415, 70416 }, + {70419, 70440 }, + {70442, 70448 }, + {70450, 70451 }, + {70453, 70457 }, + {70461, 70461 }, + {70480, 70480 }, + {70493, 70497 }, + {70656, 70708 }, + {70727, 70730 }, + {70751, 70753 }, + {70784, 70831 }, + {70852, 70853 }, + {70855, 70855 }, + {71040, 71086 }, + {71128, 71131 }, + {71168, 71215 }, + {71236, 71236 }, + {71296, 71338 }, + {71352, 71352 }, + {71424, 71450 }, + {71488, 71494 }, + {71680, 71723 }, + {71935, 71942 }, + {71945, 71945 }, + {71948, 71955 }, + {71957, 71958 }, + {71960, 71983 }, + {71999, 71999 }, + {72001, 72001 }, + {72096, 72103 }, + {72106, 72144 }, + {72161, 72161 }, + {72163, 72163 }, + {72192, 72192 }, + {72203, 72242 }, + {72250, 72250 }, + {72272, 72272 }, + {72284, 72329 }, + {72349, 72349 }, + {72368, 72440 }, + {72704, 72712 }, + {72714, 72750 }, + {72768, 72768 }, + {72818, 72847 }, + {72960, 72966 }, + {72968, 72969 }, + {72971, 73008 }, + {73030, 73030 }, + {73056, 73061 }, + {73063, 73064 }, + {73066, 73097 }, + {73112, 73112 }, + {73440, 73458 }, + {73648, 73648 }, + {73728, 74649 }, + {74880, 75075 }, + {77712, 77808 }, + {77824, 78894 }, + {82944, 83526 }, + {92160, 92728 }, + {92736, 92766 }, + {92784, 92862 }, + {92880, 92909 }, + {92928, 92975 }, + {93027, 93047 }, + {93053, 93071 }, + {93952, 94026 }, + {94032, 94032 }, + {94208, 100343}, + {100352, 101589}, + {101632, 101640}, + {110592, 110882}, + {110928, 110930}, + {110948, 110951}, + {110960, 111355}, + {113664, 113770}, + {113776, 113788}, + {113792, 113800}, + {113808, 113817}, + {122634, 122634}, + {123136, 123180}, + {123214, 123214}, + {123536, 123565}, + {123584, 123627}, + {124896, 124902}, + {124904, 124907}, + {124909, 124910}, + {124912, 124926}, + {124928, 125124}, + {126464, 126467}, + {126469, 126495}, + {126497, 126498}, + {126500, 126500}, + {126503, 126503}, + {126505, 126514}, + {126516, 126519}, + {126521, 126521}, + {126523, 126523}, + {126530, 126530}, + {126535, 126535}, + {126537, 126537}, + {126539, 126539}, + {126541, 126543}, + {126545, 126546}, + {126548, 126548}, + {126551, 126551}, + {126553, 126553}, + {126555, 126555}, + {126557, 126557}, + {126559, 126559}, + {126561, 126562}, + {126564, 126564}, + {126567, 126570}, + {126572, 126578}, + {126580, 126583}, + {126585, 126588}, + {126590, 126590}, + {126592, 126601}, + {126603, 126619}, + {126625, 126627}, + {126629, 126633}, + {126635, 126651}, + {131072, 173791}, + {173824, 177976}, + {177984, 178205}, + {178208, 183969}, + {183984, 191456}, + {194560, 195101}, + {196608, 201546}, +}; + +bool c11__is_unicode_Lo_char(int c) { + if(c == 0x1f955) return true; + const char* data = + c11__search_u32_ranges(c, kLoRanges, sizeof(kLoRanges) / sizeof(c11_u32_range)); + return data != NULL; +} diff --git a/src/interpreter/vm.c b/src/interpreter/vm.c index e3acff1a..cbd94ed9 100644 --- a/src/interpreter/vm.c +++ b/src/interpreter/vm.c @@ -231,6 +231,7 @@ void VM__ctor(VM* self) { pk__add_module_pickle(); pk__add_module_base64(); pk__add_module_importlib(); + pk__add_module_unicodedata(); pk__add_module_conio(); pk__add_module_lz4(); // optional diff --git a/src/modules/unicodedata.c b/src/modules/unicodedata.c new file mode 100644 index 00000000..b371eb26 --- /dev/null +++ b/src/modules/unicodedata.c @@ -0,0 +1,1051 @@ +#include "pocketpy/common/str.h" +#include "pocketpy/pocketpy.h" + +const static c11_u32_range kEastAsianWidthRanges[] = { + {32, 126, "Na\0"}, + {161, 161, "A\0" }, + {162, 163, "Na\0"}, + {164, 164, "A\0" }, + {165, 166, "Na\0"}, + {167, 168, "A\0" }, + {170, 170, "A\0" }, + {172, 172, "Na\0"}, + {173, 174, "A\0" }, + {175, 175, "Na\0"}, + {176, 180, "A\0" }, + {182, 186, "A\0" }, + {188, 191, "A\0" }, + {198, 198, "A\0" }, + {208, 208, "A\0" }, + {215, 216, "A\0" }, + {222, 225, "A\0" }, + {230, 230, "A\0" }, + {232, 234, "A\0" }, + {236, 237, "A\0" }, + {240, 240, "A\0" }, + {242, 243, "A\0" }, + {247, 250, "A\0" }, + {252, 252, "A\0" }, + {254, 254, "A\0" }, + {257, 257, "A\0" }, + {273, 273, "A\0" }, + {275, 275, "A\0" }, + {283, 283, "A\0" }, + {294, 295, "A\0" }, + {299, 299, "A\0" }, + {305, 307, "A\0" }, + {312, 312, "A\0" }, + {319, 322, "A\0" }, + {324, 324, "A\0" }, + {328, 331, "A\0" }, + {333, 333, "A\0" }, + {338, 339, "A\0" }, + {358, 359, "A\0" }, + {363, 363, "A\0" }, + {462, 462, "A\0" }, + {464, 464, "A\0" }, + {466, 466, "A\0" }, + {468, 468, "A\0" }, + {470, 470, "A\0" }, + {472, 472, "A\0" }, + {474, 474, "A\0" }, + {476, 476, "A\0" }, + {593, 593, "A\0" }, + {609, 609, "A\0" }, + {708, 708, "A\0" }, + {711, 711, "A\0" }, + {713, 715, "A\0" }, + {717, 717, "A\0" }, + {720, 720, "A\0" }, + {728, 731, "A\0" }, + {733, 733, "A\0" }, + {735, 735, "A\0" }, + {768, 879, "A\0" }, + {888, 889, "F\0" }, + {896, 899, "F\0" }, + {907, 907, "F\0" }, + {909, 909, "F\0" }, + {913, 929, "A\0" }, + {930, 930, "F\0" }, + {931, 937, "A\0" }, + {945, 961, "A\0" }, + {963, 969, "A\0" }, + {1025, 1025, "A\0" }, + {1040, 1103, "A\0" }, + {1105, 1105, "A\0" }, + {1328, 1328, "F\0" }, + {1367, 1368, "F\0" }, + {1419, 1420, "F\0" }, + {1424, 1424, "F\0" }, + {1480, 1487, "F\0" }, + {1515, 1518, "F\0" }, + {1525, 1535, "F\0" }, + {1806, 1806, "F\0" }, + {1867, 1868, "F\0" }, + {1970, 1983, "F\0" }, + {2043, 2044, "F\0" }, + {2094, 2095, "F\0" }, + {2111, 2111, "F\0" }, + {2140, 2141, "F\0" }, + {2143, 2143, "F\0" }, + {2155, 2159, "F\0" }, + {2191, 2191, "F\0" }, + {2194, 2199, "F\0" }, + {2436, 2436, "F\0" }, + {2445, 2446, "F\0" }, + {2449, 2450, "F\0" }, + {2473, 2473, "F\0" }, + {2481, 2481, "F\0" }, + {2483, 2485, "F\0" }, + {2490, 2491, "F\0" }, + {2501, 2502, "F\0" }, + {2505, 2506, "F\0" }, + {2511, 2518, "F\0" }, + {2520, 2523, "F\0" }, + {2526, 2526, "F\0" }, + {2532, 2533, "F\0" }, + {2559, 2560, "F\0" }, + {2564, 2564, "F\0" }, + {2571, 2574, "F\0" }, + {2577, 2578, "F\0" }, + {2601, 2601, "F\0" }, + {2609, 2609, "F\0" }, + {2612, 2612, "F\0" }, + {2615, 2615, "F\0" }, + {2618, 2619, "F\0" }, + {2621, 2621, "F\0" }, + {2627, 2630, "F\0" }, + {2633, 2634, "F\0" }, + {2638, 2640, "F\0" }, + {2642, 2648, "F\0" }, + {2653, 2653, "F\0" }, + {2655, 2661, "F\0" }, + {2679, 2688, "F\0" }, + {2692, 2692, "F\0" }, + {2702, 2702, "F\0" }, + {2706, 2706, "F\0" }, + {2729, 2729, "F\0" }, + {2737, 2737, "F\0" }, + {2740, 2740, "F\0" }, + {2746, 2747, "F\0" }, + {2758, 2758, "F\0" }, + {2762, 2762, "F\0" }, + {2766, 2767, "F\0" }, + {2769, 2783, "F\0" }, + {2788, 2789, "F\0" }, + {2802, 2808, "F\0" }, + {2816, 2816, "F\0" }, + {2820, 2820, "F\0" }, + {2829, 2830, "F\0" }, + {2833, 2834, "F\0" }, + {2857, 2857, "F\0" }, + {2865, 2865, "F\0" }, + {2868, 2868, "F\0" }, + {2874, 2875, "F\0" }, + {2885, 2886, "F\0" }, + {2889, 2890, "F\0" }, + {2894, 2900, "F\0" }, + {2904, 2907, "F\0" }, + {2910, 2910, "F\0" }, + {2916, 2917, "F\0" }, + {2936, 2945, "F\0" }, + {2948, 2948, "F\0" }, + {2955, 2957, "F\0" }, + {2961, 2961, "F\0" }, + {2966, 2968, "F\0" }, + {2971, 2971, "F\0" }, + {2973, 2973, "F\0" }, + {2976, 2978, "F\0" }, + {2981, 2983, "F\0" }, + {2987, 2989, "F\0" }, + {3002, 3005, "F\0" }, + {3011, 3013, "F\0" }, + {3017, 3017, "F\0" }, + {3022, 3023, "F\0" }, + {3025, 3030, "F\0" }, + {3032, 3045, "F\0" }, + {3067, 3071, "F\0" }, + {3085, 3085, "F\0" }, + {3089, 3089, "F\0" }, + {3113, 3113, "F\0" }, + {3130, 3131, "F\0" }, + {3141, 3141, "F\0" }, + {3145, 3145, "F\0" }, + {3150, 3156, "F\0" }, + {3159, 3159, "F\0" }, + {3163, 3164, "F\0" }, + {3166, 3167, "F\0" }, + {3172, 3173, "F\0" }, + {3184, 3190, "F\0" }, + {3213, 3213, "F\0" }, + {3217, 3217, "F\0" }, + {3241, 3241, "F\0" }, + {3252, 3252, "F\0" }, + {3258, 3259, "F\0" }, + {3269, 3269, "F\0" }, + {3273, 3273, "F\0" }, + {3278, 3284, "F\0" }, + {3287, 3292, "F\0" }, + {3295, 3295, "F\0" }, + {3300, 3301, "F\0" }, + {3312, 3312, "F\0" }, + {3315, 3327, "F\0" }, + {3341, 3341, "F\0" }, + {3345, 3345, "F\0" }, + {3397, 3397, "F\0" }, + {3401, 3401, "F\0" }, + {3408, 3411, "F\0" }, + {3428, 3429, "F\0" }, + {3456, 3456, "F\0" }, + {3460, 3460, "F\0" }, + {3479, 3481, "F\0" }, + {3506, 3506, "F\0" }, + {3516, 3516, "F\0" }, + {3518, 3519, "F\0" }, + {3527, 3529, "F\0" }, + {3531, 3534, "F\0" }, + {3541, 3541, "F\0" }, + {3543, 3543, "F\0" }, + {3552, 3557, "F\0" }, + {3568, 3569, "F\0" }, + {3573, 3584, "F\0" }, + {3643, 3646, "F\0" }, + {3676, 3712, "F\0" }, + {3715, 3715, "F\0" }, + {3717, 3717, "F\0" }, + {3723, 3723, "F\0" }, + {3748, 3748, "F\0" }, + {3750, 3750, "F\0" }, + {3774, 3775, "F\0" }, + {3781, 3781, "F\0" }, + {3783, 3783, "F\0" }, + {3790, 3791, "F\0" }, + {3802, 3803, "F\0" }, + {3808, 3839, "F\0" }, + {3912, 3912, "F\0" }, + {3949, 3952, "F\0" }, + {3992, 3992, "F\0" }, + {4029, 4029, "F\0" }, + {4045, 4045, "F\0" }, + {4059, 4095, "F\0" }, + {4294, 4294, "F\0" }, + {4296, 4300, "F\0" }, + {4302, 4303, "F\0" }, + {4352, 4447, "W\0" }, + {4681, 4681, "F\0" }, + {4686, 4687, "F\0" }, + {4695, 4695, "F\0" }, + {4697, 4697, "F\0" }, + {4702, 4703, "F\0" }, + {4745, 4745, "F\0" }, + {4750, 4751, "F\0" }, + {4785, 4785, "F\0" }, + {4790, 4791, "F\0" }, + {4799, 4799, "F\0" }, + {4801, 4801, "F\0" }, + {4806, 4807, "F\0" }, + {4823, 4823, "F\0" }, + {4881, 4881, "F\0" }, + {4886, 4887, "F\0" }, + {4955, 4956, "F\0" }, + {4989, 4991, "F\0" }, + {5018, 5023, "F\0" }, + {5110, 5111, "F\0" }, + {5118, 5119, "F\0" }, + {5789, 5791, "F\0" }, + {5881, 5887, "F\0" }, + {5910, 5918, "F\0" }, + {5943, 5951, "F\0" }, + {5972, 5983, "F\0" }, + {5997, 5997, "F\0" }, + {6001, 6001, "F\0" }, + {6004, 6015, "F\0" }, + {6110, 6111, "F\0" }, + {6122, 6127, "F\0" }, + {6138, 6143, "F\0" }, + {6170, 6175, "F\0" }, + {6265, 6271, "F\0" }, + {6315, 6319, "F\0" }, + {6390, 6399, "F\0" }, + {6431, 6431, "F\0" }, + {6444, 6447, "F\0" }, + {6460, 6463, "F\0" }, + {6465, 6467, "F\0" }, + {6510, 6511, "F\0" }, + {6517, 6527, "F\0" }, + {6572, 6575, "F\0" }, + {6602, 6607, "F\0" }, + {6619, 6621, "F\0" }, + {6684, 6685, "F\0" }, + {6751, 6751, "F\0" }, + {6781, 6782, "F\0" }, + {6794, 6799, "F\0" }, + {6810, 6815, "F\0" }, + {6830, 6831, "F\0" }, + {6863, 6911, "F\0" }, + {6989, 6991, "F\0" }, + {7039, 7039, "F\0" }, + {7156, 7163, "F\0" }, + {7224, 7226, "F\0" }, + {7242, 7244, "F\0" }, + {7305, 7311, "F\0" }, + {7355, 7356, "F\0" }, + {7368, 7375, "F\0" }, + {7419, 7423, "F\0" }, + {7958, 7959, "F\0" }, + {7966, 7967, "F\0" }, + {8006, 8007, "F\0" }, + {8014, 8015, "F\0" }, + {8024, 8024, "F\0" }, + {8026, 8026, "F\0" }, + {8028, 8028, "F\0" }, + {8030, 8030, "F\0" }, + {8062, 8063, "F\0" }, + {8117, 8117, "F\0" }, + {8133, 8133, "F\0" }, + {8148, 8149, "F\0" }, + {8156, 8156, "F\0" }, + {8176, 8177, "F\0" }, + {8181, 8181, "F\0" }, + {8191, 8191, "F\0" }, + {8208, 8208, "A\0" }, + {8211, 8214, "A\0" }, + {8216, 8217, "A\0" }, + {8220, 8221, "A\0" }, + {8224, 8226, "A\0" }, + {8228, 8231, "A\0" }, + {8240, 8240, "A\0" }, + {8242, 8243, "A\0" }, + {8245, 8245, "A\0" }, + {8251, 8251, "A\0" }, + {8254, 8254, "A\0" }, + {8293, 8293, "F\0" }, + {8306, 8307, "F\0" }, + {8308, 8308, "A\0" }, + {8319, 8319, "A\0" }, + {8321, 8324, "A\0" }, + {8335, 8335, "F\0" }, + {8349, 8351, "F\0" }, + {8361, 8361, "H\0" }, + {8364, 8364, "A\0" }, + {8385, 8399, "F\0" }, + {8433, 8447, "F\0" }, + {8451, 8451, "A\0" }, + {8453, 8453, "A\0" }, + {8457, 8457, "A\0" }, + {8467, 8467, "A\0" }, + {8470, 8470, "A\0" }, + {8481, 8482, "A\0" }, + {8486, 8486, "A\0" }, + {8491, 8491, "A\0" }, + {8531, 8532, "A\0" }, + {8539, 8542, "A\0" }, + {8544, 8555, "A\0" }, + {8560, 8569, "A\0" }, + {8585, 8585, "A\0" }, + {8588, 8591, "F\0" }, + {8592, 8601, "A\0" }, + {8632, 8633, "A\0" }, + {8658, 8658, "A\0" }, + {8660, 8660, "A\0" }, + {8679, 8679, "A\0" }, + {8704, 8704, "A\0" }, + {8706, 8707, "A\0" }, + {8711, 8712, "A\0" }, + {8715, 8715, "A\0" }, + {8719, 8719, "A\0" }, + {8721, 8721, "A\0" }, + {8725, 8725, "A\0" }, + {8730, 8730, "A\0" }, + {8733, 8736, "A\0" }, + {8739, 8739, "A\0" }, + {8741, 8741, "A\0" }, + {8743, 8748, "A\0" }, + {8750, 8750, "A\0" }, + {8756, 8759, "A\0" }, + {8764, 8765, "A\0" }, + {8776, 8776, "A\0" }, + {8780, 8780, "A\0" }, + {8786, 8786, "A\0" }, + {8800, 8801, "A\0" }, + {8804, 8807, "A\0" }, + {8810, 8811, "A\0" }, + {8814, 8815, "A\0" }, + {8834, 8835, "A\0" }, + {8838, 8839, "A\0" }, + {8853, 8853, "A\0" }, + {8857, 8857, "A\0" }, + {8869, 8869, "A\0" }, + {8895, 8895, "A\0" }, + {8978, 8978, "A\0" }, + {8986, 8987, "W\0" }, + {9001, 9002, "W\0" }, + {9193, 9196, "W\0" }, + {9200, 9200, "W\0" }, + {9203, 9203, "W\0" }, + {9255, 9279, "F\0" }, + {9291, 9311, "F\0" }, + {9312, 9449, "A\0" }, + {9451, 9547, "A\0" }, + {9552, 9587, "A\0" }, + {9600, 9615, "A\0" }, + {9618, 9621, "A\0" }, + {9632, 9633, "A\0" }, + {9635, 9641, "A\0" }, + {9650, 9651, "A\0" }, + {9654, 9655, "A\0" }, + {9660, 9661, "A\0" }, + {9664, 9665, "A\0" }, + {9670, 9672, "A\0" }, + {9675, 9675, "A\0" }, + {9678, 9681, "A\0" }, + {9698, 9701, "A\0" }, + {9711, 9711, "A\0" }, + {9725, 9726, "W\0" }, + {9733, 9734, "A\0" }, + {9737, 9737, "A\0" }, + {9742, 9743, "A\0" }, + {9748, 9749, "W\0" }, + {9756, 9756, "A\0" }, + {9758, 9758, "A\0" }, + {9792, 9792, "A\0" }, + {9794, 9794, "A\0" }, + {9800, 9811, "W\0" }, + {9824, 9825, "A\0" }, + {9827, 9829, "A\0" }, + {9831, 9834, "A\0" }, + {9836, 9837, "A\0" }, + {9839, 9839, "A\0" }, + {9855, 9855, "W\0" }, + {9875, 9875, "W\0" }, + {9886, 9887, "A\0" }, + {9889, 9889, "W\0" }, + {9898, 9899, "W\0" }, + {9917, 9918, "W\0" }, + {9919, 9919, "A\0" }, + {9924, 9925, "W\0" }, + {9926, 9933, "A\0" }, + {9934, 9934, "W\0" }, + {9935, 9939, "A\0" }, + {9940, 9940, "W\0" }, + {9941, 9953, "A\0" }, + {9955, 9955, "A\0" }, + {9960, 9961, "A\0" }, + {9962, 9962, "W\0" }, + {9963, 9969, "A\0" }, + {9970, 9971, "W\0" }, + {9972, 9972, "A\0" }, + {9973, 9973, "W\0" }, + {9974, 9977, "A\0" }, + {9978, 9978, "W\0" }, + {9979, 9980, "A\0" }, + {9981, 9981, "W\0" }, + {9982, 9983, "A\0" }, + {9989, 9989, "W\0" }, + {9994, 9995, "W\0" }, + {10024, 10024, "W\0" }, + {10045, 10045, "A\0" }, + {10060, 10060, "W\0" }, + {10062, 10062, "W\0" }, + {10067, 10069, "W\0" }, + {10071, 10071, "W\0" }, + {10102, 10111, "A\0" }, + {10133, 10135, "W\0" }, + {10160, 10160, "W\0" }, + {10175, 10175, "W\0" }, + {10214, 10221, "Na\0"}, + {10629, 10630, "Na\0"}, + {11035, 11036, "W\0" }, + {11088, 11088, "W\0" }, + {11093, 11093, "W\0" }, + {11094, 11097, "A\0" }, + {11124, 11125, "F\0" }, + {11158, 11158, "F\0" }, + {11508, 11512, "F\0" }, + {11558, 11558, "F\0" }, + {11560, 11564, "F\0" }, + {11566, 11567, "F\0" }, + {11624, 11630, "F\0" }, + {11633, 11646, "F\0" }, + {11671, 11679, "F\0" }, + {11687, 11687, "F\0" }, + {11695, 11695, "F\0" }, + {11703, 11703, "F\0" }, + {11711, 11711, "F\0" }, + {11719, 11719, "F\0" }, + {11727, 11727, "F\0" }, + {11735, 11735, "F\0" }, + {11743, 11743, "F\0" }, + {11870, 11903, "F\0" }, + {11904, 11929, "W\0" }, + {11930, 11930, "F\0" }, + {11931, 12019, "W\0" }, + {12020, 12031, "F\0" }, + {12032, 12245, "W\0" }, + {12246, 12271, "F\0" }, + {12272, 12283, "W\0" }, + {12284, 12288, "F\0" }, + {12289, 12350, "W\0" }, + {12352, 12352, "F\0" }, + {12353, 12438, "W\0" }, + {12439, 12440, "F\0" }, + {12441, 12543, "W\0" }, + {12544, 12548, "F\0" }, + {12549, 12591, "W\0" }, + {12592, 12592, "F\0" }, + {12593, 12686, "W\0" }, + {12687, 12687, "F\0" }, + {12688, 12771, "W\0" }, + {12772, 12783, "F\0" }, + {12784, 12830, "W\0" }, + {12831, 12831, "F\0" }, + {12832, 12871, "W\0" }, + {12872, 12879, "A\0" }, + {12880, 19903, "W\0" }, + {19968, 42124, "W\0" }, + {42125, 42127, "F\0" }, + {42128, 42182, "W\0" }, + {42183, 42191, "F\0" }, + {42540, 42559, "F\0" }, + {42744, 42751, "F\0" }, + {42955, 42959, "F\0" }, + {42962, 42962, "F\0" }, + {42964, 42964, "F\0" }, + {42970, 42993, "F\0" }, + {43053, 43055, "F\0" }, + {43066, 43071, "F\0" }, + {43128, 43135, "F\0" }, + {43206, 43213, "F\0" }, + {43226, 43231, "F\0" }, + {43348, 43358, "F\0" }, + {43360, 43388, "W\0" }, + {43389, 43391, "F\0" }, + {43470, 43470, "F\0" }, + {43482, 43485, "F\0" }, + {43519, 43519, "F\0" }, + {43575, 43583, "F\0" }, + {43598, 43599, "F\0" }, + {43610, 43611, "F\0" }, + {43715, 43738, "F\0" }, + {43767, 43776, "F\0" }, + {43783, 43784, "F\0" }, + {43791, 43792, "F\0" }, + {43799, 43807, "F\0" }, + {43815, 43815, "F\0" }, + {43823, 43823, "F\0" }, + {43884, 43887, "F\0" }, + {44014, 44015, "F\0" }, + {44026, 44031, "F\0" }, + {44032, 55203, "W\0" }, + {55204, 55215, "F\0" }, + {55239, 55242, "F\0" }, + {55292, 55295, "F\0" }, + {57344, 63743, "A\0" }, + {63744, 64109, "W\0" }, + {64110, 64111, "F\0" }, + {64112, 64217, "W\0" }, + {64218, 64255, "F\0" }, + {64263, 64274, "F\0" }, + {64280, 64284, "F\0" }, + {64311, 64311, "F\0" }, + {64317, 64317, "F\0" }, + {64319, 64319, "F\0" }, + {64322, 64322, "F\0" }, + {64325, 64325, "F\0" }, + {64451, 64466, "F\0" }, + {64912, 64913, "F\0" }, + {64968, 64974, "F\0" }, + {64976, 65007, "F\0" }, + {65024, 65039, "A\0" }, + {65040, 65049, "W\0" }, + {65050, 65055, "F\0" }, + {65072, 65106, "W\0" }, + {65107, 65107, "F\0" }, + {65108, 65126, "W\0" }, + {65127, 65127, "F\0" }, + {65128, 65131, "W\0" }, + {65132, 65135, "F\0" }, + {65141, 65141, "F\0" }, + {65277, 65278, "F\0" }, + {65280, 65376, "F\0" }, + {65377, 65470, "H\0" }, + {65471, 65473, "F\0" }, + {65474, 65479, "H\0" }, + {65480, 65481, "F\0" }, + {65482, 65487, "H\0" }, + {65488, 65489, "F\0" }, + {65490, 65495, "H\0" }, + {65496, 65497, "F\0" }, + {65498, 65500, "H\0" }, + {65501, 65511, "F\0" }, + {65512, 65518, "H\0" }, + {65519, 65528, "F\0" }, + {65533, 65533, "A\0" }, + {65534, 65535, "F\0" }, + {65548, 65548, "F\0" }, + {65575, 65575, "F\0" }, + {65595, 65595, "F\0" }, + {65598, 65598, "F\0" }, + {65614, 65615, "F\0" }, + {65630, 65663, "F\0" }, + {65787, 65791, "F\0" }, + {65795, 65798, "F\0" }, + {65844, 65846, "F\0" }, + {65935, 65935, "F\0" }, + {65949, 65951, "F\0" }, + {65953, 65999, "F\0" }, + {66046, 66175, "F\0" }, + {66205, 66207, "F\0" }, + {66257, 66271, "F\0" }, + {66300, 66303, "F\0" }, + {66340, 66348, "F\0" }, + {66379, 66383, "F\0" }, + {66427, 66431, "F\0" }, + {66462, 66462, "F\0" }, + {66500, 66503, "F\0" }, + {66518, 66559, "F\0" }, + {66718, 66719, "F\0" }, + {66730, 66735, "F\0" }, + {66772, 66775, "F\0" }, + {66812, 66815, "F\0" }, + {66856, 66863, "F\0" }, + {66916, 66926, "F\0" }, + {66939, 66939, "F\0" }, + {66955, 66955, "F\0" }, + {66963, 66963, "F\0" }, + {66966, 66966, "F\0" }, + {66978, 66978, "F\0" }, + {66994, 66994, "F\0" }, + {67002, 67002, "F\0" }, + {67005, 67071, "F\0" }, + {67383, 67391, "F\0" }, + {67414, 67423, "F\0" }, + {67432, 67455, "F\0" }, + {67462, 67462, "F\0" }, + {67505, 67505, "F\0" }, + {67515, 67583, "F\0" }, + {67590, 67591, "F\0" }, + {67593, 67593, "F\0" }, + {67638, 67638, "F\0" }, + {67641, 67643, "F\0" }, + {67645, 67646, "F\0" }, + {67670, 67670, "F\0" }, + {67743, 67750, "F\0" }, + {67760, 67807, "F\0" }, + {67827, 67827, "F\0" }, + {67830, 67834, "F\0" }, + {67868, 67870, "F\0" }, + {67898, 67902, "F\0" }, + {67904, 67967, "F\0" }, + {68024, 68027, "F\0" }, + {68048, 68049, "F\0" }, + {68100, 68100, "F\0" }, + {68103, 68107, "F\0" }, + {68116, 68116, "F\0" }, + {68120, 68120, "F\0" }, + {68150, 68151, "F\0" }, + {68155, 68158, "F\0" }, + {68169, 68175, "F\0" }, + {68185, 68191, "F\0" }, + {68256, 68287, "F\0" }, + {68327, 68330, "F\0" }, + {68343, 68351, "F\0" }, + {68406, 68408, "F\0" }, + {68438, 68439, "F\0" }, + {68467, 68471, "F\0" }, + {68498, 68504, "F\0" }, + {68509, 68520, "F\0" }, + {68528, 68607, "F\0" }, + {68681, 68735, "F\0" }, + {68787, 68799, "F\0" }, + {68851, 68857, "F\0" }, + {68904, 68911, "F\0" }, + {68922, 69215, "F\0" }, + {69247, 69247, "F\0" }, + {69290, 69290, "F\0" }, + {69294, 69295, "F\0" }, + {69298, 69375, "F\0" }, + {69416, 69423, "F\0" }, + {69466, 69487, "F\0" }, + {69514, 69551, "F\0" }, + {69580, 69599, "F\0" }, + {69623, 69631, "F\0" }, + {69710, 69713, "F\0" }, + {69750, 69758, "F\0" }, + {69827, 69836, "F\0" }, + {69838, 69839, "F\0" }, + {69865, 69871, "F\0" }, + {69882, 69887, "F\0" }, + {69941, 69941, "F\0" }, + {69960, 69967, "F\0" }, + {70007, 70015, "F\0" }, + {70112, 70112, "F\0" }, + {70133, 70143, "F\0" }, + {70162, 70162, "F\0" }, + {70207, 70271, "F\0" }, + {70279, 70279, "F\0" }, + {70281, 70281, "F\0" }, + {70286, 70286, "F\0" }, + {70302, 70302, "F\0" }, + {70314, 70319, "F\0" }, + {70379, 70383, "F\0" }, + {70394, 70399, "F\0" }, + {70404, 70404, "F\0" }, + {70413, 70414, "F\0" }, + {70417, 70418, "F\0" }, + {70441, 70441, "F\0" }, + {70449, 70449, "F\0" }, + {70452, 70452, "F\0" }, + {70458, 70458, "F\0" }, + {70469, 70470, "F\0" }, + {70473, 70474, "F\0" }, + {70478, 70479, "F\0" }, + {70481, 70486, "F\0" }, + {70488, 70492, "F\0" }, + {70500, 70501, "F\0" }, + {70509, 70511, "F\0" }, + {70517, 70655, "F\0" }, + {70748, 70748, "F\0" }, + {70754, 70783, "F\0" }, + {70856, 70863, "F\0" }, + {70874, 71039, "F\0" }, + {71094, 71095, "F\0" }, + {71134, 71167, "F\0" }, + {71237, 71247, "F\0" }, + {71258, 71263, "F\0" }, + {71277, 71295, "F\0" }, + {71354, 71359, "F\0" }, + {71370, 71423, "F\0" }, + {71451, 71452, "F\0" }, + {71468, 71471, "F\0" }, + {71495, 71679, "F\0" }, + {71740, 71839, "F\0" }, + {71923, 71934, "F\0" }, + {71943, 71944, "F\0" }, + {71946, 71947, "F\0" }, + {71956, 71956, "F\0" }, + {71959, 71959, "F\0" }, + {71990, 71990, "F\0" }, + {71993, 71994, "F\0" }, + {72007, 72015, "F\0" }, + {72026, 72095, "F\0" }, + {72104, 72105, "F\0" }, + {72152, 72153, "F\0" }, + {72165, 72191, "F\0" }, + {72264, 72271, "F\0" }, + {72355, 72367, "F\0" }, + {72441, 72703, "F\0" }, + {72713, 72713, "F\0" }, + {72759, 72759, "F\0" }, + {72774, 72783, "F\0" }, + {72813, 72815, "F\0" }, + {72848, 72849, "F\0" }, + {72872, 72872, "F\0" }, + {72887, 72959, "F\0" }, + {72967, 72967, "F\0" }, + {72970, 72970, "F\0" }, + {73015, 73017, "F\0" }, + {73019, 73019, "F\0" }, + {73022, 73022, "F\0" }, + {73032, 73039, "F\0" }, + {73050, 73055, "F\0" }, + {73062, 73062, "F\0" }, + {73065, 73065, "F\0" }, + {73103, 73103, "F\0" }, + {73106, 73106, "F\0" }, + {73113, 73119, "F\0" }, + {73130, 73439, "F\0" }, + {73465, 73647, "F\0" }, + {73649, 73663, "F\0" }, + {73714, 73726, "F\0" }, + {74650, 74751, "F\0" }, + {74863, 74863, "F\0" }, + {74869, 74879, "F\0" }, + {75076, 77711, "F\0" }, + {77811, 77823, "F\0" }, + {78895, 78895, "F\0" }, + {78905, 82943, "F\0" }, + {83527, 92159, "F\0" }, + {92729, 92735, "F\0" }, + {92767, 92767, "F\0" }, + {92778, 92781, "F\0" }, + {92863, 92863, "F\0" }, + {92874, 92879, "F\0" }, + {92910, 92911, "F\0" }, + {92918, 92927, "F\0" }, + {92998, 93007, "F\0" }, + {93018, 93018, "F\0" }, + {93026, 93026, "F\0" }, + {93048, 93052, "F\0" }, + {93072, 93759, "F\0" }, + {93851, 93951, "F\0" }, + {94027, 94030, "F\0" }, + {94088, 94094, "F\0" }, + {94112, 94175, "F\0" }, + {94176, 94180, "W\0" }, + {94181, 94191, "F\0" }, + {94192, 94193, "W\0" }, + {94194, 94207, "F\0" }, + {94208, 100343, "W\0" }, + {100344, 100351, "F\0" }, + {100352, 101589, "W\0" }, + {101590, 101631, "F\0" }, + {101632, 101640, "W\0" }, + {101641, 110575, "F\0" }, + {110576, 110579, "W\0" }, + {110580, 110580, "F\0" }, + {110581, 110587, "W\0" }, + {110588, 110588, "F\0" }, + {110589, 110590, "W\0" }, + {110591, 110591, "F\0" }, + {110592, 110882, "W\0" }, + {110883, 110927, "F\0" }, + {110928, 110930, "W\0" }, + {110931, 110947, "F\0" }, + {110948, 110951, "W\0" }, + {110952, 110959, "F\0" }, + {110960, 111355, "W\0" }, + {111356, 113663, "F\0" }, + {113771, 113775, "F\0" }, + {113789, 113791, "F\0" }, + {113801, 113807, "F\0" }, + {113818, 113819, "F\0" }, + {113828, 118527, "F\0" }, + {118574, 118575, "F\0" }, + {118599, 118607, "F\0" }, + {118724, 118783, "F\0" }, + {119030, 119039, "F\0" }, + {119079, 119080, "F\0" }, + {119275, 119295, "F\0" }, + {119366, 119519, "F\0" }, + {119540, 119551, "F\0" }, + {119639, 119647, "F\0" }, + {119673, 119807, "F\0" }, + {119893, 119893, "F\0" }, + {119965, 119965, "F\0" }, + {119968, 119969, "F\0" }, + {119971, 119972, "F\0" }, + {119975, 119976, "F\0" }, + {119981, 119981, "F\0" }, + {119994, 119994, "F\0" }, + {119996, 119996, "F\0" }, + {120004, 120004, "F\0" }, + {120070, 120070, "F\0" }, + {120075, 120076, "F\0" }, + {120085, 120085, "F\0" }, + {120093, 120093, "F\0" }, + {120122, 120122, "F\0" }, + {120127, 120127, "F\0" }, + {120133, 120133, "F\0" }, + {120135, 120137, "F\0" }, + {120145, 120145, "F\0" }, + {120486, 120487, "F\0" }, + {120780, 120781, "F\0" }, + {121484, 121498, "F\0" }, + {121504, 121504, "F\0" }, + {121520, 122623, "F\0" }, + {122655, 122879, "F\0" }, + {122887, 122887, "F\0" }, + {122905, 122906, "F\0" }, + {122914, 122914, "F\0" }, + {122917, 122917, "F\0" }, + {122923, 123135, "F\0" }, + {123181, 123183, "F\0" }, + {123198, 123199, "F\0" }, + {123210, 123213, "F\0" }, + {123216, 123535, "F\0" }, + {123567, 123583, "F\0" }, + {123642, 123646, "F\0" }, + {123648, 124895, "F\0" }, + {124903, 124903, "F\0" }, + {124908, 124908, "F\0" }, + {124911, 124911, "F\0" }, + {124927, 124927, "F\0" }, + {125125, 125126, "F\0" }, + {125143, 125183, "F\0" }, + {125260, 125263, "F\0" }, + {125274, 125277, "F\0" }, + {125280, 126064, "F\0" }, + {126133, 126208, "F\0" }, + {126270, 126463, "F\0" }, + {126468, 126468, "F\0" }, + {126496, 126496, "F\0" }, + {126499, 126499, "F\0" }, + {126501, 126502, "F\0" }, + {126504, 126504, "F\0" }, + {126515, 126515, "F\0" }, + {126520, 126520, "F\0" }, + {126522, 126522, "F\0" }, + {126524, 126529, "F\0" }, + {126531, 126534, "F\0" }, + {126536, 126536, "F\0" }, + {126538, 126538, "F\0" }, + {126540, 126540, "F\0" }, + {126544, 126544, "F\0" }, + {126547, 126547, "F\0" }, + {126549, 126550, "F\0" }, + {126552, 126552, "F\0" }, + {126554, 126554, "F\0" }, + {126556, 126556, "F\0" }, + {126558, 126558, "F\0" }, + {126560, 126560, "F\0" }, + {126563, 126563, "F\0" }, + {126565, 126566, "F\0" }, + {126571, 126571, "F\0" }, + {126579, 126579, "F\0" }, + {126584, 126584, "F\0" }, + {126589, 126589, "F\0" }, + {126591, 126591, "F\0" }, + {126602, 126602, "F\0" }, + {126620, 126624, "F\0" }, + {126628, 126628, "F\0" }, + {126634, 126634, "F\0" }, + {126652, 126703, "F\0" }, + {126706, 126975, "F\0" }, + {126980, 126980, "W\0" }, + {127020, 127023, "F\0" }, + {127124, 127135, "F\0" }, + {127151, 127152, "F\0" }, + {127168, 127168, "F\0" }, + {127183, 127183, "W\0" }, + {127184, 127184, "F\0" }, + {127222, 127231, "F\0" }, + {127232, 127242, "A\0" }, + {127248, 127277, "A\0" }, + {127280, 127337, "A\0" }, + {127344, 127373, "A\0" }, + {127374, 127374, "W\0" }, + {127375, 127376, "A\0" }, + {127377, 127386, "W\0" }, + {127387, 127404, "A\0" }, + {127406, 127461, "F\0" }, + {127488, 127490, "W\0" }, + {127491, 127503, "F\0" }, + {127504, 127547, "W\0" }, + {127548, 127551, "F\0" }, + {127552, 127560, "W\0" }, + {127561, 127567, "F\0" }, + {127568, 127569, "W\0" }, + {127570, 127583, "F\0" }, + {127584, 127589, "W\0" }, + {127590, 127743, "F\0" }, + {127744, 127776, "W\0" }, + {127789, 127797, "W\0" }, + {127799, 127868, "W\0" }, + {127870, 127891, "W\0" }, + {127904, 127946, "W\0" }, + {127951, 127955, "W\0" }, + {127968, 127984, "W\0" }, + {127988, 127988, "W\0" }, + {127992, 128062, "W\0" }, + {128064, 128064, "W\0" }, + {128066, 128252, "W\0" }, + {128255, 128317, "W\0" }, + {128331, 128334, "W\0" }, + {128336, 128359, "W\0" }, + {128378, 128378, "W\0" }, + {128405, 128406, "W\0" }, + {128420, 128420, "W\0" }, + {128507, 128591, "W\0" }, + {128640, 128709, "W\0" }, + {128716, 128716, "W\0" }, + {128720, 128722, "W\0" }, + {128725, 128727, "W\0" }, + {128728, 128732, "F\0" }, + {128733, 128735, "W\0" }, + {128747, 128748, "W\0" }, + {128749, 128751, "F\0" }, + {128756, 128764, "W\0" }, + {128765, 128767, "F\0" }, + {128884, 128895, "F\0" }, + {128985, 128991, "F\0" }, + {128992, 129003, "W\0" }, + {129004, 129007, "F\0" }, + {129008, 129008, "W\0" }, + {129009, 129023, "F\0" }, + {129036, 129039, "F\0" }, + {129096, 129103, "F\0" }, + {129114, 129119, "F\0" }, + {129160, 129167, "F\0" }, + {129198, 129199, "F\0" }, + {129202, 129279, "F\0" }, + {129292, 129338, "W\0" }, + {129340, 129349, "W\0" }, + {129351, 129535, "W\0" }, + {129620, 129631, "F\0" }, + {129646, 129647, "F\0" }, + {129648, 129652, "W\0" }, + {129653, 129655, "F\0" }, + {129656, 129660, "W\0" }, + {129661, 129663, "F\0" }, + {129664, 129670, "W\0" }, + {129671, 129679, "F\0" }, + {129680, 129708, "W\0" }, + {129709, 129711, "F\0" }, + {129712, 129722, "W\0" }, + {129723, 129727, "F\0" }, + {129728, 129733, "W\0" }, + {129734, 129743, "F\0" }, + {129744, 129753, "W\0" }, + {129754, 129759, "F\0" }, + {129760, 129767, "W\0" }, + {129768, 129775, "F\0" }, + {129776, 129782, "W\0" }, + {129783, 129791, "F\0" }, + {129939, 129939, "F\0" }, + {129995, 130031, "F\0" }, + {130042, 131071, "F\0" }, + {131072, 173791, "W\0" }, + {173792, 173823, "F\0" }, + {173824, 177976, "W\0" }, + {177977, 177983, "F\0" }, + {177984, 178205, "W\0" }, + {178206, 178207, "F\0" }, + {178208, 183969, "W\0" }, + {183970, 183983, "F\0" }, + {183984, 191456, "W\0" }, + {191457, 194559, "F\0" }, + {194560, 195101, "W\0" }, + {195102, 196607, "F\0" }, + {196608, 201546, "W\0" }, + {201547, 917504, "F\0" }, + {917506, 917535, "F\0" }, + {917632, 917759, "F\0" }, + {917760, 917999, "A\0" }, + {918000, 983039, "F\0" }, + {983040, 1048573, "A\0" }, + {1048574, 1048575, "F\0" }, + {1048576, 1114109, "A\0" }, + {1114110, 1114111, "F\0" }, +}; + +const static char* c11__u32_east_asian_width(int c) { + const char* data = + c11__search_u32_ranges(c, + kEastAsianWidthRanges, + sizeof(kEastAsianWidthRanges) / sizeof(c11_u32_range)); + if(data == NULL) return "N"; + return data; +} + +static bool unicodedata_east_asian_width(int argc, py_Ref argv) { + PY_CHECK_ARGC(1); + PY_CHECK_ARG_TYPE(0, tp_str); + c11_sv sv = py_tosv(py_arg(0)); + if(c11_sv__u8_length(sv) != 1) { + return TypeError("east_asian_width() expected a character, but string of length %d found", + c11_sv__u8_length(sv)); + } + int u8bytes = c11__u8_header(sv.data[0], true); + if(u8bytes == 0) return ValueError("invalid utf-8 char: %c", sv.data[0]); + int value = c11__u8_value(u8bytes, sv.data); + const char* width = c11__u32_east_asian_width(value); + py_newstr(py_retval(), width); + return true; +} + +void pk__add_module_unicodedata() { + py_Ref mod = py_newmodule("unicodedata"); + + py_bindfunc(mod, "east_asian_width", unicodedata_east_asian_width); +} \ No newline at end of file diff --git a/src/public/modules.c b/src/public/modules.c index b6c5c694..a379d41b 100644 --- a/src/public/modules.c +++ b/src/public/modules.c @@ -472,7 +472,7 @@ static bool builtins_ord(int argc, py_Ref argv) { c11_sv__u8_length(sv)); } int u8bytes = c11__u8_header(sv.data[0], true); - if(u8bytes == 0) { return ValueError("invalid char: %c", sv.data[0]); } + if(u8bytes == 0) return ValueError("invalid utf-8 char: %c", sv.data[0]); int value = c11__u8_value(u8bytes, sv.data); py_newint(py_retval(), value); return true; diff --git a/tests/83_unicodedata.py b/tests/83_unicodedata.py new file mode 100644 index 00000000..4df52a41 --- /dev/null +++ b/tests/83_unicodedata.py @@ -0,0 +1,17 @@ +from unicodedata import east_asian_width + +# full width +assert east_asian_width("A") == "F" +# half width +assert east_asian_width("サ") == "H" +# narrow +assert east_asian_width("a") == "Na" +# wide +assert east_asian_width("测") == "W" +assert east_asian_width("🥕") == "W" +assert east_asian_width("。") == "W" +# ambiguous +assert east_asian_width("°") == "A" +# neutral +assert east_asian_width("\n") == "N" +