From 0d9bf2e7de6eb93e6b5456158d7b41fce5fcf645 Mon Sep 17 00:00:00 2001 From: blueloveTH Date: Sat, 26 Apr 2025 21:43:57 +0800 Subject: [PATCH] optimize small string --- include/pocketpy/common/str.h | 2 + include/pocketpy/interpreter/vm.h | 2 - include/pocketpy/objects/base.h | 1 + include/pocketpy/objects/namedict.h | 2 +- src/common/sstream.c | 2 +- src/compiler/compiler.c | 20 ++---- src/interpreter/heap.c | 2 +- src/interpreter/vm.c | 12 +--- src/objects/namedict.c | 26 +++++--- src/public/modules.c | 14 ++--- src/public/py_ops.c | 7 +++ src/public/py_str.c | 97 ++++++++++++++--------------- tests/04_str.py | 2 +- 13 files changed, 88 insertions(+), 101 deletions(-) diff --git a/include/pocketpy/common/str.h b/include/pocketpy/common/str.h index e9757c25..3aebfad4 100644 --- a/include/pocketpy/common/str.h +++ b/include/pocketpy/common/str.h @@ -13,6 +13,8 @@ typedef struct c11_string { char data[]; // flexible array member } c11_string; +c11_string* pk_tostr(py_Ref self); + /* bytes */ typedef struct c11_bytes { int size; diff --git a/include/pocketpy/interpreter/vm.h b/include/pocketpy/interpreter/vm.h index a63caced..8a13dcb7 100644 --- a/include/pocketpy/interpreter/vm.h +++ b/include/pocketpy/interpreter/vm.h @@ -34,8 +34,6 @@ typedef struct VM { py_Callbacks callbacks; - py_TValue ascii_literals[128 + 1]; - py_TValue last_retval; py_TValue curr_exception; diff --git a/include/pocketpy/objects/base.h b/include/pocketpy/objects/base.h index f3432aef..bc656f5e 100644 --- a/include/pocketpy/objects/base.h +++ b/include/pocketpy/objects/base.h @@ -21,5 +21,6 @@ typedef struct py_TValue { c11_vec2i _vec2i; c11_color32 _color32; void* _ptr; + char _chars[8]; }; } py_TValue; diff --git a/include/pocketpy/objects/namedict.h b/include/pocketpy/objects/namedict.h index 3b232cf2..5a168f82 100644 --- a/include/pocketpy/objects/namedict.h +++ b/include/pocketpy/objects/namedict.h @@ -14,7 +14,7 @@ /* A simple binary tree for storing modules. */ typedef struct ModuleDict { - const char* path; + char path[PK_MAX_MODULE_PATH_LEN + 1]; py_TValue module; struct ModuleDict* left; struct ModuleDict* right; diff --git a/src/common/sstream.c b/src/common/sstream.c index 0fadec3b..87a0afc3 100644 --- a/src/common/sstream.c +++ b/src/common/sstream.c @@ -11,7 +11,7 @@ void c11_sbuf__ctor(c11_sbuf* self) { c11_vector__ctor(&self->data, sizeof(char)); - c11_vector__reserve(&self->data, sizeof(c11_string) + 100); + c11_vector__reserve(&self->data, sizeof(c11_string) + 64); self->data.length = sizeof(c11_string); } diff --git a/src/compiler/compiler.c b/src/compiler/compiler.c index a4690dbe..5a7ab101 100644 --- a/src/compiler/compiler.c +++ b/src/compiler/compiler.c @@ -63,7 +63,6 @@ typedef struct Ctx { bool is_compiling_class; c11_vector /*T=Expr* */ s_expr; c11_smallmap_n2i global_names; - c11_smallmap_s2n co_consts_string_dedup_map; } Ctx; typedef struct Expr Expr; @@ -1081,7 +1080,6 @@ static void Ctx__ctor(Ctx* self, CodeObject* co, FuncDecl* func, int level) { self->is_compiling_class = false; c11_vector__ctor(&self->s_expr, sizeof(Expr*)); c11_smallmap_n2i__ctor(&self->global_names); - c11_smallmap_s2n__ctor(&self->co_consts_string_dedup_map); } static void Ctx__dtor(Ctx* self) { @@ -1091,7 +1089,6 @@ static void Ctx__dtor(Ctx* self) { } c11_vector__dtor(&self->s_expr); c11_smallmap_n2i__dtor(&self->global_names); - c11_smallmap_s2n__dtor(&self->co_consts_string_dedup_map); } static int Ctx__prepare_loop_divert(Ctx* self, int line, bool is_break) { @@ -1201,19 +1198,10 @@ static int Ctx__add_varname(Ctx* self, py_Name name) { } static int Ctx__add_const_string(Ctx* self, c11_sv key) { - uint16_t* val = c11_smallmap_s2n__try_get(&self->co_consts_string_dedup_map, key); - if(val) { - return *val; - } else { - py_TValue tmp; - py_newstrv(&tmp, key); - c11_vector__push(py_TValue, &self->co->consts, tmp); - int index = self->co->consts.length - 1; - c11_smallmap_s2n__set(&self->co_consts_string_dedup_map, - c11_string__sv(PyObject__userdata(tmp._obj)), - index); - return index; - } + py_Ref p = c11_vector__emplace(&self->co->consts); + py_newstrv(p, key); + int index = self->co->consts.length - 1; + return index; } static int Ctx__add_const(Ctx* self, py_Ref v) { diff --git a/src/interpreter/heap.c b/src/interpreter/heap.c index 37b1d2ff..e3bd1631 100644 --- a/src/interpreter/heap.c +++ b/src/interpreter/heap.c @@ -44,7 +44,7 @@ void ManagedHeap__collect_if_needed(ManagedHeap* self) { const int lower = PK_GC_MIN_THRESHOLD / 2; float free_ratio = (float)avg_freed / self->gc_threshold; int new_threshold = self->gc_threshold * (1 / free_ratio); - // printf("gc_threshold=%d, avg_freed=%d, new_threshold=%d\n", self->gc_threshold, avg_freed, new_threshold); + printf("gc_threshold=%d, avg_freed=%d, new_threshold=%d\n", self->gc_threshold, avg_freed, new_threshold); self->gc_threshold = c11__min(c11__max(new_threshold, lower), upper); } diff --git a/src/interpreter/vm.c b/src/interpreter/vm.c index 544c74f5..a14e2816 100644 --- a/src/interpreter/vm.c +++ b/src/interpreter/vm.c @@ -59,7 +59,7 @@ void VM__ctor(VM* self) { self->top_frame = NULL; InternedNames__ctor(&self->names); - ModuleDict__ctor(&self->modules, NULL, *py_NIL()); + ModuleDict__ctor(&self->modules, "", *py_NIL()); TypeList__ctor(&self->types); self->builtins = *py_NIL(); @@ -88,12 +88,6 @@ void VM__ctor(VM* self) { ValueStack__ctor(&self->stack); /* Init Builtin Types */ - for(int i = 0; i < 128; i++) { - char* p = py_newstrn(&self->ascii_literals[i], 1); - *p = i; - } - py_newstrn(&self->ascii_literals[128], 0); // empty string - // 0: unused void* placeholder = TypeList__emplace(&self->types); memset(placeholder, 0, sizeof(py_TypeInfo)); @@ -634,10 +628,6 @@ void ManagedHeap__mark(ManagedHeap* self) { for(py_TValue* p = vm->stack.begin; p != vm->stack.end; p++) { pk__mark_value(p); } - // mark ascii literals - for(int i = 0; i < c11__count_array(vm->ascii_literals); i++) { - pk__mark_value(&vm->ascii_literals[i]); - } // mark modules ModuleDict__apply_mark(&vm->modules, p_stack); // mark types diff --git a/src/objects/namedict.c b/src/objects/namedict.c index 03ae3b1c..a7e6a4ae 100644 --- a/src/objects/namedict.c +++ b/src/objects/namedict.c @@ -9,7 +9,11 @@ #undef SMALLMAP_T__SOURCE void ModuleDict__ctor(ModuleDict* self, const char* path, py_TValue module) { - self->path = path; + assert(path != NULL); + int length = strlen(path); + assert(length <= PK_MAX_MODULE_PATH_LEN); + memcpy(self->path, path, length); + self->path[length] = '\0'; self->module = module; self->left = NULL; self->right = NULL; @@ -27,10 +31,7 @@ void ModuleDict__dtor(ModuleDict* self) { } void ModuleDict__set(ModuleDict* self, const char* key, py_TValue val) { - if(self->path == NULL) { - self->path = key; - self->module = val; - } + assert(key != NULL); int cmp = strcmp(key, self->path); if(cmp < 0) { if(self->left) { @@ -52,7 +53,7 @@ void ModuleDict__set(ModuleDict* self, const char* key, py_TValue val) { } py_TValue* ModuleDict__try_get(ModuleDict* self, const char* path) { - if(self->path == NULL) return NULL; + assert(path != NULL); int cmp = strcmp(path, self->path); if(cmp < 0) { if(self->left) { @@ -72,14 +73,19 @@ py_TValue* ModuleDict__try_get(ModuleDict* self, const char* path) { } bool ModuleDict__contains(ModuleDict* self, const char* path) { + assert(path != NULL); return ModuleDict__try_get(self, path) != NULL; } void ModuleDict__apply_mark(ModuleDict* self, c11_vector* p_stack) { - PyObject* obj = self->module._obj; - if(!obj->gc_marked) { - obj->gc_marked = true; - c11_vector__push(PyObject*, p_stack, obj); + if(!py_isnil(&self->module)) { + // root node is dummy + PyObject* obj = self->module._obj; + assert(obj != NULL); + if(!obj->gc_marked) { + obj->gc_marked = true; + c11_vector__push(PyObject*, p_stack, obj); + } } if(self->left) ModuleDict__apply_mark(self->left, p_stack); if(self->right) ModuleDict__apply_mark(self->right, p_stack); diff --git a/src/public/modules.c b/src/public/modules.c index 762ef229..b4af2af9 100644 --- a/src/public/modules.c +++ b/src/public/modules.c @@ -450,15 +450,11 @@ static bool builtins_chr(int argc, py_Ref argv) { PY_CHECK_ARGC(1); PY_CHECK_ARG_TYPE(0, tp_int); uint32_t val = py_toint(py_arg(0)); - if(val >= 0 && val < 128) { - py_assign(py_retval(), &pk_current_vm->ascii_literals[val]); - } else { - // convert to utf-8 - char utf8[4]; - int len = c11__u32_to_u8(val, utf8); - if(len == -1) return ValueError("invalid unicode code point: %d", val); - py_newstrv(py_retval(), (c11_sv){utf8, len}); - } + // convert to utf-8 + char utf8[4]; + int len = c11__u32_to_u8(val, utf8); + if(len == -1) return ValueError("invalid unicode code point: %d", val); + py_newstrv(py_retval(), (c11_sv){utf8, len}); return true; } diff --git a/src/public/py_ops.c b/src/public/py_ops.c index 9495ff4a..71d13bb7 100644 --- a/src/public/py_ops.c +++ b/src/public/py_ops.c @@ -9,6 +9,13 @@ bool py_isidentical(py_Ref lhs, py_Ref rhs) { case tp_int: return lhs->_i64 == rhs->_i64; case tp_float: return lhs->_f64 == rhs->_f64; case tp_bool: return lhs->_bool == rhs->_bool; + case tp_str: { + if(lhs->is_ptr && rhs->is_ptr) { + return lhs->_obj == rhs->_obj; + } else { + return strcmp(lhs->_chars, rhs->_chars) == 0; + } + } case tp_nativefunc: return lhs->_cfunc == rhs->_cfunc; case tp_NoneType: return true; case tp_NotImplementedType: return true; diff --git a/src/public/py_str.c b/src/public/py_str.c index 5b9ef97e..03d217da 100644 --- a/src/public/py_str.c +++ b/src/public/py_str.c @@ -9,6 +9,13 @@ void py_newstr(py_Ref out, const char* data) { py_newstrv(out, (c11_sv){data, strlen(data)}); } char* py_newstrn(py_Ref out, int size) { + if(size < 8) { + out->type = tp_str; + out->is_ptr = false; + c11_string* ud = (c11_string*)(&out->extra); + c11_string__ctor3(ud, size); + return ud->data; + } ManagedHeap* heap = &pk_current_vm->heap; int total_size = sizeof(c11_string) + size + 1; PyObject* obj = ManagedHeap__gcnew(heap, tp_str, 0, total_size); @@ -21,17 +28,6 @@ char* py_newstrn(py_Ref out, int size) { } void py_newstrv(py_OutRef out, c11_sv sv) { - if(sv.size == 0) { - *out = pk_current_vm->ascii_literals[128]; - return; - } - if(sv.size == 1) { - int c = sv.data[0]; - if(c >= 0 && c < 128) { - *out = pk_current_vm->ascii_literals[c]; - return; - } - } char* data = py_newstrn(out, sv.size); memcpy(data, sv.data, sv.size); } @@ -58,22 +54,25 @@ unsigned char* py_newbytes(py_Ref out, int size) { return ud->data; } -const char* py_tostr(py_Ref self) { +c11_string* pk_tostr(py_Ref self) { assert(self->type == tp_str); - c11_string* ud = PyObject__userdata(self->_obj); - return ud->data; + if(!self->is_ptr) { + return (c11_string*)(&self->extra); + } else { + return PyObject__userdata(self->_obj); + } } +const char* py_tostr(py_Ref self) { return pk_tostr(self)->data; } + const char* py_tostrn(py_Ref self, int* size) { - assert(self->type == tp_str); - c11_string* ud = PyObject__userdata(self->_obj); + c11_string* ud = pk_tostr(self); *size = ud->size; return ud->data; } c11_sv py_tosv(py_Ref self) { - assert(self->type == tp_str); - c11_string* ud = PyObject__userdata(self->_obj); + c11_string* ud = pk_tostr(self); return c11_string__sv(ud); } @@ -116,18 +115,18 @@ static bool str__hash__(int argc, py_Ref argv) { static bool str__len__(int argc, py_Ref argv) { PY_CHECK_ARGC(1); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); py_newint(py_retval(), c11_sv__u8_length((c11_sv){self->data, self->size})); return true; } static bool str__add__(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); if(py_arg(1)->type != tp_str) { py_newnotimplemented(py_retval()); } else { - c11_string* other = py_touserdata(&argv[1]); + c11_string* other = pk_tostr(&argv[1]); char* p = py_newstrn(py_retval(), self->size + other->size); memcpy(p, self->data, self->size); memcpy(p + self->size, other->data, other->size); @@ -137,7 +136,7 @@ static bool str__add__(int argc, py_Ref argv) { static bool str__mul__(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); if(py_arg(1)->type != tp_int) { py_newnotimplemented(py_retval()); } else { @@ -158,11 +157,11 @@ static bool str__rmul__(int argc, py_Ref argv) { return str__mul__(argc, argv); static bool str__contains__(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); if(py_arg(1)->type != tp_str) { py_newnotimplemented(py_retval()); } else { - c11_string* other = py_touserdata(&argv[1]); + c11_string* other = pk_tostr(&argv[1]); const char* p = strstr(self->data, other->data); py_newbool(py_retval(), p != NULL); } @@ -194,7 +193,7 @@ static bool str__iter__(int argc, py_Ref argv) { static bool str__getitem__(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_sv self = c11_string__sv(py_touserdata(&argv[0])); + c11_sv self = c11_string__sv(pk_tostr(&argv[0])); py_Ref _1 = py_arg(1); if(_1->type == tp_int) { int index = py_toint(py_arg(1)); @@ -218,11 +217,11 @@ static bool str__getitem__(int argc, py_Ref argv) { #define DEF_STR_CMP_OP(op, __f, __cond) \ static bool str##op(int argc, py_Ref argv) { \ PY_CHECK_ARGC(2); \ - c11_string* self = py_touserdata(&argv[0]); \ + c11_string* self = pk_tostr(&argv[0]); \ if(py_arg(1)->type != tp_str) { \ py_newnotimplemented(py_retval()); \ } else { \ - c11_string* other = py_touserdata(&argv[1]); \ + c11_string* other = pk_tostr(&argv[1]); \ int res = __f(c11_string__sv(self), c11_string__sv(other)); \ py_newbool(py_retval(), __cond); \ } \ @@ -240,7 +239,7 @@ DEF_STR_CMP_OP(__ge__, c11_sv__cmp, res >= 0) static bool str_lower(int argc, py_Ref argv) { PY_CHECK_ARGC(1); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); char* p = py_newstrn(py_retval(), self->size); for(int i = 0; i < self->size; i++) { char c = self->data[i]; @@ -251,7 +250,7 @@ static bool str_lower(int argc, py_Ref argv) { static bool str_upper(int argc, py_Ref argv) { PY_CHECK_ARGC(1); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); char* p = py_newstrn(py_retval(), self->size); for(int i = 0; i < self->size; i++) { char c = self->data[i]; @@ -262,25 +261,25 @@ static bool str_upper(int argc, py_Ref argv) { static bool str_startswith(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); PY_CHECK_ARG_TYPE(1, tp_str); - c11_string* other = py_touserdata(&argv[1]); + c11_string* other = pk_tostr(&argv[1]); py_newbool(py_retval(), c11_sv__startswith(c11_string__sv(self), c11_string__sv(other))); return true; } static bool str_endswith(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); PY_CHECK_ARG_TYPE(1, tp_str); - c11_string* other = py_touserdata(&argv[1]); + c11_string* other = pk_tostr(&argv[1]); py_newbool(py_retval(), c11_sv__endswith(c11_string__sv(self), c11_string__sv(other))); return true; } static bool str_join(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_sv self = c11_string__sv(py_touserdata(argv)); + c11_sv self = c11_string__sv(pk_tostr(argv)); if(!py_iter(py_arg(1))) return false; py_push(py_retval()); // iter @@ -302,7 +301,7 @@ static bool str_join(int argc, py_Ref argv) { c11_sbuf__dtor(&buf); return false; } - c11_string* item = py_touserdata(py_retval()); + c11_string* item = pk_tostr(py_retval()); c11_sbuf__write_cstrn(&buf, item->data, item->size); first = false; } @@ -314,11 +313,11 @@ static bool str_join(int argc, py_Ref argv) { static bool str_replace(int argc, py_Ref argv) { PY_CHECK_ARGC(3); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); PY_CHECK_ARG_TYPE(1, tp_str); PY_CHECK_ARG_TYPE(2, tp_str); - c11_string* old = py_touserdata(&argv[1]); - c11_string* new_ = py_touserdata(&argv[2]); + c11_string* old = pk_tostr(&argv[1]); + c11_string* new_ = pk_tostr(&argv[2]); c11_string* res = c11_sv__replace2(c11_string__sv(self), c11_string__sv(old), c11_string__sv(new_)); py_newstrv(py_retval(), (c11_sv){res->data, res->size}); @@ -327,7 +326,7 @@ static bool str_replace(int argc, py_Ref argv) { } static bool str_split(int argc, py_Ref argv) { - c11_sv self = c11_string__sv(py_touserdata(&argv[0])); + c11_sv self = c11_string__sv(pk_tostr(&argv[0])); c11_vector res; bool discard_empty = false; if(argc > 2) return TypeError("split() takes at most 2 arguments"); @@ -339,7 +338,7 @@ static bool str_split(int argc, py_Ref argv) { if(argc == 2) { // sep = argv[1] if(!py_checkstr(&argv[1])) return false; - c11_sv sep = c11_string__sv(py_touserdata(&argv[1])); + c11_sv sep = c11_string__sv(pk_tostr(&argv[1])); if(sep.size == 0) return ValueError("empty separator"); res = c11_sv__split2(self, sep); } @@ -355,22 +354,22 @@ static bool str_split(int argc, py_Ref argv) { static bool str_count(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); PY_CHECK_ARG_TYPE(1, tp_str); - c11_string* sub = py_touserdata(&argv[1]); + c11_string* sub = pk_tostr(&argv[1]); int res = c11_sv__count(c11_string__sv(self), c11_string__sv(sub)); py_newint(py_retval(), res); return true; } static bool str__strip_impl(bool left, bool right, int argc, py_Ref argv) { - c11_sv self = c11_string__sv(py_touserdata(&argv[0])); + c11_sv self = c11_string__sv(pk_tostr(&argv[0])); c11_sv chars; if(argc == 1) { chars = (c11_sv){" \t\n\r", 4}; } else if(argc == 2) { if(!py_checkstr(&argv[1])) return false; - chars = c11_string__sv(py_touserdata(&argv[1])); + chars = c11_string__sv(pk_tostr(&argv[1])); } else { return TypeError("strip() takes at most 2 arguments"); } @@ -387,7 +386,7 @@ static bool str_rstrip(int argc, py_Ref argv) { return str__strip_impl(false, tr static bool str_zfill(int argc, py_Ref argv) { PY_CHECK_ARGC(2); - c11_sv self = c11_string__sv(py_touserdata(&argv[0])); + c11_sv self = c11_string__sv(pk_tostr(&argv[0])); PY_CHECK_ARG_TYPE(1, tp_int); int width = py_toint(py_arg(1)); int delta = width - c11_sv__u8_length(self); @@ -412,12 +411,12 @@ static bool str__widthjust_impl(bool left, int argc, py_Ref argv) { pad = ' '; } else { if(!py_checkstr(&argv[2])) return false; - c11_string* padstr = py_touserdata(&argv[2]); + c11_string* padstr = pk_tostr(&argv[2]); if(padstr->size != 1) return TypeError("The fill character must be exactly one character long"); pad = padstr->data[0]; } - c11_sv self = c11_string__sv(py_touserdata(&argv[0])); + c11_sv self = c11_string__sv(pk_tostr(&argv[0])); PY_CHECK_ARG_TYPE(1, tp_int); int width = py_toint(py_arg(1)); if(width <= self.size) { @@ -452,9 +451,9 @@ static bool str_find(int argc, py_Ref argv) { PY_CHECK_ARG_TYPE(2, tp_int); start = py_toint(py_arg(2)); } - c11_string* self = py_touserdata(&argv[0]); + c11_string* self = pk_tostr(&argv[0]); PY_CHECK_ARG_TYPE(1, tp_str); - c11_string* sub = py_touserdata(&argv[1]); + c11_string* sub = pk_tostr(&argv[1]); int res = c11_sv__index2(c11_string__sv(self), c11_string__sv(sub), start); py_newint(py_retval(), res); return true; diff --git a/tests/04_str.py b/tests/04_str.py index 9b855798..dfe8a984 100644 --- a/tests/04_str.py +++ b/tests/04_str.py @@ -190,7 +190,7 @@ assert (1 != '1') is True assert (1 == '1') is False assert 1 == 1.0 -assert chr(97) is 'a' +assert chr(97) == 'a' assert ord('a') == 97 assert ord('🥕') == 0x1f955