optimize small string

This commit is contained in:
blueloveTH 2025-04-26 21:43:57 +08:00
parent 2b44f11ed7
commit 0d9bf2e7de
13 changed files with 88 additions and 101 deletions

View File

@ -13,6 +13,8 @@ typedef struct c11_string {
char data[]; // flexible array member
} c11_string;
c11_string* pk_tostr(py_Ref self);
/* bytes */
typedef struct c11_bytes {
int size;

View File

@ -34,8 +34,6 @@ typedef struct VM {
py_Callbacks callbacks;
py_TValue ascii_literals[128 + 1];
py_TValue last_retval;
py_TValue curr_exception;

View File

@ -21,5 +21,6 @@ typedef struct py_TValue {
c11_vec2i _vec2i;
c11_color32 _color32;
void* _ptr;
char _chars[8];
};
} py_TValue;

View File

@ -14,7 +14,7 @@
/* A simple binary tree for storing modules. */
typedef struct ModuleDict {
const char* path;
char path[PK_MAX_MODULE_PATH_LEN + 1];
py_TValue module;
struct ModuleDict* left;
struct ModuleDict* right;

View File

@ -11,7 +11,7 @@
void c11_sbuf__ctor(c11_sbuf* self) {
c11_vector__ctor(&self->data, sizeof(char));
c11_vector__reserve(&self->data, sizeof(c11_string) + 100);
c11_vector__reserve(&self->data, sizeof(c11_string) + 64);
self->data.length = sizeof(c11_string);
}

View File

@ -63,7 +63,6 @@ typedef struct Ctx {
bool is_compiling_class;
c11_vector /*T=Expr* */ s_expr;
c11_smallmap_n2i global_names;
c11_smallmap_s2n co_consts_string_dedup_map;
} Ctx;
typedef struct Expr Expr;
@ -1081,7 +1080,6 @@ static void Ctx__ctor(Ctx* self, CodeObject* co, FuncDecl* func, int level) {
self->is_compiling_class = false;
c11_vector__ctor(&self->s_expr, sizeof(Expr*));
c11_smallmap_n2i__ctor(&self->global_names);
c11_smallmap_s2n__ctor(&self->co_consts_string_dedup_map);
}
static void Ctx__dtor(Ctx* self) {
@ -1091,7 +1089,6 @@ static void Ctx__dtor(Ctx* self) {
}
c11_vector__dtor(&self->s_expr);
c11_smallmap_n2i__dtor(&self->global_names);
c11_smallmap_s2n__dtor(&self->co_consts_string_dedup_map);
}
static int Ctx__prepare_loop_divert(Ctx* self, int line, bool is_break) {
@ -1201,20 +1198,11 @@ static int Ctx__add_varname(Ctx* self, py_Name name) {
}
static int Ctx__add_const_string(Ctx* self, c11_sv key) {
uint16_t* val = c11_smallmap_s2n__try_get(&self->co_consts_string_dedup_map, key);
if(val) {
return *val;
} else {
py_TValue tmp;
py_newstrv(&tmp, key);
c11_vector__push(py_TValue, &self->co->consts, tmp);
py_Ref p = c11_vector__emplace(&self->co->consts);
py_newstrv(p, key);
int index = self->co->consts.length - 1;
c11_smallmap_s2n__set(&self->co_consts_string_dedup_map,
c11_string__sv(PyObject__userdata(tmp._obj)),
index);
return index;
}
}
static int Ctx__add_const(Ctx* self, py_Ref v) {
assert(v->type != tp_str);

View File

@ -44,7 +44,7 @@ void ManagedHeap__collect_if_needed(ManagedHeap* self) {
const int lower = PK_GC_MIN_THRESHOLD / 2;
float free_ratio = (float)avg_freed / self->gc_threshold;
int new_threshold = self->gc_threshold * (1 / free_ratio);
// printf("gc_threshold=%d, avg_freed=%d, new_threshold=%d\n", self->gc_threshold, avg_freed, new_threshold);
printf("gc_threshold=%d, avg_freed=%d, new_threshold=%d\n", self->gc_threshold, avg_freed, new_threshold);
self->gc_threshold = c11__min(c11__max(new_threshold, lower), upper);
}

View File

@ -59,7 +59,7 @@ void VM__ctor(VM* self) {
self->top_frame = NULL;
InternedNames__ctor(&self->names);
ModuleDict__ctor(&self->modules, NULL, *py_NIL());
ModuleDict__ctor(&self->modules, "", *py_NIL());
TypeList__ctor(&self->types);
self->builtins = *py_NIL();
@ -88,12 +88,6 @@ void VM__ctor(VM* self) {
ValueStack__ctor(&self->stack);
/* Init Builtin Types */
for(int i = 0; i < 128; i++) {
char* p = py_newstrn(&self->ascii_literals[i], 1);
*p = i;
}
py_newstrn(&self->ascii_literals[128], 0); // empty string
// 0: unused
void* placeholder = TypeList__emplace(&self->types);
memset(placeholder, 0, sizeof(py_TypeInfo));
@ -634,10 +628,6 @@ void ManagedHeap__mark(ManagedHeap* self) {
for(py_TValue* p = vm->stack.begin; p != vm->stack.end; p++) {
pk__mark_value(p);
}
// mark ascii literals
for(int i = 0; i < c11__count_array(vm->ascii_literals); i++) {
pk__mark_value(&vm->ascii_literals[i]);
}
// mark modules
ModuleDict__apply_mark(&vm->modules, p_stack);
// mark types

View File

@ -9,7 +9,11 @@
#undef SMALLMAP_T__SOURCE
void ModuleDict__ctor(ModuleDict* self, const char* path, py_TValue module) {
self->path = path;
assert(path != NULL);
int length = strlen(path);
assert(length <= PK_MAX_MODULE_PATH_LEN);
memcpy(self->path, path, length);
self->path[length] = '\0';
self->module = module;
self->left = NULL;
self->right = NULL;
@ -27,10 +31,7 @@ void ModuleDict__dtor(ModuleDict* self) {
}
void ModuleDict__set(ModuleDict* self, const char* key, py_TValue val) {
if(self->path == NULL) {
self->path = key;
self->module = val;
}
assert(key != NULL);
int cmp = strcmp(key, self->path);
if(cmp < 0) {
if(self->left) {
@ -52,7 +53,7 @@ void ModuleDict__set(ModuleDict* self, const char* key, py_TValue val) {
}
py_TValue* ModuleDict__try_get(ModuleDict* self, const char* path) {
if(self->path == NULL) return NULL;
assert(path != NULL);
int cmp = strcmp(path, self->path);
if(cmp < 0) {
if(self->left) {
@ -72,15 +73,20 @@ py_TValue* ModuleDict__try_get(ModuleDict* self, const char* path) {
}
bool ModuleDict__contains(ModuleDict* self, const char* path) {
assert(path != NULL);
return ModuleDict__try_get(self, path) != NULL;
}
void ModuleDict__apply_mark(ModuleDict* self, c11_vector* p_stack) {
if(!py_isnil(&self->module)) {
// root node is dummy
PyObject* obj = self->module._obj;
assert(obj != NULL);
if(!obj->gc_marked) {
obj->gc_marked = true;
c11_vector__push(PyObject*, p_stack, obj);
}
}
if(self->left) ModuleDict__apply_mark(self->left, p_stack);
if(self->right) ModuleDict__apply_mark(self->right, p_stack);
}

View File

@ -450,15 +450,11 @@ static bool builtins_chr(int argc, py_Ref argv) {
PY_CHECK_ARGC(1);
PY_CHECK_ARG_TYPE(0, tp_int);
uint32_t val = py_toint(py_arg(0));
if(val >= 0 && val < 128) {
py_assign(py_retval(), &pk_current_vm->ascii_literals[val]);
} else {
// convert to utf-8
char utf8[4];
int len = c11__u32_to_u8(val, utf8);
if(len == -1) return ValueError("invalid unicode code point: %d", val);
py_newstrv(py_retval(), (c11_sv){utf8, len});
}
return true;
}

View File

@ -9,6 +9,13 @@ bool py_isidentical(py_Ref lhs, py_Ref rhs) {
case tp_int: return lhs->_i64 == rhs->_i64;
case tp_float: return lhs->_f64 == rhs->_f64;
case tp_bool: return lhs->_bool == rhs->_bool;
case tp_str: {
if(lhs->is_ptr && rhs->is_ptr) {
return lhs->_obj == rhs->_obj;
} else {
return strcmp(lhs->_chars, rhs->_chars) == 0;
}
}
case tp_nativefunc: return lhs->_cfunc == rhs->_cfunc;
case tp_NoneType: return true;
case tp_NotImplementedType: return true;

View File

@ -9,6 +9,13 @@
void py_newstr(py_Ref out, const char* data) { py_newstrv(out, (c11_sv){data, strlen(data)}); }
char* py_newstrn(py_Ref out, int size) {
if(size < 8) {
out->type = tp_str;
out->is_ptr = false;
c11_string* ud = (c11_string*)(&out->extra);
c11_string__ctor3(ud, size);
return ud->data;
}
ManagedHeap* heap = &pk_current_vm->heap;
int total_size = sizeof(c11_string) + size + 1;
PyObject* obj = ManagedHeap__gcnew(heap, tp_str, 0, total_size);
@ -21,17 +28,6 @@ char* py_newstrn(py_Ref out, int size) {
}
void py_newstrv(py_OutRef out, c11_sv sv) {
if(sv.size == 0) {
*out = pk_current_vm->ascii_literals[128];
return;
}
if(sv.size == 1) {
int c = sv.data[0];
if(c >= 0 && c < 128) {
*out = pk_current_vm->ascii_literals[c];
return;
}
}
char* data = py_newstrn(out, sv.size);
memcpy(data, sv.data, sv.size);
}
@ -58,22 +54,25 @@ unsigned char* py_newbytes(py_Ref out, int size) {
return ud->data;
}
const char* py_tostr(py_Ref self) {
c11_string* pk_tostr(py_Ref self) {
assert(self->type == tp_str);
c11_string* ud = PyObject__userdata(self->_obj);
return ud->data;
if(!self->is_ptr) {
return (c11_string*)(&self->extra);
} else {
return PyObject__userdata(self->_obj);
}
}
const char* py_tostr(py_Ref self) { return pk_tostr(self)->data; }
const char* py_tostrn(py_Ref self, int* size) {
assert(self->type == tp_str);
c11_string* ud = PyObject__userdata(self->_obj);
c11_string* ud = pk_tostr(self);
*size = ud->size;
return ud->data;
}
c11_sv py_tosv(py_Ref self) {
assert(self->type == tp_str);
c11_string* ud = PyObject__userdata(self->_obj);
c11_string* ud = pk_tostr(self);
return c11_string__sv(ud);
}
@ -116,18 +115,18 @@ static bool str__hash__(int argc, py_Ref argv) {
static bool str__len__(int argc, py_Ref argv) {
PY_CHECK_ARGC(1);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
py_newint(py_retval(), c11_sv__u8_length((c11_sv){self->data, self->size}));
return true;
}
static bool str__add__(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
if(py_arg(1)->type != tp_str) {
py_newnotimplemented(py_retval());
} else {
c11_string* other = py_touserdata(&argv[1]);
c11_string* other = pk_tostr(&argv[1]);
char* p = py_newstrn(py_retval(), self->size + other->size);
memcpy(p, self->data, self->size);
memcpy(p + self->size, other->data, other->size);
@ -137,7 +136,7 @@ static bool str__add__(int argc, py_Ref argv) {
static bool str__mul__(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
if(py_arg(1)->type != tp_int) {
py_newnotimplemented(py_retval());
} else {
@ -158,11 +157,11 @@ static bool str__rmul__(int argc, py_Ref argv) { return str__mul__(argc, argv);
static bool str__contains__(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
if(py_arg(1)->type != tp_str) {
py_newnotimplemented(py_retval());
} else {
c11_string* other = py_touserdata(&argv[1]);
c11_string* other = pk_tostr(&argv[1]);
const char* p = strstr(self->data, other->data);
py_newbool(py_retval(), p != NULL);
}
@ -194,7 +193,7 @@ static bool str__iter__(int argc, py_Ref argv) {
static bool str__getitem__(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
c11_sv self = c11_string__sv(pk_tostr(&argv[0]));
py_Ref _1 = py_arg(1);
if(_1->type == tp_int) {
int index = py_toint(py_arg(1));
@ -218,11 +217,11 @@ static bool str__getitem__(int argc, py_Ref argv) {
#define DEF_STR_CMP_OP(op, __f, __cond) \
static bool str##op(int argc, py_Ref argv) { \
PY_CHECK_ARGC(2); \
c11_string* self = py_touserdata(&argv[0]); \
c11_string* self = pk_tostr(&argv[0]); \
if(py_arg(1)->type != tp_str) { \
py_newnotimplemented(py_retval()); \
} else { \
c11_string* other = py_touserdata(&argv[1]); \
c11_string* other = pk_tostr(&argv[1]); \
int res = __f(c11_string__sv(self), c11_string__sv(other)); \
py_newbool(py_retval(), __cond); \
} \
@ -240,7 +239,7 @@ DEF_STR_CMP_OP(__ge__, c11_sv__cmp, res >= 0)
static bool str_lower(int argc, py_Ref argv) {
PY_CHECK_ARGC(1);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
char* p = py_newstrn(py_retval(), self->size);
for(int i = 0; i < self->size; i++) {
char c = self->data[i];
@ -251,7 +250,7 @@ static bool str_lower(int argc, py_Ref argv) {
static bool str_upper(int argc, py_Ref argv) {
PY_CHECK_ARGC(1);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
char* p = py_newstrn(py_retval(), self->size);
for(int i = 0; i < self->size; i++) {
char c = self->data[i];
@ -262,25 +261,25 @@ static bool str_upper(int argc, py_Ref argv) {
static bool str_startswith(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
PY_CHECK_ARG_TYPE(1, tp_str);
c11_string* other = py_touserdata(&argv[1]);
c11_string* other = pk_tostr(&argv[1]);
py_newbool(py_retval(), c11_sv__startswith(c11_string__sv(self), c11_string__sv(other)));
return true;
}
static bool str_endswith(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
PY_CHECK_ARG_TYPE(1, tp_str);
c11_string* other = py_touserdata(&argv[1]);
c11_string* other = pk_tostr(&argv[1]);
py_newbool(py_retval(), c11_sv__endswith(c11_string__sv(self), c11_string__sv(other)));
return true;
}
static bool str_join(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_sv self = c11_string__sv(py_touserdata(argv));
c11_sv self = c11_string__sv(pk_tostr(argv));
if(!py_iter(py_arg(1))) return false;
py_push(py_retval()); // iter
@ -302,7 +301,7 @@ static bool str_join(int argc, py_Ref argv) {
c11_sbuf__dtor(&buf);
return false;
}
c11_string* item = py_touserdata(py_retval());
c11_string* item = pk_tostr(py_retval());
c11_sbuf__write_cstrn(&buf, item->data, item->size);
first = false;
}
@ -314,11 +313,11 @@ static bool str_join(int argc, py_Ref argv) {
static bool str_replace(int argc, py_Ref argv) {
PY_CHECK_ARGC(3);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
PY_CHECK_ARG_TYPE(1, tp_str);
PY_CHECK_ARG_TYPE(2, tp_str);
c11_string* old = py_touserdata(&argv[1]);
c11_string* new_ = py_touserdata(&argv[2]);
c11_string* old = pk_tostr(&argv[1]);
c11_string* new_ = pk_tostr(&argv[2]);
c11_string* res =
c11_sv__replace2(c11_string__sv(self), c11_string__sv(old), c11_string__sv(new_));
py_newstrv(py_retval(), (c11_sv){res->data, res->size});
@ -327,7 +326,7 @@ static bool str_replace(int argc, py_Ref argv) {
}
static bool str_split(int argc, py_Ref argv) {
c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
c11_sv self = c11_string__sv(pk_tostr(&argv[0]));
c11_vector res;
bool discard_empty = false;
if(argc > 2) return TypeError("split() takes at most 2 arguments");
@ -339,7 +338,7 @@ static bool str_split(int argc, py_Ref argv) {
if(argc == 2) {
// sep = argv[1]
if(!py_checkstr(&argv[1])) return false;
c11_sv sep = c11_string__sv(py_touserdata(&argv[1]));
c11_sv sep = c11_string__sv(pk_tostr(&argv[1]));
if(sep.size == 0) return ValueError("empty separator");
res = c11_sv__split2(self, sep);
}
@ -355,22 +354,22 @@ static bool str_split(int argc, py_Ref argv) {
static bool str_count(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
PY_CHECK_ARG_TYPE(1, tp_str);
c11_string* sub = py_touserdata(&argv[1]);
c11_string* sub = pk_tostr(&argv[1]);
int res = c11_sv__count(c11_string__sv(self), c11_string__sv(sub));
py_newint(py_retval(), res);
return true;
}
static bool str__strip_impl(bool left, bool right, int argc, py_Ref argv) {
c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
c11_sv self = c11_string__sv(pk_tostr(&argv[0]));
c11_sv chars;
if(argc == 1) {
chars = (c11_sv){" \t\n\r", 4};
} else if(argc == 2) {
if(!py_checkstr(&argv[1])) return false;
chars = c11_string__sv(py_touserdata(&argv[1]));
chars = c11_string__sv(pk_tostr(&argv[1]));
} else {
return TypeError("strip() takes at most 2 arguments");
}
@ -387,7 +386,7 @@ static bool str_rstrip(int argc, py_Ref argv) { return str__strip_impl(false, tr
static bool str_zfill(int argc, py_Ref argv) {
PY_CHECK_ARGC(2);
c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
c11_sv self = c11_string__sv(pk_tostr(&argv[0]));
PY_CHECK_ARG_TYPE(1, tp_int);
int width = py_toint(py_arg(1));
int delta = width - c11_sv__u8_length(self);
@ -412,12 +411,12 @@ static bool str__widthjust_impl(bool left, int argc, py_Ref argv) {
pad = ' ';
} else {
if(!py_checkstr(&argv[2])) return false;
c11_string* padstr = py_touserdata(&argv[2]);
c11_string* padstr = pk_tostr(&argv[2]);
if(padstr->size != 1)
return TypeError("The fill character must be exactly one character long");
pad = padstr->data[0];
}
c11_sv self = c11_string__sv(py_touserdata(&argv[0]));
c11_sv self = c11_string__sv(pk_tostr(&argv[0]));
PY_CHECK_ARG_TYPE(1, tp_int);
int width = py_toint(py_arg(1));
if(width <= self.size) {
@ -452,9 +451,9 @@ static bool str_find(int argc, py_Ref argv) {
PY_CHECK_ARG_TYPE(2, tp_int);
start = py_toint(py_arg(2));
}
c11_string* self = py_touserdata(&argv[0]);
c11_string* self = pk_tostr(&argv[0]);
PY_CHECK_ARG_TYPE(1, tp_str);
c11_string* sub = py_touserdata(&argv[1]);
c11_string* sub = pk_tostr(&argv[1]);
int res = c11_sv__index2(c11_string__sv(self), c11_string__sv(sub), start);
py_newint(py_retval(), res);
return true;

View File

@ -190,7 +190,7 @@ assert (1 != '1') is True
assert (1 == '1') is False
assert 1 == 1.0
assert chr(97) is 'a'
assert chr(97) == 'a'
assert ord('a') == 97
assert ord('🥕') == 0x1f955