diff --git a/amalgamate.py b/amalgamate.py index 99ef93fe..9775e061 100644 --- a/amalgamate.py +++ b/amalgamate.py @@ -6,7 +6,7 @@ with open("src/opcodes.h", "rt", encoding='utf-8') as f: OPCODES_TEXT = f.read() pipeline = [ - ["common.h", "vector.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h", "lexer.h"], + ["common.h", "memory.h", "vector.h", "str.h", "tuplelist.h", "namedict.h", "error.h", "lexer.h"], ["obj.h", "codeobject.h", "frame.h"], ["gc.h", "vm.h", "ceval.h", "expr.h", "compiler.h", "repl.h"], ["iter.h", "cffi.h", "io.h", "_generated.h", "pocketpy.h"] diff --git a/src/iter.h b/src/iter.h index e7cf6f04..bd048c05 100644 --- a/src/iter.h +++ b/src/iter.h @@ -49,6 +49,8 @@ public: StringIter(VM* vm, PyObject* ref) : BaseIter(vm), ref(ref), index(0) {} PyObject* next() override{ + // TODO: optimize this to use iterator + // operator[] is O(n) complexity Str* str = &OBJ_GET(Str, ref); if(index == str->u8_length()) return nullptr; return VAR(str->u8_getitem(index++)); diff --git a/src/lexer.h b/src/lexer.h index cf96cd8e..6ed245ea 100644 --- a/src/lexer.h +++ b/src/lexer.h @@ -2,7 +2,6 @@ #include "common.h" #include "error.h" -#include "new_str.h" #include "str.h" namespace pkpy{ diff --git a/src/memory.h b/src/memory.h index 5c10a8ad..fe84cbbe 100644 --- a/src/memory.h +++ b/src/memory.h @@ -73,7 +73,6 @@ struct LinkedListNode{ LinkedListNode* next; }; - template struct DoubleLinkedList{ static_assert(std::is_base_of_v); diff --git a/src/new_str.h b/src/new_str.h deleted file mode 100644 index 031c8433..00000000 --- a/src/new_str.h +++ /dev/null @@ -1,177 +0,0 @@ -#pragma once - -#include "common.h" -#include "memory.h" - -namespace pkpy{ - -inline int utf8len(unsigned char c){ - if((c & 0b10000000) == 0) return 1; - if((c & 0b11100000) == 0b11000000) return 2; - if((c & 0b11110000) == 0b11100000) return 3; - if((c & 0b11111000) == 0b11110000) return 4; - if((c & 0b11111100) == 0b11111000) return 5; - if((c & 0b11111110) == 0b11111100) return 6; - return 0; -} - -struct String{ - int size; - bool is_ascii; - char* data; - - String(): size(0), is_ascii(true), data((char*)pool64.alloc(0)) {} - - String(int size, bool is_ascii): size(size), is_ascii(is_ascii) { - data = (char*)pool64.alloc(size); - } - - String(const char* str): size(strlen(str)), is_ascii(true) { - data = (char*)pool64.alloc(size); - for(int i=0; i(const String& other) const { - int ret = strncmp(data, other.data, std::min(size, other.size)); - if(ret != 0) return ret > 0; - return size > other.size; - } - - bool operator<=(const String& other) const { - int ret = strncmp(data, other.data, std::min(size, other.size)); - if(ret != 0) return ret < 0; - return size <= other.size; - } - - bool operator>=(const String& other) const { - int ret = strncmp(data, other.data, std::min(size, other.size)); - if(ret != 0) return ret > 0; - return size >= other.size; - } - - String substr(int start, int len) const { - String ret(len, is_ascii); - memcpy(ret.data, data + start, len); - return ret; - } - - char* dup_c_str() const { - char* p = (char*)malloc(size + 1); - memcpy(p, data, size); - p[size] = 0; - return p; - } - - std::string_view view() const { - return std::string_view(data, size); - } - - std::string str() const { - return std::string(data, size); - } - - String lstrip() const { - std::string copy = str(); - copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) { - // std::isspace(c) does not working on windows (Debug) - return c != ' ' && c != '\t' && c != '\r' && c != '\n'; - })); - return String(copy.c_str()); - } - - /*************unicode*************/ - - int _u8_index(int i) const{ - if(is_ascii) return i; - int j = 0; - while(i > 0){ - j += utf8len(data[j]); - i--; - } - return j; - } - - String u8_getitem(int i) const { - i = _u8_index(i); - return substr(i, utf8len(data[i])); - } - - String u8_slice(int start, int end) const{ - start = _u8_index(start); - end = _u8_index(end); - return substr(start, end - start); - } -}; - -} // namespace pkpy \ No newline at end of file diff --git a/src/pocketpy.h b/src/pocketpy.h index 4ce389d1..3459c8e3 100644 --- a/src/pocketpy.h +++ b/src/pocketpy.h @@ -131,8 +131,8 @@ inline void init_builtins(VM* _vm) { _vm->bind_builtin_func<1>("ord", [](VM* vm, Args& args) { const Str& s = CAST(Str&, args[0]); - if (s.size() != 1) vm->TypeError("ord() expected an ASCII character"); - return VAR((i64)(s.c_str()[0])); + if (s.length()!=1) vm->TypeError("ord() expected an ASCII character"); + return VAR((i64)(s[0])); }); _vm->bind_builtin_func<2>("hasattr", [](VM* vm, Args& args) { @@ -237,8 +237,8 @@ inline void init_builtins(VM* _vm) { const Str& s = CAST(Str&, args[0]); try{ size_t parsed = 0; - i64 val = S_TO_INT(s, &parsed, 10); - if(parsed != s.size()) throw std::invalid_argument(""); + i64 val = S_TO_INT(s.str(), &parsed, 10); + if(parsed != s.length()) throw std::invalid_argument(""); return VAR(val); }catch(std::invalid_argument&){ vm->ValueError("invalid literal for int(): " + s.escape(true)); @@ -284,7 +284,7 @@ inline void init_builtins(VM* _vm) { if(s == "inf") return VAR(INFINITY); if(s == "-inf") return VAR(-INFINITY); try{ - f64 val = S_TO_FLOAT(s); + f64 val = S_TO_FLOAT(s.str()); return VAR(val); }catch(std::invalid_argument&){ vm->ValueError("invalid literal for float(): '" + s + "'"); @@ -327,7 +327,7 @@ inline void init_builtins(VM* _vm) { _vm->bind_method<1>("str", "__contains__", [](VM* vm, Args& args) { const Str& self = CAST(Str&, args[0]); const Str& other = CAST(Str&, args[1]); - return VAR(self.find(other) != Str::npos); + return VAR(self.index(other) != -1); }); _vm->bind_method<0>("str", "__str__", CPP_LAMBDA(args[0])); @@ -361,7 +361,7 @@ inline void init_builtins(VM* _vm) { if(is_type(args[1], vm->tp_slice)){ Slice s = _CAST(Slice, args[1]); s.normalize(self.u8_length()); - return VAR(self.u8_substr(s.start, s.stop)); + return VAR(self.u8_slice(s.start, s.stop)); } int index = CAST(int, args[1]); @@ -382,28 +382,25 @@ inline void init_builtins(VM* _vm) { }); _vm->bind_method<2>("str", "replace", [](VM* vm, Args& args) { - const Str& _self = CAST(Str&, args[0]); - const Str& _old = CAST(Str&, args[1]); - const Str& _new = CAST(Str&, args[2]); - Str _copy = _self; - size_t pos = 0; - while ((pos = _copy.find(_old, pos)) != std::string::npos) { - _copy.replace(pos, _old.length(), _new); - pos += _new.length(); - } - return VAR(_copy); + const Str& self = CAST(Str&, args[0]); + const Str& old = CAST(Str&, args[1]); + const Str& new_ = CAST(Str&, args[2]); + return VAR(self.replace(old, new_)); }); _vm->bind_method<1>("str", "startswith", [](VM* vm, Args& args) { const Str& self = CAST(Str&, args[0]); const Str& prefix = CAST(Str&, args[1]); - return VAR(self.find(prefix) == 0); + return VAR(self.index(prefix) == 0); }); _vm->bind_method<1>("str", "endswith", [](VM* vm, Args& args) { const Str& self = CAST(Str&, args[0]); const Str& suffix = CAST(Str&, args[1]); - return VAR(self.rfind(suffix) == self.length() - suffix.length()); + int offset = self.length() - suffix.length(); + if(offset < 0) return vm->False; + bool ok = memcmp(self.data+offset, suffix.data, suffix.length()) == 0; + return VAR(ok); }); _vm->bind_method<1>("str", "join", [](VM* vm, Args& args) { @@ -664,13 +661,15 @@ struct ReMatch { } }; -inline PyObject* _regex_search(const Str& pattern, const Str& string, bool fromStart, VM* vm){ +inline PyObject* _regex_search(const Str& _pattern, const Str& _string, bool fromStart, VM* vm){ + std::string pattern = _pattern.str(); + std::string string = _string.str(); std::regex re(pattern); std::smatch m; if(std::regex_search(string, m, re)){ if(fromStart && m.position() != 0) return vm->None; - i64 start = string._to_u8_index(m.position()); - i64 end = string._to_u8_index(m.position() + m.length()); + i64 start = _string._u8_index(m.position()); + i64 end = _string._u8_index(m.position() + m.length()); return VAR_T(ReMatch, start, end, m); } return vm->None; @@ -695,14 +694,15 @@ inline void add_module_re(VM* vm){ vm->bind_func<3>(mod, "sub", [](VM* vm, Args& args) { const Str& pattern = CAST(Str&, args[0]); const Str& repl = CAST(Str&, args[1]); - const Str& string = CAST(Str&, args[2]); - std::regex re(pattern); + const Str& _string = CAST(Str&, args[2]); + std::regex re(pattern.str()); + std::string string = _string.str(); return VAR(std::regex_replace(string, re, repl)); }); vm->bind_func<2>(mod, "split", [](VM* vm, Args& args) { - const Str& pattern = CAST(Str&, args[0]); - const Str& string = CAST(Str&, args[1]); + std::string pattern = CAST(Str&, args[0]).str(); + std::string string = CAST(Str&, args[1]).str(); std::regex re(pattern); std::sregex_token_iterator it(string.begin(), string.end(), re, -1); std::sregex_token_iterator end; diff --git a/src/str.h b/src/str.h index 53d71297..af0e1acb 100644 --- a/src/str.h +++ b/src/str.h @@ -1,67 +1,182 @@ #pragma once #include "common.h" +#include "memory.h" namespace pkpy { typedef std::stringstream StrStream; -class Str : public std::string { - mutable std::vector* _u8_index = nullptr; +inline int utf8len(unsigned char c){ + if((c & 0b10000000) == 0) return 1; + if((c & 0b11100000) == 0b11000000) return 2; + if((c & 0b11110000) == 0b11100000) return 3; + if((c & 0b11111000) == 0b11110000) return 4; + if((c & 0b11111100) == 0b11111000) return 5; + if((c & 0b11111110) == 0b11111100) return 6; + return 0; +} - void utf8_lazy_init() const{ - if(_u8_index != nullptr) return; - _u8_index = new std::vector(); - _u8_index->reserve(size()); - if(size() > 65535) throw std::runtime_error("str has more than 65535 bytes."); - for(uint16_t i = 0; i < size(); i++){ - // https://stackoverflow.com/questions/3911536/utf-8-unicode-whats-with-0xc0-and-0x80 - if((at(i) & 0xC0) != 0x80) _u8_index->push_back(i); - } - } -public: +struct Str{ + int size; + bool is_ascii; + char* data; uint16_t _cached_sn_index = 0; - Str() : std::string() {} - Str(const char* s) : std::string(s) {} - Str(const char* s, size_t n) : std::string(s, n) {} - Str(const std::string& s) : std::string(s) {} - Str(const Str& s) : std::string(s) { - if(s._u8_index != nullptr){ - _u8_index = new std::vector(*s._u8_index); + Str(): size(0), is_ascii(true), data((char*)pool64.alloc(0)) {} + + Str(int size, bool is_ascii): size(size), is_ascii(is_ascii) { + data = (char*)pool64.alloc(size); + } + +#define STR_INIT() \ + data = (char*)pool64.alloc(size); \ + for(int i=0; ibegin(), _u8_index->end(), index); - if(p != _u8_index->end() && *p != index) UNREACHABLE(); - return p - _u8_index->begin(); + Str(std::string_view s): size(s.size()), is_ascii(true) { + STR_INIT() } - int u8_length() const { - utf8_lazy_init(); - return _u8_index->size(); + Str(const char* s): size(strlen(s)), is_ascii(true) { + STR_INIT() } - Str u8_getitem(int i) const{ - return u8_substr(i, i+1); + Str(const char* s, int len): size(len), is_ascii(true) { + STR_INIT() } - Str u8_substr(int start, int end) const{ - utf8_lazy_init(); - if(start >= end) return Str(); - int c_end = end >= _u8_index->size() ? size() : _u8_index->at(end); - return substr(_u8_index->at(start), c_end - _u8_index->at(start)); +#undef STR_INIT + + Str(const Str& other): size(other.size), is_ascii(other.is_ascii) { + data = (char*)pool64.alloc(size); + memcpy(data, other.data, size); + } + + Str(Str&& other): size(other.size), is_ascii(other.is_ascii), data(other.data) { + other.data = nullptr; + other.size = 0; + } + + Str& operator=(const Str& other){ + if(data!=nullptr) pool64.dealloc(data); + size = other.size; + is_ascii = other.is_ascii; + data = (char*)pool64.alloc(size); + memcpy(data, other.data, size); + return *this; + } + + Str& operator=(Str&& other) noexcept{ + if(data!=nullptr) pool64.dealloc(data); + size = other.size; + is_ascii = other.is_ascii; + data = other.data; + other.data = nullptr; + return *this; + } + + ~Str(){ + if(data!=nullptr) pool64.dealloc(data); + } + + char operator[](int idx) const { + return data[idx]; + } + + int length() const { + return size; + } + + size_t hash() const{ + return std::hash()(sv()); + } + + Str operator+(const Str& other) const { + Str ret(size + other.size, is_ascii && other.is_ascii); + memcpy(ret.data, data, size); + memcpy(ret.data + size, other.data, other.size); + return ret; + } + + Str operator+(const char* p) const { + Str other(p); + return *this + other; + } + + friend Str operator+(const char* p, const Str& str){ + Str other(p); + return other + str; + } + + friend std::ostream& operator<<(std::ostream& os, const Str& str){ + os.write(str.data, str.size); + return os; + } + + bool operator==(const Str& other) const { + if(size != other.size) return false; + return memcmp(data, other.data, size) == 0; + } + + bool operator!=(const Str& other) const { + if(size != other.size) return true; + return memcmp(data, other.data, size) != 0; + } + + bool operator<(const Str& other) const { + int ret = strncmp(data, other.data, std::min(size, other.size)); + if(ret != 0) return ret < 0; + return size < other.size; + } + + bool operator>(const Str& other) const { + int ret = strncmp(data, other.data, std::min(size, other.size)); + if(ret != 0) return ret > 0; + return size > other.size; + } + + bool operator<=(const Str& other) const { + int ret = strncmp(data, other.data, std::min(size, other.size)); + if(ret != 0) return ret < 0; + return size <= other.size; + } + + bool operator>=(const Str& other) const { + int ret = strncmp(data, other.data, std::min(size, other.size)); + if(ret != 0) return ret > 0; + return size >= other.size; + } + + Str substr(int start, int len) const { + Str ret(len, is_ascii); + memcpy(ret.data, data + start, len); + return ret; + } + + char* c_str_dup() const { + char* p = (char*)malloc(size + 1); + memcpy(p, data, size); + p[size] = 0; + return p; + } + + std::string_view sv() const { + return std::string_view(data, size); + } + + std::string str() const { + return std::string(data, size); } Str lstrip() const { - Str copy(*this); + std::string copy = str(); copy.erase(copy.begin(), std::find_if(copy.begin(), copy.end(), [](char c) { // std::isspace(c) does not working on windows (Debug) return c != ' ' && c != '\t' && c != '\r' && c != '\n'; @@ -69,10 +184,6 @@ public: return Str(copy); } - size_t hash() const { - return std::hash()(*this); - } - Str escape(bool single_quote) const { StrStream ss; ss << (single_quote ? '\'' : '"'); @@ -104,24 +215,60 @@ public: return ss.str(); } - Str& operator=(const Str& s){ - this->std::string::operator=(s); - delete _u8_index; - if(s._u8_index != nullptr){ - _u8_index = new std::vector(*s._u8_index); + int index(const Str& sub) const { + auto p = std::search(data, data + size, sub.data, sub.data + sub.size); + if(p == data + size) return -1; + return p - data; + } + + Str replace(const Str& old, const Str& new_) const { + StrStream ss; + int i = 0; + while(i < size){ + int j = index(old); + if(j == -1){ + ss << substr(i, size - i); + break; + } + ss << substr(i, j - i); + ss << new_; + i = j + old.size; } - return *this; + return ss.str(); } - Str& operator=(Str&& s){ - this->std::string::operator=(std::move(s)); - delete _u8_index; - this->_u8_index = s._u8_index; - s._u8_index = nullptr; - return *this; + /*************unicode*************/ + + int _u8_index(int i) const{ + if(is_ascii) return i; + int j = 0; + while(i > 0){ + j += utf8len(data[j]); + i--; + } + return j; } - ~Str(){ delete _u8_index;} + Str u8_getitem(int i) const{ + i = _u8_index(i); + return substr(i, utf8len(data[i])); + } + + Str u8_slice(int start, int end) const{ + // TODO: optimize this + start = _u8_index(start); + end = _u8_index(end); + return substr(start, end - start); + } + + int u8_length() const { + if(is_ascii) return size; + int ret = 0; + for(int i=0; i> _interned; static std::vector _r_interned; - static StrName get(const Str& s){ - return get(s.c_str()); - } - - static StrName get(const char* s){ + static StrName get(std::string_view s){ auto it = _interned.find(s); if(it != _interned.end()) return StrName(it->second); uint16_t index = (uint16_t)(_r_interned.size() + 1); diff --git a/src/vm.h b/src/vm.h index 7b8439b9..5d2ce1c4 100644 --- a/src/vm.h +++ b/src/vm.h @@ -561,8 +561,8 @@ inline PyObject* VM::new_module(StrName name) { inline Str VM::disassemble(CodeObject_ co){ auto pad = [](const Str& s, const int n){ - if(s.size() >= n) return s.substr(0, n); - return s + std::string(n - s.size(), ' '); + if(s.length() >= n) return s.substr(0, n); + return s + std::string(n - s.length(), ' '); }; std::vector jumpTargets; @@ -591,7 +591,7 @@ inline Str VM::disassemble(CodeObject_ co){ ss << pad(line, 8) << pointer << pad(std::to_string(i), 3); ss << " " << pad(OP_NAMES[byte.op], 20) << " "; // ss << pad(byte.arg == -1 ? "" : std::to_string(byte.arg), 5); - Str argStr = byte.arg == -1 ? "" : std::to_string(byte.arg); + std::string argStr = byte.arg == -1 ? "" : std::to_string(byte.arg); switch(byte.op){ case OP_LOAD_CONST: argStr += " (" + CAST(Str, asRepr(co->consts[byte.arg])) + ")";