From 3787a1da1da485153972d27d5ea9d0e62b079cad Mon Sep 17 00:00:00 2001 From: blueloveTH Date: Mon, 10 Jun 2024 22:38:49 +0800 Subject: [PATCH] init --- CMakeLists.txt | 9 +- build.sh | 4 +- build_g.sh | 7 +- build_web.sh | 5 +- include/pocketpy/common/str.h | 52 +++++ include/pocketpy/common/str.hpp | 229 +++++++++++++++------ include/pocketpy/common/traits.hpp | 7 + include/pocketpy/common/utils.h | 29 +++ include/pocketpy/common/utils.hpp | 36 ---- include/pocketpy/common/vector.h | 55 ++++++ include/pocketpy/interpreter/gc.hpp | 2 +- include/pocketpy/interpreter/vm.hpp | 4 +- include/pocketpy/objects/namedict.hpp | 2 +- include/pocketpy/objects/sourcedata.hpp | 2 +- src/common/any.cpp | 2 +- src/common/str.c | 250 +++++++++++++++++++++++ src/common/str.cpp | 253 +----------------------- src/common/utils.c | 9 + src/common/vector.c | 61 ++++++ src/compiler/lexer.cpp | 3 +- src/interpreter/ceval.cpp | 2 +- src/interpreter/iter.cpp | 2 +- src/modules/io.cpp | 2 +- src/pocketpy.cpp | 11 +- tests/04_str.py | 4 +- 25 files changed, 681 insertions(+), 361 deletions(-) create mode 100644 include/pocketpy/common/str.h create mode 100644 include/pocketpy/common/utils.h delete mode 100644 include/pocketpy/common/utils.hpp create mode 100644 include/pocketpy/common/vector.h create mode 100644 src/common/str.c create mode 100644 src/common/utils.c create mode 100644 src/common/vector.c diff --git a/CMakeLists.txt b/CMakeLists.txt index c0d719cf..db995efa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,23 +2,30 @@ cmake_minimum_required(VERSION 3.10) project(pocketpy) +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /utf-8 /Ox /jumptablerdata /GS-") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8 /Ox /jumptablerdata /GS-") add_compile_options(/wd4267 /wd4244) else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -frtti -O2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2") # disable -Wshorten-64-to-32 for apple if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-shorten-64-to-32") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-shorten-64-to-32") endif() endif() include_directories(${CMAKE_CURRENT_LIST_DIR}/include) -file(GLOB_RECURSE POCKETPY_SRC ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp) +file(GLOB_RECURSE POCKETPY_SRC_CPP ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp) +file(GLOB_RECURSE POCKETPY_SRC_C ${CMAKE_CURRENT_LIST_DIR}/src/*.c) +set(POCKETPY_SRC ${POCKETPY_SRC_CPP} ${POCKETPY_SRC_C}) option(PK_USE_CJSON "" OFF) if(PK_USE_CJSON) diff --git a/build.sh b/build.sh index a474d17f..d1760346 100644 --- a/build.sh +++ b/build.sh @@ -18,7 +18,9 @@ if [ $? -ne 0 ]; then exit 1 fi -SRC=$(find src/ -name "*.cpp") +SRC_C=$(find src/ -name "*.c") +SRC_CPP=$(find src/ -name "*.cpp") +SRC="$SRC_C $SRC_CPP" echo "> Compiling and linking source files... " diff --git a/build_g.sh b/build_g.sh index a634da62..340e471c 100644 --- a/build_g.sh +++ b/build_g.sh @@ -1,7 +1,10 @@ python prebuild.py -SRC=$(find src/ -name "*.cpp") +SRC_C=$(find src/ -name "*.c") +SRC_CPP=$(find src/ -name "*.cpp") +SRC="$SRC_C $SRC_CPP" -FLAGS="-std=c++17 -O0 -stdlib=libc++ -Iinclude -frtti -Wfatal-errors -g -DDEBUG" +FLAGS="-std=c++17 -O0 -stdlib=libc++ -Iinclude -frtti -Wfatal-errors -g -DDEBUG -DPK_ENABLE_OS=1" clang++ $FLAGS -o main src2/main.cpp $SRC + diff --git a/build_web.sh b/build_web.sh index 8695f700..f3006c45 100644 --- a/build_web.sh +++ b/build_web.sh @@ -3,5 +3,8 @@ python prebuild.py rm -rf web/lib mkdir web/lib -SRC=$(find src/ -name "*.cpp") +SRC_C=$(find src/ -name "*.c") +SRC_CPP=$(find src/ -name "*.cpp") +SRC="$SRC_C $SRC_CPP" + em++ $SRC -Iinclude/ -fexceptions -frtti -s -Os -sEXPORTED_FUNCTIONS=_pkpy_new_repl,_pkpy_repl_input,_pkpy_new_vm -sEXPORTED_RUNTIME_METHODS=ccall -o web/lib/pocketpy.js diff --git a/include/pocketpy/common/str.h b/include/pocketpy/common/str.h new file mode 100644 index 00000000..585073c9 --- /dev/null +++ b/include/pocketpy/common/str.h @@ -0,0 +1,52 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct pkpy_Str{ + int size; + bool is_ascii; + bool is_sso; + union{ + char* _ptr; + char _inlined[16]; + }; +} pkpy_Str; + +inline const char* pkpy_Str__data(const pkpy_Str* self){ + return self->is_sso ? self->_inlined : self->_ptr; +} + +inline int pkpy_Str__size(const pkpy_Str* self){ + return self->size; +} + +int pkpy_utils__u8len(unsigned char c, bool suppress); +void pkpy_Str__ctor(pkpy_Str* self, const char* data); +void pkpy_Str__ctor2(pkpy_Str* self, const char* data, int size); +void pkpy_Str__dtor(pkpy_Str* self); +pkpy_Str pkpy_Str__copy(const pkpy_Str* self); +pkpy_Str pkpy_Str__concat(const pkpy_Str* self, const pkpy_Str* other); +pkpy_Str pkpy_Str__concat2(const pkpy_Str* self, const char* other, int size); +pkpy_Str pkpy_Str__substr(const pkpy_Str* self, int start); +pkpy_Str pkpy_Str__substr2(const pkpy_Str* self, int start, int size); +pkpy_Str pkpy_Str__lower(const pkpy_Str* self); +pkpy_Str pkpy_Str__upper(const pkpy_Str* self); +pkpy_Str pkpy_Str__replace(const pkpy_Str* self, char old, char new_); +pkpy_Str pkpy_Str__replace2(const pkpy_Str* self, const pkpy_Str* old, const pkpy_Str* new_); +pkpy_Str pkpy_Str__u8_getitem(const pkpy_Str* self, int i); +pkpy_Str pkpy_Str__u8_slice(const pkpy_Str* self, int start, int stop, int step); +int pkpy_Str__u8_length(const pkpy_Str* self); +int pkpy_Str__cmp(const pkpy_Str* self, const pkpy_Str* other); +int pkpy_Str__cmp2(const pkpy_Str* self, const char* other, int size); +int pkpy_Str__unicode_index_to_byte(const pkpy_Str* self, int i); +int pkpy_Str__byte_index_to_unicode(const pkpy_Str* self, int n); +int pkpy_Str__index(const pkpy_Str* self, const pkpy_Str* sub, int start); +int pkpy_Str__count(const pkpy_Str* self, const pkpy_Str* sub); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/pocketpy/common/str.hpp b/include/pocketpy/common/str.hpp index e4788596..4d31ddc9 100644 --- a/include/pocketpy/common/str.hpp +++ b/include/pocketpy/common/str.hpp @@ -1,80 +1,162 @@ #pragma once -#include "pocketpy/common/utils.hpp" +#include "pocketpy/common/utils.h" #include "pocketpy/common/memorypool.hpp" #include "pocketpy/common/vector.hpp" +#include "pocketpy/common/str.h" #include +#include namespace pkpy { -int utf8len(unsigned char c, bool suppress = false); struct SStream; -struct Str { - int size; - bool is_ascii; - char* data; - char _inlined[16]; +struct Str: pkpy_Str { + bool is_inlined() const { return is_sso; } - bool is_inlined() const { return data == _inlined; } + Str(){ + pkpy_Str__ctor2(this, "", 0); + } + + Str(pkpy_Str&& s){ + std::memcpy(this, &s, sizeof(pkpy_Str)); + } + + Str(const std::string& s){ + pkpy_Str__ctor2(this, s.data(), s.size()); + } + + Str(std::string_view s){ + pkpy_Str__ctor2(this, s.data(), s.size()); + } + + Str(const char* s){ + pkpy_Str__ctor2(this, s, strlen(s)); + } + + Str(const char* s, int len){ + pkpy_Str__ctor2(this, s, len); + } - Str(); - Str(int size, bool is_ascii); - Str(const std::string& s); - Str(std::string_view s); - Str(const char* s); - Str(const char* s, int len); Str(pair); // take ownership - Str(const Str& other); - Str(Str&& other); + + Str(const Str& other){ + pkpy_Str__ctor2(this, pkpy_Str__data(&other), other.size); + } + + Str(Str&& other){ + std::memcpy(this, &other, sizeof(pkpy_Str)); + other.size = 0; + other.is_sso = true; + } operator std::string_view () const { return sv(); } - - const char* begin() const { return data; } - - const char* end() const { return data + size; } - - char operator[] (int idx) const { return data[idx]; } - + const char* begin() const { return pkpy_Str__data(this); } + const char* end() const { return pkpy_Str__data(this) + size; } int length() const { return size; } - + char operator[] (int idx) const { return pkpy_Str__data(this)[idx]; } bool empty() const { return size == 0; } - size_t hash() const { return std::hash()(sv()); } - Str& operator= (const Str&); - Str operator+ (const Str&) const; - Str operator+ (const char*) const; - friend Str operator+ (const char*, const Str&); + Str& operator= (const Str& other){ + pkpy_Str__dtor(this); + pkpy_Str__ctor2(this, pkpy_Str__data(&other), other.size); + return *this; + } - bool operator== (const std::string_view other) const; - bool operator!= (const std::string_view other) const; - bool operator< (const std::string_view other) const; - friend bool operator< (const std::string_view other, const Str& str); + Str operator+ (const Str& other) const{ + return pkpy_Str__concat(this, &other); + } - bool operator== (const char* p) const; - bool operator!= (const char* p) const; + Str operator+ (const char* other) const{ + return pkpy_Str__concat2(this, other, strlen(other)); + } - bool operator== (const Str& other) const; - bool operator!= (const Str& other) const; - bool operator< (const Str& other) const; - bool operator> (const Str& other) const; - bool operator<= (const Str& other) const; - bool operator>= (const Str& other) const; + friend Str operator+ (const char* self, const Str& other){ + pkpy_Str tmp; + pkpy_Str__ctor2(&tmp, self, strlen(self)); + pkpy_Str retval = pkpy_Str__concat(&tmp, &other); + pkpy_Str__dtor(&tmp); + return retval; + } - ~Str(); + bool operator== (const std::string_view other) const{ + int res = pkpy_Str__cmp2(this, other.data(), other.size()); + return res == 0; + } - friend std::ostream& operator<< (std::ostream& os, const Str& str); + bool operator!= (const std::string_view other) const{ + int res = pkpy_Str__cmp2(this, other.data(), other.size()); + return res != 0; + } - const char* c_str() const { return data; } + bool operator< (const std::string_view other) const{ + int res = pkpy_Str__cmp2(this, other.data(), other.size()); + return res < 0; + } - std::string_view sv() const { return std::string_view(data, size); } + friend bool operator< (const std::string_view other, const Str& str){ + int res = pkpy_Str__cmp2(&str, other.data(), other.size()); + return res > 0; + } - std::string str() const { return std::string(data, size); } + bool operator== (const char* p) const{ + int res = pkpy_Str__cmp2(this, p, strlen(p)); + return res == 0; + } + + bool operator!= (const char* p) const{ + int res = pkpy_Str__cmp2(this, p, strlen(p)); + return res != 0; + } + + bool operator== (const Str& other) const{ + return pkpy_Str__cmp(this, &other) == 0; + } + bool operator!= (const Str& other) const{ + return pkpy_Str__cmp(this, &other) != 0; + } + bool operator< (const Str& other) const{ + return pkpy_Str__cmp(this, &other) < 0; + } + bool operator> (const Str& other) const{ + return pkpy_Str__cmp(this, &other) > 0; + } + bool operator<= (const Str& other) const{ + return pkpy_Str__cmp(this, &other) <= 0; + } + bool operator>= (const Str& other) const{ + return pkpy_Str__cmp(this, &other) >= 0; + } + + ~Str(){ + pkpy_Str__dtor(this); + } + + friend std::ostream& operator<< (std::ostream& os, const Str& self){ + os.write(pkpy_Str__data(&self), self.size); + return os; + } + + const char* c_str() const { return pkpy_Str__data(this); } + + std::string_view sv() const { + return std::string_view(pkpy_Str__data(this), size); + } + + std::string str() const { + return std::string(pkpy_Str__data(this), size); + } + + Str substr(int start, int size) const{ + return pkpy_Str__substr2(this, start, size); + } + + Str substr(int start) const{ + return pkpy_Str__substr(this, start); + } - Str substr(int start, int len) const; - Str substr(int start) const; Str strip(bool left, bool right, const Str& chars) const; Str strip(bool left = true, bool right = true) const; @@ -82,23 +164,52 @@ struct Str { Str rstrip() const { return strip(false, true); } - Str lower() const; - Str upper() const; + Str lower() const{ + return pkpy_Str__lower(this); + } + Str upper() const{ + return pkpy_Str__upper(this); + } + Str replace(char old, char new_) const{ + return pkpy_Str__replace(this, old, new_); + } + Str replace(const Str& old, const Str& new_) const{ + return pkpy_Str__replace2(this, &old, &new_); + } + Str escape(bool single_quote = true) const; void escape_(SStream& ss, bool single_quote = true) const; - int index(const Str& sub, int start = 0) const; - Str replace(char old, char new_) const; - Str replace(const Str& old, const Str& new_, int count = -1) const; vector split(const Str& sep) const; vector split(char sep) const; - int count(const Str& sub) const; + + int index(const Str& sub, int start = 0) const{ + return pkpy_Str__index(this, &sub, start); + } + + int count(const Str& sub) const{ + return pkpy_Str__count(this, &sub); + } /*************unicode*************/ - int _unicode_index_to_byte(int i) const; - int _byte_index_to_unicode(int n) const; - Str u8_getitem(int i) const; - Str u8_slice(int start, int stop, int step) const; - int u8_length() const; + int _unicode_index_to_byte(int i) const{ + return pkpy_Str__unicode_index_to_byte(this, i); + } + + int _byte_index_to_unicode(int n) const{ + return pkpy_Str__byte_index_to_unicode(this, n); + } + + Str u8_getitem(int i) const{ + return pkpy_Str__u8_getitem(this, i); + } + + Str u8_slice(int start, int stop, int step) const{ + return pkpy_Str__u8_slice(this, start, stop, step); + } + + int u8_length() const{ + return pkpy_Str__u8_length(this); + } }; struct StrName { diff --git a/include/pocketpy/common/traits.hpp b/include/pocketpy/common/traits.hpp index 35e90220..4aa1c789 100644 --- a/include/pocketpy/common/traits.hpp +++ b/include/pocketpy/common/traits.hpp @@ -37,4 +37,11 @@ struct has_gc_marker> : std::true_type {} template constexpr inline int py_sizeof = 16 + sizeof(T); + +#define PK_ALWAYS_PASS_BY_POINTER(T) \ + T(const T&) = delete; \ + T& operator= (const T&) = delete; \ + T(T&&) = delete; \ + T& operator= (T&&) = delete; + } // namespace pkpy diff --git a/include/pocketpy/common/utils.h b/include/pocketpy/common/utils.h new file mode 100644 index 00000000..52d4bfc2 --- /dev/null +++ b/include/pocketpy/common/utils.h @@ -0,0 +1,29 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#define PK_REGION(name) 1 + +#define PK_SLICE_LOOP(i, start, stop, step) for(int i = start; step > 0 ? i < stop : i > stop; i += step) + +// global constants +#define PK_HEX_TABLE "0123456789abcdef" + +extern const char* kPlatformStrings[]; + +#ifdef _MSC_VER +#define PK_UNREACHABLE() __assume(0); +#else +#define PK_UNREACHABLE() __builtin_unreachable(); +#endif + +#define PK_FATAL_ERROR(...) { fprintf(stderr, __VA_ARGS__); abort(); } + +#define PK_MIN(a, b) ((a) < (b) ? (a) : (b)) +#define PK_MAX(a, b) ((a) > (b) ? (a) : (b)) + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/pocketpy/common/utils.hpp b/include/pocketpy/common/utils.hpp deleted file mode 100644 index abf53c0e..00000000 --- a/include/pocketpy/common/utils.hpp +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#define PK_REGION(name) 1 - -#define PK_ALWAYS_PASS_BY_POINTER(T) \ - T(const T&) = delete; \ - T& operator= (const T&) = delete; \ - T(T&&) = delete; \ - T& operator= (T&&) = delete; - -#define PK_SLICE_LOOP(i, start, stop, step) for(int i = start; step > 0 ? i < stop : i > stop; i += step) - -namespace pkpy { - -// global constants -const inline char* PK_HEX_TABLE = "0123456789abcdef"; - -const inline char* kPlatformStrings[] = { - "win32", // 0 - "emscripten", // 1 - "ios", // 2 - "darwin", // 3 - "android", // 4 - "linux", // 5 - "unknown" // 6 -}; - -#ifdef _MSC_VER -#define PK_UNREACHABLE() __assume(0); -#else -#define PK_UNREACHABLE() __builtin_unreachable(); -#endif - -#define PK_FATAL_ERROR(...) { fprintf(stderr, __VA_ARGS__); std::abort(); } - -} // namespace pkpy diff --git a/include/pocketpy/common/vector.h b/include/pocketpy/common/vector.h new file mode 100644 index 00000000..6c9c33d1 --- /dev/null +++ b/include/pocketpy/common/vector.h @@ -0,0 +1,55 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct c11_array{ + void* data; + int count; + int elem_size; +} c11_array; + +void c11_array__ctor(c11_array* self, int elem_size, int count); +void c11_array__dtor(c11_array* self); +c11_array c11_array__copy(const c11_array* self); +void* c11_array__at(c11_array* self, int index); + +typedef struct c11_vector{ + void* data; + int count; + int capacity; + int elem_size; +} c11_vector; + +void c11_vector__ctor(c11_vector* self, int elem_size); +void c11_vector__dtor(c11_vector* self); +c11_vector c11_vector__copy(const c11_vector* self); +void* c11_vector__at(c11_vector* self, int index); +void c11_vector__reserve(c11_vector* self, int capacity); + +#define c11__getitem(T, self, index) ((T*)(self)->data)[index] +#define c11__setitem(T, self, index, value) ((T*)(self)->data)[index] = value; + +#define c11_vector__push_back(T, self, elem) \ + do{ \ + if((self)->count == (self)->capacity) c11_vector__reserve((self), (self)->capacity*2); \ + ((T*)(self)->data)[(self)->count] = (elem); \ + (self)->count++; \ + }while(0) + +#define c11_vector__pop_back(T, self) \ + do{ \ + (self)->count--; \ + }while(0) + +#define c11_vector__extend(T, self, p, size) \ + do{ \ + c11_vector__reserve((self), (self)->count + (size)); \ + memcpy((T*)(self)->data + (self)->count, (p), (size) * sizeof(T)); \ + (self)->count += (size); \ + }while(0) + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/pocketpy/interpreter/gc.hpp b/include/pocketpy/interpreter/gc.hpp index 5c1060db..7cf64b2b 100644 --- a/include/pocketpy/interpreter/gc.hpp +++ b/include/pocketpy/interpreter/gc.hpp @@ -2,7 +2,7 @@ #include "pocketpy/common/config.h" #include "pocketpy/common/vector.hpp" -#include "pocketpy/common/utils.hpp" +#include "pocketpy/common/utils.h" #include "pocketpy/objects/object.hpp" #include "pocketpy/objects/namedict.hpp" diff --git a/include/pocketpy/interpreter/vm.hpp b/include/pocketpy/interpreter/vm.hpp index 48ed5f48..1c2ec125 100644 --- a/include/pocketpy/interpreter/vm.hpp +++ b/include/pocketpy/interpreter/vm.hpp @@ -319,8 +319,8 @@ public: #endif #if PK_REGION("Logging Methods") - virtual void stdout_write(const Str& s){ _stdout(s.data, s.size); } - virtual void stderr_write(const Str& s){ _stderr(s.data, s.size); } + virtual void stdout_write(const Str& s){ _stdout(s.c_str(), s.size); } + virtual void stderr_write(const Str& s){ _stderr(s.c_str(), s.size); } #endif #if PK_REGION("Magic Bindings") diff --git a/include/pocketpy/objects/namedict.hpp b/include/pocketpy/objects/namedict.hpp index 25922687..d396e35b 100644 --- a/include/pocketpy/objects/namedict.hpp +++ b/include/pocketpy/objects/namedict.hpp @@ -2,7 +2,7 @@ #include "pocketpy/common/config.h" #include "pocketpy/common/str.hpp" -#include "pocketpy/common/utils.hpp" +#include "pocketpy/common/utils.h" #include "pocketpy/objects/object.hpp" namespace pkpy { diff --git a/include/pocketpy/objects/sourcedata.hpp b/include/pocketpy/objects/sourcedata.hpp index 4fb96d98..07ff384e 100644 --- a/include/pocketpy/objects/sourcedata.hpp +++ b/include/pocketpy/objects/sourcedata.hpp @@ -1,6 +1,6 @@ #pragma once -#include "pocketpy/common/utils.hpp" +#include "pocketpy/common/utils.h" #include "pocketpy/common/str.hpp" namespace pkpy { diff --git a/src/common/any.cpp b/src/common/any.cpp index b2274afa..6ec3636d 100644 --- a/src/common/any.cpp +++ b/src/common/any.cpp @@ -1,5 +1,5 @@ #include "pocketpy/common/any.hpp" -#include "pocketpy/common/utils.hpp" +#include "pocketpy/common/utils.h" #include diff --git a/src/common/str.c b/src/common/str.c new file mode 100644 index 00000000..1e5bbade --- /dev/null +++ b/src/common/str.c @@ -0,0 +1,250 @@ +#include "pocketpy/common/str.h" +#include "pocketpy/common/vector.h" +#include "pocketpy/common/utils.h" + +#include +#include +#include +#include +#include + +int pkpy_utils__u8len(unsigned char c, bool suppress) { + if((c & 0b10000000) == 0) return 1; + if((c & 0b11100000) == 0b11000000) return 2; + if((c & 0b11110000) == 0b11100000) return 3; + if((c & 0b11111000) == 0b11110000) return 4; + if((c & 0b11111100) == 0b11111000) return 5; + if((c & 0b11111110) == 0b11111100) return 6; + if(!suppress) PK_FATAL_ERROR("invalid utf8 char\n") + return 0; +} + +void pkpy_Str__ctor(pkpy_Str *self, const char *data){ + pkpy_Str__ctor2(self, data, strlen(data)); +} + +void pkpy_Str__ctor2(pkpy_Str *self, const char *data, int size){ + self->size = size; + self->is_ascii = true; + self->is_sso = size < sizeof(self->_inlined); + char* p; + if(self->is_sso){ + p = self->_inlined; + }else{ + self->_ptr = (char*)malloc(size + 1); + p = self->_ptr; + } + memcpy(p, data, size); + p[size] = '\0'; + // check is_ascii + for(int i = 0; i < size; i++){ + if(!isascii(p[i])){ + self->is_ascii = false; + break; + } + } +} + +void pkpy_Str__dtor(pkpy_Str *self){ + if(!self->is_sso){ + free(self->_ptr); + self->is_sso = true; + self->size = 0; + } +} + +pkpy_Str pkpy_Str__copy(const pkpy_Str *self){ + pkpy_Str retval = *self; + if(!self->is_sso){ + retval._ptr = (char*)malloc(self->size + 1); + memcpy(retval._ptr, self->_ptr, self->size + 1); + retval._ptr[retval.size] = '\0'; + } + return retval; +} + +pkpy_Str pkpy_Str__concat(const pkpy_Str *self, const pkpy_Str *other){ + pkpy_Str retval = { + .size = self->size + other->size, + .is_ascii = self->is_ascii && other->is_ascii, + .is_sso = self->size + other->size < sizeof(retval._inlined), + }; + char* p; + if(retval.is_sso){ + p = retval._inlined; + }else{ + retval._ptr = (char*)malloc(retval.size + 1); + p = retval._ptr; + } + memcpy(p, pkpy_Str__data(self), self->size); + memcpy(p + self->size, pkpy_Str__data(other), other->size); + p[retval.size] = '\0'; + return retval; +} + +pkpy_Str pkpy_Str__concat2(const pkpy_Str *self, const char *other, int size){ + pkpy_Str tmp; + pkpy_Str__ctor2(&tmp, other, size); + pkpy_Str retval = pkpy_Str__concat(self, &tmp); + pkpy_Str__dtor(&tmp); + return retval; +} + +pkpy_Str pkpy_Str__substr(const pkpy_Str *self, int start){ + return pkpy_Str__substr2(self, start, self->size - start); +} + +pkpy_Str pkpy_Str__substr2(const pkpy_Str *self, int start, int size){ + pkpy_Str retval; + pkpy_Str__ctor2(&retval, pkpy_Str__data(self) + start, size); + return retval; +} + +pkpy_Str pkpy_Str__lower(const pkpy_Str *self){ + pkpy_Str retval = pkpy_Str__copy(self); + char* p = (char*)pkpy_Str__data(&retval); + for(int i = 0; i < retval.size; i++){ + if('A' <= p[i] && p[i] <= 'Z') p[i] += 32; + } + return retval; +} + +pkpy_Str pkpy_Str__upper(const pkpy_Str *self){ + pkpy_Str retval = pkpy_Str__copy(self); + char* p = (char*)pkpy_Str__data(&retval); + for(int i = 0; i < retval.size; i++){ + if('a' <= p[i] && p[i] <= 'z') p[i] -= 32; + } + return retval; +} + +pkpy_Str pkpy_Str__replace(const pkpy_Str *self, char old, char new_){ + pkpy_Str retval = pkpy_Str__copy(self); + char* p = (char*)pkpy_Str__data(&retval); + for(int i = 0; i < retval.size; i++){ + if(p[i] == old) p[i] = new_; + } + return retval; +} + +pkpy_Str pkpy_Str__replace2(const pkpy_Str *self, const pkpy_Str *old, const pkpy_Str *new_){ + c11_vector buffer; + c11_vector__ctor(&buffer, sizeof(char)); + int start = 0; + while(true) { + int i = pkpy_Str__index(self, old, start); + if(i == -1) break; + pkpy_Str tmp = pkpy_Str__substr2(self, start, i - start); + c11_vector__extend(char, &buffer, pkpy_Str__data(&tmp), tmp.size); + pkpy_Str__dtor(&tmp); + c11_vector__extend(char, &buffer, pkpy_Str__data(new_), new_->size); + start = i + old->size; + } + pkpy_Str tmp = pkpy_Str__substr2(self, start, self->size - start); + c11_vector__extend(char, &buffer, pkpy_Str__data(&tmp), tmp.size); + pkpy_Str__dtor(&tmp); + pkpy_Str retval = { + .size = buffer.count, + .is_ascii = self->is_ascii && old->is_ascii && new_->is_ascii, + .is_sso = false, + ._ptr = (char*)buffer.data, + }; + return retval; +} + +int pkpy_Str__cmp(const pkpy_Str *self, const pkpy_Str *other){ + return pkpy_Str__cmp2(self, pkpy_Str__data(other), other->size); +} + +int pkpy_Str__cmp2(const pkpy_Str *self, const char *other, int size){ + int res = strncmp(pkpy_Str__data(self), other, PK_MIN(self->size, size)); + if(res != 0) return res; + return self->size - size; +} + +pkpy_Str pkpy_Str__u8_getitem(const pkpy_Str *self, int i){ + i = pkpy_Str__unicode_index_to_byte(self, i); + return pkpy_Str__substr2( + self, i, + pkpy_utils__u8len(pkpy_Str__data(self)[i], false) + ); +} + +pkpy_Str pkpy_Str__u8_slice(const pkpy_Str *self, int start, int stop, int step){ + c11_vector buffer; + c11_vector__ctor(&buffer, sizeof(char)); + assert(step != 0); + if(self->is_ascii){ + const char* p = pkpy_Str__data(self); + for (int i=start; step>0 ? istop; i+=step) { + c11_vector__push_back(char, &buffer, p[i]); + } + }else{ + for (int i=start; step>0 ? istop; i+=step) { + pkpy_Str unicode = pkpy_Str__u8_getitem(self, i); + const char* p = pkpy_Str__data(&unicode); + for(int j = 0; j < unicode.size; j++){ + c11_vector__push_back(char, &buffer, p[j]); + } + pkpy_Str__dtor(&unicode); + } + } + pkpy_Str retval = { + .size = buffer.count, + .is_ascii = self->is_ascii, + .is_sso = false, + ._ptr = (char*)buffer.data, + }; + return retval; +} + +int pkpy_Str__u8_length(const pkpy_Str *self){ + return pkpy_Str__byte_index_to_unicode(self, self->size); +} + +int pkpy_Str__unicode_index_to_byte(const pkpy_Str* self, int i) { + if(self->is_ascii) return i; + const char* p = pkpy_Str__data(self); + int j = 0; + while(i > 0) { + j += pkpy_utils__u8len(p[j], false); + i--; + } + return j; +} + +int pkpy_Str__byte_index_to_unicode(const pkpy_Str* self, int n) { + if(self->is_ascii) return n; + const char* p = pkpy_Str__data(self); + int cnt = 0; + for(int i = 0; i < n; i++) { + if((p[i] & 0xC0) != 0x80) cnt++; + } + return cnt; +} + +int pkpy_Str__index(const pkpy_Str *self, const pkpy_Str *sub, int start){ + if(sub->size == 0) return start; + int max_end = self->size - sub->size; + const char* self_data = pkpy_Str__data(self); + const char* sub_data = pkpy_Str__data(sub); + for(int i=start; i<=max_end; i++){ + int res = memcmp(self_data + i, sub_data, sub->size); + if(res == 0) return i; + } + return -1; +} + +int pkpy_Str__count(const pkpy_Str *self, const pkpy_Str *sub){ + if(sub->size == 0) return self->size + 1; + int cnt = 0; + int start = 0; + while(true) { + int i = pkpy_Str__index(self, sub, start); + if(i == -1) break; + cnt++; + start = i + sub->size; + } + return cnt; +} + diff --git a/src/common/str.cpp b/src/common/str.cpp index fed8827a..564e5efb 100644 --- a/src/common/str.cpp +++ b/src/common/str.cpp @@ -9,159 +9,20 @@ namespace pkpy { -int utf8len(unsigned char c, bool suppress) { - if((c & 0b10000000) == 0) return 1; - if((c & 0b11100000) == 0b11000000) return 2; - if((c & 0b11110000) == 0b11100000) return 3; - if((c & 0b11111000) == 0b11110000) return 4; - if((c & 0b11111100) == 0b11111000) return 5; - if((c & 0b11111110) == 0b11111100) return 6; - if(!suppress) PK_FATAL_ERROR("invalid utf8 char\n") - return 0; -} - -#define PK_STR_ALLOCATE() \ - if(this->size < (int)sizeof(this->_inlined)) { \ - this->data = this->_inlined; \ - } else { \ - this->data = (char*)std::malloc(this->size + 1); \ - } - -#define PK_STR_COPY_INIT(__s) \ - for(int i = 0; i < this->size; i++) { \ - this->data[i] = __s[i]; \ - if(!isascii(__s[i])) is_ascii = false; \ - } \ - this->data[this->size] = '\0'; - -Str::Str() : size(0), is_ascii(true), data(_inlined) { _inlined[0] = '\0'; } - -Str::Str(int size, bool is_ascii) : - size(size), is_ascii(is_ascii){PK_STR_ALLOCATE()} - - Str::Str(const std::string& s) : - size(s.size()), is_ascii(true){PK_STR_ALLOCATE() PK_STR_COPY_INIT(s)} - - Str::Str(std::string_view s) : - size(s.size()), is_ascii(true){PK_STR_ALLOCATE() PK_STR_COPY_INIT(s)} - - Str::Str(const char* s) : - size(strlen(s)), is_ascii(true){PK_STR_ALLOCATE() PK_STR_COPY_INIT(s)} - - Str::Str(const char* s, int len) : - size(len), is_ascii(true){PK_STR_ALLOCATE() PK_STR_COPY_INIT(s)} - - Str::Str(pair detached) : size(detached.second), is_ascii(true) { - this->data = detached.first; +Str::Str(pair detached) { + this->size = detached.second; + this->is_ascii = true; + this->is_sso = false; + this->_ptr = detached.first; for(int i = 0; i < size; i++) { - if(!isascii(data[i])) { + if(!isascii(_ptr[i])) { is_ascii = false; break; } } - assert(data[size] == '\0'); + assert(_ptr[size] == '\0'); } -Str::Str(const Str& other) : size(other.size), is_ascii(other.is_ascii) { - PK_STR_ALLOCATE() - std::memcpy(data, other.data, size); - data[size] = '\0'; -} - -Str::Str(Str&& other) : size(other.size), is_ascii(other.is_ascii) { - if(other.is_inlined()) { - data = _inlined; - for(int i = 0; i < size; i++) - _inlined[i] = other._inlined[i]; - data[size] = '\0'; - } else { - data = other.data; - // zero out `other` - other.data = other._inlined; - other.data[0] = '\0'; - other.size = 0; - } -} - -Str operator+ (const char* p, const Str& str) { - Str other(p); - return other + str; -} - -std::ostream& operator<< (std::ostream& os, const Str& str) { return os << str.sv(); } - -bool operator< (const std::string_view other, const Str& str) { return other < str.sv(); } - -Str& Str::operator= (const Str& other) { - if(!is_inlined()) std::free(data); - size = other.size; - is_ascii = other.is_ascii; - PK_STR_ALLOCATE() - std::memcpy(data, other.data, size); - data[size] = '\0'; - return *this; -} - -Str Str::operator+ (const Str& other) const { - Str ret(size + other.size, is_ascii && other.is_ascii); - std::memcpy(ret.data, data, size); - std::memcpy(ret.data + size, other.data, other.size); - ret.data[ret.size] = '\0'; - return ret; -} - -Str Str::operator+ (const char* p) const { - Str other(p); - return *this + other; -} - -bool Str::operator== (const Str& other) const { - if(size != other.size) return false; - return memcmp(data, other.data, size) == 0; -} - -bool Str::operator!= (const Str& other) const { - if(size != other.size) return true; - return memcmp(data, other.data, size) != 0; -} - -bool Str::operator== (const std::string_view other) const { - if(size != (int)other.size()) return false; - return memcmp(data, other.data(), size) == 0; -} - -bool Str::operator!= (const std::string_view other) const { - if(size != (int)other.size()) return true; - return memcmp(data, other.data(), size) != 0; -} - -bool Str::operator== (const char* p) const { return *this == std::string_view(p); } - -bool Str::operator!= (const char* p) const { return *this != std::string_view(p); } - -bool Str::operator< (const Str& other) const { return this->sv() < other.sv(); } - -bool Str::operator< (const std::string_view other) const { return this->sv() < other; } - -bool Str::operator> (const Str& other) const { return this->sv() > other.sv(); } - -bool Str::operator<= (const Str& other) const { return this->sv() <= other.sv(); } - -bool Str::operator>= (const Str& other) const { return this->sv() >= other.sv(); } - -Str::~Str() { - if(!is_inlined()) std::free(data); -} - -Str Str::substr(int start, int len) const { - Str ret(len, is_ascii); - std::memcpy(ret.data, data + start, len); - ret.data[len] = '\0'; - return ret; -} - -Str Str::substr(int start) const { return substr(start, size - start); } - Str Str::strip(bool left, bool right, const Str& chars) const { int L = 0; int R = u8_length(); @@ -177,6 +38,7 @@ Str Str::strip(bool left, bool right, const Str& chars) const { } Str Str::strip(bool left, bool right) const { + const char* data = pkpy_Str__data(this); if(is_ascii) { int L = 0; int R = size; @@ -194,24 +56,6 @@ Str Str::strip(bool left, bool right) const { } } -Str Str::lower() const { - std::string copy(data, size); - std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c) { - if('A' <= c && c <= 'Z') return c + ('a' - 'A'); - return (int)c; - }); - return Str(copy); -} - -Str Str::upper() const { - std::string copy(data, size); - std::transform(copy.begin(), copy.end(), copy.begin(), [](unsigned char c) { - if('a' <= c && c <= 'z') return c - ('a' - 'A'); - return (int)c; - }); - return Str(copy); -} - Str Str::escape(bool single_quote) const { SStream ss; escape_(ss, single_quote); @@ -220,7 +64,7 @@ Str Str::escape(bool single_quote) const { void Str::escape_(SStream& ss, bool single_quote) const { ss << (single_quote ? '\'' : '"'); - for(int i = 0; i < length(); i++) { + for(int i = 0; i < size; i++) { char c = this->operator[] (i); switch(c) { case '"': @@ -249,71 +93,6 @@ void Str::escape_(SStream& ss, bool single_quote) const { ss << (single_quote ? '\'' : '"'); } -int Str::index(const Str& sub, int start) const { - auto p = std::search(data + start, data + size, sub.data, sub.data + sub.size); - if(p == data + size) return -1; - return p - data; -} - -Str Str::replace(char old, char new_) const { - Str copied = *this; - for(int i = 0; i < copied.size; i++) { - if(copied.data[i] == old) copied.data[i] = new_; - } - return copied; -} - -Str Str::replace(const Str& old, const Str& new_, int count) const { - SStream ss; - int start = 0; - while(true) { - int i = index(old, start); - if(i == -1) break; - ss << substr(start, i - start); - ss << new_; - start = i + old.size; - if(count != -1 && --count == 0) break; - } - ss << substr(start, size - start); - return ss.str(); -} - -int Str::_unicode_index_to_byte(int i) const { - if(is_ascii) return i; - int j = 0; - while(i > 0) { - j += utf8len(data[j]); - i--; - } - return j; -} - -int Str::_byte_index_to_unicode(int n) const { - if(is_ascii) return n; - int cnt = 0; - for(int i = 0; i < n; i++) { - if((data[i] & 0xC0) != 0x80) cnt++; - } - return cnt; -} - -Str Str::u8_getitem(int i) const { - i = _unicode_index_to_byte(i); - return substr(i, utf8len(data[i])); -} - -Str Str::u8_slice(int start, int stop, int step) const { - SStream ss; - if(is_ascii) { - PK_SLICE_LOOP(i, start, stop, step) ss << data[i]; - } else { - PK_SLICE_LOOP(i, start, stop, step) ss << u8_getitem(i); - } - return ss.str(); -} - -int Str::u8_length() const { return _byte_index_to_unicode(size); } - vector Str::split(const Str& sep) const { vector result; std::string_view tmp; @@ -332,6 +111,7 @@ vector Str::split(const Str& sep) const { vector Str::split(char sep) const { vector result; + const char* data = pkpy_Str__data(this); int i = 0; for(int j = 0; j < size; j++) { if(data[j] == sep) { @@ -344,19 +124,6 @@ vector Str::split(char sep) const { return result; } -int Str::count(const Str& sub) const { - if(sub.empty()) return size + 1; - int cnt = 0; - int start = 0; - while(true) { - int i = index(sub, start); - if(i == -1) break; - cnt++; - start = i + sub.size; - } - return cnt; -} - static std::map& _interned() { static std::map interned; return interned; diff --git a/src/common/utils.c b/src/common/utils.c new file mode 100644 index 00000000..d3032366 --- /dev/null +++ b/src/common/utils.c @@ -0,0 +1,9 @@ +const char* kPlatformStrings[] = { + "win32", // 0 + "emscripten", // 1 + "ios", // 2 + "darwin", // 3 + "android", // 4 + "linux", // 5 + "unknown" // 6 +}; \ No newline at end of file diff --git a/src/common/vector.c b/src/common/vector.c new file mode 100644 index 00000000..46d83ea5 --- /dev/null +++ b/src/common/vector.c @@ -0,0 +1,61 @@ +#include "pocketpy/common/vector.h" + +#include +#include + +void c11_array__ctor(c11_array* self, int elem_size, int count){ + self->data = malloc(elem_size * count); + self->count = count; + self->elem_size = elem_size; +} + +void c11_array__dtor(c11_array* self){ + free(self->data); + self->data = NULL; + self->count = 0; +} + +c11_array c11_array__copy(const c11_array* self){ + c11_array retval; + c11_array__ctor(&retval, self->elem_size, self->count); + memcpy(retval.data, self->data, self->elem_size * self->count); + return retval; +} + +void* c11_array__at(c11_array* self, int index){ + return (char*)self->data + self->elem_size * index; +} + +void c11_vector__ctor(c11_vector* self, int elem_size){ + self->data = NULL; + self->count = 0; + self->capacity = 0; + self->elem_size = elem_size; +} + +void c11_vector__dtor(c11_vector* self){ + if(self->data) free(self->data); + self->data = NULL; + self->count = 0; + self->capacity = 0; +} + +c11_vector c11_vector__copy(const c11_vector* self){ + c11_vector retval; + c11_vector__ctor(&retval, self->elem_size); + c11_vector__reserve(&retval, self->capacity); + memcpy(retval.data, self->data, self->elem_size * self->count); + retval.count = self->count; + return retval; +} + +void* c11_vector__at(c11_vector* self, int index){ + return (char*)self->data + self->elem_size * index; +} + +void c11_vector__reserve(c11_vector* self, int capacity){ + if(capacity < 4) capacity = 4; + if(capacity <= self->capacity) return; + self->capacity = capacity; + self->data = realloc(self->data, self->elem_size * self->capacity); +} diff --git a/src/compiler/lexer.cpp b/src/compiler/lexer.cpp index f303a7ba..13226239 100644 --- a/src/compiler/lexer.cpp +++ b/src/compiler/lexer.cpp @@ -1,6 +1,7 @@ #include "pocketpy/compiler/lexer.hpp" #include "pocketpy/common/gil.hpp" #include "pocketpy/common/version.h" +#include "pocketpy/common/str.h" #include @@ -107,7 +108,7 @@ Error* Lexer::eat_name() noexcept{ curr_char--; while(true) { unsigned char c = peekchar(); - int u8bytes = utf8len(c, true); + int u8bytes = pkpy_utils__u8len(c, true); if(u8bytes == 0) return SyntaxError("invalid char: %c", c); if(u8bytes == 1) { if(isalpha(c) || c == '_' || isdigit(c)) { diff --git a/src/interpreter/ceval.cpp b/src/interpreter/ceval.cpp index 01e77832..978d296e 100644 --- a/src/interpreter/ceval.cpp +++ b/src/interpreter/ceval.cpp @@ -446,7 +446,7 @@ PyVar VM::__run_top_frame() { case OP_BUILD_BYTES: { const Str& s = CAST(Str&, TOP()); unsigned char* p = (unsigned char*)std::malloc(s.size); - std::memcpy(p, s.data, s.size); + std::memcpy(p, s.c_str(), s.size); TOP() = VAR(Bytes(p, s.size)); } DISPATCH() diff --git a/src/interpreter/iter.cpp b/src/interpreter/iter.cpp index 15d058d5..390df813 100644 --- a/src/interpreter/iter.cpp +++ b/src/interpreter/iter.cpp @@ -49,7 +49,7 @@ void StringIter::_register(VM* vm, PyObject* mod, PyObject* type) { Str& s = PK_OBJ_GET(Str, self.ref); if(self.i == s.size) return 0; int start = self.i; - int len = utf8len(s.data[self.i]); + int len = pkpy_utils__u8len(s[self.i], false); self.i += len; vm->s_data.push(VAR(s.substr(start, len))); return 1; diff --git a/src/modules/io.cpp b/src/modules/io.cpp index 6bc551b3..30847e6b 100644 --- a/src/modules/io.cpp +++ b/src/modules/io.cpp @@ -85,7 +85,7 @@ void FileIO::_register(VM* vm, PyObject* mod, PyObject* type) { FileIO& io = PK_OBJ_GET(FileIO, args[0]); if(io.is_text) { Str& s = CAST(Str&, args[1]); - fwrite(s.data, 1, s.length(), io.fp); + fwrite(s.c_str(), 1, s.length(), io.fp); } else { Bytes& buffer = CAST(Bytes&, args[1]); fwrite(buffer.data(), 1, buffer.size(), io.fp); diff --git a/src/pocketpy.cpp b/src/pocketpy.cpp index 4e7452eb..4b523275 100644 --- a/src/pocketpy.cpp +++ b/src/pocketpy.cpp @@ -539,7 +539,7 @@ void __init_builtins(VM* _vm) { double float_out; char* p_end; try { - float_out = std::strtod(s.data, &p_end); + float_out = std::strtod(s.c_str(), &p_end); if(p_end != s.end()) throw 1; } catch(...) { vm->ValueError("invalid literal for float(): " + s.escape()); } return VAR(float_out); @@ -636,13 +636,12 @@ void __init_builtins(VM* _vm) { return VAR(self.u8_getitem(i)); }); - _vm->bind(_vm->_t(VM::tp_str), "replace(self, old, new, count=-1)", [](VM* vm, ArgsView args) { + _vm->bind(_vm->_t(VM::tp_str), "replace(self, old, new)", [](VM* vm, ArgsView args) { const Str& self = _CAST(Str&, args[0]); const Str& old = CAST(Str&, args[1]); if(old.empty()) vm->ValueError("empty substring"); const Str& new_ = CAST(Str&, args[2]); - int count = CAST(int, args[3]); - return VAR(self.replace(old, new_, count)); + return VAR(self.replace(old, new_)); }); _vm->bind(_vm->_t(VM::tp_str), "split(self, sep=' ')", [](VM* vm, ArgsView args) { @@ -705,14 +704,14 @@ void __init_builtins(VM* _vm) { const Str& suffix = CAST(Str&, args[1]); int offset = self.length() - suffix.length(); if(offset < 0) return vm->False; - bool ok = memcmp(self.data + offset, suffix.data, suffix.length()) == 0; + bool ok = memcmp(self.c_str() + offset, suffix.c_str(), suffix.length()) == 0; return VAR(ok); }); _vm->bind_func(VM::tp_str, "encode", 1, [](VM* vm, ArgsView args) { const Str& self = _CAST(Str&, args[0]); Bytes retval(self.length()); - std::memcpy(retval.data(), self.data, self.length()); + std::memcpy(retval.data(), self.c_str(), self.length()); return VAR(std::move(retval)); }); diff --git a/tests/04_str.py b/tests/04_str.py index 79778a77..79ad9a8f 100644 --- a/tests/04_str.py +++ b/tests/04_str.py @@ -39,8 +39,8 @@ assert t[-5:] == 'ow!!!' assert t[3:-3] == 's is string example....wow' assert s > q;assert s < r assert s.replace("o","") == "ftball" -assert s.replace("o","O",1) == "fOotball" -assert s.replace("foo","ball",1) == "balltball" +assert s.replace("o","O") == "fOOtball" +assert s.replace("foo","ball") == "balltball" assert s.startswith('f') == True;assert s.endswith('o') == False assert t.startswith('this') == True;