diff --git a/include/pocketpy/compiler/lexer.h b/include/pocketpy/compiler/lexer.h index 1044c39f..f2c3e5ec 100644 --- a/include/pocketpy/compiler/lexer.h +++ b/include/pocketpy/compiler/lexer.h @@ -9,11 +9,6 @@ extern "C" { extern const char* pk_TokenSymbols[]; -typedef struct pk_TokenDeserializer { - const char* curr; - const char* source; -} pk_TokenDeserializer; - typedef enum TokenIndex{ TK_EOF, TK_EOL, TK_SOF, TK_ID, TK_NUM, TK_STR, TK_FSTR, TK_LONG, TK_BYTES, TK_IMAG, @@ -39,6 +34,64 @@ typedef enum TokenIndex{ TK__COUNT__ } TokenIndex; +typedef struct TokenValue { + int index; + union { + int64_t _i64; // 0 + double _f64; // 1 + py_Str _str; // 2 + }; +} TokenValue; + +typedef struct Token { + TokenIndex type; + const char* start; + int length; + int line; + int brackets_level; + TokenValue value; +} Token; + +// https://docs.python.org/3/reference/expressions.html#operator-precedence +enum Precedence { + PREC_LOWEST, + PREC_LAMBDA, // lambda + PREC_TERNARY, // ?: + PREC_LOGICAL_OR, // or + PREC_LOGICAL_AND, // and + PREC_LOGICAL_NOT, // not + /* https://docs.python.org/3/reference/expressions.html#comparisons + * Unlike C, all comparison operations in Python have the same priority, + * which is lower than that of any arithmetic, shifting or bitwise operation. + * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics. + */ + PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in + PREC_BITWISE_OR, // | + PREC_BITWISE_XOR, // ^ + PREC_BITWISE_AND, // & + PREC_BITWISE_SHIFT, // << >> + PREC_TERM, // + - + PREC_FACTOR, // * / % // @ + PREC_UNARY, // - not ~ + PREC_EXPONENT, // ** + PREC_PRIMARY, // f() x[] a.b 1:2 + PREC_HIGHEST, +}; + +enum StringType { + NORMAL_STRING, + RAW_STRING, + F_STRING, + NORMAL_BYTES +}; + +#define is_raw_string_used(t) ((t) == TK_ID || (t) == TK_LONG) + +typedef struct pk_TokenDeserializer { + const char* curr; + const char* source; +} pk_TokenDeserializer; + void pk_TokenDeserializer__ctor(pk_TokenDeserializer* self, const char* source); bool pk_TokenDeserializer__match_char(pk_TokenDeserializer* self, char c); c11_string pk_TokenDeserializer__read_string(pk_TokenDeserializer* self, char c); @@ -47,6 +100,7 @@ int pk_TokenDeserializer__read_count(pk_TokenDeserializer* self); int64_t pk_TokenDeserializer__read_uint(pk_TokenDeserializer* self, char c); double pk_TokenDeserializer__read_float(pk_TokenDeserializer* self, char c); + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/pocketpy/compiler/lexer.hpp b/include/pocketpy/compiler/lexer.hpp index c6ffb841..68381eb7 100644 --- a/include/pocketpy/compiler/lexer.hpp +++ b/include/pocketpy/compiler/lexer.hpp @@ -8,50 +8,6 @@ namespace pkpy { -using TokenValue = std::variant; - -constexpr inline bool is_raw_string_used(TokenIndex t) noexcept{ return t == TK_ID || t == TK_LONG; } - -struct Token { - TokenIndex type; - const char* start; - int length; - int line; - int brackets_level; - TokenValue value; - - Str str() const noexcept{ return Str(start, length); } - - std::string_view sv() const noexcept{ return std::string_view(start, length); } -}; - -// https://docs.python.org/3/reference/expressions.html#operator-precedence -enum Precedence { - PREC_LOWEST, - PREC_LAMBDA, // lambda - PREC_TERNARY, // ?: - PREC_LOGICAL_OR, // or - PREC_LOGICAL_AND, // and - PREC_LOGICAL_NOT, // not - /* https://docs.python.org/3/reference/expressions.html#comparisons - * Unlike C, all comparison operations in Python have the same priority, - * which is lower than that of any arithmetic, shifting or bitwise operation. - * Also unlike C, expressions like a < b < c have the interpretation that is conventional in mathematics. - */ - PREC_COMPARISION, // < > <= >= != ==, in / is / is not / not in - PREC_BITWISE_OR, // | - PREC_BITWISE_XOR, // ^ - PREC_BITWISE_AND, // & - PREC_BITWISE_SHIFT, // << >> - PREC_TERM, // + - - PREC_FACTOR, // * / % // @ - PREC_UNARY, // - not ~ - PREC_EXPONENT, // ** - PREC_PRIMARY, // f() x[] a.b 1:2 - PREC_HIGHEST, -}; - -enum class StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES }; struct Lexer { PK_ALWAYS_PASS_BY_POINTER(Lexer) diff --git a/include/pocketpy/objects/codeobject.h b/include/pocketpy/objects/codeobject.h index b11f554b..cba87014 100644 --- a/include/pocketpy/objects/codeobject.h +++ b/include/pocketpy/objects/codeobject.h @@ -68,7 +68,7 @@ typedef struct BytecodeEx { } BytecodeEx; typedef struct CodeObject { - pkpy_SourceData_ src; + pk_SourceData_ src; py_Str name; c11_vector/*T=Bytecode*/ codes; @@ -88,7 +88,7 @@ typedef struct CodeObject { int end_line; } CodeObject; -CodeObject* CodeObject__new(pkpy_SourceData_ src, c11_string name); +CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name); void CodeObject__delete(CodeObject* self); void CodeObject__gc_mark(const CodeObject* self); @@ -117,7 +117,7 @@ typedef struct FuncDecl { typedef FuncDecl* FuncDecl_; -FuncDecl_ FuncDecl__rcnew(pkpy_SourceData_ src, c11_string name); +FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name); void FuncDecl__dtor(FuncDecl* self); void FuncDecl__add_kwarg(FuncDecl* self, int index, uint16_t key, const PyVar* value); void FuncDecl__gc_mark(const FuncDecl* self); diff --git a/include/pocketpy/objects/error.h b/include/pocketpy/objects/error.h index 7cada98c..dec2d402 100644 --- a/include/pocketpy/objects/error.h +++ b/include/pocketpy/objects/error.h @@ -10,7 +10,7 @@ extern "C" { #endif typedef struct pkpy_ExceptionFrame { - pkpy_SourceData_ src; + pk_SourceData_ src; int lineno; const char* cursor; py_Str name; @@ -31,12 +31,12 @@ typedef struct pkpy_Exception { void pkpy_Exception__ctor(pkpy_Exception* self, StrName type); void pkpy_Exception__dtor(pkpy_Exception* self); -void pkpy_Exception__stpush(pkpy_Exception* self, pkpy_SourceData_ src, int lineno, const char* cursor, const char* name); +void pkpy_Exception__stpush(pkpy_Exception* self, pk_SourceData_ src, int lineno, const char* cursor, const char* name); py_Str pkpy_Exception__summary(pkpy_Exception* self); struct Error{ const char* type; - pkpy_SourceData_ src; + pk_SourceData_ src; int lineno; const char* cursor; char msg[100]; diff --git a/include/pocketpy/objects/sourcedata.h b/include/pocketpy/objects/sourcedata.h index a7f16e65..92da49f8 100644 --- a/include/pocketpy/objects/sourcedata.h +++ b/include/pocketpy/objects/sourcedata.h @@ -11,7 +11,7 @@ extern "C" { enum CompileMode { EXEC_MODE, EVAL_MODE, REPL_MODE, JSON_MODE, CELL_MODE }; -struct pkpy_SourceData { +struct pk_SourceData { RefCounted rc; enum CompileMode mode; bool is_precompiled; @@ -23,14 +23,11 @@ struct pkpy_SourceData { c11_vector/*T=py_Str*/ _precompiled_tokens; }; -typedef struct pkpy_SourceData* pkpy_SourceData_; +typedef struct pk_SourceData* pk_SourceData_; -pkpy_SourceData_ pkpy_SourceData__rcnew(c11_string source, const py_Str *filename, enum CompileMode mode); -void pkpy_SourceData__ctor(struct pkpy_SourceData *self, c11_string source, const py_Str *filename, enum CompileMode mode); -void pkpy_SourceData__dtor(struct pkpy_SourceData* self); - -bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, const char** st, const char** ed); -py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData *self, int lineno, const char *cursor, const char *name); +pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode); +bool pk_SourceData__get_line(const struct pk_SourceData* self, int lineno, const char** st, const char** ed); +py_Str pk_SourceData__snapshot(const struct pk_SourceData *self, int lineno, const char *cursor, const char *name); #ifdef __cplusplus } diff --git a/src/common/sourcedata.c b/src/common/sourcedata.c index b71ce70d..1cde6133 100644 --- a/src/common/sourcedata.c +++ b/src/common/sourcedata.c @@ -4,40 +4,33 @@ #include #include -pkpy_SourceData_ pkpy_SourceData__rcnew(c11_string source, const py_Str* filename, enum CompileMode mode) { - pkpy_SourceData_ self = malloc(sizeof(struct pkpy_SourceData)); - pkpy_SourceData__ctor(self, source, filename, mode); - self->rc.count = 1; - self->rc.dtor = (void(*)(void*))pkpy_SourceData__dtor; - return self; -} - -void pkpy_SourceData__ctor(struct pkpy_SourceData* self, - c11_string source, // may not be null-terminated - const py_Str* filename, +void pk_SourceData__ctor(struct pk_SourceData* self, + const char* source, + const char* filename, enum CompileMode mode) { - self->filename = py_Str__copy(filename); // OPTIMIZEME? + py_Str__ctor(&self->filename, filename); self->mode = mode; c11_vector__ctor(&self->line_starts, sizeof(const char*)); c11_vector__ctor(&self->_precompiled_tokens, sizeof(py_Str)); - int index = 0; // Skip utf8 BOM if there is any. - if (source.size >= 3 && strncmp(source.data, "\xEF\xBB\xBF", 3) == 0) index += 3; + if(strncmp(source, "\xEF\xBB\xBF", 3) == 0) source += 3; // Drop all '\r' pk_SStream ss; - pk_SStream__ctor2(&ss, source.size + 1); - while(index < source.size){ - char c = source.data[index]; + pk_SStream__ctor(&ss); + while(true){ + char c = *source; + if(c == '\0') break; if(c != '\r') pk_SStream__write_char(&ss, c); - index++; + source++; } self->source = pk_SStream__submit(&ss); - self->is_precompiled = (strncmp(py_Str__data(&self->source), "pkpy:", 5) == 0); - c11_vector__push(const char*, &self->line_starts, py_Str__data(&self->source)); + source = py_Str__data(&self->source); + self->is_precompiled = (strncmp(source, "pkpy:", 5) == 0); + c11_vector__push(const char*, &self->line_starts, source); } -void pkpy_SourceData__dtor(struct pkpy_SourceData* self) { +void pk_SourceData__dtor(struct pk_SourceData* self) { py_Str__dtor(&self->filename); py_Str__dtor(&self->source); c11_vector__dtor(&self->line_starts); @@ -48,7 +41,15 @@ void pkpy_SourceData__dtor(struct pkpy_SourceData* self) { c11_vector__dtor(&self->_precompiled_tokens); } -bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, const char** st, const char** ed) { +pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode) { + pk_SourceData_ self = malloc(sizeof(struct pk_SourceData)); + pk_SourceData__ctor(self, source, filename, mode); + self->rc.count = 1; + self->rc.dtor = (void(*)(void*))pk_SourceData__dtor; + return self; +} + +bool pk_SourceData__get_line(const struct pk_SourceData* self, int lineno, const char** st, const char** ed) { if(self->is_precompiled || lineno == -1) { return false; } lineno -= 1; if(lineno < 0) lineno = 0; @@ -62,7 +63,7 @@ bool pkpy_SourceData__get_line(const struct pkpy_SourceData* self, int lineno, c return true; } -py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData* self, int lineno, const char* cursor, const char* name) { +py_Str pk_SourceData__snapshot(const struct pk_SourceData* self, int lineno, const char* cursor, const char* name) { pk_SStream ss; pk_SStream__ctor(&ss); @@ -85,7 +86,7 @@ py_Str pkpy_SourceData__snapshot(const struct pkpy_SourceData* self, int lineno, if(!self->is_precompiled) { pk_SStream__write_char(&ss, '\n'); const char *st = NULL, *ed; - if(pkpy_SourceData__get_line(self, lineno, &st, &ed)) { + if(pk_SourceData__get_line(self, lineno, &st, &ed)) { while(st < ed && isblank(*st)) ++st; if(st < ed) { diff --git a/src/common/sstream.c b/src/common/sstream.c index 0e9127cb..40e74caf 100644 --- a/src/common/sstream.c +++ b/src/common/sstream.c @@ -152,6 +152,7 @@ void pk_SStream__write_any(pk_SStream* self, const char* fmt, const pk_AnyStr* a py_Str pk_SStream__submit(pk_SStream* self) { c11_vector__push(char, &self->data, '\0'); + // TODO: optimize c11__isascii py_Str retval = { .size = self->data.count - 1, .is_ascii = c11__isascii((char*)self->data.data, self->data.count), diff --git a/src/compiler/lexer.c b/src/compiler/lexer.c index 9fed4882..2926a154 100644 --- a/src/compiler/lexer.c +++ b/src/compiler/lexer.c @@ -2,6 +2,7 @@ #include "pocketpy/common/str.h" #include "pocketpy/common/smallmap.h" #include "pocketpy/compiler/lexer.h" +#include "pocketpy/objects/sourcedata.h" const char* pk_TokenSymbols[] = { "@eof", "@eol", "@sof", diff --git a/src/compiler/lexer2.c b/src/compiler/lexer2.c new file mode 100644 index 00000000..dea39abf --- /dev/null +++ b/src/compiler/lexer2.c @@ -0,0 +1,187 @@ +#include "pocketpy/compiler/lexer.h" +#include "pocketpy/objects/sourcedata.h" + +typedef struct pk_Lexer{ + pk_SourceData_ src; + const char* token_start; + const char* curr_char; + int current_line; + int brackets_level; + + c11_vector/*T=Token*/ nexts; + c11_vector/*T=int*/ indents; +} pk_Lexer; + +const static TokenValue EmptyTokenValue; + +void pk_Lexer__ctor(pk_Lexer* self, pk_SourceData_ src){ + PK_INCREF(src); + self->src = src; + self->curr_char = self->token_start = py_Str__data(&src->source); + self->current_line = 1; + self->brackets_level = 0; + c11_vector__ctor(&self->nexts, sizeof(Token)); + c11_vector__ctor(&self->indents, sizeof(int)); +} + +void pk_Lexer__dtor(pk_Lexer* self){ + PK_DECREF(self->src); + c11_vector__dtor(&self->nexts); + c11_vector__dtor(&self->indents); +} + +void* pk_Lexer__run(pk_SourceData_ src, void** out_tokens){ + pk_Lexer lexer; + pk_Lexer__ctor(&lexer, src); + + if(src->is_precompiled) { + pk_Lexer__dtor(&lexer); + return from_precompiled(); + } + // push initial tokens + Token sof = {TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue}; + c11_vector__push(Token, &lexer.nexts, sof); + c11_vector__push(int, &lexer.indents, 0); + + bool eof = false; + while(!eof) { + void* err = lex_one_token(&eof); + if(err){ + pk_Lexer__dtor(&lexer); + return err; + } + } + pk_Lexer__dtor(&lexer); + return NULL; +} + +char eatchar(pk_Lexer* self){ + char c = *self->curr_char; + assert(c != '\n'); // eatchar() cannot consume a newline + self->curr_char++; + return c; +} + +char eatchar_include_newline(pk_Lexer* self){ + char c = *self->curr_char; + self->curr_char++; + if(c == '\n') { + self->current_line++; + c11_vector__push(const char*, &self->src->line_starts, self->curr_char); + } + return c; +} + +int eat_spaces(pk_Lexer* self){ + int count = 0; + while(true) { + switch(*self->curr_char) { + case ' ': count += 1; break; + case '\t': count += 4; break; + default: return count; + } + eatchar(self); + } +} + +bool matchchar(pk_Lexer* self, char c){ + if(*self->curr_char != c) return false; + eatchar_include_newline(self); + return true; +} + +bool match_n_chars(pk_Lexer* self, int n, char c0){ + const char* c = self->curr_char; + for(int i = 0; i < n; i++) { + if(*c == '\0') return false; + if(*c != c0) return false; + c++; + } + for(int i = 0; i < n; i++) + eatchar_include_newline(self); + return true; +} + +bool match_string(pk_Lexer* self, const char* s){ + int s_len = strlen(s); + if(strncmp(self->curr_char, s, s_len) == 0){ + for(int i = 0; i < s_len; i++) + eatchar_include_newline(self); + } + return ok; +} + +void skip_line_comment(pk_Lexer* self){ + while(*self->curr_char) { + if(*self->curr_char == '\n') return; + eatchar(self); + } +} + +void add_token(pk_Lexer* self, TokenIndex type, TokenValue value){ + switch(type) { + case TK_LBRACE: + case TK_LBRACKET: + case TK_LPAREN: self->brackets_level++; break; + case TK_RPAREN: + case TK_RBRACKET: + case TK_RBRACE: self->brackets_level--; break; + default: break; + } + Token token = {type, + self->token_start, + (int)(self->curr_char - self->token_start), + self->current_line - ((type == TK_EOL) ? 1 : 0), + self->brackets_level, + value}; + // handle "not in", "is not", "yield from" + if(self->nexts.count > 0) { + Token* back = &c11_vector__back(Token, &self->nexts); + if(back->type == TK_NOT_KW && type == TK_IN) { + back->type = TK_NOT_IN; + return; + } + if(back->type == TK_IS && type == TK_NOT_KW) { + back->type = TK_IS_NOT; + return; + } + if(back->type == TK_YIELD && type == TK_FROM) { + back->type = TK_YIELD_FROM; + return; + } + c11_vector__push(Token, &self->nexts, token); + } +} + + +void add_token_2(pk_Lexer* self, char c, TokenIndex one, TokenIndex two){ + if(matchchar(self, c)) + add_token(self, two, EmptyTokenValue); + else + add_token(self, one, EmptyTokenValue); +} + +bool eat_indentation(pk_Lexer* self){ + if(self->brackets_level > 0) return true; + int spaces = eat_spaces(self); + if(*self->curr_char == '#') skip_line_comment(); + if(*self->curr_char == '\0' || *self->curr_char == '\n'){ + return true; + } + // https://docs.python.org/3/reference/lexical_analysis.html#indentation + int indents_back = c11_vector__back(int, &self->indents); + if(spaces > indents_back) { + c11_vector__push(int, &self->indents, spaces); + Token t = {TK_INDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue}; + c11_vector__push(Token, &self->nexts, t); + } else if(spaces < indents_back) { + do { + c11_vector__pop(int, &self->indents); + Token t = {TK_DEDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue}; + c11_vector__push(Token, &self->nexts, t); + indents_back = c11_vector__back(int, &self->indents); + } while(spaces < indents_back); + if(spaces != indents_back) { return false; } + } + return true; +} diff --git a/src/error.c b/src/error.c index 371e0012..39155d57 100644 --- a/src/error.c +++ b/src/error.c @@ -23,7 +23,7 @@ void pkpy_Exception__dtor(pkpy_Exception* self){ c11_vector__dtor(&self->stacktrace); } -void pkpy_Exception__stpush(pkpy_Exception* self, pkpy_SourceData_ src, int lineno, const char* cursor, const char* name){ +void pkpy_Exception__stpush(pkpy_Exception* self, pk_SourceData_ src, int lineno, const char* cursor, const char* name){ if(self->stacktrace.count >= 7) return; PK_INCREF(src); pkpy_ExceptionFrame* frame = c11_vector__emplace(&self->stacktrace); @@ -42,7 +42,7 @@ py_Str pkpy_Exception__summary(pkpy_Exception* self){ } for(int i=self->stacktrace.count-1; i >= 0; i--) { pkpy_ExceptionFrame* frame = c11__at(pkpy_ExceptionFrame, &self->stacktrace, i); - py_Str s = pkpy_SourceData__snapshot(frame->src, frame->lineno, frame->cursor, py_Str__data(&frame->name)); + py_Str s = pk_SourceData__snapshot(frame->src, frame->lineno, frame->cursor, py_Str__data(&frame->name)); pk_SStream__write_Str(&ss, &s); py_Str__dtor(&s); pk_SStream__write_cstr(&ss, "\n"); diff --git a/src/objects/codeobject.c b/src/objects/codeobject.c index d69e968d..2053908d 100644 --- a/src/objects/codeobject.c +++ b/src/objects/codeobject.c @@ -13,7 +13,7 @@ bool Bytecode__is_forward_jump(const Bytecode* self) { return self->op >= OP_JUMP_FORWARD && self->op <= OP_LOOP_BREAK; } -FuncDecl_ FuncDecl__rcnew(pkpy_SourceData_ src, c11_string name){ +FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name){ FuncDecl* self = malloc(sizeof(FuncDecl)); self->rc.count = 1; self->rc.dtor = (void (*)(void*))FuncDecl__dtor; @@ -46,7 +46,7 @@ void FuncDecl__add_kwarg(FuncDecl* self, int index, uint16_t key, const PyVar* v c11_vector__push(FuncDeclKwArg, &self->kwargs, item); } -CodeObject* CodeObject__new(pkpy_SourceData_ src, c11_string name){ +CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name){ CodeObject* self = malloc(sizeof(CodeObject)); self->src = src; PK_INCREF(src); py_Str__ctor2(&self->name, name.data, name.size);