This commit is contained in:
blueloveTH 2024-08-14 12:37:43 +08:00
parent e51f86c599
commit b94b535de8
4 changed files with 40 additions and 229 deletions

View File

@ -87,7 +87,6 @@ enum Precedence {
typedef c11_array TokenArray; typedef c11_array TokenArray;
Error* Lexer__process(SourceData_ src, TokenArray* out_tokens); Error* Lexer__process(SourceData_ src, TokenArray* out_tokens);
Error* Lexer__process_and_dump(SourceData_ src, c11_string** out_string);
void TokenArray__dtor(TokenArray* self); void TokenArray__dtor(TokenArray* self);
#define Token__sv(self) (c11_sv){(self)->start, (self)->length} #define Token__sv(self) (c11_sv){(self)->start, (self)->length}

View File

@ -9,14 +9,12 @@
struct SourceData { struct SourceData {
RefCounted rc; RefCounted rc;
enum py_CompileMode mode; enum py_CompileMode mode;
bool is_precompiled;
bool is_dynamic; // for exec() and eval() bool is_dynamic; // for exec() and eval()
c11_string* filename; c11_string* filename;
c11_string* source; c11_string* source;
c11_vector /*T=const char* */ line_starts; c11_vector /*T=const char* */ line_starts;
c11_vector /*T=c11_string* */ _precompiled_tokens;
}; };
typedef struct SourceData* SourceData_; typedef struct SourceData* SourceData_;

View File

@ -5,14 +5,13 @@
#include <string.h> #include <string.h>
static void SourceData__ctor(struct SourceData* self, static void SourceData__ctor(struct SourceData* self,
const char* source, const char* source,
const char* filename, const char* filename,
enum py_CompileMode mode, enum py_CompileMode mode,
bool is_dynamic) { bool is_dynamic) {
self->filename = c11_string__new(filename); self->filename = c11_string__new(filename);
self->mode = mode; self->mode = mode;
c11_vector__ctor(&self->line_starts, sizeof(const char*)); c11_vector__ctor(&self->line_starts, sizeof(const char*));
c11_vector__ctor(&self->_precompiled_tokens, sizeof(c11_string*));
// Skip utf8 BOM if there is any. // Skip utf8 BOM if there is any.
if(strncmp(source, "\xEF\xBB\xBF", 3) == 0) source += 3; if(strncmp(source, "\xEF\xBB\xBF", 3) == 0) source += 3;
@ -26,7 +25,6 @@ static void SourceData__ctor(struct SourceData* self,
source++; source++;
} }
self->source = c11_sbuf__submit(&ss); self->source = c11_sbuf__submit(&ss);
self->is_precompiled = (strncmp(source, "pkpy:", 5) == 0);
self->is_dynamic = is_dynamic; self->is_dynamic = is_dynamic;
c11_vector__push(const char*, &self->line_starts, self->source->data); c11_vector__push(const char*, &self->line_starts, self->source->data);
} }
@ -34,19 +32,13 @@ static void SourceData__ctor(struct SourceData* self,
static void SourceData__dtor(struct SourceData* self) { static void SourceData__dtor(struct SourceData* self) {
c11_string__delete(self->filename); c11_string__delete(self->filename);
c11_string__delete(self->source); c11_string__delete(self->source);
c11_vector__dtor(&self->line_starts); c11_vector__dtor(&self->line_starts);
for(int i = 0; i < self->_precompiled_tokens.count; i++) {
c11_string__delete(c11__getitem(c11_string*, &self->_precompiled_tokens, i));
}
c11_vector__dtor(&self->_precompiled_tokens);
} }
SourceData_ SourceData__rcnew(const char* source, SourceData_ SourceData__rcnew(const char* source,
const char* filename, const char* filename,
enum py_CompileMode mode, enum py_CompileMode mode,
bool is_dynamic) { bool is_dynamic) {
SourceData_ self = malloc(sizeof(struct SourceData)); SourceData_ self = malloc(sizeof(struct SourceData));
SourceData__ctor(self, source, filename, mode, is_dynamic); SourceData__ctor(self, source, filename, mode, is_dynamic);
self->rc.count = 1; self->rc.count = 1;
@ -55,10 +47,10 @@ SourceData_ SourceData__rcnew(const char* source,
} }
bool SourceData__get_line(const struct SourceData* self, bool SourceData__get_line(const struct SourceData* self,
int lineno, int lineno,
const char** st, const char** st,
const char** ed) { const char** ed) {
if(self->is_precompiled || lineno == -1) { return false; } if(lineno < 0) return false;
lineno -= 1; lineno -= 1;
if(lineno < 0) lineno = 0; if(lineno < 0) lineno = 0;
const char* _start = c11__getitem(const char*, &self->line_starts, lineno); const char* _start = c11__getitem(const char*, &self->line_starts, lineno);
@ -72,10 +64,10 @@ bool SourceData__get_line(const struct SourceData* self,
} }
void SourceData__snapshot(const struct SourceData* self, void SourceData__snapshot(const struct SourceData* self,
c11_sbuf* ss, c11_sbuf* ss,
int lineno, int lineno,
const char* cursor, const char* cursor,
const char* name) { const char* name) {
pk_sprintf(ss, " File \"%s\", line %d", self->filename->data, lineno); pk_sprintf(ss, " File \"%s\", line %d", self->filename->data, lineno);
if(name && *name) { if(name && *name) {
@ -83,26 +75,24 @@ void SourceData__snapshot(const struct SourceData* self,
c11_sbuf__write_cstr(ss, name); c11_sbuf__write_cstr(ss, name);
} }
if(!self->is_precompiled) { c11_sbuf__write_char(ss, '\n');
c11_sbuf__write_char(ss, '\n'); const char *st = NULL, *ed;
const char *st = NULL, *ed; if(SourceData__get_line(self, lineno, &st, &ed)) {
if(SourceData__get_line(self, lineno, &st, &ed)) { while(st < ed && isblank(*st))
while(st < ed && isblank(*st)) ++st;
++st; if(st < ed) {
if(st < ed) { c11_sbuf__write_cstr(ss, " ");
c11_sbuf__write_cstr(ss, " "); c11_sbuf__write_cstrn(ss, st, ed - st);
c11_sbuf__write_cstrn(ss, st, ed - st); if(cursor && st <= cursor && cursor <= ed) {
if(cursor && st <= cursor && cursor <= ed) { c11_sbuf__write_cstr(ss, "\n ");
c11_sbuf__write_cstr(ss, "\n "); for(int i = 0; i < (cursor - st); ++i)
for(int i = 0; i < (cursor - st); ++i) c11_sbuf__write_char(ss, ' ');
c11_sbuf__write_char(ss, ' '); c11_sbuf__write_cstr(ss, "^");
c11_sbuf__write_cstr(ss, "^");
}
} else {
st = NULL;
} }
} else {
st = NULL;
} }
if(!st) { c11_sbuf__write_cstr(ss, " <?>"); }
} }
if(!st) { c11_sbuf__write_cstr(ss, " <?>"); }
} }

View File

@ -267,7 +267,11 @@ static Error* eat_name(Lexer* self) {
return NULL; return NULL;
} }
static Error* eat_string_until(Lexer* self, char quote, bool raw, c11_string** out) { enum StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES };
static Error* eat_string(Lexer* self, char quote, enum StringType type) {
bool raw = type == RAW_STRING;
// previous char is quote // previous char is quote
bool quote3 = match_n_chars(self, 2, quote); bool quote3 = match_n_chars(self, 2, quote);
c11_sbuf buff; c11_sbuf buff;
@ -313,17 +317,9 @@ static Error* eat_string_until(Lexer* self, char quote, bool raw, c11_string** o
c11_sbuf__write_char(&buff, c); c11_sbuf__write_char(&buff, c);
} }
} }
*out = c11_sbuf__submit(&buff);
return NULL;
}
enum StringType { NORMAL_STRING, RAW_STRING, F_STRING, NORMAL_BYTES }; c11_string* res = c11_sbuf__submit(&buff);
TokenValue value = {TokenValue_STR, ._str = res};
static Error* eat_string(Lexer* self, char quote, enum StringType type) {
c11_string* s;
Error* err = eat_string_until(self, quote, type == RAW_STRING, &s);
if(err) return err;
TokenValue value = {TokenValue_STR, ._str = s};
if(type == F_STRING) { if(type == F_STRING) {
add_token_with_value(self, TK_FSTR, value); add_token_with_value(self, TK_FSTR, value);
} else if(type == NORMAL_BYTES) { } else if(type == NORMAL_BYTES) {
@ -468,8 +464,7 @@ static Error* lex_one_token(Lexer* self, bool* eof) {
if(matchchar(self, '=')) { if(matchchar(self, '=')) {
add_token(self, TK_NE); add_token(self, TK_NE);
} else { } else {
Error* err = SyntaxError(self, "expected '=' after '!'"); return SyntaxError(self, "expected '=' after '!'");
if(err) return err;
} }
break; break;
case '*': case '*':
@ -523,85 +518,10 @@ static Error* lex_one_token(Lexer* self, bool* eof) {
return NULL; return NULL;
} }
static Error* from_precompiled(Lexer* self) {
TokenDeserializer deserializer;
TokenDeserializer__ctor(&deserializer, self->src->source->data);
deserializer.curr += 5; // skip "pkpy:"
c11_sv version = TokenDeserializer__read_string(&deserializer, '\n');
if(c11_sv__cmp2(version, PK_VERSION) != 0) {
return SyntaxError(self, "precompiled version mismatch");
}
if(TokenDeserializer__read_uint(&deserializer, '\n') != (int64_t)self->src->mode) {
return SyntaxError(self, "precompiled mode mismatch");
}
int count = TokenDeserializer__read_count(&deserializer);
c11_vector* precompiled_tokens = &self->src->_precompiled_tokens;
for(int i = 0; i < count; i++) {
c11_sv item = TokenDeserializer__read_string(&deserializer, '\n');
c11_string* copied_item = c11_string__new2(item.data, item.size);
c11_vector__push(c11_string*, precompiled_tokens, copied_item);
}
count = TokenDeserializer__read_count(&deserializer);
for(int i = 0; i < count; i++) {
Token t;
t.type = (TokenIndex)TokenDeserializer__read_uint(&deserializer, ',');
if(is_raw_string_used(t.type)) {
int64_t index = TokenDeserializer__read_uint(&deserializer, ',');
c11_string* p = c11__getitem(c11_string*, precompiled_tokens, index);
t.start = p->data;
t.length = p->size;
} else {
t.start = NULL;
t.length = 0;
}
if(TokenDeserializer__match_char(&deserializer, ',')) {
t.line = c11_vector__back(Token, &self->nexts).line;
} else {
t.line = (int)TokenDeserializer__read_uint(&deserializer, ',');
}
if(TokenDeserializer__match_char(&deserializer, ',')) {
t.brackets_level = c11_vector__back(Token, &self->nexts).brackets_level;
} else {
t.brackets_level = (int)TokenDeserializer__read_uint(&deserializer, ',');
}
char type = (*deserializer.curr++); // read_char
switch(type) {
case 'I': {
int64_t res = TokenDeserializer__read_uint(&deserializer, '\n');
t.value = (TokenValue){TokenValue_I64, ._i64 = res};
} break;
case 'F': {
double res = TokenDeserializer__read_float(&deserializer, '\n');
t.value = (TokenValue){TokenValue_F64, ._f64 = res};
} break;
case 'S': {
c11_string* res = TokenDeserializer__read_string_from_hex(&deserializer, '\n');
t.value = (TokenValue){TokenValue_STR, ._str = res};
} break;
default: t.value = EmptyTokenValue; break;
}
c11_vector__push(Token, &self->nexts, t);
}
return NULL;
}
Error* Lexer__process(SourceData_ src, TokenArray* out_tokens) { Error* Lexer__process(SourceData_ src, TokenArray* out_tokens) {
Lexer lexer; Lexer lexer;
Lexer__ctor(&lexer, src); Lexer__ctor(&lexer, src);
if(src->is_precompiled) {
Error* err = from_precompiled(&lexer);
// TODO: set out tokens
Lexer__dtor(&lexer);
return err;
}
// push initial tokens // push initial tokens
Token sof = Token sof =
{TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue}; {TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue};
@ -623,102 +543,6 @@ Error* Lexer__process(SourceData_ src, TokenArray* out_tokens) {
return NULL; return NULL;
} }
Error* Lexer__process_and_dump(SourceData_ src, c11_string** out) {
assert(!src->is_precompiled);
TokenArray nexts; // output tokens
Error* err = Lexer__process(src, &nexts);
if(err) return err;
c11_sbuf ss;
c11_sbuf__ctor(&ss);
// L1: version string
c11_sbuf__write_cstr(&ss, "pkpy:" PK_VERSION "\n");
// L2: mode
c11_sbuf__write_int(&ss, (int)src->mode);
c11_sbuf__write_char(&ss, '\n');
c11_smallmap_s2n token_indices;
c11_smallmap_s2n__ctor(&token_indices);
c11__foreach(Token, &nexts, token) {
if(is_raw_string_used(token->type)) {
c11_sv token_sv = {token->start, token->length};
if(!c11_smallmap_s2n__contains(&token_indices, token_sv)) {
c11_smallmap_s2n__set(&token_indices, token_sv, 0);
}
}
}
// L3: raw string count
c11_sbuf__write_char(&ss, '=');
c11_sbuf__write_int(&ss, token_indices.count);
c11_sbuf__write_char(&ss, '\n');
uint16_t index = 0;
for(int i = 0; i < token_indices.count; i++) {
c11_smallmap_s2n_KV* kv = c11__at(c11_smallmap_s2n_KV, &token_indices, i);
// L4: raw strings
c11_sbuf__write_cstrn(&ss, kv->key.data, kv->key.size);
kv->value = index++;
}
// L5: token count
c11_sbuf__write_char(&ss, '=');
c11_sbuf__write_int(&ss, nexts.count);
c11_sbuf__write_char(&ss, '\n');
for(int i = 0; i < nexts.count; i++) {
const Token* token = c11__at(Token, &nexts, i);
c11_sbuf__write_int(&ss, (int)token->type);
c11_sbuf__write_char(&ss, ',');
if(is_raw_string_used(token->type)) {
uint16_t* p =
c11_smallmap_s2n__try_get(&token_indices, (c11_sv){token->start, token->length});
assert(p != NULL);
c11_sbuf__write_int(&ss, (int)*p);
c11_sbuf__write_char(&ss, ',');
}
if(i > 0 && c11__getitem(Token, &nexts, i - 1).line == token->line) {
c11_sbuf__write_char(&ss, ',');
} else {
c11_sbuf__write_int(&ss, token->line);
c11_sbuf__write_char(&ss, ',');
}
if(i > 0 && c11__getitem(Token, &nexts, i - 1).brackets_level == token->brackets_level) {
c11_sbuf__write_char(&ss, ',');
} else {
c11_sbuf__write_int(&ss, token->brackets_level);
c11_sbuf__write_char(&ss, ',');
}
// visit token value
switch(token->value.index) {
case TokenValue_EMPTY: break;
case TokenValue_I64:
c11_sbuf__write_char(&ss, 'I');
c11_sbuf__write_int(&ss, token->value._i64);
break;
case TokenValue_F64:
c11_sbuf__write_char(&ss, 'F');
c11_sbuf__write_f64(&ss, token->value._f64, -1);
break;
case TokenValue_STR: {
c11_sbuf__write_char(&ss, 'S');
c11_sv sv = c11_string__sv(token->value._str);
for(int i = 0; i < sv.size; i++) {
c11_sbuf__write_hex(&ss, sv.data[i], false);
}
break;
}
}
c11_sbuf__write_char(&ss, '\n');
}
*out = c11_sbuf__submit(&ss);
c11_smallmap_s2n__dtor(&token_indices);
return NULL;
}
void TokenArray__dtor(TokenArray* self) { void TokenArray__dtor(TokenArray* self) {
Token* data = self->data; Token* data = self->data;
for(int i = 0; i < self->count; i++) { for(int i = 0; i < self->count; i++) {