mirror of
https://github.com/pocketpy/pocketpy
synced 2025-10-20 03:20:18 +00:00
Compare commits
8 Commits
fa31f4c5df
...
6f4617b83d
Author | SHA1 | Date | |
---|---|---|---|
|
6f4617b83d | ||
|
4c332b7d16 | ||
|
7748d2bf03 | ||
|
881e94e8b0 | ||
|
72723d24f5 | ||
|
1ac08cfc2b | ||
|
79b9df3392 | ||
|
104c266bc0 |
@ -11,8 +11,8 @@ extern "C" {
|
||||
#define kPoolObjectArenaSize (256*1024)
|
||||
#define kPoolObjectMaxBlocks (kPoolObjectArenaSize / kPoolObjectBlockSize)
|
||||
|
||||
void Pools_initialize();
|
||||
void Pools_finalize();
|
||||
void pk_MemoryPools__initialize();
|
||||
void pk_MemoryPools__finalize();
|
||||
|
||||
void* PoolExpr_alloc();
|
||||
void PoolExpr_dealloc(void*);
|
||||
|
@ -34,6 +34,7 @@ c11_vector c11_vector__copy(const c11_vector* self);
|
||||
void c11_vector__reserve(c11_vector* self, int capacity);
|
||||
void c11_vector__clear(c11_vector* self);
|
||||
void* c11_vector__emplace(c11_vector* self);
|
||||
c11_array c11_vector__submit(c11_vector* self);
|
||||
|
||||
#define c11__getitem(T, self, index) (((T*)(self)->data)[index])
|
||||
#define c11__setitem(T, self, index, value) ((T*)(self)->data)[index] = value;
|
||||
|
18
include/pocketpy/compiler/compiler.h
Normal file
18
include/pocketpy/compiler/compiler.h
Normal file
@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include "pocketpy/common/vector.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
#include "pocketpy/objects/codeobject.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
Error* pk_compile(pk_SourceData_ src, CodeObject* out);
|
||||
void pk_Compiler__initialize();
|
||||
#define pk_Compiler__finalize() // do nothing
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,52 +1,63 @@
|
||||
// #pragma once
|
||||
#pragma once
|
||||
|
||||
// #include <stdbool.h>
|
||||
// #include "pocketpy/common/memorypool.h"
|
||||
// #include "pocketpy/compiler/lexer.h"
|
||||
#include <stdbool.h>
|
||||
#include "pocketpy/common/memorypool.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
#include "pocketpy/objects/codeobject.h"
|
||||
|
||||
// #ifdef __cplusplus
|
||||
// extern "C" {
|
||||
// #endif
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// struct pk_Expr;
|
||||
// struct pk_CodeEmitContext;
|
||||
typedef struct pk_Expr pk_Expr;
|
||||
typedef struct pk_CodeEmitContext pk_CodeEmitContext;
|
||||
|
||||
// struct pk_ExprVt{
|
||||
// void (*dtor)(pk_Expr*);
|
||||
// /* reflections */
|
||||
// bool (*is_literal)(const pk_Expr*);
|
||||
// bool (*is_json_object)(const pk_Expr*);
|
||||
// bool (*is_attrib)(const pk_Expr*);
|
||||
// bool (*is_subscr)(const pk_Expr*);
|
||||
// bool (*is_compare)(const pk_Expr*);
|
||||
// int (*star_level)(const pk_Expr*);
|
||||
// bool (*is_tuple)(const pk_Expr*);
|
||||
// bool (*is_name)(const pk_Expr*);
|
||||
// /* emit */
|
||||
// void (*emit_)(pk_Expr*, pk_CodeEmitContext*);
|
||||
// bool (*emit_del)(pk_Expr*, pk_CodeEmitContext*);
|
||||
// bool (*emit_store)(pk_Expr*, pk_CodeEmitContext*);
|
||||
// void (*emit_inplace)(pk_Expr*, pk_CodeEmitContext*);
|
||||
// bool (*emit_store_inplace)(pk_Expr*, pk_CodeEmitContext*);
|
||||
// };
|
||||
typedef struct pk_ExprVt{
|
||||
void (*dtor)(pk_Expr*);
|
||||
/* reflections */
|
||||
bool (*is_literal)(const pk_Expr*);
|
||||
bool (*is_json_object)(const pk_Expr*);
|
||||
bool (*is_attrib)(const pk_Expr*);
|
||||
bool (*is_subscr)(const pk_Expr*);
|
||||
bool (*is_compare)(const pk_Expr*);
|
||||
int (*star_level)(const pk_Expr*);
|
||||
bool (*is_tuple)(const pk_Expr*);
|
||||
bool (*is_name)(const pk_Expr*);
|
||||
/* emit */
|
||||
void (*emit_)(pk_Expr*, pk_CodeEmitContext*);
|
||||
bool (*emit_del)(pk_Expr*, pk_CodeEmitContext*);
|
||||
bool (*emit_store)(pk_Expr*, pk_CodeEmitContext*);
|
||||
void (*emit_inplace)(pk_Expr*, pk_CodeEmitContext*);
|
||||
bool (*emit_store_inplace)(pk_Expr*, pk_CodeEmitContext*);
|
||||
} pk_ExprVt;
|
||||
|
||||
// typedef struct pk_Expr{
|
||||
// pk_ExprVt* vt;
|
||||
// int line;
|
||||
// } pk_Expr;
|
||||
typedef struct pk_Expr{
|
||||
pk_ExprVt* vt;
|
||||
int line;
|
||||
} pk_Expr;
|
||||
|
||||
// void pk_ExprVt__ctor(pk_ExprVt* vt);
|
||||
// void pk_Expr__emit_(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
// bool pk_Expr__emit_del(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
// bool pk_Expr__emit_store(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
// void pk_Expr__emit_inplace(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
// bool pk_Expr__emit_store_inplace(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
// void pk_Expr__delete(pk_Expr* self);
|
||||
void pk_ExprVt__ctor(pk_ExprVt* vt);
|
||||
void pk_Expr__emit_(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
bool pk_Expr__emit_del(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
bool pk_Expr__emit_store(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
void pk_Expr__emit_inplace(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
bool pk_Expr__emit_store_inplace(pk_Expr* self, pk_CodeEmitContext* ctx);
|
||||
void pk_Expr__delete(pk_Expr* self);
|
||||
|
||||
// typedef struct pk_CodeEmitContext{
|
||||
typedef struct pk_CodeEmitContext{
|
||||
CodeObject* co; // 1 CodeEmitContext <=> 1 CodeObject*
|
||||
FuncDecl* func; // optional, weakref
|
||||
int level;
|
||||
int curr_iblock;
|
||||
bool is_compiling_class;
|
||||
c11_vector/*T=Expr* */ s_expr;
|
||||
c11_vector/*T=StrName*/ global_names;
|
||||
c11_smallmap_s2n co_consts_string_dedup_map;
|
||||
} pk_CodeEmitContext;
|
||||
|
||||
// } pk_CodeEmitContext;
|
||||
void pk_CodeEmitContext__ctor(pk_CodeEmitContext* self, CodeObject* co, FuncDecl* func, int level);
|
||||
void pk_CodeEmitContext__dtor(pk_CodeEmitContext* self);
|
||||
|
||||
// #ifdef __cplusplus
|
||||
// }
|
||||
// #endif
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -53,16 +53,16 @@ struct CodeEmitContext{
|
||||
int level;
|
||||
vector<StrName> global_names;
|
||||
|
||||
CodeEmitContext(VM* vm, CodeObject* co, int level) : vm(vm), co(co), level(level) {
|
||||
func = NULL;
|
||||
c11_smallmap_s2n__ctor(&_co_consts_string_dedup_map);
|
||||
}
|
||||
|
||||
int curr_iblock = 0;
|
||||
bool is_compiling_class = false;
|
||||
|
||||
c11_smallmap_s2n _co_consts_string_dedup_map;
|
||||
|
||||
CodeEmitContext(VM* vm, CodeObject* co, int level) : vm(vm), co(co), level(level) {
|
||||
func = NULL;
|
||||
c11_smallmap_s2n__ctor(&_co_consts_string_dedup_map);
|
||||
}
|
||||
|
||||
int get_loop() const noexcept;
|
||||
CodeBlock* enter_block(CodeBlockType type) noexcept;
|
||||
void exit_block() noexcept;
|
||||
|
@ -1,6 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "pocketpy/common/str.h"
|
||||
#include "pocketpy/common/vector.h"
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -34,12 +36,19 @@ typedef enum TokenIndex{
|
||||
TK__COUNT__
|
||||
} TokenIndex;
|
||||
|
||||
enum TokenValueIndex{
|
||||
TokenValue_EMPTY = 0,
|
||||
TokenValue_I64 = 1,
|
||||
TokenValue_F64 = 2,
|
||||
TokenValue_STR = 3,
|
||||
};
|
||||
|
||||
typedef struct TokenValue {
|
||||
int index;
|
||||
enum TokenValueIndex index; // 0: empty
|
||||
union {
|
||||
int64_t _i64; // 0
|
||||
double _f64; // 1
|
||||
py_Str _str; // 2
|
||||
int64_t _i64; // 1
|
||||
double _f64; // 2
|
||||
py_Str _str; // 3
|
||||
};
|
||||
} TokenValue;
|
||||
|
||||
@ -78,28 +87,21 @@ enum Precedence {
|
||||
PREC_HIGHEST,
|
||||
};
|
||||
|
||||
enum StringType {
|
||||
NORMAL_STRING,
|
||||
RAW_STRING,
|
||||
F_STRING,
|
||||
NORMAL_BYTES
|
||||
};
|
||||
typedef enum IntParsingResult{
|
||||
IntParsing_SUCCESS,
|
||||
IntParsing_FAILURE,
|
||||
IntParsing_OVERFLOW,
|
||||
} IntParsingResult;
|
||||
|
||||
#define is_raw_string_used(t) ((t) == TK_ID || (t) == TK_LONG)
|
||||
IntParsingResult parse_uint(c11_string text, int64_t* out, int base);
|
||||
|
||||
typedef struct pk_TokenDeserializer {
|
||||
const char* curr;
|
||||
const char* source;
|
||||
} pk_TokenDeserializer;
|
||||
typedef struct Error Error;
|
||||
|
||||
void pk_TokenDeserializer__ctor(pk_TokenDeserializer* self, const char* source);
|
||||
bool pk_TokenDeserializer__match_char(pk_TokenDeserializer* self, char c);
|
||||
c11_string pk_TokenDeserializer__read_string(pk_TokenDeserializer* self, char c);
|
||||
py_Str pk_TokenDeserializer__read_string_from_hex(pk_TokenDeserializer* self, char c);
|
||||
int pk_TokenDeserializer__read_count(pk_TokenDeserializer* self);
|
||||
int64_t pk_TokenDeserializer__read_uint(pk_TokenDeserializer* self, char c);
|
||||
double pk_TokenDeserializer__read_float(pk_TokenDeserializer* self, char c);
|
||||
typedef c11_array pk_TokenArray;
|
||||
|
||||
Error* pk_Lexer__process(pk_SourceData_ src, pk_TokenArray* out_tokens);
|
||||
Error* pk_Lexer__process_and_dump(pk_SourceData_ src, py_Str* out_string);
|
||||
void pk_TokenArray__dtor(pk_TokenArray* self);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -1,74 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "pocketpy/objects/error.hpp"
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
|
||||
#include <variant>
|
||||
|
||||
namespace pkpy {
|
||||
|
||||
|
||||
struct Lexer {
|
||||
PK_ALWAYS_PASS_BY_POINTER(Lexer)
|
||||
|
||||
VM* vm;
|
||||
pkpy_SourceData_ src;
|
||||
const char* token_start;
|
||||
const char* curr_char;
|
||||
int current_line = 1;
|
||||
vector<Token> nexts;
|
||||
small_vector_2<int, 8> indents;
|
||||
int brackets_level = 0;
|
||||
bool used = false;
|
||||
|
||||
char peekchar() const noexcept { return *curr_char; }
|
||||
|
||||
bool match_n_chars(int n, char c0) noexcept;
|
||||
bool match_string(const char* s) noexcept;
|
||||
int eat_spaces() noexcept;
|
||||
|
||||
bool eat_indentation() noexcept;
|
||||
char eatchar() noexcept;
|
||||
char eatchar_include_newline() noexcept;
|
||||
void skip_line_comment() noexcept;
|
||||
bool matchchar(char c) noexcept;
|
||||
void add_token(TokenIndex type, TokenValue value = {}) noexcept;
|
||||
void add_token_2(char c, TokenIndex one, TokenIndex two) noexcept;
|
||||
|
||||
[[nodiscard]] Error* eat_name() noexcept;
|
||||
[[nodiscard]] Error* eat_string_until(char quote, bool raw, Str* out) noexcept;
|
||||
[[nodiscard]] Error* eat_string(char quote, StringType type) noexcept;
|
||||
[[nodiscard]] Error* eat_number() noexcept;
|
||||
[[nodiscard]] Error* lex_one_token(bool* eof) noexcept;
|
||||
|
||||
/***** Error Reporter *****/
|
||||
[[nodiscard]] Error* _error(bool lexer_err, const char* type, const char* msg, va_list* args, i64 userdata=0) noexcept;
|
||||
[[nodiscard]] Error* SyntaxError(const char* fmt, ...) noexcept;
|
||||
[[nodiscard]] Error* IndentationError(const char* msg) noexcept { return _error(true, "IndentationError", msg, NULL); }
|
||||
[[nodiscard]] Error* NeedMoreLines() noexcept { return _error(true, "NeedMoreLines", "", NULL, 0); }
|
||||
|
||||
[[nodiscard]] Error* run() noexcept;
|
||||
[[nodiscard]] Error* from_precompiled() noexcept;
|
||||
[[nodiscard]] Error* precompile(Str* out) noexcept;
|
||||
|
||||
Lexer(VM* vm, std::string_view source, const Str& filename, CompileMode mode) noexcept{
|
||||
src = pkpy_SourceData__rcnew({source.data(), (int)source.size()}, &filename, mode);
|
||||
this->token_start = py_Str__data(&src->source);
|
||||
this->curr_char = py_Str__data(&src->source);
|
||||
}
|
||||
|
||||
~Lexer(){
|
||||
PK_DECREF(src);
|
||||
}
|
||||
};
|
||||
|
||||
enum class IntParsingResult {
|
||||
Success,
|
||||
Failure,
|
||||
Overflow,
|
||||
};
|
||||
|
||||
IntParsingResult parse_uint(std::string_view text, i64* out, int base) noexcept;
|
||||
|
||||
} // namespace pkpy
|
@ -28,7 +28,7 @@ typedef enum FuncType {
|
||||
typedef enum NameScope {
|
||||
NAME_LOCAL,
|
||||
NAME_GLOBAL,
|
||||
NAME_GLOBAL_UNKNOWN
|
||||
NAME_GLOBAL_UNKNOWN,
|
||||
} NameScope;
|
||||
|
||||
typedef enum CodeBlockType {
|
||||
@ -88,8 +88,8 @@ typedef struct CodeObject {
|
||||
int end_line;
|
||||
} CodeObject;
|
||||
|
||||
CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name);
|
||||
void CodeObject__delete(CodeObject* self);
|
||||
void CodeObject__ctor(CodeObject* self, pk_SourceData_ src, c11_string name);
|
||||
void CodeObject__dtor(CodeObject* self);
|
||||
void CodeObject__gc_mark(const CodeObject* self);
|
||||
|
||||
typedef struct FuncDeclKwArg{
|
||||
@ -100,7 +100,7 @@ typedef struct FuncDeclKwArg{
|
||||
|
||||
typedef struct FuncDecl {
|
||||
RefCounted rc;
|
||||
CodeObject* code; // strong ref
|
||||
CodeObject code; // strong ref
|
||||
|
||||
c11_vector/*T=int*/ args; // indices in co->varnames
|
||||
c11_vector/*T=KwArg*/ kwargs; // indices in co->varnames
|
||||
|
@ -15,6 +15,7 @@ struct pk_SourceData {
|
||||
RefCounted rc;
|
||||
enum CompileMode mode;
|
||||
bool is_precompiled;
|
||||
bool is_dynamic; // for exec() and eval()
|
||||
|
||||
py_Str filename;
|
||||
py_Str source;
|
||||
@ -25,7 +26,7 @@ struct pk_SourceData {
|
||||
|
||||
typedef struct pk_SourceData* pk_SourceData_;
|
||||
|
||||
pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode);
|
||||
pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode, bool is_dynamic);
|
||||
bool pk_SourceData__get_line(const struct pk_SourceData* self, int lineno, const char** st, const char** ed);
|
||||
py_Str pk_SourceData__snapshot(const struct pk_SourceData *self, int lineno, const char *cursor, const char *name);
|
||||
|
||||
|
@ -4,10 +4,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
void pk_SourceData__ctor(struct pk_SourceData* self,
|
||||
static void pk_SourceData__ctor(struct pk_SourceData* self,
|
||||
const char* source,
|
||||
const char* filename,
|
||||
enum CompileMode mode) {
|
||||
enum CompileMode mode,
|
||||
bool is_dynamic) {
|
||||
py_Str__ctor(&self->filename, filename);
|
||||
self->mode = mode;
|
||||
c11_vector__ctor(&self->line_starts, sizeof(const char*));
|
||||
@ -30,7 +31,7 @@ void pk_SourceData__ctor(struct pk_SourceData* self,
|
||||
c11_vector__push(const char*, &self->line_starts, source);
|
||||
}
|
||||
|
||||
void pk_SourceData__dtor(struct pk_SourceData* self) {
|
||||
static void pk_SourceData__dtor(struct pk_SourceData* self) {
|
||||
py_Str__dtor(&self->filename);
|
||||
py_Str__dtor(&self->source);
|
||||
c11_vector__dtor(&self->line_starts);
|
||||
@ -41,9 +42,9 @@ void pk_SourceData__dtor(struct pk_SourceData* self) {
|
||||
c11_vector__dtor(&self->_precompiled_tokens);
|
||||
}
|
||||
|
||||
pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode) {
|
||||
pk_SourceData_ pk_SourceData__rcnew(const char* source, const char* filename, enum CompileMode mode, bool is_dynamic) {
|
||||
pk_SourceData_ self = malloc(sizeof(struct pk_SourceData));
|
||||
pk_SourceData__ctor(self, source, filename, mode);
|
||||
pk_SourceData__ctor(self, source, filename, mode, is_dynamic);
|
||||
self->rc.count = 1;
|
||||
self->rc.dtor = (void(*)(void*))pk_SourceData__dtor;
|
||||
return self;
|
||||
|
@ -152,12 +152,13 @@ void pk_SStream__write_any(pk_SStream* self, const char* fmt, const pk_AnyStr* a
|
||||
|
||||
py_Str pk_SStream__submit(pk_SStream* self) {
|
||||
c11_vector__push(char, &self->data, '\0');
|
||||
c11_array a = c11_vector__submit(&self->data);
|
||||
// TODO: optimize c11__isascii
|
||||
py_Str retval = {
|
||||
.size = self->data.count - 1,
|
||||
.is_ascii = c11__isascii((char*)self->data.data, self->data.count),
|
||||
.size = a.count - 1,
|
||||
.is_ascii = c11__isascii((char*)a.data, a.count),
|
||||
.is_sso = false,
|
||||
._ptr = (char*)self->data.data
|
||||
._ptr = (char*)a.data
|
||||
};
|
||||
return retval;
|
||||
}
|
||||
|
@ -62,3 +62,15 @@ void* c11_vector__emplace(c11_vector* self){
|
||||
self->count++;
|
||||
return p;
|
||||
}
|
||||
|
||||
c11_array c11_vector__submit(c11_vector* self){
|
||||
c11_array retval = {
|
||||
.data = self->data,
|
||||
.count = self->count,
|
||||
.elem_size = self->elem_size
|
||||
};
|
||||
self->data = NULL;
|
||||
self->count = 0;
|
||||
self->capacity = 0;
|
||||
return retval;
|
||||
}
|
||||
|
307
src/compiler/compiler.c
Normal file
307
src/compiler/compiler.c
Normal file
@ -0,0 +1,307 @@
|
||||
#include "pocketpy/compiler/compiler.h"
|
||||
#include "pocketpy/compiler/expr.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
|
||||
typedef struct pk_Compiler pk_Compiler;
|
||||
typedef Error* (*PrattCallback)(pk_Compiler* self);
|
||||
|
||||
typedef struct PrattRule {
|
||||
PrattCallback prefix;
|
||||
PrattCallback infix;
|
||||
enum Precedence precedence;
|
||||
} PrattRule;
|
||||
|
||||
static PrattRule rules[TK__COUNT__];
|
||||
|
||||
typedef struct pk_Compiler {
|
||||
pk_SourceData_ src; // weakref
|
||||
pk_TokenArray tokens;
|
||||
int i;
|
||||
c11_vector/*T=CodeEmitContext*/ contexts;
|
||||
} pk_Compiler;
|
||||
|
||||
static void pk_Compiler__ctor(pk_Compiler *self, pk_SourceData_ src, pk_TokenArray tokens){
|
||||
self->src = src;
|
||||
self->tokens = tokens;
|
||||
self->i = 0;
|
||||
c11_vector__ctor(&self->contexts, sizeof(pk_CodeEmitContext));
|
||||
}
|
||||
|
||||
static void pk_Compiler__dtor(pk_Compiler *self){
|
||||
pk_TokenArray__dtor(&self->tokens);
|
||||
c11_vector__dtor(&self->contexts);
|
||||
}
|
||||
|
||||
/**************************************/
|
||||
#define tk(i) c11__getitem(Token, &self->tokens, i)
|
||||
#define prev() tk(self->i - 1)
|
||||
#define curr() tk(self->i)
|
||||
#define next() tk(self->i + 1)
|
||||
#define err() (self->i == self->tokens.count ? prev() : curr())
|
||||
|
||||
#define advance() self->i++
|
||||
#define mode() self->src->mode
|
||||
#define ctx() c11_vector__back(pk_CodeEmitContext, &self->contexts)
|
||||
|
||||
#define match_newlines() match_newlines_repl(self, NULL)
|
||||
|
||||
#define consume(expected) if(!match(expected)) return SyntaxError("expected '%s', got '%s'", pk_TokenSymbols[expected], pk_TokenSymbols[curr().type]);
|
||||
#define consume_end_stmt() if(!match_end_stmt()) return SyntaxError("expected statement end")
|
||||
#define check_newlines_repl() { bool __nml; match_newlines_repl(self, &__nml); if(__nml) return NeedMoreLines(); }
|
||||
#define check(B) if((err = B)) return err
|
||||
|
||||
static NameScope name_scope(pk_Compiler* self) {
|
||||
NameScope s = self->contexts.count > 1 ? NAME_LOCAL : NAME_GLOBAL;
|
||||
if(self->src->is_dynamic && s == NAME_GLOBAL) s = NAME_GLOBAL_UNKNOWN;
|
||||
return s;
|
||||
}
|
||||
|
||||
static Error* SyntaxError(const char* fmt, ...){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* NeedMoreLines(){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Matchers */
|
||||
static bool is_expression(pk_Compiler* self, bool allow_slice){
|
||||
PrattCallback prefix = rules[curr().type].prefix;
|
||||
return prefix && (allow_slice || curr().type != TK_COLON);
|
||||
}
|
||||
|
||||
#define match(expected) (curr().type == expected ? (++self->i) : 0)
|
||||
|
||||
static bool match_newlines_repl(pk_Compiler* self, bool* need_more_lines){
|
||||
bool consumed = false;
|
||||
if(curr().type == TK_EOL) {
|
||||
while(curr().type == TK_EOL) advance();
|
||||
consumed = true;
|
||||
}
|
||||
if(need_more_lines) {
|
||||
*need_more_lines = (mode() == REPL_MODE && curr().type == TK_EOF);
|
||||
}
|
||||
return consumed;
|
||||
}
|
||||
|
||||
static bool match_end_stmt(pk_Compiler* self) {
|
||||
if(match(TK_SEMICOLON)) {
|
||||
match_newlines();
|
||||
return true;
|
||||
}
|
||||
if(match_newlines() || curr().type == TK_EOF) return true;
|
||||
if(curr().type == TK_DEDENT) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Expression Callbacks */
|
||||
static Error* exprLiteral(pk_Compiler* self);
|
||||
static Error* exprLong(pk_Compiler* self);
|
||||
static Error* exprImag(pk_Compiler* self);
|
||||
static Error* exprBytes(pk_Compiler* self);
|
||||
static Error* exprFString(pk_Compiler* self);
|
||||
static Error* exprLambda(pk_Compiler* self);
|
||||
static Error* exprOr(pk_Compiler* self);
|
||||
static Error* exprAnd(pk_Compiler* self);
|
||||
static Error* exprTernary(pk_Compiler* self);
|
||||
static Error* exprBinaryOp(pk_Compiler* self);
|
||||
static Error* exprNot(pk_Compiler* self);
|
||||
static Error* exprUnaryOp(pk_Compiler* self);
|
||||
static Error* exprGroup(pk_Compiler* self);
|
||||
static Error* exprList(pk_Compiler* self);
|
||||
static Error* exprMap(pk_Compiler* self);
|
||||
static Error* exprCall(pk_Compiler* self);
|
||||
static Error* exprName(pk_Compiler* self);
|
||||
static Error* exprAttrib(pk_Compiler* self);
|
||||
static Error* exprSlice0(pk_Compiler* self);
|
||||
static Error* exprSlice1(pk_Compiler* self);
|
||||
static Error* exprSubscr(pk_Compiler* self);
|
||||
static Error* exprLiteral0(pk_Compiler* self);
|
||||
|
||||
/* Expression */
|
||||
static Error* parse_expression(pk_Compiler* self, int precedence, bool allow_slice){
|
||||
PrattCallback prefix = rules[curr().type].prefix;
|
||||
if(!prefix || (curr().type == TK_COLON && !allow_slice)) {
|
||||
return SyntaxError("expected an expression, got %s", pk_TokenSymbols[curr().type]);
|
||||
}
|
||||
advance();
|
||||
Error* err;
|
||||
check(prefix(self));
|
||||
while(rules[curr().type].precedence >= precedence && (allow_slice || curr().type != TK_COLON)) {
|
||||
TokenIndex op = curr().type;
|
||||
advance();
|
||||
PrattCallback infix = rules[op].infix;
|
||||
assert(infix != NULL);
|
||||
check(infix(self));
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* EXPR(pk_Compiler* self) {
|
||||
return parse_expression(self, PREC_LOWEST + 1, false);
|
||||
}
|
||||
|
||||
static Error* EXPR_TUPLE(pk_Compiler* self, bool allow_slice){
|
||||
Error* err;
|
||||
check(parse_expression(self, PREC_LOWEST + 1, allow_slice));
|
||||
if(!match(TK_COMMA)) return NULL;
|
||||
// tuple expression
|
||||
int count = 1;
|
||||
do {
|
||||
if(curr().brackets_level) check_newlines_repl()
|
||||
if(!is_expression(self, allow_slice)) break;
|
||||
check(parse_expression(self, PREC_LOWEST + 1, allow_slice));
|
||||
count += 1;
|
||||
if(curr().brackets_level) check_newlines_repl();
|
||||
} while(match(TK_COMMA));
|
||||
// TupleExpr* e = make_expr<TupleExpr>(count);
|
||||
// for(int i=count-1; i>=0; i--)
|
||||
// e->items[i] = ctx()->s_popx();
|
||||
// ctx()->s_push(e);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// special case for `for loop` and `comp`
|
||||
static Error* EXPR_VARS(pk_Compiler* self){
|
||||
int count = 0;
|
||||
do {
|
||||
consume(TK_ID);
|
||||
ctx()->s_push(make_expr<NameExpr>(prev().str(), name_scope()));
|
||||
count += 1;
|
||||
} while(match(TK_COMMA));
|
||||
if(count > 1){
|
||||
TupleExpr* e = make_expr<TupleExpr>(count);
|
||||
for(int i=count-1; i>=0; i--)
|
||||
e->items[i] = ctx()->s_popx();
|
||||
ctx()->s_push(e);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void setup_global_context(pk_Compiler* self, CodeObject* co){
|
||||
co->start_line = self->i == 0 ? 1 : prev().line;
|
||||
pk_CodeEmitContext* ctx = c11_vector__emplace(&self->contexts);
|
||||
pk_CodeEmitContext__ctor(ctx, co, NULL, self->contexts.count);
|
||||
}
|
||||
|
||||
Error* pk_Compiler__compile(pk_Compiler* self, CodeObject* out){
|
||||
// make sure it is the first time to compile
|
||||
assert(self->i == 0);
|
||||
// make sure the first token is @sof
|
||||
assert(tk(0).type == TK_SOF);
|
||||
|
||||
setup_global_context(self, out);
|
||||
|
||||
advance(); // skip @sof, so prev() is always valid
|
||||
match_newlines(); // skip possible leading '\n'
|
||||
|
||||
Error* err;
|
||||
// if(mode() == EVAL_MODE) {
|
||||
// check(EXPR_TUPLE());
|
||||
// ctx()->s_emit_top();
|
||||
// consume(TK_EOF);
|
||||
// ctx()->emit_(OP_RETURN_VALUE, BC_NOARG, BC_KEEPLINE);
|
||||
// check(pop_context());
|
||||
// return NULL;
|
||||
// } else if(mode() == JSON_MODE) {
|
||||
// check(EXPR());
|
||||
// Expr* e = ctx()->s_popx();
|
||||
// if(!e->is_json_object()){
|
||||
// return SyntaxError("expect a JSON object, literal or array");
|
||||
// }
|
||||
// consume(TK_EOF);
|
||||
// e->emit_(ctx());
|
||||
// ctx()->emit_(OP_RETURN_VALUE, BC_NOARG, BC_KEEPLINE);
|
||||
// check(pop_context());
|
||||
// return NULL;
|
||||
// }
|
||||
|
||||
// while(!match(TK_EOF)) {
|
||||
// check(compile_stmt());
|
||||
// match_newlines();
|
||||
// }
|
||||
// check(pop_context());
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Error* pk_compile(pk_SourceData_ src, CodeObject* out){
|
||||
pk_TokenArray tokens;
|
||||
Error* err = pk_Lexer__process(src, &tokens);
|
||||
if(err) return err;
|
||||
|
||||
// Token* data = (Token*)tokens.data;
|
||||
// printf("%s\n", py_Str__data(&src->filename));
|
||||
// for(int i = 0; i < tokens.count; i++) {
|
||||
// Token* t = data + i;
|
||||
// py_Str tmp;
|
||||
// py_Str__ctor2(&tmp, t->start, t->length);
|
||||
// printf("[%d] %s: %s\n", t->line, pk_TokenSymbols[t->type], py_Str__data(&tmp));
|
||||
// py_Str__dtor(&tmp);
|
||||
// }
|
||||
|
||||
pk_Compiler compiler;
|
||||
pk_Compiler__ctor(&compiler, src, tokens);
|
||||
CodeObject__ctor(out, src, py_Str__sv(&src->filename));
|
||||
err = pk_Compiler__compile(&compiler, out);
|
||||
CodeObject__dtor(out);
|
||||
pk_Compiler__dtor(&compiler);
|
||||
return err;
|
||||
}
|
||||
|
||||
void pk_Compiler__initialize(){
|
||||
// clang-format off
|
||||
// http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
|
||||
#define PK_NO_INFIX NULL, PREC_LOWEST
|
||||
for(int i = 0; i < TK__COUNT__; i++) rules[i] = { NULL, PK_NO_INFIX };
|
||||
rules[TK_DOT] = { NULL, exprAttrib, PREC_PRIMARY };
|
||||
rules[TK_LPAREN] = { exprGroup, exprCall, PREC_PRIMARY };
|
||||
rules[TK_LBRACKET] = { exprList, exprSubscr, PREC_PRIMARY };
|
||||
rules[TK_LBRACE] = { exprMap, PK_NO_INFIX };
|
||||
rules[TK_MOD] = { NULL, exprBinaryOp, PREC_FACTOR };
|
||||
rules[TK_ADD] = { NULL, exprBinaryOp, PREC_TERM };
|
||||
rules[TK_SUB] = { exprUnaryOp, exprBinaryOp, PREC_TERM };
|
||||
rules[TK_MUL] = { exprUnaryOp, exprBinaryOp, PREC_FACTOR };
|
||||
rules[TK_INVERT] = { exprUnaryOp, NULL, PREC_UNARY };
|
||||
rules[TK_DIV] = { NULL, exprBinaryOp, PREC_FACTOR };
|
||||
rules[TK_FLOORDIV] = { NULL, exprBinaryOp, PREC_FACTOR };
|
||||
rules[TK_POW] = { exprUnaryOp, exprBinaryOp, PREC_EXPONENT };
|
||||
rules[TK_GT] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_LT] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_EQ] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_NE] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_GE] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_LE] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_IN] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_IS] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_LSHIFT] = { NULL, exprBinaryOp, PREC_BITWISE_SHIFT };
|
||||
rules[TK_RSHIFT] = { NULL, exprBinaryOp, PREC_BITWISE_SHIFT };
|
||||
rules[TK_AND] = { NULL, exprBinaryOp, PREC_BITWISE_AND };
|
||||
rules[TK_OR] = { NULL, exprBinaryOp, PREC_BITWISE_OR };
|
||||
rules[TK_XOR] = { NULL, exprBinaryOp, PREC_BITWISE_XOR };
|
||||
rules[TK_DECORATOR] = { NULL, exprBinaryOp, PREC_FACTOR };
|
||||
rules[TK_IF] = { NULL, exprTernary, PREC_TERNARY };
|
||||
rules[TK_NOT_IN] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_IS_NOT] = { NULL, exprBinaryOp, PREC_COMPARISION };
|
||||
rules[TK_AND_KW ] = { NULL, exprAnd, PREC_LOGICAL_AND };
|
||||
rules[TK_OR_KW] = { NULL, exprOr, PREC_LOGICAL_OR };
|
||||
rules[TK_NOT_KW] = { exprNot, NULL, PREC_LOGICAL_NOT };
|
||||
rules[TK_TRUE] = { exprLiteral0, PK_NO_INFIX };
|
||||
rules[TK_FALSE] = { exprLiteral0, PK_NO_INFIX };
|
||||
rules[TK_NONE] = { exprLiteral0, PK_NO_INFIX };
|
||||
rules[TK_DOTDOTDOT] = { exprLiteral0, PK_NO_INFIX };
|
||||
rules[TK_LAMBDA] = { exprLambda, PK_NO_INFIX };
|
||||
rules[TK_ID] = { exprName, PK_NO_INFIX };
|
||||
rules[TK_NUM] = { exprLiteral, PK_NO_INFIX };
|
||||
rules[TK_STR] = { exprLiteral, PK_NO_INFIX };
|
||||
rules[TK_FSTR] = { exprFString, PK_NO_INFIX };
|
||||
rules[TK_LONG] = { exprLong, PK_NO_INFIX };
|
||||
rules[TK_IMAG] = { exprImag, PK_NO_INFIX };
|
||||
rules[TK_BYTES] = { exprBytes, PK_NO_INFIX };
|
||||
rules[TK_COLON] = { exprSlice0, exprSlice1, PREC_PRIMARY };
|
||||
|
||||
#undef PK_METHOD
|
||||
#undef PK_NO_INFIX
|
||||
// clang-format on
|
||||
}
|
@ -17,7 +17,7 @@ PrattRule Compiler::rules[TK__COUNT__];
|
||||
|
||||
NameScope Compiler::name_scope() const noexcept{
|
||||
auto s = contexts.size() > 1 ? NAME_LOCAL : NAME_GLOBAL;
|
||||
if(unknown_global_scope && s == NAME_GLOBAL) s = NAME_GLOBAL_UNKNOWN;
|
||||
if(unknown_global_scope && s == NAME_GLOBAL) s = NAME_UNKNOWN;
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -120,61 +120,6 @@ void Compiler::init_pratt_rules() noexcept{
|
||||
static bool initialized = false;
|
||||
if(initialized) return;
|
||||
initialized = true;
|
||||
|
||||
// clang-format off
|
||||
// http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/
|
||||
#define PK_METHOD(name) &Compiler::name
|
||||
#define PK_NO_INFIX nullptr, PREC_LOWEST
|
||||
for(int i = 0; i < TK__COUNT__; i++) rules[i] = { nullptr, PK_NO_INFIX };
|
||||
rules[TK_DOT] = { nullptr, PK_METHOD(exprAttrib), PREC_PRIMARY };
|
||||
rules[TK_LPAREN] = { PK_METHOD(exprGroup), PK_METHOD(exprCall), PREC_PRIMARY };
|
||||
rules[TK_LBRACKET] = { PK_METHOD(exprList), PK_METHOD(exprSubscr), PREC_PRIMARY };
|
||||
rules[TK_LBRACE] = { PK_METHOD(exprMap), PK_NO_INFIX };
|
||||
rules[TK_MOD] = { nullptr, PK_METHOD(exprBinaryOp), PREC_FACTOR };
|
||||
rules[TK_ADD] = { nullptr, PK_METHOD(exprBinaryOp), PREC_TERM };
|
||||
rules[TK_SUB] = { PK_METHOD(exprUnaryOp), PK_METHOD(exprBinaryOp), PREC_TERM };
|
||||
rules[TK_MUL] = { PK_METHOD(exprUnaryOp), PK_METHOD(exprBinaryOp), PREC_FACTOR };
|
||||
rules[TK_INVERT] = { PK_METHOD(exprUnaryOp), nullptr, PREC_UNARY };
|
||||
rules[TK_DIV] = { nullptr, PK_METHOD(exprBinaryOp), PREC_FACTOR };
|
||||
rules[TK_FLOORDIV] = { nullptr, PK_METHOD(exprBinaryOp), PREC_FACTOR };
|
||||
rules[TK_POW] = { PK_METHOD(exprUnaryOp), PK_METHOD(exprBinaryOp), PREC_EXPONENT };
|
||||
rules[TK_GT] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_LT] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_EQ] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_NE] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_GE] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_LE] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_IN] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_IS] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_LSHIFT] = { nullptr, PK_METHOD(exprBinaryOp), PREC_BITWISE_SHIFT };
|
||||
rules[TK_RSHIFT] = { nullptr, PK_METHOD(exprBinaryOp), PREC_BITWISE_SHIFT };
|
||||
rules[TK_AND] = { nullptr, PK_METHOD(exprBinaryOp), PREC_BITWISE_AND };
|
||||
rules[TK_OR] = { nullptr, PK_METHOD(exprBinaryOp), PREC_BITWISE_OR };
|
||||
rules[TK_XOR] = { nullptr, PK_METHOD(exprBinaryOp), PREC_BITWISE_XOR };
|
||||
rules[TK_DECORATOR] = { nullptr, PK_METHOD(exprBinaryOp), PREC_FACTOR };
|
||||
rules[TK_IF] = { nullptr, PK_METHOD(exprTernary), PREC_TERNARY };
|
||||
rules[TK_NOT_IN] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_IS_NOT] = { nullptr, PK_METHOD(exprBinaryOp), PREC_COMPARISION };
|
||||
rules[TK_AND_KW ] = { nullptr, PK_METHOD(exprAnd), PREC_LOGICAL_AND };
|
||||
rules[TK_OR_KW] = { nullptr, PK_METHOD(exprOr), PREC_LOGICAL_OR };
|
||||
rules[TK_NOT_KW] = { PK_METHOD(exprNot), nullptr, PREC_LOGICAL_NOT };
|
||||
rules[TK_TRUE] = { PK_METHOD(exprLiteral0), PK_NO_INFIX };
|
||||
rules[TK_FALSE] = { PK_METHOD(exprLiteral0), PK_NO_INFIX };
|
||||
rules[TK_NONE] = { PK_METHOD(exprLiteral0), PK_NO_INFIX };
|
||||
rules[TK_DOTDOTDOT] = { PK_METHOD(exprLiteral0), PK_NO_INFIX };
|
||||
rules[TK_LAMBDA] = { PK_METHOD(exprLambda), PK_NO_INFIX };
|
||||
rules[TK_ID] = { PK_METHOD(exprName), PK_NO_INFIX };
|
||||
rules[TK_NUM] = { PK_METHOD(exprLiteral), PK_NO_INFIX };
|
||||
rules[TK_STR] = { PK_METHOD(exprLiteral), PK_NO_INFIX };
|
||||
rules[TK_FSTR] = { PK_METHOD(exprFString), PK_NO_INFIX };
|
||||
rules[TK_LONG] = { PK_METHOD(exprLong), PK_NO_INFIX };
|
||||
rules[TK_IMAG] = { PK_METHOD(exprImag), PK_NO_INFIX };
|
||||
rules[TK_BYTES] = { PK_METHOD(exprBytes), PK_NO_INFIX };
|
||||
rules[TK_COLON] = { PK_METHOD(exprSlice0), PK_METHOD(exprSlice1), PREC_PRIMARY };
|
||||
|
||||
#undef PK_METHOD
|
||||
#undef PK_NO_INFIX
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
bool Compiler::match(TokenIndex expected) noexcept{
|
||||
|
@ -1,59 +1,79 @@
|
||||
// #include "pocketpy/compiler/expr.h"
|
||||
// #include "pocketpy/common/memorypool.h"
|
||||
#include "pocketpy/compiler/expr.h"
|
||||
#include "pocketpy/common/memorypool.h"
|
||||
#include "pocketpy/common/strname.h"
|
||||
|
||||
// static bool default_false(const pk_Expr*) { return false; }
|
||||
// static int default_zero(const pk_Expr*) { return 0; }
|
||||
// static void default_dtor(pk_Expr*) {}
|
||||
static bool default_false(const pk_Expr* e) { return false; }
|
||||
static int default_zero(const pk_Expr* e) { return 0; }
|
||||
static void default_dtor(pk_Expr* e) {}
|
||||
|
||||
// void pk_ExprVt__ctor(pk_ExprVt* vt){
|
||||
// vt->dtor = default_dtor;
|
||||
// vt->is_literal = default_false;
|
||||
// vt->is_json_object = default_false;
|
||||
// vt->is_attrib = default_false;
|
||||
// vt->is_subscr = default_false;
|
||||
// vt->is_compare = default_false;
|
||||
// vt->star_level = default_zero;
|
||||
// vt->is_tuple = default_false;
|
||||
// vt->is_name = default_false;
|
||||
// vt->emit_ = NULL; // must be set
|
||||
// vt->emit_del = NULL;
|
||||
// vt->emit_store = NULL;
|
||||
// vt->emit_inplace = NULL;
|
||||
// vt->emit_store_inplace = NULL;
|
||||
// }
|
||||
void pk_ExprVt__ctor(pk_ExprVt* vt){
|
||||
vt->dtor = default_dtor;
|
||||
vt->is_literal = default_false;
|
||||
vt->is_json_object = default_false;
|
||||
vt->is_attrib = default_false;
|
||||
vt->is_subscr = default_false;
|
||||
vt->is_compare = default_false;
|
||||
vt->star_level = default_zero;
|
||||
vt->is_tuple = default_false;
|
||||
vt->is_name = default_false;
|
||||
vt->emit_ = NULL; // must be set
|
||||
vt->emit_del = NULL;
|
||||
vt->emit_store = NULL;
|
||||
vt->emit_inplace = NULL;
|
||||
vt->emit_store_inplace = NULL;
|
||||
}
|
||||
|
||||
// void pk_Expr__emit_(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
// assert(self->vt->emit_);
|
||||
// self->vt->emit_(self, ctx);
|
||||
// }
|
||||
void pk_Expr__emit_(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
assert(self->vt->emit_);
|
||||
self->vt->emit_(self, ctx);
|
||||
}
|
||||
|
||||
// bool pk_Expr__emit_del(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
// if(!self->vt->emit_del) return false;
|
||||
// return self->vt->emit_del(self, ctx);
|
||||
// }
|
||||
bool pk_Expr__emit_del(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
if(!self->vt->emit_del) return false;
|
||||
return self->vt->emit_del(self, ctx);
|
||||
}
|
||||
|
||||
// bool pk_Expr__emit_store(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
// if(!self->vt->emit_store) return false;
|
||||
// return self->vt->emit_store(self, ctx);
|
||||
// }
|
||||
bool pk_Expr__emit_store(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
if(!self->vt->emit_store) return false;
|
||||
return self->vt->emit_store(self, ctx);
|
||||
}
|
||||
|
||||
// void pk_Expr__emit_inplace(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
// if(!self->vt->emit_inplace){
|
||||
// pk_Expr__emit_(self, ctx);
|
||||
// return;
|
||||
// }
|
||||
// self->vt->emit_inplace(self, ctx);
|
||||
// }
|
||||
void pk_Expr__emit_inplace(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
if(!self->vt->emit_inplace){
|
||||
pk_Expr__emit_(self, ctx);
|
||||
return;
|
||||
}
|
||||
self->vt->emit_inplace(self, ctx);
|
||||
}
|
||||
|
||||
// bool pk_Expr__emit_store_inplace(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
// if(!self->vt->emit_store_inplace){
|
||||
// return pk_Expr__emit_store(self, ctx);
|
||||
// }
|
||||
// return self->vt->emit_store_inplace(self, ctx);
|
||||
// }
|
||||
bool pk_Expr__emit_store_inplace(pk_Expr* self, pk_CodeEmitContext* ctx){
|
||||
if(!self->vt->emit_store_inplace){
|
||||
return pk_Expr__emit_store(self, ctx);
|
||||
}
|
||||
return self->vt->emit_store_inplace(self, ctx);
|
||||
}
|
||||
|
||||
// void pk_Expr__delete(pk_Expr* self){
|
||||
// if(!self) return;
|
||||
// self->vt->dtor(self);
|
||||
// PoolExpr_dealloc(self);
|
||||
// }
|
||||
void pk_Expr__delete(pk_Expr* self){
|
||||
if(!self) return;
|
||||
self->vt->dtor(self);
|
||||
PoolExpr_dealloc(self);
|
||||
}
|
||||
|
||||
/* CodeEmitContext */
|
||||
|
||||
void pk_CodeEmitContext__ctor(pk_CodeEmitContext* self, CodeObject* co, FuncDecl* func, int level){
|
||||
self->co = co;
|
||||
self->func = func;
|
||||
self->level = level;
|
||||
self->curr_iblock = 0;
|
||||
self->is_compiling_class = false;
|
||||
c11_vector__ctor(&self->s_expr, sizeof(pk_Expr*));
|
||||
c11_vector__ctor(&self->global_names, sizeof(StrName));
|
||||
c11_smallmap_s2n__ctor(&self->co_consts_string_dedup_map);
|
||||
}
|
||||
|
||||
void pk_CodeEmitContext__dtor(pk_CodeEmitContext* self){
|
||||
c11_vector__dtor(&self->s_expr);
|
||||
c11_vector__dtor(&self->global_names);
|
||||
c11_smallmap_s2n__dtor(&self->co_consts_string_dedup_map);
|
||||
}
|
@ -1,8 +1,855 @@
|
||||
#include "pocketpy/common/config.h"
|
||||
#include "pocketpy/common/str.h"
|
||||
#include "pocketpy/common/smallmap.h"
|
||||
#include "pocketpy/common/config.h"
|
||||
#include "pocketpy/common/sstream.h"
|
||||
#include "pocketpy/common/vector.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
#include <ctype.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define is_raw_string_used(t) ((t) == TK_ID || (t) == TK_LONG)
|
||||
|
||||
typedef struct pk_Lexer{
|
||||
pk_SourceData_ src;
|
||||
const char* token_start;
|
||||
const char* curr_char;
|
||||
int current_line;
|
||||
int brackets_level;
|
||||
|
||||
c11_vector/*T=Token*/ nexts;
|
||||
c11_vector/*T=int*/ indents;
|
||||
} pk_Lexer;
|
||||
|
||||
typedef struct pk_TokenDeserializer {
|
||||
const char* curr;
|
||||
const char* source;
|
||||
} pk_TokenDeserializer;
|
||||
|
||||
void pk_TokenDeserializer__ctor(pk_TokenDeserializer* self, const char* source);
|
||||
bool pk_TokenDeserializer__match_char(pk_TokenDeserializer* self, char c);
|
||||
c11_string pk_TokenDeserializer__read_string(pk_TokenDeserializer* self, char c);
|
||||
py_Str pk_TokenDeserializer__read_string_from_hex(pk_TokenDeserializer* self, char c);
|
||||
int pk_TokenDeserializer__read_count(pk_TokenDeserializer* self);
|
||||
int64_t pk_TokenDeserializer__read_uint(pk_TokenDeserializer* self, char c);
|
||||
double pk_TokenDeserializer__read_float(pk_TokenDeserializer* self, char c);
|
||||
|
||||
|
||||
const static TokenValue EmptyTokenValue;
|
||||
|
||||
static void pk_Lexer__ctor(pk_Lexer* self, pk_SourceData_ src){
|
||||
PK_INCREF(src);
|
||||
self->src = src;
|
||||
self->curr_char = self->token_start = py_Str__data(&src->source);
|
||||
self->current_line = 1;
|
||||
self->brackets_level = 0;
|
||||
c11_vector__ctor(&self->nexts, sizeof(Token));
|
||||
c11_vector__ctor(&self->indents, sizeof(int));
|
||||
}
|
||||
|
||||
static void pk_Lexer__dtor(pk_Lexer* self){
|
||||
PK_DECREF(self->src);
|
||||
c11_vector__dtor(&self->nexts);
|
||||
c11_vector__dtor(&self->indents);
|
||||
}
|
||||
|
||||
static char eatchar(pk_Lexer* self){
|
||||
char c = *self->curr_char;
|
||||
assert(c != '\n'); // eatchar() cannot consume a newline
|
||||
self->curr_char++;
|
||||
return c;
|
||||
}
|
||||
|
||||
static char eatchar_include_newline(pk_Lexer* self){
|
||||
char c = *self->curr_char;
|
||||
self->curr_char++;
|
||||
if(c == '\n') {
|
||||
self->current_line++;
|
||||
c11_vector__push(const char*, &self->src->line_starts, self->curr_char);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
static int eat_spaces(pk_Lexer* self){
|
||||
int count = 0;
|
||||
while(true) {
|
||||
switch(*self->curr_char) {
|
||||
case ' ': count += 1; break;
|
||||
case '\t': count += 4; break;
|
||||
default: return count;
|
||||
}
|
||||
eatchar(self);
|
||||
}
|
||||
}
|
||||
|
||||
static bool matchchar(pk_Lexer* self, char c){
|
||||
if(*self->curr_char != c) return false;
|
||||
eatchar_include_newline(self);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool match_n_chars(pk_Lexer* self, int n, char c0){
|
||||
const char* c = self->curr_char;
|
||||
for(int i = 0; i < n; i++) {
|
||||
if(*c == '\0') return false;
|
||||
if(*c != c0) return false;
|
||||
c++;
|
||||
}
|
||||
for(int i = 0; i < n; i++)
|
||||
eatchar_include_newline(self);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void skip_line_comment(pk_Lexer* self){
|
||||
while(*self->curr_char) {
|
||||
if(*self->curr_char == '\n') return;
|
||||
eatchar(self);
|
||||
}
|
||||
}
|
||||
|
||||
static void add_token_with_value(pk_Lexer* self, TokenIndex type, TokenValue value){
|
||||
switch(type) {
|
||||
case TK_LBRACE:
|
||||
case TK_LBRACKET:
|
||||
case TK_LPAREN: self->brackets_level++; break;
|
||||
case TK_RPAREN:
|
||||
case TK_RBRACKET:
|
||||
case TK_RBRACE: self->brackets_level--; break;
|
||||
default: break;
|
||||
}
|
||||
Token token = {type,
|
||||
self->token_start,
|
||||
(int)(self->curr_char - self->token_start),
|
||||
self->current_line - ((type == TK_EOL) ? 1 : 0),
|
||||
self->brackets_level,
|
||||
value};
|
||||
// handle "not in", "is not", "yield from"
|
||||
if(self->nexts.count > 0) {
|
||||
Token* back = &c11_vector__back(Token, &self->nexts);
|
||||
if(back->type == TK_NOT_KW && type == TK_IN) {
|
||||
back->type = TK_NOT_IN;
|
||||
return;
|
||||
}
|
||||
if(back->type == TK_IS && type == TK_NOT_KW) {
|
||||
back->type = TK_IS_NOT;
|
||||
return;
|
||||
}
|
||||
if(back->type == TK_YIELD && type == TK_FROM) {
|
||||
back->type = TK_YIELD_FROM;
|
||||
return;
|
||||
}
|
||||
c11_vector__push(Token, &self->nexts, token);
|
||||
}
|
||||
}
|
||||
|
||||
static void add_token(pk_Lexer* self, TokenIndex type){
|
||||
add_token_with_value(self, type, EmptyTokenValue);
|
||||
}
|
||||
|
||||
static void add_token_2(pk_Lexer* self, char c, TokenIndex one, TokenIndex two){
|
||||
if(matchchar(self, c))
|
||||
add_token(self, two);
|
||||
else
|
||||
add_token(self, one);
|
||||
}
|
||||
|
||||
static bool eat_indentation(pk_Lexer* self){
|
||||
if(self->brackets_level > 0) return true;
|
||||
int spaces = eat_spaces(self);
|
||||
if(*self->curr_char == '#') skip_line_comment(self);
|
||||
if(*self->curr_char == '\0' || *self->curr_char == '\n'){
|
||||
return true;
|
||||
}
|
||||
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
|
||||
int indents_back = c11_vector__back(int, &self->indents);
|
||||
if(spaces > indents_back) {
|
||||
c11_vector__push(int, &self->indents, spaces);
|
||||
Token t = {TK_INDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &self->nexts, t);
|
||||
} else if(spaces < indents_back) {
|
||||
do {
|
||||
c11_vector__pop(int, &self->indents);
|
||||
Token t = {TK_DEDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &self->nexts, t);
|
||||
indents_back = c11_vector__back(int, &self->indents);
|
||||
} while(spaces < indents_back);
|
||||
if(spaces != indents_back) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_possible_number_char(char c){
|
||||
switch(c) {
|
||||
// clang-format off
|
||||
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
case '.': case 'L': case 'x': case 'o': case 'j':
|
||||
return true;
|
||||
default: return false;
|
||||
// clang-format on
|
||||
}
|
||||
}
|
||||
|
||||
/******************************/
|
||||
static Error* SyntaxError(const char* fmt, ...){
|
||||
// va_list args;
|
||||
// va_start(args, fmt);
|
||||
// Error* err = _error(true, "SyntaxError", fmt, &args);
|
||||
// va_end(args);
|
||||
// return err;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* NeedMoreLines(){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* eat_name(pk_Lexer* self){
|
||||
self->curr_char--;
|
||||
while(true) {
|
||||
unsigned char c = *self->curr_char;
|
||||
int u8bytes = c11__u8_header(c, true);
|
||||
if(u8bytes == 0) return SyntaxError("invalid char: %c", c);
|
||||
if(u8bytes == 1) {
|
||||
if(isalnum(c) || c == '_') {
|
||||
self->curr_char++;
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle multibyte char
|
||||
py_Str u8str;
|
||||
py_Str__ctor2(&u8str, self->curr_char, u8bytes);
|
||||
if(u8str.size != u8bytes){
|
||||
py_Str__dtor(&u8str);
|
||||
return SyntaxError("invalid utf8 sequence: %s", py_Str__data(&u8str));
|
||||
}
|
||||
uint32_t value = 0;
|
||||
for(int k = 0; k < u8bytes; k++) {
|
||||
uint8_t b = py_Str__data(&u8str)[k];
|
||||
if(k == 0) {
|
||||
if(u8bytes == 2)
|
||||
value = (b & 0b00011111) << 6;
|
||||
else if(u8bytes == 3)
|
||||
value = (b & 0b00001111) << 12;
|
||||
else if(u8bytes == 4)
|
||||
value = (b & 0b00000111) << 18;
|
||||
} else {
|
||||
value |= (b & 0b00111111) << (6 * (u8bytes - k - 1));
|
||||
}
|
||||
}
|
||||
if(c11__is_unicode_Lo_char(value))
|
||||
self->curr_char += u8bytes;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
int length = (int)(self->curr_char - self->token_start);
|
||||
if(length == 0) return SyntaxError("@id contains invalid char");
|
||||
c11_string name = {self->token_start, length};
|
||||
|
||||
if(self->src->mode == JSON_MODE) {
|
||||
if(c11_string__cmp3(name, "true") == 0) {
|
||||
add_token(self, TK_TRUE);
|
||||
} else if(c11_string__cmp3(name, "false") == 0) {
|
||||
add_token(self, TK_FALSE);
|
||||
} else if(c11_string__cmp3(name, "null") == 0) {
|
||||
add_token(self, TK_NONE);
|
||||
} else {
|
||||
return SyntaxError("invalid JSON token");
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char** KW_BEGIN = pk_TokenSymbols + TK_FALSE;
|
||||
int KW_COUNT = TK__COUNT__ - TK_FALSE;
|
||||
#define less(a, b) (c11_string__cmp3(b, a) > 0)
|
||||
int out;
|
||||
c11__lower_bound(const char*, KW_BEGIN, KW_COUNT, name, less, &out);
|
||||
#undef less
|
||||
|
||||
if(out != KW_COUNT && c11_string__cmp3(name, KW_BEGIN[out]) == 0) {
|
||||
add_token(self, (TokenIndex)(out + TK_FALSE));
|
||||
} else {
|
||||
add_token(self, TK_ID);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* eat_string_until(pk_Lexer* self, char quote, bool raw, py_Str* out) {
|
||||
// previous char is quote
|
||||
bool quote3 = match_n_chars(self, 2, quote);
|
||||
pk_SStream buff;
|
||||
pk_SStream__ctor(&buff);
|
||||
while(true) {
|
||||
char c = eatchar_include_newline(self);
|
||||
if(c == quote) {
|
||||
if(quote3 && !match_n_chars(self, 2, quote)) {
|
||||
pk_SStream__write_char(&buff, c);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if(c == '\0') {
|
||||
if(quote3 && self->src->mode == REPL_MODE){
|
||||
return NeedMoreLines();
|
||||
}
|
||||
return SyntaxError("EOL while scanning string literal");
|
||||
}
|
||||
if(c == '\n') {
|
||||
if(!quote3)
|
||||
return SyntaxError("EOL while scanning string literal");
|
||||
else {
|
||||
pk_SStream__write_char(&buff, c);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(!raw && c == '\\') {
|
||||
switch(eatchar_include_newline(self)) {
|
||||
case '"': pk_SStream__write_char(&buff, '"'); break;
|
||||
case '\'': pk_SStream__write_char(&buff, '\''); break;
|
||||
case '\\': pk_SStream__write_char(&buff, '\\'); break;
|
||||
case 'n': pk_SStream__write_char(&buff, '\n'); break;
|
||||
case 'r': pk_SStream__write_char(&buff, '\r'); break;
|
||||
case 't': pk_SStream__write_char(&buff, '\t'); break;
|
||||
case 'b': pk_SStream__write_char(&buff, '\b'); break;
|
||||
case 'x': {
|
||||
char hex[3] = {eatchar(self), eatchar(self), '\0'};
|
||||
int code;
|
||||
if(sscanf(hex, "%x", &code) != 1) {
|
||||
return SyntaxError("invalid hex char");
|
||||
}
|
||||
pk_SStream__write_char(&buff, (char)code);
|
||||
} break;
|
||||
default: return SyntaxError("invalid escape char");
|
||||
}
|
||||
} else {
|
||||
pk_SStream__write_char(&buff, c);
|
||||
}
|
||||
}
|
||||
*out = pk_SStream__submit(&buff);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
enum StringType {
|
||||
NORMAL_STRING,
|
||||
RAW_STRING,
|
||||
F_STRING,
|
||||
NORMAL_BYTES
|
||||
};
|
||||
|
||||
static Error* eat_string(pk_Lexer* self, char quote, enum StringType type){
|
||||
py_Str s;
|
||||
Error* err = eat_string_until(self, quote, type == RAW_STRING, &s);
|
||||
if(err) return err;
|
||||
TokenValue value = {TokenValue_STR, ._str = s};
|
||||
if(type == F_STRING) {
|
||||
add_token_with_value(self, TK_FSTR, value);
|
||||
}else if(type == NORMAL_BYTES) {
|
||||
add_token_with_value(self, TK_BYTES, value);
|
||||
}else{
|
||||
add_token_with_value(self, TK_STR, value);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* eat_number(pk_Lexer* self){
|
||||
const char* i = self->token_start;
|
||||
while(is_possible_number_char(*i)) i++;
|
||||
|
||||
bool is_scientific_notation = false;
|
||||
if(*(i - 1) == 'e' && (*i == '+' || *i == '-')) {
|
||||
i++;
|
||||
while(isdigit(*i) || *i == 'j') i++;
|
||||
is_scientific_notation = true;
|
||||
}
|
||||
|
||||
c11_string text = {self->token_start, i - self->token_start};
|
||||
self->curr_char = i;
|
||||
|
||||
if(text.data[0] != '.' && !is_scientific_notation) {
|
||||
// try long
|
||||
if(i[-1] == 'L') {
|
||||
add_token(self, TK_LONG);
|
||||
return NULL;
|
||||
}
|
||||
// try integer
|
||||
TokenValue value = {.index = TokenValue_EMPTY};
|
||||
switch(parse_uint(text, &value._i64, -1)) {
|
||||
case IntParsing_SUCCESS:
|
||||
add_token_with_value(self, TK_NUM, value);
|
||||
return NULL;
|
||||
case IntParsing_OVERFLOW:
|
||||
return SyntaxError("int literal is too large");
|
||||
case IntParsing_FAILURE:
|
||||
break; // do nothing
|
||||
}
|
||||
}
|
||||
|
||||
// try float
|
||||
double float_out;
|
||||
char* p_end;
|
||||
float_out = strtod(text.data, &p_end);
|
||||
|
||||
if(p_end == text.data + text.size){
|
||||
TokenValue value = {.index = TokenValue_F64, ._f64 = float_out};
|
||||
add_token_with_value(self, TK_NUM, value);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(i[-1] == 'j' && p_end == text.data + text.size - 1) {
|
||||
TokenValue value = {.index = TokenValue_F64, ._f64 = float_out};
|
||||
add_token_with_value(self, TK_IMAG, value);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return SyntaxError("invalid number literal");
|
||||
}
|
||||
|
||||
static Error* lex_one_token(pk_Lexer* self, bool* eof){
|
||||
*eof = false;
|
||||
while(*self->curr_char) {
|
||||
self->token_start = self->curr_char;
|
||||
char c = eatchar_include_newline(self);
|
||||
switch(c) {
|
||||
case '\'':
|
||||
case '"': {
|
||||
Error* err = eat_string(self, c, NORMAL_STRING);
|
||||
if(err) return err;
|
||||
return NULL;
|
||||
}
|
||||
case '#': skip_line_comment(self); break;
|
||||
case '~': add_token(self, TK_INVERT); return NULL;
|
||||
case '{': add_token(self, TK_LBRACE); return NULL;
|
||||
case '}': add_token(self, TK_RBRACE); return NULL;
|
||||
case ',': add_token(self, TK_COMMA); return NULL;
|
||||
case ':': add_token(self, TK_COLON); return NULL;
|
||||
case ';': add_token(self, TK_SEMICOLON); return NULL;
|
||||
case '(': add_token(self, TK_LPAREN); return NULL;
|
||||
case ')': add_token(self, TK_RPAREN); return NULL;
|
||||
case '[': add_token(self, TK_LBRACKET); return NULL;
|
||||
case ']': add_token(self, TK_RBRACKET); return NULL;
|
||||
case '@': add_token(self, TK_DECORATOR); return NULL;
|
||||
case '\\': {
|
||||
// line continuation character
|
||||
char c = eatchar_include_newline(self);
|
||||
if(c != '\n') {
|
||||
if(self->src->mode == REPL_MODE && c == '\0') return NeedMoreLines();
|
||||
return SyntaxError("expected newline after line continuation character");
|
||||
}
|
||||
eat_spaces(self);
|
||||
return NULL;
|
||||
}
|
||||
case '%': add_token_2(self, '=', TK_MOD, TK_IMOD); return NULL;
|
||||
case '&': add_token_2(self, '=', TK_AND, TK_IAND); return NULL;
|
||||
case '|': add_token_2(self, '=', TK_OR, TK_IOR); return NULL;
|
||||
case '^': add_token_2(self, '=', TK_XOR, TK_IXOR); return NULL;
|
||||
case '.': {
|
||||
if(matchchar(self, '.')) {
|
||||
if(matchchar(self, '.')) {
|
||||
add_token(self, TK_DOTDOTDOT);
|
||||
} else {
|
||||
add_token(self, TK_DOTDOT);
|
||||
}
|
||||
} else {
|
||||
char next_char = *self->curr_char;
|
||||
if(next_char >= '0' && next_char <= '9') {
|
||||
Error* err = eat_number(self);
|
||||
if(err) return err;
|
||||
} else {
|
||||
add_token(self, TK_DOT);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case '=': add_token_2(self, '=', TK_ASSIGN, TK_EQ); return NULL;
|
||||
case '+': add_token_2(self, '=', TK_ADD, TK_IADD); return NULL;
|
||||
case '>': {
|
||||
if(matchchar(self, '='))
|
||||
add_token(self, TK_GE);
|
||||
else if(matchchar(self, '>'))
|
||||
add_token_2(self, '=', TK_RSHIFT, TK_IRSHIFT);
|
||||
else
|
||||
add_token(self, TK_GT);
|
||||
return NULL;
|
||||
}
|
||||
case '<': {
|
||||
if(matchchar(self, '='))
|
||||
add_token(self, TK_LE);
|
||||
else if(matchchar(self, '<'))
|
||||
add_token_2(self, '=', TK_LSHIFT, TK_ILSHIFT);
|
||||
else
|
||||
add_token(self, TK_LT);
|
||||
return NULL;
|
||||
}
|
||||
case '-': {
|
||||
if(matchchar(self, '='))
|
||||
add_token(self, TK_ISUB);
|
||||
else if(matchchar(self, '>'))
|
||||
add_token(self, TK_ARROW);
|
||||
else
|
||||
add_token(self, TK_SUB);
|
||||
return NULL;
|
||||
}
|
||||
case '!':
|
||||
if(matchchar(self, '=')){
|
||||
add_token(self, TK_NE);
|
||||
}else{
|
||||
Error* err = SyntaxError("expected '=' after '!'");
|
||||
if(err) return err;
|
||||
}
|
||||
break;
|
||||
case '*':
|
||||
if(matchchar(self, '*')) {
|
||||
add_token(self, TK_POW); // '**'
|
||||
} else {
|
||||
add_token_2(self, '=', TK_MUL, TK_IMUL);
|
||||
}
|
||||
return NULL;
|
||||
case '/':
|
||||
if(matchchar(self, '/')) {
|
||||
add_token_2(self, '=', TK_FLOORDIV, TK_IFLOORDIV);
|
||||
} else {
|
||||
add_token_2(self, '=', TK_DIV, TK_IDIV);
|
||||
}
|
||||
return NULL;
|
||||
case ' ':
|
||||
case '\t': eat_spaces(self); break;
|
||||
case '\n': {
|
||||
add_token(self, TK_EOL);
|
||||
if(!eat_indentation(self)){
|
||||
return SyntaxError("unindent does not match any outer indentation level");
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
default: {
|
||||
if(c == 'f') {
|
||||
if(matchchar(self, '\'')) return eat_string(self, '\'', F_STRING);
|
||||
if(matchchar(self, '"')) return eat_string(self, '"', F_STRING);
|
||||
} else if(c == 'r') {
|
||||
if(matchchar(self, '\'')) return eat_string(self, '\'', RAW_STRING);
|
||||
if(matchchar(self, '"')) return eat_string(self, '"', RAW_STRING);
|
||||
} else if(c == 'b') {
|
||||
if(matchchar(self, '\'')) return eat_string(self, '\'', NORMAL_BYTES);
|
||||
if(matchchar(self, '"')) return eat_string(self, '"', NORMAL_BYTES);
|
||||
}
|
||||
if(c >= '0' && c <= '9') return eat_number(self);
|
||||
return eat_name(self);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self->token_start = self->curr_char;
|
||||
while(self->indents.count > 1) {
|
||||
c11_vector__pop(int, &self->indents);
|
||||
add_token(self, TK_DEDENT);
|
||||
return NULL;
|
||||
}
|
||||
add_token(self, TK_EOF);
|
||||
*eof = true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static Error* from_precompiled(pk_Lexer* self) {
|
||||
pk_TokenDeserializer deserializer;
|
||||
pk_TokenDeserializer__ctor(&deserializer, py_Str__data(&self->src->source));
|
||||
|
||||
deserializer.curr += 5; // skip "pkpy:"
|
||||
c11_string version = pk_TokenDeserializer__read_string(&deserializer, '\n');
|
||||
|
||||
if(c11_string__cmp3(version, PK_VERSION) != 0) {
|
||||
return SyntaxError("precompiled version mismatch");
|
||||
}
|
||||
if(pk_TokenDeserializer__read_uint(&deserializer, '\n') != (int64_t)self->src->mode){
|
||||
return SyntaxError("precompiled mode mismatch");
|
||||
}
|
||||
|
||||
int count = pk_TokenDeserializer__read_count(&deserializer);
|
||||
c11_vector* precompiled_tokens = &self->src->_precompiled_tokens;
|
||||
for(int i = 0; i < count; i++) {
|
||||
c11_string item = pk_TokenDeserializer__read_string(&deserializer, '\n');
|
||||
py_Str copied_item;
|
||||
py_Str__ctor2(&copied_item, item.data, item.size);
|
||||
c11_vector__push(py_Str, precompiled_tokens, copied_item);
|
||||
}
|
||||
|
||||
count = pk_TokenDeserializer__read_count(&deserializer);
|
||||
for(int i = 0; i < count; i++) {
|
||||
Token t;
|
||||
t.type = (TokenIndex)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
if(is_raw_string_used(t.type)) {
|
||||
int64_t index = pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
py_Str* p = c11__at(py_Str, precompiled_tokens, index);
|
||||
t.start = py_Str__data(p);
|
||||
t.length = c11__getitem(py_Str, precompiled_tokens, index).size;
|
||||
} else {
|
||||
t.start = NULL;
|
||||
t.length = 0;
|
||||
}
|
||||
|
||||
if(pk_TokenDeserializer__match_char(&deserializer, ',')) {
|
||||
t.line = c11_vector__back(Token, &self->nexts).line;
|
||||
} else {
|
||||
t.line = (int)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
}
|
||||
|
||||
if(pk_TokenDeserializer__match_char(&deserializer, ',')) {
|
||||
t.brackets_level = c11_vector__back(Token, &self->nexts).brackets_level;
|
||||
} else {
|
||||
t.brackets_level = (int)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
}
|
||||
|
||||
char type = (*deserializer.curr++); // read_char
|
||||
switch(type) {
|
||||
case 'I': {
|
||||
int64_t res = pk_TokenDeserializer__read_uint(&deserializer, '\n');
|
||||
t.value = (TokenValue){TokenValue_I64, ._i64 = res};
|
||||
} break;
|
||||
case 'F': {
|
||||
double res = pk_TokenDeserializer__read_float(&deserializer, '\n');
|
||||
t.value = (TokenValue){TokenValue_F64, ._f64 = res};
|
||||
} break;
|
||||
case 'S': {
|
||||
py_Str res = pk_TokenDeserializer__read_string_from_hex(&deserializer, '\n');
|
||||
t.value = (TokenValue){TokenValue_STR, ._str = res};
|
||||
} break;
|
||||
default:
|
||||
t.value = EmptyTokenValue;
|
||||
break;
|
||||
}
|
||||
c11_vector__push(Token, &self->nexts, t);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
IntParsingResult parse_uint(c11_string text, int64_t* out, int base) {
|
||||
*out = 0;
|
||||
|
||||
c11_string prefix = {.data = text.data, .size = PK_MIN(2, text.size)};
|
||||
if(base == -1) {
|
||||
if(c11_string__cmp3(prefix, "0b") == 0)
|
||||
base = 2;
|
||||
else if(c11_string__cmp3(prefix, "0o") == 0)
|
||||
base = 8;
|
||||
else if(c11_string__cmp3(prefix, "0x") == 0)
|
||||
base = 16;
|
||||
else
|
||||
base = 10;
|
||||
}
|
||||
|
||||
if(base == 10) {
|
||||
// 10-base 12334
|
||||
if(text.size == 0) return IntParsing_FAILURE;
|
||||
for(int i = 0; i < text.size; i++) {
|
||||
char c = text.data[i];
|
||||
if(c >= '0' && c <= '9') {
|
||||
*out = (*out * 10) + (c - '0');
|
||||
} else {
|
||||
return IntParsing_FAILURE;
|
||||
}
|
||||
}
|
||||
// "9223372036854775807".__len__() == 19
|
||||
if(text.size > 19) return IntParsing_OVERFLOW;
|
||||
return IntParsing_SUCCESS;
|
||||
} else if(base == 2) {
|
||||
// 2-base 0b101010
|
||||
if(c11_string__cmp3(prefix, "0b") == 0) {
|
||||
// text.remove_prefix(2);
|
||||
text = (c11_string){text.data + 2, text.size - 2};
|
||||
}
|
||||
if(text.size == 0) return IntParsing_FAILURE;
|
||||
for(int i = 0; i < text.size; i++) {
|
||||
char c = text.data[i];
|
||||
if(c == '0' || c == '1') {
|
||||
*out = (*out << 1) | (c - '0');
|
||||
} else {
|
||||
return IntParsing_FAILURE;
|
||||
}
|
||||
}
|
||||
// "111111111111111111111111111111111111111111111111111111111111111".__len__() == 63
|
||||
if(text.size > 63) return IntParsing_OVERFLOW;
|
||||
return IntParsing_SUCCESS;
|
||||
} else if(base == 8) {
|
||||
// 8-base 0o123
|
||||
if(c11_string__cmp3(prefix, "0o") == 0) {
|
||||
// text.remove_prefix(2);
|
||||
text = (c11_string){text.data + 2, text.size - 2};
|
||||
}
|
||||
if(text.size == 0) return IntParsing_FAILURE;
|
||||
for(int i = 0; i < text.size; i++) {
|
||||
char c = text.data[i];
|
||||
if(c >= '0' && c <= '7') {
|
||||
*out = (*out << 3) | (c - '0');
|
||||
} else {
|
||||
return IntParsing_FAILURE;
|
||||
}
|
||||
}
|
||||
// "777777777777777777777".__len__() == 21
|
||||
if(text.size > 21) return IntParsing_OVERFLOW;
|
||||
return IntParsing_SUCCESS;
|
||||
} else if(base == 16) {
|
||||
// 16-base 0x123
|
||||
if(c11_string__cmp3(prefix, "0x") == 0) {
|
||||
// text.remove_prefix(2);
|
||||
text = (c11_string){text.data + 2, text.size - 2};
|
||||
}
|
||||
if(text.size == 0) return IntParsing_FAILURE;
|
||||
for(int i = 0; i < text.size; i++) {
|
||||
char c = text.data[i];
|
||||
if(c >= '0' && c <= '9') {
|
||||
*out = (*out << 4) | (c - '0');
|
||||
} else if(c >= 'a' && c <= 'f') {
|
||||
*out = (*out << 4) | (c - 'a' + 10);
|
||||
} else if(c >= 'A' && c <= 'F') {
|
||||
*out = (*out << 4) | (c - 'A' + 10);
|
||||
} else {
|
||||
return IntParsing_FAILURE;
|
||||
}
|
||||
}
|
||||
// "7fffffffffffffff".__len__() == 16
|
||||
if(text.size > 16) return IntParsing_OVERFLOW;
|
||||
return IntParsing_SUCCESS;
|
||||
}
|
||||
return IntParsing_FAILURE;
|
||||
}
|
||||
|
||||
Error* pk_Lexer__process(pk_SourceData_ src, pk_TokenArray* out_tokens){
|
||||
pk_Lexer lexer;
|
||||
pk_Lexer__ctor(&lexer, src);
|
||||
|
||||
if(src->is_precompiled) {
|
||||
Error* err = from_precompiled(&lexer);
|
||||
// TODO: set out tokens
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return err;
|
||||
}
|
||||
// push initial tokens
|
||||
Token sof = {TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &lexer.nexts, sof);
|
||||
c11_vector__push(int, &lexer.indents, 0);
|
||||
|
||||
bool eof = false;
|
||||
while(!eof) {
|
||||
void* err = lex_one_token(&lexer, &eof);
|
||||
if(err){
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
// set out_tokens
|
||||
*out_tokens = c11_vector__submit(&lexer.nexts);
|
||||
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Error* pk_Lexer__process_and_dump(pk_SourceData_ src, py_Str* out) {
|
||||
assert(!src->is_precompiled);
|
||||
pk_TokenArray nexts; // output tokens
|
||||
Error* err = pk_Lexer__process(src, &nexts);
|
||||
if(err) return err;
|
||||
|
||||
pk_SStream ss;
|
||||
pk_SStream__ctor(&ss);
|
||||
|
||||
// L1: version string
|
||||
pk_SStream__write_cstr(&ss, "pkpy:" PK_VERSION "\n");
|
||||
// L2: mode
|
||||
pk_SStream__write_int(&ss, (int)src->mode);
|
||||
pk_SStream__write_char(&ss, '\n');
|
||||
|
||||
c11_smallmap_s2n token_indices;
|
||||
c11_smallmap_s2n__ctor(&token_indices);
|
||||
|
||||
c11_vector__foreach(Token, &nexts, token) {
|
||||
if(is_raw_string_used(token->type)) {
|
||||
c11_string token_sv = {token->start, token->length};
|
||||
if(!c11_smallmap_s2n__contains(&token_indices, token_sv)) {
|
||||
c11_smallmap_s2n__set(&token_indices, token_sv, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
// L3: raw string count
|
||||
pk_SStream__write_char(&ss, '=');
|
||||
pk_SStream__write_int(&ss, token_indices.count);
|
||||
pk_SStream__write_char(&ss, '\n');
|
||||
|
||||
uint16_t index = 0;
|
||||
for(int i=0; i<token_indices.count; i++){
|
||||
c11_smallmap_s2n_KV* kv = c11__at(c11_smallmap_s2n_KV, &token_indices, i);
|
||||
// L4: raw strings
|
||||
pk_SStream__write_cstrn(&ss, kv->key.data, kv->key.size);
|
||||
kv->value = index++;
|
||||
}
|
||||
|
||||
// L5: token count
|
||||
pk_SStream__write_char(&ss, '=');
|
||||
pk_SStream__write_int(&ss, nexts.count);
|
||||
pk_SStream__write_char(&ss, '\n');
|
||||
|
||||
for(int i = 0; i < nexts.count; i++) {
|
||||
const Token* token = c11__at(Token, &nexts, i);
|
||||
pk_SStream__write_int(&ss, (int)token->type);
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
|
||||
if(is_raw_string_used(token->type)) {
|
||||
uint16_t *p = c11_smallmap_s2n__try_get(
|
||||
&token_indices, (c11_string){token->start, token->length});
|
||||
assert(p != NULL);
|
||||
pk_SStream__write_int(&ss, (int)*p);
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
}
|
||||
if(i > 0 && c11__getitem(Token, &nexts, i-1).line == token->line){
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
}else{
|
||||
pk_SStream__write_int(&ss, token->line);
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
}
|
||||
|
||||
if(i > 0 && c11__getitem(Token, &nexts, i-1).brackets_level == token->brackets_level){
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
}else{
|
||||
pk_SStream__write_int(&ss, token->brackets_level);
|
||||
pk_SStream__write_char(&ss, ',');
|
||||
}
|
||||
// visit token value
|
||||
switch(token->value.index){
|
||||
case TokenValue_EMPTY: break;
|
||||
case TokenValue_I64:
|
||||
pk_SStream__write_char(&ss, 'I');
|
||||
pk_SStream__write_int(&ss, token->value._i64);
|
||||
break;
|
||||
case TokenValue_F64:
|
||||
pk_SStream__write_char(&ss, 'F');
|
||||
pk_SStream__write_float(&ss, token->value._f64, -1);
|
||||
break;
|
||||
case TokenValue_STR: {
|
||||
pk_SStream__write_char(&ss, 'S');
|
||||
c11_string sv = py_Str__sv(&token->value._str);
|
||||
for(int i=0; i<sv.size; i++){
|
||||
pk_SStream__write_hex(&ss, sv.data[i], false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
pk_SStream__write_char(&ss, '\n');
|
||||
}
|
||||
*out = pk_SStream__submit(&ss);
|
||||
c11_smallmap_s2n__dtor(&token_indices);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void pk_TokenArray__dtor(pk_TokenArray *self){
|
||||
Token* data = self->data;
|
||||
for(int i=0; i<self->count; i++){
|
||||
if(data[i].value.index == TokenValue_STR){
|
||||
py_Str__dtor(&data[i].value._str);
|
||||
}
|
||||
}
|
||||
c11_array__dtor(self);
|
||||
}
|
||||
|
||||
const char* pk_TokenSymbols[] = {
|
||||
"@eof", "@eol", "@sof",
|
||||
|
@ -1,751 +0,0 @@
|
||||
#include "pocketpy/compiler/lexer.hpp"
|
||||
#include "pocketpy/common/config.h"
|
||||
#include "pocketpy/common/str.h"
|
||||
#include "pocketpy/common/smallmap.h"
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
|
||||
#include <cstdarg>
|
||||
|
||||
namespace pkpy {
|
||||
|
||||
static bool is_possible_number_char(char c) noexcept{
|
||||
switch(c) {
|
||||
// clang-format off
|
||||
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
case '.': case 'L': case 'x': case 'o': case 'j':
|
||||
return true;
|
||||
default: return false;
|
||||
// clang-format on
|
||||
}
|
||||
}
|
||||
|
||||
bool Lexer::match_n_chars(int n, char c0) noexcept{
|
||||
const char* c = curr_char;
|
||||
for(int i = 0; i < n; i++) {
|
||||
if(*c == '\0') return false;
|
||||
if(*c != c0) return false;
|
||||
c++;
|
||||
}
|
||||
for(int i = 0; i < n; i++)
|
||||
eatchar_include_newline();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Lexer::match_string(const char* s) noexcept{
|
||||
int s_len = strlen(s);
|
||||
bool ok = strncmp(curr_char, s, s_len) == 0;
|
||||
if(ok)
|
||||
for(int i = 0; i < s_len; i++)
|
||||
eatchar_include_newline();
|
||||
return ok;
|
||||
}
|
||||
|
||||
int Lexer::eat_spaces() noexcept{
|
||||
int count = 0;
|
||||
while(true) {
|
||||
switch(peekchar()) {
|
||||
case ' ': count += 1; break;
|
||||
case '\t': count += 4; break;
|
||||
default: return count;
|
||||
}
|
||||
eatchar();
|
||||
}
|
||||
}
|
||||
|
||||
bool Lexer::eat_indentation() noexcept{
|
||||
if(brackets_level > 0) return true;
|
||||
int spaces = eat_spaces();
|
||||
if(peekchar() == '#') skip_line_comment();
|
||||
if(peekchar() == '\0' || peekchar() == '\n') return true;
|
||||
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
|
||||
if(spaces > indents.back()) {
|
||||
indents.push_back(spaces);
|
||||
nexts.push_back(Token{TK_INDENT, token_start, 0, current_line, brackets_level, {}});
|
||||
} else if(spaces < indents.back()) {
|
||||
while(spaces < indents.back()) {
|
||||
indents.pop_back();
|
||||
nexts.push_back(Token{TK_DEDENT, token_start, 0, current_line, brackets_level, {}});
|
||||
}
|
||||
if(spaces != indents.back()) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
char Lexer::eatchar() noexcept{
|
||||
char c = peekchar();
|
||||
assert(c != '\n'); // eatchar() cannot consume a newline
|
||||
curr_char++;
|
||||
return c;
|
||||
}
|
||||
|
||||
char Lexer::eatchar_include_newline() noexcept{
|
||||
char c = peekchar();
|
||||
curr_char++;
|
||||
if(c == '\n') {
|
||||
current_line++;
|
||||
c11_vector__push(const char*, &src->line_starts, curr_char);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
Error* Lexer::eat_name() noexcept{
|
||||
curr_char--;
|
||||
while(true) {
|
||||
unsigned char c = peekchar();
|
||||
int u8bytes = c11__u8_header(c, true);
|
||||
if(u8bytes == 0) return SyntaxError("invalid char: %c", c);
|
||||
if(u8bytes == 1) {
|
||||
if(isalpha(c) || c == '_' || isdigit(c)) {
|
||||
curr_char++;
|
||||
continue;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle multibyte char
|
||||
Str u8str(curr_char, u8bytes);
|
||||
if(u8str.size != u8bytes) return SyntaxError("invalid utf8 sequence: %s", u8str.c_str());
|
||||
uint32_t value = 0;
|
||||
for(int k = 0; k < u8bytes; k++) {
|
||||
uint8_t b = u8str[k];
|
||||
if(k == 0) {
|
||||
if(u8bytes == 2)
|
||||
value = (b & 0b00011111) << 6;
|
||||
else if(u8bytes == 3)
|
||||
value = (b & 0b00001111) << 12;
|
||||
else if(u8bytes == 4)
|
||||
value = (b & 0b00000111) << 18;
|
||||
} else {
|
||||
value |= (b & 0b00111111) << (6 * (u8bytes - k - 1));
|
||||
}
|
||||
}
|
||||
if(c11__is_unicode_Lo_char(value))
|
||||
curr_char += u8bytes;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
int length = (int)(curr_char - token_start);
|
||||
if(length == 0) return SyntaxError("@id contains invalid char");
|
||||
c11_string name = {token_start, length};
|
||||
|
||||
if(src->mode == JSON_MODE) {
|
||||
if(c11_string__cmp3(name, "true") == 0) {
|
||||
add_token(TK_TRUE);
|
||||
} else if(c11_string__cmp3(name, "false") == 0) {
|
||||
add_token(TK_FALSE);
|
||||
} else if(c11_string__cmp3(name, "null") == 0) {
|
||||
add_token(TK_NONE);
|
||||
} else {
|
||||
return SyntaxError("invalid JSON token");
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char** KW_BEGIN = pk_TokenSymbols + TK_FALSE;
|
||||
int KW_COUNT = TK__COUNT__ - TK_FALSE;
|
||||
#define less(a, b) (c11_string__cmp3(b, a) > 0)
|
||||
int out;
|
||||
c11__lower_bound(const char*, KW_BEGIN, KW_COUNT, name, less, &out);
|
||||
#undef less
|
||||
|
||||
if(out != KW_COUNT && c11_string__cmp3(name, KW_BEGIN[out]) == 0) {
|
||||
add_token((TokenIndex)(out + TK_FALSE));
|
||||
} else {
|
||||
add_token(TK_ID);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void Lexer::skip_line_comment() noexcept{
|
||||
char c;
|
||||
while((c = peekchar()) != '\0') {
|
||||
if(c == '\n') return;
|
||||
eatchar();
|
||||
}
|
||||
}
|
||||
|
||||
bool Lexer::matchchar(char c) noexcept{
|
||||
if(peekchar() != c) return false;
|
||||
eatchar_include_newline();
|
||||
return true;
|
||||
}
|
||||
|
||||
void Lexer::add_token(TokenIndex type, TokenValue value) noexcept{
|
||||
switch(type) {
|
||||
case TK_LBRACE:
|
||||
case TK_LBRACKET:
|
||||
case TK_LPAREN: brackets_level++; break;
|
||||
case TK_RPAREN:
|
||||
case TK_RBRACKET:
|
||||
case TK_RBRACE: brackets_level--; break;
|
||||
default: break;
|
||||
}
|
||||
auto token = Token{type,
|
||||
token_start,
|
||||
(int)(curr_char - token_start),
|
||||
current_line - ((type == TK_EOL) ? 1 : 0),
|
||||
brackets_level,
|
||||
value};
|
||||
// handle "not in", "is not", "yield from"
|
||||
if(!nexts.empty()) {
|
||||
auto& back = nexts.back();
|
||||
if(back.type == TK_NOT_KW && type == TK_IN) {
|
||||
back.type = TK_NOT_IN;
|
||||
return;
|
||||
}
|
||||
if(back.type == TK_IS && type == TK_NOT_KW) {
|
||||
back.type = TK_IS_NOT;
|
||||
return;
|
||||
}
|
||||
if(back.type == TK_YIELD && type == TK_FROM) {
|
||||
back.type = TK_YIELD_FROM;
|
||||
return;
|
||||
}
|
||||
nexts.push_back(token);
|
||||
}
|
||||
}
|
||||
|
||||
void Lexer::add_token_2(char c, TokenIndex one, TokenIndex two) noexcept{
|
||||
if(matchchar(c))
|
||||
add_token(two);
|
||||
else
|
||||
add_token(one);
|
||||
}
|
||||
|
||||
Error* Lexer::eat_string_until(char quote, bool raw, Str* out) noexcept{
|
||||
bool quote3 = match_n_chars(2, quote);
|
||||
small_vector_2<char, 32> buff;
|
||||
while(true) {
|
||||
char c = eatchar_include_newline();
|
||||
if(c == quote) {
|
||||
if(quote3 && !match_n_chars(2, quote)) {
|
||||
buff.push_back(c);
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if(c == '\0') {
|
||||
if(quote3 && src->mode == REPL_MODE) return NeedMoreLines();
|
||||
return SyntaxError("EOL while scanning string literal");
|
||||
}
|
||||
if(c == '\n') {
|
||||
if(!quote3)
|
||||
return SyntaxError("EOL while scanning string literal");
|
||||
else {
|
||||
buff.push_back(c);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(!raw && c == '\\') {
|
||||
switch(eatchar_include_newline()) {
|
||||
case '"': buff.push_back('"'); break;
|
||||
case '\'': buff.push_back('\''); break;
|
||||
case '\\': buff.push_back('\\'); break;
|
||||
case 'n': buff.push_back('\n'); break;
|
||||
case 'r': buff.push_back('\r'); break;
|
||||
case 't': buff.push_back('\t'); break;
|
||||
case 'b': buff.push_back('\b'); break;
|
||||
case 'x': {
|
||||
char hex[3] = {eatchar(), eatchar(), '\0'};
|
||||
size_t parsed;
|
||||
char code;
|
||||
try {
|
||||
code = (char)std::stoi(hex, &parsed, 16);
|
||||
} catch(...) {
|
||||
return SyntaxError("invalid hex char");
|
||||
}
|
||||
if(parsed != 2) return SyntaxError("invalid hex char");
|
||||
buff.push_back(code);
|
||||
} break;
|
||||
default: return SyntaxError("invalid escape char");
|
||||
}
|
||||
} else {
|
||||
buff.push_back(c);
|
||||
}
|
||||
}
|
||||
*out = Str(buff.data(), buff.size());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Error* Lexer::eat_string(char quote, StringType type) noexcept{
|
||||
Str s;
|
||||
Error* err = eat_string_until(quote, type == StringType::RAW_STRING, &s);
|
||||
if(err) return err;
|
||||
if(type == StringType::F_STRING) {
|
||||
add_token(TK_FSTR, s);
|
||||
}else if(type == StringType::NORMAL_BYTES) {
|
||||
add_token(TK_BYTES, s);
|
||||
}else{
|
||||
add_token(TK_STR, s);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Error* Lexer::eat_number() noexcept{
|
||||
const char* i = token_start;
|
||||
while(is_possible_number_char(*i))
|
||||
i++;
|
||||
|
||||
bool is_scientific_notation = false;
|
||||
if(*(i - 1) == 'e' && (*i == '+' || *i == '-')) {
|
||||
i++;
|
||||
while(isdigit(*i) || *i == 'j')
|
||||
i++;
|
||||
is_scientific_notation = true;
|
||||
}
|
||||
|
||||
std::string_view text(token_start, i - token_start);
|
||||
this->curr_char = i;
|
||||
|
||||
if(text[0] != '.' && !is_scientific_notation) {
|
||||
// try long
|
||||
if(i[-1] == 'L') {
|
||||
add_token(TK_LONG);
|
||||
return NULL;
|
||||
}
|
||||
// try integer
|
||||
i64 int_out;
|
||||
switch(parse_uint(text, &int_out, -1)) {
|
||||
case IntParsingResult::Success: add_token(TK_NUM, int_out); return NULL;
|
||||
case IntParsingResult::Overflow: return SyntaxError("int literal is too large");
|
||||
case IntParsingResult::Failure: break; // do nothing
|
||||
}
|
||||
}
|
||||
|
||||
// try float
|
||||
double float_out;
|
||||
char* p_end;
|
||||
try {
|
||||
float_out = std::strtod(text.data(), &p_end);
|
||||
} catch(...) {
|
||||
return SyntaxError("invalid number literal");
|
||||
}
|
||||
|
||||
if(p_end == text.data() + text.size()) {
|
||||
add_token(TK_NUM, (f64)float_out);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(i[-1] == 'j' && p_end == text.data() + text.size() - 1) {
|
||||
add_token(TK_IMAG, (f64)float_out);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return SyntaxError("invalid number literal");
|
||||
}
|
||||
|
||||
Error* Lexer::lex_one_token(bool* eof) noexcept{
|
||||
*eof = false;
|
||||
while(peekchar() != '\0') {
|
||||
token_start = curr_char;
|
||||
char c = eatchar_include_newline();
|
||||
switch(c) {
|
||||
case '\'':
|
||||
case '"': {
|
||||
Error* err = eat_string(c, StringType::NORMAL_STRING);
|
||||
if(err) return err;
|
||||
return NULL;
|
||||
}
|
||||
case '#': skip_line_comment(); break;
|
||||
case '~': add_token(TK_INVERT); return NULL;
|
||||
case '{': add_token(TK_LBRACE); return NULL;
|
||||
case '}': add_token(TK_RBRACE); return NULL;
|
||||
case ',': add_token(TK_COMMA); return NULL;
|
||||
case ':': add_token(TK_COLON); return NULL;
|
||||
case ';': add_token(TK_SEMICOLON); return NULL;
|
||||
case '(': add_token(TK_LPAREN); return NULL;
|
||||
case ')': add_token(TK_RPAREN); return NULL;
|
||||
case '[': add_token(TK_LBRACKET); return NULL;
|
||||
case ']': add_token(TK_RBRACKET); return NULL;
|
||||
case '@': add_token(TK_DECORATOR); return NULL;
|
||||
case '\\': {
|
||||
// line continuation character
|
||||
char c = eatchar_include_newline();
|
||||
if(c != '\n') {
|
||||
if(src->mode == REPL_MODE && c == '\0') return NeedMoreLines();
|
||||
return SyntaxError("expected newline after line continuation character");
|
||||
}
|
||||
eat_spaces();
|
||||
return NULL;
|
||||
}
|
||||
case '%': add_token_2('=', TK_MOD, TK_IMOD); return NULL;
|
||||
case '&': add_token_2('=', TK_AND, TK_IAND); return NULL;
|
||||
case '|': add_token_2('=', TK_OR, TK_IOR); return NULL;
|
||||
case '^': add_token_2('=', TK_XOR, TK_IXOR); return NULL;
|
||||
case '.': {
|
||||
if(matchchar('.')) {
|
||||
if(matchchar('.')) {
|
||||
add_token(TK_DOTDOTDOT);
|
||||
} else {
|
||||
add_token(TK_DOTDOT);
|
||||
}
|
||||
} else {
|
||||
char next_char = peekchar();
|
||||
if(next_char >= '0' && next_char <= '9') {
|
||||
Error* err = eat_number();
|
||||
if(err) return err;
|
||||
} else {
|
||||
add_token(TK_DOT);
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case '=': add_token_2('=', TK_ASSIGN, TK_EQ); return NULL;
|
||||
case '+': add_token_2('=', TK_ADD, TK_IADD); return NULL;
|
||||
case '>': {
|
||||
if(matchchar('='))
|
||||
add_token(TK_GE);
|
||||
else if(matchchar('>'))
|
||||
add_token_2('=', TK_RSHIFT, TK_IRSHIFT);
|
||||
else
|
||||
add_token(TK_GT);
|
||||
return NULL;
|
||||
}
|
||||
case '<': {
|
||||
if(matchchar('='))
|
||||
add_token(TK_LE);
|
||||
else if(matchchar('<'))
|
||||
add_token_2('=', TK_LSHIFT, TK_ILSHIFT);
|
||||
else
|
||||
add_token(TK_LT);
|
||||
return NULL;
|
||||
}
|
||||
case '-': {
|
||||
if(matchchar('='))
|
||||
add_token(TK_ISUB);
|
||||
else if(matchchar('>'))
|
||||
add_token(TK_ARROW);
|
||||
else
|
||||
add_token(TK_SUB);
|
||||
return NULL;
|
||||
}
|
||||
case '!':
|
||||
if(matchchar('=')){
|
||||
add_token(TK_NE);
|
||||
}else{
|
||||
Error* err = SyntaxError("expected '=' after '!'");
|
||||
if(err) return err;
|
||||
}
|
||||
break;
|
||||
case '*':
|
||||
if(matchchar('*')) {
|
||||
add_token(TK_POW); // '**'
|
||||
} else {
|
||||
add_token_2('=', TK_MUL, TK_IMUL);
|
||||
}
|
||||
return NULL;
|
||||
case '/':
|
||||
if(matchchar('/')) {
|
||||
add_token_2('=', TK_FLOORDIV, TK_IFLOORDIV);
|
||||
} else {
|
||||
add_token_2('=', TK_DIV, TK_IDIV);
|
||||
}
|
||||
return NULL;
|
||||
case ' ':
|
||||
case '\t': eat_spaces(); break;
|
||||
case '\n': {
|
||||
add_token(TK_EOL);
|
||||
if(!eat_indentation()){
|
||||
return IndentationError("unindent does not match any outer indentation level");
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
default: {
|
||||
if(c == 'f') {
|
||||
if(matchchar('\'')) return eat_string('\'', StringType::F_STRING);
|
||||
if(matchchar('"')) return eat_string('"', StringType::F_STRING);
|
||||
} else if(c == 'r') {
|
||||
if(matchchar('\'')) return eat_string('\'', StringType::RAW_STRING);
|
||||
if(matchchar('"')) return eat_string('"', StringType::RAW_STRING);
|
||||
} else if(c == 'b') {
|
||||
if(matchchar('\'')) return eat_string('\'', StringType::NORMAL_BYTES);
|
||||
if(matchchar('"')) return eat_string('"', StringType::NORMAL_BYTES);
|
||||
}
|
||||
if(c >= '0' && c <= '9') return eat_number();
|
||||
return eat_name();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
token_start = curr_char;
|
||||
while(indents.size() > 1) {
|
||||
indents.pop_back();
|
||||
add_token(TK_DEDENT);
|
||||
return NULL;
|
||||
}
|
||||
add_token(TK_EOF);
|
||||
*eof = true;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Error* Lexer::_error(bool lexer_err, const char* type, const char* msg, va_list* args, i64 userdata) noexcept{
|
||||
Error* err = (Error*)malloc(sizeof(Error));
|
||||
err->type = type;
|
||||
err->src = src;
|
||||
PK_INCREF(src);
|
||||
if(lexer_err){
|
||||
err->lineno = current_line;
|
||||
err->cursor = curr_char;
|
||||
if(*curr_char == '\n') {
|
||||
err->lineno--;
|
||||
err->cursor--;
|
||||
}
|
||||
}else{
|
||||
err->lineno = -1;
|
||||
err->cursor = NULL;
|
||||
}
|
||||
if(args){
|
||||
vsnprintf(err->msg, sizeof(err->msg), msg, *args);
|
||||
}else{
|
||||
strncpy(err->msg, msg, sizeof(err->msg));
|
||||
}
|
||||
err->userdata = userdata;
|
||||
return err;
|
||||
}
|
||||
|
||||
Error* Lexer::SyntaxError(const char* fmt, ...) noexcept{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
Error* err = _error(true, "SyntaxError", fmt, &args);
|
||||
va_end(args);
|
||||
return err;
|
||||
}
|
||||
|
||||
Error* Lexer::run() noexcept{
|
||||
assert(!this->used);
|
||||
this->used = true;
|
||||
if(src->is_precompiled) {
|
||||
return from_precompiled();
|
||||
}
|
||||
// push initial tokens
|
||||
this->nexts.push_back(Token{TK_SOF, token_start, 0, current_line, brackets_level, {}});
|
||||
this->indents.push_back(0);
|
||||
|
||||
bool eof = false;
|
||||
while(!eof) {
|
||||
Error* err = lex_one_token(&eof);
|
||||
if(err) return err;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Error* Lexer::from_precompiled() noexcept{
|
||||
pk_TokenDeserializer deserializer;
|
||||
pk_TokenDeserializer__ctor(&deserializer, py_Str__data(&src->source));
|
||||
|
||||
deserializer.curr += 5; // skip "pkpy:"
|
||||
c11_string version = pk_TokenDeserializer__read_string(&deserializer, '\n');
|
||||
|
||||
if(c11_string__cmp3(version, PK_VERSION) != 0) {
|
||||
return SyntaxError("precompiled version mismatch");
|
||||
}
|
||||
if(pk_TokenDeserializer__read_uint(&deserializer, '\n') != (i64)src->mode){
|
||||
return SyntaxError("precompiled mode mismatch");
|
||||
}
|
||||
|
||||
int count = pk_TokenDeserializer__read_count(&deserializer);
|
||||
c11_vector* precompiled_tokens = &src->_precompiled_tokens;
|
||||
for(int i = 0; i < count; i++) {
|
||||
c11_string item = pk_TokenDeserializer__read_string(&deserializer, '\n');
|
||||
py_Str copied_item;
|
||||
py_Str__ctor2(&copied_item, item.data, item.size);
|
||||
c11_vector__push(py_Str, precompiled_tokens, copied_item);
|
||||
}
|
||||
|
||||
count = pk_TokenDeserializer__read_count(&deserializer);
|
||||
for(int i = 0; i < count; i++) {
|
||||
Token t;
|
||||
t.type = (TokenIndex)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
if(is_raw_string_used(t.type)) {
|
||||
i64 index = pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
py_Str* p = c11__at(py_Str, precompiled_tokens, index);
|
||||
t.start = py_Str__data(p);
|
||||
t.length = c11__getitem(py_Str, precompiled_tokens, index).size;
|
||||
} else {
|
||||
t.start = NULL;
|
||||
t.length = 0;
|
||||
}
|
||||
|
||||
if(pk_TokenDeserializer__match_char(&deserializer, ',')) {
|
||||
t.line = nexts.back().line;
|
||||
} else {
|
||||
t.line = (int)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
}
|
||||
|
||||
if(pk_TokenDeserializer__match_char(&deserializer, ',')) {
|
||||
t.brackets_level = nexts.back().brackets_level;
|
||||
} else {
|
||||
t.brackets_level = (int)pk_TokenDeserializer__read_uint(&deserializer, ',');
|
||||
}
|
||||
|
||||
char type = (*deserializer.curr++); // read_char
|
||||
switch(type) {
|
||||
case 'I':
|
||||
t.value = pk_TokenDeserializer__read_uint(&deserializer, '\n');
|
||||
break;
|
||||
case 'F':
|
||||
t.value = pk_TokenDeserializer__read_float(&deserializer, '\n');
|
||||
break;
|
||||
case 'S': {
|
||||
py_Str res = pk_TokenDeserializer__read_string_from_hex(&deserializer, '\n');
|
||||
t.value = Str(std::move(res));
|
||||
} break;
|
||||
default:
|
||||
t.value = {};
|
||||
break;
|
||||
}
|
||||
nexts.push_back(t);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Error* Lexer::precompile(Str* out) noexcept{
|
||||
assert(!src->is_precompiled);
|
||||
Error* err = run();
|
||||
if(err) return err;
|
||||
SStream ss;
|
||||
ss << "pkpy:" PK_VERSION << '\n'; // L1: version string
|
||||
ss << (int)src->mode << '\n'; // L2: mode
|
||||
|
||||
c11_smallmap_s2n token_indices;
|
||||
c11_smallmap_s2n__ctor(&token_indices);
|
||||
|
||||
for(auto token: nexts) {
|
||||
if(is_raw_string_used(token.type)) {
|
||||
c11_string token_sv = {token.start, token.length};
|
||||
if(!c11_smallmap_s2n__contains(&token_indices, token_sv)) {
|
||||
c11_smallmap_s2n__set(&token_indices, token_sv, 0);
|
||||
// assert no '\n' in token.sv()
|
||||
for(char c: token.sv())
|
||||
assert(c != '\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
ss << "=" << (int)token_indices.count << '\n'; // L3: raw string count
|
||||
uint16_t index = 0;
|
||||
for(int i=0; i<token_indices.count; i++){
|
||||
auto kv = c11__at(c11_smallmap_s2n_KV, &token_indices, i);
|
||||
ss << kv->key << '\n'; // L4: raw strings
|
||||
kv->value = index++;
|
||||
}
|
||||
|
||||
ss << "=" << (int)nexts.size() << '\n'; // L5: token count
|
||||
for(int i = 0; i < nexts.size(); i++) {
|
||||
const Token& token = nexts[i];
|
||||
ss << (int)token.type << ',';
|
||||
if(is_raw_string_used(token.type)) {
|
||||
uint16_t *p = c11_smallmap_s2n__try_get(&token_indices, {token.start, token.length});
|
||||
assert(p != NULL);
|
||||
ss << (int)*p << ',';
|
||||
}
|
||||
if(i > 0 && nexts[i - 1].line == token.line)
|
||||
ss << ',';
|
||||
else
|
||||
ss << token.line << ',';
|
||||
if(i > 0 && nexts[i - 1].brackets_level == token.brackets_level)
|
||||
ss << ',';
|
||||
else
|
||||
ss << token.brackets_level << ',';
|
||||
// visit token value
|
||||
std::visit(
|
||||
[&ss](auto&& arg) {
|
||||
using T = std::decay_t<decltype(arg)>;
|
||||
if constexpr(std::is_same_v<T, i64>) {
|
||||
ss << 'I' << arg;
|
||||
} else if constexpr(std::is_same_v<T, f64>) {
|
||||
ss << 'F' << arg;
|
||||
} else if constexpr(std::is_same_v<T, Str>) {
|
||||
ss << 'S';
|
||||
for(char c: arg)
|
||||
ss.write_hex((unsigned char)c);
|
||||
}
|
||||
ss << '\n';
|
||||
},
|
||||
token.value);
|
||||
}
|
||||
*out = ss.str();
|
||||
c11_smallmap_s2n__dtor(&token_indices);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
IntParsingResult parse_uint(std::string_view text, i64* out, int base) noexcept{
|
||||
*out = 0;
|
||||
|
||||
if(base == -1) {
|
||||
if(text.substr(0, 2) == "0b")
|
||||
base = 2;
|
||||
else if(text.substr(0, 2) == "0o")
|
||||
base = 8;
|
||||
else if(text.substr(0, 2) == "0x")
|
||||
base = 16;
|
||||
else
|
||||
base = 10;
|
||||
}
|
||||
|
||||
if(base == 10) {
|
||||
// 10-base 12334
|
||||
if(text.length() == 0) return IntParsingResult::Failure;
|
||||
for(char c: text) {
|
||||
if(c >= '0' && c <= '9') {
|
||||
*out = (*out * 10) + (c - '0');
|
||||
} else {
|
||||
return IntParsingResult::Failure;
|
||||
}
|
||||
}
|
||||
const std::string_view INT64_MAX_S = "9223372036854775807";
|
||||
if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
|
||||
return IntParsingResult::Success;
|
||||
} else if(base == 2) {
|
||||
// 2-base 0b101010
|
||||
if(text.substr(0, 2) == "0b") text.remove_prefix(2);
|
||||
if(text.length() == 0) return IntParsingResult::Failure;
|
||||
for(char c: text) {
|
||||
if(c == '0' || c == '1') {
|
||||
*out = (*out << 1) | (c - '0');
|
||||
} else {
|
||||
return IntParsingResult::Failure;
|
||||
}
|
||||
}
|
||||
const std::string_view INT64_MAX_S = "111111111111111111111111111111111111111111111111111111111111111";
|
||||
if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
|
||||
return IntParsingResult::Success;
|
||||
} else if(base == 8) {
|
||||
// 8-base 0o123
|
||||
if(text.substr(0, 2) == "0o") text.remove_prefix(2);
|
||||
if(text.length() == 0) return IntParsingResult::Failure;
|
||||
for(char c: text) {
|
||||
if(c >= '0' && c <= '7') {
|
||||
*out = (*out << 3) | (c - '0');
|
||||
} else {
|
||||
return IntParsingResult::Failure;
|
||||
}
|
||||
}
|
||||
const std::string_view INT64_MAX_S = "777777777777777777777";
|
||||
if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
|
||||
return IntParsingResult::Success;
|
||||
} else if(base == 16) {
|
||||
// 16-base 0x123
|
||||
if(text.substr(0, 2) == "0x") text.remove_prefix(2);
|
||||
if(text.length() == 0) return IntParsingResult::Failure;
|
||||
for(char c: text) {
|
||||
if(c >= '0' && c <= '9') {
|
||||
*out = (*out << 4) | (c - '0');
|
||||
} else if(c >= 'a' && c <= 'f') {
|
||||
*out = (*out << 4) | (c - 'a' + 10);
|
||||
} else if(c >= 'A' && c <= 'F') {
|
||||
*out = (*out << 4) | (c - 'A' + 10);
|
||||
} else {
|
||||
return IntParsingResult::Failure;
|
||||
}
|
||||
}
|
||||
const std::string_view INT64_MAX_S = "7fffffffffffffff";
|
||||
if(text.length() > INT64_MAX_S.length()) return IntParsingResult::Overflow;
|
||||
return IntParsingResult::Success;
|
||||
}
|
||||
return IntParsingResult::Failure;
|
||||
}
|
||||
|
||||
} // namespace pkpy
|
@ -1,187 +0,0 @@
|
||||
#include "pocketpy/compiler/lexer.h"
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
|
||||
typedef struct pk_Lexer{
|
||||
pk_SourceData_ src;
|
||||
const char* token_start;
|
||||
const char* curr_char;
|
||||
int current_line;
|
||||
int brackets_level;
|
||||
|
||||
c11_vector/*T=Token*/ nexts;
|
||||
c11_vector/*T=int*/ indents;
|
||||
} pk_Lexer;
|
||||
|
||||
const static TokenValue EmptyTokenValue;
|
||||
|
||||
void pk_Lexer__ctor(pk_Lexer* self, pk_SourceData_ src){
|
||||
PK_INCREF(src);
|
||||
self->src = src;
|
||||
self->curr_char = self->token_start = py_Str__data(&src->source);
|
||||
self->current_line = 1;
|
||||
self->brackets_level = 0;
|
||||
c11_vector__ctor(&self->nexts, sizeof(Token));
|
||||
c11_vector__ctor(&self->indents, sizeof(int));
|
||||
}
|
||||
|
||||
void pk_Lexer__dtor(pk_Lexer* self){
|
||||
PK_DECREF(self->src);
|
||||
c11_vector__dtor(&self->nexts);
|
||||
c11_vector__dtor(&self->indents);
|
||||
}
|
||||
|
||||
void* pk_Lexer__run(pk_SourceData_ src, void** out_tokens){
|
||||
pk_Lexer lexer;
|
||||
pk_Lexer__ctor(&lexer, src);
|
||||
|
||||
if(src->is_precompiled) {
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return from_precompiled();
|
||||
}
|
||||
// push initial tokens
|
||||
Token sof = {TK_SOF, lexer.token_start, 0, lexer.current_line, lexer.brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &lexer.nexts, sof);
|
||||
c11_vector__push(int, &lexer.indents, 0);
|
||||
|
||||
bool eof = false;
|
||||
while(!eof) {
|
||||
void* err = lex_one_token(&eof);
|
||||
if(err){
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
pk_Lexer__dtor(&lexer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char eatchar(pk_Lexer* self){
|
||||
char c = *self->curr_char;
|
||||
assert(c != '\n'); // eatchar() cannot consume a newline
|
||||
self->curr_char++;
|
||||
return c;
|
||||
}
|
||||
|
||||
char eatchar_include_newline(pk_Lexer* self){
|
||||
char c = *self->curr_char;
|
||||
self->curr_char++;
|
||||
if(c == '\n') {
|
||||
self->current_line++;
|
||||
c11_vector__push(const char*, &self->src->line_starts, self->curr_char);
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
int eat_spaces(pk_Lexer* self){
|
||||
int count = 0;
|
||||
while(true) {
|
||||
switch(*self->curr_char) {
|
||||
case ' ': count += 1; break;
|
||||
case '\t': count += 4; break;
|
||||
default: return count;
|
||||
}
|
||||
eatchar(self);
|
||||
}
|
||||
}
|
||||
|
||||
bool matchchar(pk_Lexer* self, char c){
|
||||
if(*self->curr_char != c) return false;
|
||||
eatchar_include_newline(self);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool match_n_chars(pk_Lexer* self, int n, char c0){
|
||||
const char* c = self->curr_char;
|
||||
for(int i = 0; i < n; i++) {
|
||||
if(*c == '\0') return false;
|
||||
if(*c != c0) return false;
|
||||
c++;
|
||||
}
|
||||
for(int i = 0; i < n; i++)
|
||||
eatchar_include_newline(self);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool match_string(pk_Lexer* self, const char* s){
|
||||
int s_len = strlen(s);
|
||||
if(strncmp(self->curr_char, s, s_len) == 0){
|
||||
for(int i = 0; i < s_len; i++)
|
||||
eatchar_include_newline(self);
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
void skip_line_comment(pk_Lexer* self){
|
||||
while(*self->curr_char) {
|
||||
if(*self->curr_char == '\n') return;
|
||||
eatchar(self);
|
||||
}
|
||||
}
|
||||
|
||||
void add_token(pk_Lexer* self, TokenIndex type, TokenValue value){
|
||||
switch(type) {
|
||||
case TK_LBRACE:
|
||||
case TK_LBRACKET:
|
||||
case TK_LPAREN: self->brackets_level++; break;
|
||||
case TK_RPAREN:
|
||||
case TK_RBRACKET:
|
||||
case TK_RBRACE: self->brackets_level--; break;
|
||||
default: break;
|
||||
}
|
||||
Token token = {type,
|
||||
self->token_start,
|
||||
(int)(self->curr_char - self->token_start),
|
||||
self->current_line - ((type == TK_EOL) ? 1 : 0),
|
||||
self->brackets_level,
|
||||
value};
|
||||
// handle "not in", "is not", "yield from"
|
||||
if(self->nexts.count > 0) {
|
||||
Token* back = &c11_vector__back(Token, &self->nexts);
|
||||
if(back->type == TK_NOT_KW && type == TK_IN) {
|
||||
back->type = TK_NOT_IN;
|
||||
return;
|
||||
}
|
||||
if(back->type == TK_IS && type == TK_NOT_KW) {
|
||||
back->type = TK_IS_NOT;
|
||||
return;
|
||||
}
|
||||
if(back->type == TK_YIELD && type == TK_FROM) {
|
||||
back->type = TK_YIELD_FROM;
|
||||
return;
|
||||
}
|
||||
c11_vector__push(Token, &self->nexts, token);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void add_token_2(pk_Lexer* self, char c, TokenIndex one, TokenIndex two){
|
||||
if(matchchar(self, c))
|
||||
add_token(self, two, EmptyTokenValue);
|
||||
else
|
||||
add_token(self, one, EmptyTokenValue);
|
||||
}
|
||||
|
||||
bool eat_indentation(pk_Lexer* self){
|
||||
if(self->brackets_level > 0) return true;
|
||||
int spaces = eat_spaces(self);
|
||||
if(*self->curr_char == '#') skip_line_comment();
|
||||
if(*self->curr_char == '\0' || *self->curr_char == '\n'){
|
||||
return true;
|
||||
}
|
||||
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
|
||||
int indents_back = c11_vector__back(int, &self->indents);
|
||||
if(spaces > indents_back) {
|
||||
c11_vector__push(int, &self->indents, spaces);
|
||||
Token t = {TK_INDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &self->nexts, t);
|
||||
} else if(spaces < indents_back) {
|
||||
do {
|
||||
c11_vector__pop(int, &self->indents);
|
||||
Token t = {TK_DEDENT, self->token_start, 0, self->current_line, self->brackets_level, EmptyTokenValue};
|
||||
c11_vector__push(Token, &self->nexts, t);
|
||||
indents_back = c11_vector__back(int, &self->indents);
|
||||
} while(spaces < indents_back);
|
||||
if(spaces != indents_back) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
@ -17,7 +17,7 @@ FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name){
|
||||
FuncDecl* self = malloc(sizeof(FuncDecl));
|
||||
self->rc.count = 1;
|
||||
self->rc.dtor = (void (*)(void*))FuncDecl__dtor;
|
||||
self->code = CodeObject__new(src, name);
|
||||
CodeObject__ctor(&self->code, src, name);
|
||||
|
||||
c11_vector__ctor(&self->args, sizeof(int));
|
||||
c11_vector__ctor(&self->kwargs, sizeof(FuncDeclKwArg));
|
||||
@ -34,7 +34,7 @@ FuncDecl_ FuncDecl__rcnew(pk_SourceData_ src, c11_string name){
|
||||
}
|
||||
|
||||
void FuncDecl__dtor(FuncDecl* self){
|
||||
CodeObject__delete(self->code);
|
||||
CodeObject__dtor(&self->code);
|
||||
c11_vector__dtor(&self->args);
|
||||
c11_vector__dtor(&self->kwargs);
|
||||
c11_smallmap_n2i__dtor(&self->kw_to_index);
|
||||
@ -46,8 +46,7 @@ void FuncDecl__add_kwarg(FuncDecl* self, int index, uint16_t key, const PyVar* v
|
||||
c11_vector__push(FuncDeclKwArg, &self->kwargs, item);
|
||||
}
|
||||
|
||||
CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name){
|
||||
CodeObject* self = malloc(sizeof(CodeObject));
|
||||
void CodeObject__ctor(CodeObject* self, pk_SourceData_ src, c11_string name){
|
||||
self->src = src; PK_INCREF(src);
|
||||
py_Str__ctor2(&self->name, name.data, name.size);
|
||||
|
||||
@ -69,10 +68,9 @@ CodeObject* CodeObject__new(pk_SourceData_ src, c11_string name){
|
||||
|
||||
CodeBlock root_block = {CodeBlockType_NO_BLOCK, -1, 0, -1, -1};
|
||||
c11_vector__push(CodeBlock, &self->blocks, root_block);
|
||||
return self;
|
||||
}
|
||||
|
||||
void CodeObject__delete(CodeObject* self){
|
||||
void CodeObject__dtor(CodeObject* self){
|
||||
PK_DECREF(self->src);
|
||||
py_Str__dtor(&self->name);
|
||||
|
||||
@ -92,6 +90,4 @@ void CodeObject__delete(CodeObject* self){
|
||||
PK_DECREF(decl);
|
||||
}
|
||||
c11_vector__dtor(&self->func_decls);
|
||||
|
||||
free(self);
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
#include "pocketpy/pocketpy.h"
|
||||
#include "pocketpy/common/utils.h"
|
||||
#include "pocketpy/objects/object.h"
|
||||
#include "pocketpy/interpreter/vm.h"
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,15 +1,18 @@
|
||||
#include "pocketpy/objects/sourcedata.h"
|
||||
#include "pocketpy/pocketpy.h"
|
||||
|
||||
#include "pocketpy/common/utils.h"
|
||||
#include "pocketpy/objects/object.h"
|
||||
#include "pocketpy/interpreter/vm.h"
|
||||
#include "pocketpy/compiler/compiler.h"
|
||||
|
||||
pk_VM* pk_current_vm;
|
||||
static pk_VM pk_default_vm;
|
||||
|
||||
void py_initialize() {
|
||||
Pools_initialize();
|
||||
pk_MemoryPools__initialize();
|
||||
pk_StrName__initialize();
|
||||
pk_Compiler__initialize();
|
||||
pk_current_vm = &pk_default_vm;
|
||||
pk_VM__ctor(&pk_default_vm);
|
||||
}
|
||||
@ -17,14 +20,20 @@ void py_initialize() {
|
||||
void py_finalize() {
|
||||
pk_VM__dtor(&pk_default_vm);
|
||||
pk_current_vm = NULL;
|
||||
pk_Compiler__finalize();
|
||||
pk_StrName__finalize();
|
||||
Pools_finalize();
|
||||
pk_MemoryPools__finalize();
|
||||
}
|
||||
|
||||
int py_exec(const char* source) {
|
||||
CodeObject* co = NULL;
|
||||
pk_SourceData_ src = pk_SourceData__rcnew(source, "main.py", EXEC_MODE, false);
|
||||
CodeObject co;
|
||||
Error* err = pk_compile(src, &co);
|
||||
PK_DECREF(src);
|
||||
if(err) abort();
|
||||
|
||||
pk_VM* vm = pk_current_vm;
|
||||
Frame* frame = Frame__new(co, &vm->main, NULL, vm->stack.sp, vm->stack.sp, co);
|
||||
Frame* frame = Frame__new(&co, &vm->main, NULL, vm->stack.sp, vm->stack.sp, &co);
|
||||
pk_VM__push_frame(vm, frame);
|
||||
pk_FrameResult res = pk_VM__run_top_frame(vm);
|
||||
if(res == RES_ERROR) return vm->last_error->type;
|
||||
|
116
src2/main.cpp
116
src2/main.cpp
@ -1,116 +0,0 @@
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#if __has_include("pocketpy_c.h")
|
||||
#include "pocketpy_c.h"
|
||||
#else
|
||||
// for amalgamated build
|
||||
#include "pocketpy.h"
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
std::string pkpy_platform_getline(bool* eof) {
|
||||
HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
|
||||
std::wstringstream wss;
|
||||
WCHAR buf;
|
||||
DWORD read;
|
||||
while(ReadConsoleW(hStdin, &buf, 1, &read, NULL) && buf != L'\n') {
|
||||
if(eof && buf == L'\x1A') *eof = true; // Ctrl+Z
|
||||
wss << buf;
|
||||
}
|
||||
std::wstring wideInput = wss.str();
|
||||
int length = WideCharToMultiByte(CP_UTF8, 0, wideInput.c_str(), (int)wideInput.length(), NULL, 0, NULL, NULL);
|
||||
std::string output;
|
||||
output.resize(length);
|
||||
WideCharToMultiByte(CP_UTF8, 0, wideInput.c_str(), (int)wideInput.length(), &output[0], length, NULL, NULL);
|
||||
if(!output.empty() && output.back() == '\r') output.pop_back();
|
||||
return output;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
std::string pkpy_platform_getline(bool* eof) {
|
||||
std::string output;
|
||||
if(!std::getline(std::cin, output)) {
|
||||
if(eof) *eof = true;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using namespace pkpy;
|
||||
|
||||
static int f_input(pkpy_vm* vm) {
|
||||
if(!pkpy_is_none(vm, -1)) {
|
||||
pkpy_CString prompt;
|
||||
bool ok = pkpy_to_string(vm, -1, &prompt);
|
||||
if(!ok) return 0;
|
||||
std::cout << prompt << std::flush;
|
||||
}
|
||||
bool eof;
|
||||
std::string output = pkpy_platform_getline(&eof);
|
||||
pkpy_push_string(vm, pkpy_string(output.c_str()));
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
#if _WIN32
|
||||
SetConsoleCP(CP_UTF8);
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
#endif
|
||||
pkpy_vm* vm = pkpy_new_vm(true);
|
||||
|
||||
pkpy_push_function(vm, "input(prompt=None) -> str", f_input);
|
||||
pkpy_py_import(vm, "builtins");
|
||||
pkpy_setattr(vm, pkpy_name("input"));
|
||||
|
||||
if(argc == 1) {
|
||||
void* repl = pkpy_new_repl(vm);
|
||||
bool need_more_lines = false;
|
||||
while(true) {
|
||||
std::cout << (need_more_lines ? "... " : ">>> ");
|
||||
bool eof = false;
|
||||
std::string line = pkpy_platform_getline(&eof);
|
||||
if(eof) break;
|
||||
need_more_lines = pkpy_repl_input(repl, line.c_str());
|
||||
}
|
||||
pkpy_delete_vm(vm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(argc == 2) {
|
||||
std::string argv_1 = argv[1];
|
||||
if(argv_1 == "-h" || argv_1 == "--help") goto __HELP;
|
||||
|
||||
std::filesystem::path filepath(argv[1]);
|
||||
filepath = std::filesystem::absolute(filepath);
|
||||
if(!std::filesystem::exists(filepath)) {
|
||||
std::cerr << "File not found: " << argv_1 << std::endl;
|
||||
return 2;
|
||||
}
|
||||
std::ifstream file(filepath);
|
||||
if(!file.is_open()) {
|
||||
std::cerr << "Failed to open file: " << argv_1 << std::endl;
|
||||
return 3;
|
||||
}
|
||||
std::string src((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
|
||||
file.close();
|
||||
|
||||
pkpy_set_main_argv(vm, argc, argv);
|
||||
|
||||
bool ok = pkpy_exec_2(vm, src.c_str(), filepath.filename().string().c_str(), 0, NULL);
|
||||
if(!ok) pkpy_clear_error(vm, NULL);
|
||||
pkpy_delete_vm(vm);
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
__HELP:
|
||||
std::cout << "Usage: pocketpy [filename]" << std::endl;
|
||||
return 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user