update lexer

This commit is contained in:
blueloveTH 2023-03-29 23:33:45 +08:00
parent 120773891a
commit e78aa44895
11 changed files with 730 additions and 630 deletions

View File

@ -6,8 +6,8 @@ with open("src/opcodes.h", "rt", encoding='utf-8') as f:
OPCODES_TEXT = f.read() OPCODES_TEXT = f.read()
pipeline = [ pipeline = [
["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h"], ["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h", "lexer.h"],
["obj.h", "parser.h", "codeobject.h", "frame.h"], ["obj.h", "codeobject.h", "frame.h"],
["gc.h", "vm.h", "ref.h", "ceval.h", "compiler.h", "repl.h"], ["gc.h", "vm.h", "ref.h", "ceval.h", "compiler.h", "repl.h"],
["iter.h", "cffi.h", "io.h", "_generated.h", "pocketpy.h"] ["iter.h", "cffi.h", "io.h", "_generated.h", "pocketpy.h"]
] ]

View File

@ -7,7 +7,7 @@ namespace pkpy{
inline PyObject* VM::run_frame(Frame* frame){ inline PyObject* VM::run_frame(Frame* frame){
while(frame->has_next_bytecode()){ while(frame->has_next_bytecode()){
// heap._auto_collect(this); heap._auto_collect(this);
const Bytecode& byte = frame->next_bytecode(); const Bytecode& byte = frame->next_bytecode();
switch (byte.op) switch (byte.op)
@ -325,7 +325,7 @@ inline PyObject* VM::run_frame(Frame* frame){
if(frame->_data.size() != 1) throw std::runtime_error("_data.size() != 1 in EVAL/JSON_MODE"); if(frame->_data.size() != 1) throw std::runtime_error("_data.size() != 1 in EVAL/JSON_MODE");
return frame->pop_value(this); return frame->pop_value(this);
} }
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(!frame->_data.empty()) throw std::runtime_error("_data.size() != 0 in EXEC_MODE"); if(!frame->_data.empty()) throw std::runtime_error("_data.size() != 0 in EXEC_MODE");
#endif #endif
return None; return None;

View File

@ -10,7 +10,6 @@
#include <sstream> #include <sstream>
#include <regex> #include <regex>
#include <cmath> #include <cmath>
#include <cstdlib>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include <string> #include <string>
@ -26,10 +25,13 @@
#include <algorithm> #include <algorithm>
#include <random> #include <random>
#include <initializer_list> #include <initializer_list>
#include <list> #include <variant>
#define PK_VERSION "0.9.5" #define PK_VERSION "0.9.6"
#define PK_EXTRA_CHECK 0
// debug macros
#define DEBUG_NO_BUILTIN_MODULES 0
#define DEBUG_EXTRA_CHECK 1
#if (defined(__ANDROID__) && __ANDROID_API__ <= 22) || defined(__EMSCRIPTEN__) #if (defined(__ANDROID__) && __ANDROID_API__ <= 22) || defined(__EMSCRIPTEN__)
#define PK_ENABLE_FILEIO 0 #define PK_ENABLE_FILEIO 0
@ -40,13 +42,13 @@
#if defined(__EMSCRIPTEN__) || defined(__arm__) || defined(__i386__) #if defined(__EMSCRIPTEN__) || defined(__arm__) || defined(__i386__)
typedef int32_t i64; typedef int32_t i64;
typedef float f64; typedef float f64;
#define S_TO_INT std::stoi #define S_TO_INT(...) static_cast<i64>(std::stoi(__VA_ARGS__))
#define S_TO_FLOAT std::stof #define S_TO_FLOAT(...) static_cast<f64>(std::stof(__VA_ARGS__))
#else #else
typedef int64_t i64; typedef int64_t i64;
typedef double f64; typedef double f64;
#define S_TO_INT std::stoll #define S_TO_INT(...) static_cast<i64>(std::stoll(__VA_ARGS__))
#define S_TO_FLOAT std::stod #define S_TO_FLOAT(...) static_cast<f64>(std::stod(__VA_ARGS__))
#endif #endif
namespace pkpy{ namespace pkpy{
@ -100,22 +102,6 @@ inline bool is_both_int(PyObject* a, PyObject* b) noexcept {
return is_int(a) && is_int(b); return is_int(a) && is_int(b);
} }
template <typename T>
class queue{
std::list<T> list;
public:
void push(const T& t){ list.push_back(t); }
void push(T&& t){ list.push_back(std::move(t)); }
void pop(){ list.pop_front(); }
void clear(){ list.clear(); }
bool empty() const { return list.empty(); }
size_t size() const { return list.size(); }
T& front(){ return list.front(); }
const T& front() const { return list.front(); }
const std::list<T>& data() const { return list; }
};
template <typename T> template <typename T>
class stack{ class stack{
std::vector<T> vec; std::vector<T> vec;

View File

@ -2,7 +2,7 @@
#include "codeobject.h" #include "codeobject.h"
#include "common.h" #include "common.h"
#include "parser.h" #include "lexer.h"
#include "error.h" #include "error.h"
#include "ceval.h" #include "ceval.h"
@ -18,24 +18,21 @@ struct GrammarRule{
Precedence precedence; Precedence precedence;
}; };
enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
class Compiler { class Compiler {
std::unique_ptr<Parser> parser; std::unique_ptr<Lexer> lexer;
stack<CodeObject_> codes; stack<CodeObject_> codes;
int lexing_count = 0;
bool used = false; bool used = false;
VM* vm; VM* vm;
std::map<TokenIndex, GrammarRule> rules; std::map<TokenIndex, GrammarRule> rules;
CodeObject_ co() const{ return codes.top(); } CodeObject_ co() const{ return codes.top(); }
CompileMode mode() const{ return parser->src->mode; } CompileMode mode() const{ return lexer->src->mode; }
NameScope name_scope() const { return codes.size()>1 ? NAME_LOCAL : NAME_GLOBAL; } NameScope name_scope() const { return codes.size()>1 ? NAME_LOCAL : NAME_GLOBAL; }
public: public:
Compiler(VM* vm, const char* source, Str filename, CompileMode mode){ Compiler(VM* vm, const char* source, Str filename, CompileMode mode){
this->vm = vm; this->vm = vm;
this->parser = std::make_unique<Parser>( this->lexer = std::make_unique<Lexer>(
make_sp<SourceData>(source, filename, mode) make_sp<SourceData>(source, filename, mode)
); );
@ -104,239 +101,36 @@ public:
} }
private: private:
Str eat_string_until(char quote, bool raw) { int i = 0;
bool quote3 = parser->match_n_chars(2, quote); std::vector<Token> tokens;
std::vector<char> buff;
while (true) {
char c = parser->eatchar_include_newline();
if (c == quote){
if(quote3 && !parser->match_n_chars(2, quote)){
buff.push_back(c);
continue;
}
break;
}
if (c == '\0'){
if(quote3 && parser->src->mode == REPL_MODE){
throw NeedMoreLines(false);
}
SyntaxError("EOL while scanning string literal");
}
if (c == '\n'){
if(!quote3) SyntaxError("EOL while scanning string literal");
else{
buff.push_back(c);
continue;
}
}
if (!raw && c == '\\') {
switch (parser->eatchar_include_newline()) {
case '"': buff.push_back('"'); break;
case '\'': buff.push_back('\''); break;
case '\\': buff.push_back('\\'); break;
case 'n': buff.push_back('\n'); break;
case 'r': buff.push_back('\r'); break;
case 't': buff.push_back('\t'); break;
default: SyntaxError("invalid escape char");
}
} else {
buff.push_back(c);
}
}
return Str(buff.data(), buff.size());
}
void eat_string(char quote, StringType type) { const Token& prev() { return tokens.at(i-1); }
Str s = eat_string_until(quote, type == RAW_STRING); const Token& curr() { return tokens.at(i); }
if(type == F_STRING){ const Token& next() { return tokens.at(i+1); }
parser->set_next_token(TK("@fstr"), VAR(s)); const Token& peek(int offset=0) { return tokens.at(i+offset); }
}else{ void advance() { i++; }
parser->set_next_token(TK("@str"), VAR(s));
}
}
void eat_number() {
static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
std::smatch m;
const char* i = parser->token_start;
while(*i != '\n' && *i != '\0') i++;
std::string s = std::string(parser->token_start, i);
try{
if (std::regex_search(s, m, pattern)) {
// here is m.length()-1, since the first char was eaten by lex_token()
for(int j=0; j<m.length()-1; j++) parser->eatchar();
int base = 10;
size_t size;
if (m[1].matched) base = 16;
if (m[2].matched) {
if(base == 16) SyntaxError("hex literal should not contain a dot");
parser->set_next_token(TK("@num"), VAR(S_TO_FLOAT(m[0], &size)));
} else {
parser->set_next_token(TK("@num"), VAR(S_TO_INT(m[0], &size, base)));
}
if (size != m.length()) UNREACHABLE();
}
}catch(std::exception& _){
SyntaxError("invalid number literal");
}
}
void lex_token(){
lexing_count++;
_lex_token();
lexing_count--;
}
// Lex the next token and set it as the next token.
void _lex_token() {
parser->prev = parser->curr;
parser->curr = parser->next_token();
//std::cout << parser->curr.info() << std::endl;
while (parser->peekchar() != '\0') {
parser->token_start = parser->curr_char;
char c = parser->eatchar_include_newline();
switch (c) {
case '\'': case '"': eat_string(c, NORMAL_STRING); return;
case '#': parser->skip_line_comment(); break;
case '{': parser->set_next_token(TK("{")); return;
case '}': parser->set_next_token(TK("}")); return;
case ',': parser->set_next_token(TK(",")); return;
case ':': parser->set_next_token_2(':', TK(":"), TK("::")); return;
case ';': parser->set_next_token(TK(";")); return;
case '(': parser->set_next_token(TK("(")); return;
case ')': parser->set_next_token(TK(")")); return;
case '[': parser->set_next_token(TK("[")); return;
case ']': parser->set_next_token(TK("]")); return;
case '@': parser->set_next_token(TK("@")); return;
case '%': parser->set_next_token_2('=', TK("%"), TK("%=")); return;
case '&': parser->set_next_token_2('=', TK("&"), TK("&=")); return;
case '|': parser->set_next_token_2('=', TK("|"), TK("|=")); return;
case '^': parser->set_next_token_2('=', TK("^"), TK("^=")); return;
case '?': parser->set_next_token(TK("?")); return;
case '.': {
if(parser->matchchar('.')) {
if(parser->matchchar('.')) {
parser->set_next_token(TK("..."));
} else {
SyntaxError("invalid token '..'");
}
} else {
parser->set_next_token(TK("."));
}
return;
}
case '=': parser->set_next_token_2('=', TK("="), TK("==")); return;
case '+': parser->set_next_token_2('=', TK("+"), TK("+=")); return;
case '>': {
if(parser->matchchar('=')) parser->set_next_token(TK(">="));
else if(parser->matchchar('>')) parser->set_next_token_2('=', TK(">>"), TK(">>="));
else parser->set_next_token(TK(">"));
return;
}
case '<': {
if(parser->matchchar('=')) parser->set_next_token(TK("<="));
else if(parser->matchchar('<')) parser->set_next_token_2('=', TK("<<"), TK("<<="));
else parser->set_next_token(TK("<"));
return;
}
case '-': {
if(parser->matchchar('=')) parser->set_next_token(TK("-="));
else if(parser->matchchar('>')) parser->set_next_token(TK("->"));
else parser->set_next_token(TK("-"));
return;
}
case '!':
if(parser->matchchar('=')) parser->set_next_token(TK("!="));
else SyntaxError("expected '=' after '!'");
break;
case '*':
if (parser->matchchar('*')) {
parser->set_next_token(TK("**")); // '**'
} else {
parser->set_next_token_2('=', TK("*"), TK("*="));
}
return;
case '/':
if(parser->matchchar('/')) {
parser->set_next_token_2('=', TK("//"), TK("//="));
} else {
parser->set_next_token_2('=', TK("/"), TK("/="));
}
return;
case '\r': break; // just ignore '\r'
case ' ': case '\t': parser->eat_spaces(); break;
case '\n': {
parser->set_next_token(TK("@eol"));
if(!parser->eat_indentation()) IndentationError("unindent does not match any outer indentation level");
return;
}
default: {
if(c == 'f'){
if(parser->matchchar('\'')) {eat_string('\'', F_STRING); return;}
if(parser->matchchar('"')) {eat_string('"', F_STRING); return;}
}else if(c == 'r'){
if(parser->matchchar('\'')) {eat_string('\'', RAW_STRING); return;}
if(parser->matchchar('"')) {eat_string('"', RAW_STRING); return;}
}
if (c >= '0' && c <= '9') {
eat_number();
return;
}
switch (parser->eat_name())
{
case 0: break;
case 1: SyntaxError("invalid char: " + std::string(1, c));
case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
case 3: SyntaxError("@id contains invalid char"); break;
case 4: SyntaxError("invalid JSON token"); break;
default: UNREACHABLE();
}
return;
}
}
}
parser->token_start = parser->curr_char;
parser->set_next_token(TK("@eof"));
}
TokenIndex peek() {
return parser->curr.type;
}
// not sure this will work
TokenIndex peek_next() {
if(parser->nexts.empty()) return TK("@eof");
return parser->nexts.front().type;
}
bool match(TokenIndex expected) { bool match(TokenIndex expected) {
if (peek() != expected) return false; if (curr().type != expected) return false;
lex_token(); advance();
return true; return true;
} }
void consume(TokenIndex expected) { void consume(TokenIndex expected) {
if (!match(expected)){ if (!match(expected)){
StrStream ss; StrStream ss;
ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(peek()) << "'"; ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(curr().type) << "'";
SyntaxError(ss.str()); SyntaxError(ss.str());
} }
} }
bool match_newlines(bool repl_throw=false) { bool match_newlines(bool repl_throw=false) {
bool consumed = false; bool consumed = false;
if (peek() == TK("@eol")) { if (curr().type == TK("@eol")) {
while (peek() == TK("@eol")) lex_token(); while (curr().type == TK("@eol")) advance();
consumed = true; consumed = true;
} }
if (repl_throw && peek() == TK("@eof")){ if (repl_throw && curr().type == TK("@eof")){
throw NeedMoreLines(co()->_is_compiling_class); throw NeedMoreLines(co()->_is_compiling_class);
} }
return consumed; return consumed;
@ -344,8 +138,8 @@ private:
bool match_end_stmt() { bool match_end_stmt() {
if (match(TK(";"))) { match_newlines(); return true; } if (match(TK(";"))) { match_newlines(); return true; }
if (match_newlines() || peek()==TK("@eof")) return true; if (match_newlines() || curr().type == TK("@eof")) return true;
if (peek() == TK("@dedent")) return true; if (curr().type == TK("@dedent")) return true;
return false; return false;
} }
@ -353,15 +147,27 @@ private:
if (!match_end_stmt()) SyntaxError("expected statement end"); if (!match_end_stmt()) SyntaxError("expected statement end");
} }
PyObject* get_value(const Token& token) {
switch (token.type) {
case TK("@num"):
if(std::holds_alternative<i64>(token.value)) return VAR(std::get<i64>(token.value));
if(std::holds_alternative<f64>(token.value)) return VAR(std::get<f64>(token.value));
UNREACHABLE();
case TK("@str"): case TK("@fstr"):
return VAR(std::get<Str>(token.value));
default: throw std::runtime_error(Str("invalid token type: ") + TK_STR(token.type));
}
}
void exprLiteral() { void exprLiteral() {
PyObject* value = parser->prev.value; PyObject* value = get_value(prev());
int index = co()->add_const(value); int index = co()->add_const(value);
emit(OP_LOAD_CONST, index); emit(OP_LOAD_CONST, index);
} }
void exprFString() { void exprFString() {
static const std::regex pattern(R"(\{(.*?)\})"); static const std::regex pattern(R"(\{(.*?)\})");
PyObject* value = parser->prev.value; PyObject* value = get_value(prev());
Str s = CAST(Str, value); Str s = CAST(Str, value);
std::sregex_iterator begin(s.begin(), s.end(), pattern); std::sregex_iterator begin(s.begin(), s.end(), pattern);
std::sregex_iterator end; std::sregex_iterator end;
@ -395,7 +201,7 @@ private:
_compile_f_args(func, false); _compile_f_args(func, false);
consume(TK(":")); consume(TK(":"));
} }
func.code = make_sp<CodeObject>(parser->src, func.name.str()); func.code = make_sp<CodeObject>(lexer->src, func.name.str());
this->codes.push(func.code); this->codes.push(func.code);
co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1; co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
emit(OP_RETURN_VALUE); emit(OP_RETURN_VALUE);
@ -414,7 +220,7 @@ private:
if(is_load_name_ref) co()->codes.pop_back(); if(is_load_name_ref) co()->codes.pop_back();
co()->_rvalue += 1; co()->_rvalue += 1;
TokenIndex op = parser->prev.type; TokenIndex op = prev().type;
if(op == TK("=")) { // a = (expr) if(op == TK("=")) { // a = (expr)
EXPR_TUPLE(); EXPR_TUPLE();
if(is_load_name_ref){ if(is_load_name_ref){
@ -487,7 +293,7 @@ private:
} }
void exprBinaryOp() { void exprBinaryOp() {
TokenIndex op = parser->prev.type; TokenIndex op = prev().type;
parse_expression((Precedence)(rules[op].precedence + 1)); parse_expression((Precedence)(rules[op].precedence + 1));
switch (op) { switch (op) {
@ -525,7 +331,7 @@ private:
} }
void exprUnaryOp() { void exprUnaryOp() {
TokenIndex op = parser->prev.type; TokenIndex op = prev().type;
parse_expression((Precedence)(PREC_UNARY + 1)); parse_expression((Precedence)(PREC_UNARY + 1));
switch (op) { switch (op) {
case TK("-"): emit(OP_UNARY_NEGATIVE); break; case TK("-"): emit(OP_UNARY_NEGATIVE); break;
@ -588,7 +394,7 @@ private:
int ARGC = 0; int ARGC = 0;
do { do {
match_newlines(mode()==REPL_MODE); match_newlines(mode()==REPL_MODE);
if (peek() == TK("]")) break; if (curr().type == TK("]")) break;
EXPR(); ARGC++; EXPR(); ARGC++;
match_newlines(mode()==REPL_MODE); match_newlines(mode()==REPL_MODE);
if(ARGC == 1 && match(TK("for"))){ if(ARGC == 1 && match(TK("for"))){
@ -609,9 +415,9 @@ private:
int ARGC = 0; int ARGC = 0;
do { do {
match_newlines(mode()==REPL_MODE); match_newlines(mode()==REPL_MODE);
if (peek() == TK("}")) break; if (curr().type == TK("}")) break;
EXPR(); EXPR();
if(peek() == TK(":")) parsing_dict = true; if(curr().type == TK(":")) parsing_dict = true;
if(parsing_dict){ if(parsing_dict){
consume(TK(":")); consume(TK(":"));
EXPR(); EXPR();
@ -637,10 +443,10 @@ private:
bool need_unpack = false; bool need_unpack = false;
do { do {
match_newlines(mode()==REPL_MODE); match_newlines(mode()==REPL_MODE);
if (peek() == TK(")")) break; if (curr().type == TK(")")) break;
if(peek() == TK("@id") && peek_next() == TK("=")) { if(curr().type == TK("@id") && next().type == TK("=")) {
consume(TK("@id")); consume(TK("@id"));
const Str& key = parser->prev.str(); const Str& key = prev().str();
emit(OP_LOAD_CONST, co()->add_const(VAR(key))); emit(OP_LOAD_CONST, co()->add_const(VAR(key)));
consume(TK("=")); consume(TK("="));
co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1; co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
@ -666,7 +472,7 @@ private:
void exprName(){ _exprName(false); } void exprName(){ _exprName(false); }
void _exprName(bool force_lvalue) { void _exprName(bool force_lvalue) {
Token tkname = parser->prev; const Token& tkname = prev();
int index = co()->add_name(tkname.str(), name_scope()); int index = co()->add_name(tkname.str(), name_scope());
bool fast_load = !force_lvalue && co()->_rvalue>0; bool fast_load = !force_lvalue && co()->_rvalue>0;
emit(fast_load ? OP_LOAD_NAME : OP_LOAD_NAME_REF, index); emit(fast_load ? OP_LOAD_NAME : OP_LOAD_NAME_REF, index);
@ -674,7 +480,7 @@ private:
void exprAttrib() { void exprAttrib() {
consume(TK("@id")); consume(TK("@id"));
const Str& name = parser->prev.str(); const Str& name = prev().str();
int index = co()->add_name(name, NAME_ATTR); int index = co()->add_name(name, NAME_ATTR);
emit(co()->_rvalue ? OP_BUILD_ATTR : OP_BUILD_ATTR_REF, index); emit(co()->_rvalue ? OP_BUILD_ATTR : OP_BUILD_ATTR_REF, index);
} }
@ -710,7 +516,7 @@ private:
} }
void exprValue() { void exprValue() {
TokenIndex op = parser->prev.type; TokenIndex op = prev().type;
switch (op) { switch (op) {
case TK("None"): emit(OP_LOAD_NONE); break; case TK("None"): emit(OP_LOAD_NONE); break;
case TK("True"): emit(OP_LOAD_TRUE); break; case TK("True"): emit(OP_LOAD_TRUE); break;
@ -721,7 +527,7 @@ private:
} }
int emit(Opcode opcode, int arg=-1, bool keepline=false) { int emit(Opcode opcode, int arg=-1, bool keepline=false) {
int line = parser->prev.line; int line = prev().line;
co()->codes.push_back( co()->codes.push_back(
Bytecode{(uint8_t)opcode, (uint16_t)co()->_curr_block_i, arg, line} Bytecode{(uint8_t)opcode, (uint16_t)co()->_curr_block_i, arg, line}
); );
@ -738,7 +544,7 @@ private:
void compile_block_body(CompilerAction action=nullptr) { void compile_block_body(CompilerAction action=nullptr) {
if(action == nullptr) action = &Compiler::compile_stmt; if(action == nullptr) action = &Compiler::compile_stmt;
consume(TK(":")); consume(TK(":"));
if(peek()!=TK("@eol") && peek()!=TK("@eof")){ if(curr().type!=TK("@eol") && curr().type!=TK("@eof")){
(this->*action)(); // inline block (this->*action)(); // inline block
return; return;
} }
@ -746,7 +552,7 @@ private:
SyntaxError("expected a new line after ':'"); SyntaxError("expected a new line after ':'");
} }
consume(TK("@indent")); consume(TK("@indent"));
while (peek() != TK("@dedent")) { while (curr().type != TK("@dedent")) {
match_newlines(); match_newlines();
(this->*action)(); (this->*action)();
match_newlines(); match_newlines();
@ -756,7 +562,7 @@ private:
Token _compile_import() { Token _compile_import() {
consume(TK("@id")); consume(TK("@id"));
Token tkmodule = parser->prev; Token tkmodule = prev();
int index = co()->add_name(tkmodule.str(), NAME_SPECIAL); int index = co()->add_name(tkmodule.str(), NAME_SPECIAL);
emit(OP_IMPORT_NAME, index); emit(OP_IMPORT_NAME, index);
return tkmodule; return tkmodule;
@ -768,7 +574,7 @@ private:
Token tkmodule = _compile_import(); Token tkmodule = _compile_import();
if (match(TK("as"))) { if (match(TK("as"))) {
consume(TK("@id")); consume(TK("@id"));
tkmodule = parser->prev; tkmodule = prev();
} }
int index = co()->add_name(tkmodule.str(), name_scope()); int index = co()->add_name(tkmodule.str(), name_scope());
emit(OP_STORE_NAME, index); emit(OP_STORE_NAME, index);
@ -789,12 +595,12 @@ private:
do { do {
emit(OP_DUP_TOP_VALUE); emit(OP_DUP_TOP_VALUE);
consume(TK("@id")); consume(TK("@id"));
Token tkname = parser->prev; Token tkname = prev();
int index = co()->add_name(tkname.str(), NAME_ATTR); int index = co()->add_name(tkname.str(), NAME_ATTR);
emit(OP_BUILD_ATTR, index); emit(OP_BUILD_ATTR, index);
if (match(TK("as"))) { if (match(TK("as"))) {
consume(TK("@id")); consume(TK("@id"));
tkname = parser->prev; tkname = prev();
} }
index = co()->add_name(tkname.str(), name_scope()); index = co()->add_name(tkname.str(), name_scope());
emit(OP_STORE_NAME, index); emit(OP_STORE_NAME, index);
@ -807,14 +613,14 @@ private:
// ['a', '1', '2', '+', '='] // ['a', '1', '2', '+', '=']
// //
void parse_expression(Precedence precedence) { void parse_expression(Precedence precedence) {
lex_token(); advance();
GrammarFn prefix = rules[parser->prev.type].prefix; GrammarFn prefix = rules[prev().type].prefix;
if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(parser->prev.type)); if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(prev().type));
(this->*prefix)(); (this->*prefix)();
bool meet_assign_token = false; bool meet_assign_token = false;
while (rules[peek()].precedence >= precedence) { while (rules[curr().type].precedence >= precedence) {
lex_token(); advance();
TokenIndex op = parser->prev.type; TokenIndex op = prev().type;
if (op == TK("=")){ if (op == TK("=")){
if(meet_assign_token) SyntaxError(); if(meet_assign_token) SyntaxError();
meet_assign_token = true; meet_assign_token = true;
@ -891,7 +697,7 @@ private:
do { do {
consume(TK("except")); consume(TK("except"));
if(match(TK("@id"))){ if(match(TK("@id"))){
int name_idx = co()->add_name(parser->prev.str(), NAME_SPECIAL); int name_idx = co()->add_name(prev().str(), NAME_SPECIAL);
emit(OP_EXCEPTION_MATCH, name_idx); emit(OP_EXCEPTION_MATCH, name_idx);
}else{ }else{
emit(OP_LOAD_TRUE); emit(OP_LOAD_TRUE);
@ -901,7 +707,7 @@ private:
compile_block_body(); compile_block_body();
patches.push_back(emit(OP_JUMP_ABSOLUTE)); patches.push_back(emit(OP_JUMP_ABSOLUTE));
patch_jump(patch); patch_jump(patch);
}while(peek() == TK("except")); }while(curr().type == TK("except"));
emit(OP_RE_RAISE); // no match, re-raise emit(OP_RE_RAISE); // no match, re-raise
for (int patch : patches) patch_jump(patch); for (int patch : patches) patch_jump(patch);
} }
@ -968,7 +774,7 @@ private:
EXPR(); EXPR();
consume(TK("as")); consume(TK("as"));
consume(TK("@id")); consume(TK("@id"));
Token tkname = parser->prev; Token tkname = prev();
int index = co()->add_name(tkname.str(), name_scope()); int index = co()->add_name(tkname.str(), name_scope());
emit(OP_STORE_NAME, index); emit(OP_STORE_NAME, index);
emit(OP_LOAD_NAME_REF, index); emit(OP_LOAD_NAME_REF, index);
@ -979,18 +785,18 @@ private:
} else if(match(TK("label"))){ } else if(match(TK("label"))){
if(mode() != EXEC_MODE) SyntaxError("'label' is only available in EXEC_MODE"); if(mode() != EXEC_MODE) SyntaxError("'label' is only available in EXEC_MODE");
consume(TK(".")); consume(TK("@id")); consume(TK(".")); consume(TK("@id"));
Str label = parser->prev.str(); Str label = prev().str();
bool ok = co()->add_label(label); bool ok = co()->add_label(label);
if(!ok) SyntaxError("label '" + label + "' already exists"); if(!ok) SyntaxError("label '" + label + "' already exists");
consume_end_stmt(); consume_end_stmt();
} else if(match(TK("goto"))){ // https://entrian.com/goto/ } else if(match(TK("goto"))){ // https://entrian.com/goto/
if(mode() != EXEC_MODE) SyntaxError("'goto' is only available in EXEC_MODE"); if(mode() != EXEC_MODE) SyntaxError("'goto' is only available in EXEC_MODE");
consume(TK(".")); consume(TK("@id")); consume(TK(".")); consume(TK("@id"));
emit(OP_GOTO, co()->add_name(parser->prev.str(), NAME_SPECIAL)); emit(OP_GOTO, co()->add_name(prev().str(), NAME_SPECIAL));
consume_end_stmt(); consume_end_stmt();
} else if(match(TK("raise"))){ } else if(match(TK("raise"))){
consume(TK("@id")); consume(TK("@id"));
int dummy_t = co()->add_name(parser->prev.str(), NAME_SPECIAL); int dummy_t = co()->add_name(prev().str(), NAME_SPECIAL);
if(match(TK("(")) && !match(TK(")"))){ if(match(TK("(")) && !match(TK(")"))){
EXPR(); consume(TK(")")); EXPR(); consume(TK(")"));
}else{ }else{
@ -1005,7 +811,7 @@ private:
} else if(match(TK("global"))){ } else if(match(TK("global"))){
do { do {
consume(TK("@id")); consume(TK("@id"));
co()->global_names[parser->prev.str()] = 1; co()->global_names[prev().str()] = 1;
} while (match(TK(","))); } while (match(TK(",")));
consume_end_stmt(); consume_end_stmt();
} else if(match(TK("pass"))){ } else if(match(TK("pass"))){
@ -1030,10 +836,10 @@ private:
void compile_class(){ void compile_class(){
consume(TK("@id")); consume(TK("@id"));
int cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL); int cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
int super_cls_name_idx = -1; int super_cls_name_idx = -1;
if(match(TK("(")) && match(TK("@id"))){ if(match(TK("(")) && match(TK("@id"))){
super_cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL); super_cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
consume(TK(")")); consume(TK(")"));
} }
if(super_cls_name_idx == -1) emit(OP_LOAD_NONE); if(super_cls_name_idx == -1) emit(OP_LOAD_NONE);
@ -1059,13 +865,13 @@ private:
} }
consume(TK("@id")); consume(TK("@id"));
const Str& name = parser->prev.str(); const Str& name = prev().str();
if(func.has_name(name)) SyntaxError("duplicate argument name"); if(func.has_name(name)) SyntaxError("duplicate argument name");
// eat type hints // eat type hints
if(enable_type_hints && match(TK(":"))) consume(TK("@id")); if(enable_type_hints && match(TK(":"))) consume(TK("@id"));
if(state == 0 && peek() == TK("=")) state = 2; if(state == 0 && curr().type == TK("=")) state = 2;
switch (state) switch (state)
{ {
@ -1075,7 +881,7 @@ private:
consume(TK("=")); consume(TK("="));
PyObject* value = read_literal(); PyObject* value = read_literal();
if(value == nullptr){ if(value == nullptr){
SyntaxError(Str("expect a literal, not ") + TK_STR(parser->curr.type)); SyntaxError(Str("expect a literal, not ") + TK_STR(curr().type));
} }
func.kwargs.set(name, value); func.kwargs.set(name, value);
func.kwargs_order.push_back(name); func.kwargs_order.push_back(name);
@ -1090,11 +896,11 @@ private:
Function func; Function func;
StrName obj_name; StrName obj_name;
consume(TK("@id")); consume(TK("@id"));
func.name = parser->prev.str(); func.name = prev().str();
if(!co()->_is_compiling_class && match(TK("::"))){ if(!co()->_is_compiling_class && match(TK("::"))){
consume(TK("@id")); consume(TK("@id"));
obj_name = func.name; obj_name = func.name;
func.name = parser->prev.str(); func.name = prev().str();
} }
consume(TK("(")); consume(TK("("));
if (!match(TK(")"))) { if (!match(TK(")"))) {
@ -1104,7 +910,7 @@ private:
if(match(TK("->"))){ if(match(TK("->"))){
if(!match(TK("None"))) consume(TK("@id")); if(!match(TK("None"))) consume(TK("@id"));
} }
func.code = make_sp<CodeObject>(parser->src, func.name.str()); func.code = make_sp<CodeObject>(lexer->src, func.name.str());
this->codes.push(func.code); this->codes.push(func.code);
compile_block_body(); compile_block_body();
func.code->optimize(vm); func.code->optimize(vm);
@ -1132,11 +938,11 @@ private:
PyObject* read_literal(){ PyObject* read_literal(){
if(match(TK("-"))){ if(match(TK("-"))){
consume(TK("@num")); consume(TK("@num"));
PyObject* val = parser->prev.value; PyObject* val = get_value(prev());
return vm->num_negated(val); return vm->num_negated(val);
} }
if(match(TK("@num"))) return parser->prev.value; if(match(TK("@num"))) return get_value(prev());
if(match(TK("@str"))) return parser->prev.value; if(match(TK("@str"))) return get_value(prev());
if(match(TK("True"))) return VAR(true); if(match(TK("True"))) return VAR(true);
if(match(TK("False"))) return VAR(false); if(match(TK("False"))) return VAR(false);
if(match(TK("None"))) return vm->None; if(match(TK("None"))) return vm->None;
@ -1144,23 +950,8 @@ private:
return nullptr; return nullptr;
} }
/***** Error Reporter *****/ void SyntaxError(Str msg){ lexer->throw_err("SyntaxError", msg, curr().line, curr().start); }
void throw_err(Str type, Str msg){ void SyntaxError(){ lexer->throw_err("SyntaxError", "invalid syntax", curr().line, curr().start); }
int lineno = parser->curr.line;
const char* cursor = parser->curr.start;
// if error occurs in lexing, lineno should be `parser->current_line`
if(lexing_count > 0){
lineno = parser->current_line;
cursor = parser->curr_char;
}
if(parser->peekchar() == '\n') lineno--;
auto e = Exception("SyntaxError", msg);
e.st_push(parser->src->snapshot(lineno, cursor));
throw e;
}
void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
void IndentationError(Str msg){ throw_err("IndentationError", msg); }
public: public:
CodeObject_ compile(){ CodeObject_ compile(){
@ -1168,11 +959,16 @@ public:
if(used) UNREACHABLE(); if(used) UNREACHABLE();
used = true; used = true;
CodeObject_ code = make_sp<CodeObject>(parser->src, Str("<module>")); tokens = lexer->run();
// if(lexer->src->filename == "tests/01_int.py"){
// for(auto& t: tokens) std::cout << t.info() << std::endl;
// }
CodeObject_ code = make_sp<CodeObject>(lexer->src, lexer->src->filename);
codes.push(code); codes.push(code);
lex_token(); lex_token(); advance(); // skip @sof, so prev() is always valid
match_newlines(); match_newlines(); // skip leading '\n'
if(mode()==EVAL_MODE) { if(mode()==EVAL_MODE) {
EXPR_TUPLE(); EXPR_TUPLE();

108
src/expr.h Normal file
View File

@ -0,0 +1,108 @@
#pragma once
#include "codeobject.h"
#include "common.h"
#include "parser.h"
#include "error.h"
#include "ceval.h"
#include <memory>
namespace pkpy{
struct Expression;
typedef std::unique_ptr<Expression> Expression_;
struct Expression{
std::vector<Expression_> children;
virtual Str to_string() const = 0;
};
struct NameExpr: Expression{
Str name;
NameScope scope;
NameExpr(Str name, NameScope scope): name(name), scope(scope) {}
Str to_string() const override { return name; }
};
struct GroupExpr: Expression{
Expression_ expr;
GroupExpr(Expression_ expr): expr(std::move(expr)) {}
Str to_string() const override { return "()"; }
};
struct UnaryExpr: Expression{
TokenIndex op;
UnaryExpr(TokenIndex op): op(op) {}
Str to_string() const override { return TK_STR(op); }
};
struct NotExpr: Expression{
Str to_string() const override { return "not"; }
};
struct AndExpr: Expression{
Str to_string() const override { return "and"; }
};
struct OrExpr: Expression{
Str to_string() const override { return "or"; }
};
// None, True, False, ...
struct SpecialValueExpr: Expression{
TokenIndex token;
SpecialValueExpr(TokenIndex token): token(token) {}
Str to_string() const override { return TK_STR(token); }
};
// @num, @str which needs to invoke OP_LOAD_CONST
struct LiteralExpr: Expression{
PyObject* value;
LiteralExpr(PyObject* value): value(value) {}
Str to_string() const override { return "literal"; }
};
struct ListExpr: Expression{
Str to_string() const override { return "[]"; }
};
struct DictExpr: Expression{
Str to_string() const override { return "{}"; }
};
struct LambdaExpr: Expression{
Str to_string() const override { return "lambda"; }
};
struct FStringExpr: Expression{
Str to_string() const override { return "@fstr"; }
};
struct AttribExpr: Expression{
Str to_string() const override { return "."; }
};
struct CallExpr: Expression{
Str to_string() const override { return "()"; }
};
struct BinaryExpr: Expression{
TokenIndex op;
BinaryExpr(TokenIndex op): op(op) {}
Str to_string() const override { return TK_STR(op); }
};
struct TernaryExpr: Expression{
Str to_string() const override { return "?"; }
};
struct AssignExpr: Expression{
Str to_string() const override { return "="; }
};
struct CommaExpr: Expression{
Str to_string() const override { return ","; }
};
} // namespace pkpy

View File

@ -58,7 +58,7 @@ struct Frame {
} }
PyObject* pop(){ PyObject* pop(){
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true"); if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif #endif
PyObject* v = _data.back(); PyObject* v = _data.back();
@ -67,7 +67,7 @@ struct Frame {
} }
void _pop(){ void _pop(){
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true"); if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif #endif
_data.pop_back(); _data.pop_back();
@ -88,14 +88,14 @@ struct Frame {
} }
PyObject*& top(){ PyObject*& top(){
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true"); if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif #endif
return _data.back(); return _data.back();
} }
PyObject*& top_1(){ PyObject*& top_1(){
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(_data.size() < 2) throw std::runtime_error("_data.size() < 2"); if(_data.size() < 2) throw std::runtime_error("_data.size() < 2");
#endif #endif
return _data[_data.size()-2]; return _data[_data.size()-2];

View File

@ -67,9 +67,9 @@ struct ManagedHeap{
~ManagedHeap(){ ~ManagedHeap(){
for(PyObject* obj: _no_gc) delete obj; for(PyObject* obj: _no_gc) delete obj;
for(auto& [type, count]: deleted){ // for(auto& [type, count]: deleted){
std::cout << "GC: " << type << "=" << count << std::endl; // std::cout << "GC: " << type << "=" << count << std::endl;
} // }
} }
int sweep(VM* vm){ int sweep(VM* vm){

510
src/lexer.h Normal file
View File

@ -0,0 +1,510 @@
#pragma once
#include "common.h"
#include "error.h"
#include "str.h"
namespace pkpy{
typedef uint8_t TokenIndex;
constexpr const char* kTokens[] = {
"@eof", "@eol", "@sof",
".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
"+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
"<<", ">>", "&", "|", "^", "?", "@",
"==", "!=", ">=", "<=",
"+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
/** KW_BEGIN **/
"class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
"None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
"goto", "label", // extended keywords, not available in cpython
"while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
/** KW_END **/
"is not", "not in",
"@id", "@num", "@str", "@fstr",
"@indent", "@dedent"
};
using TokenValue = std::variant<std::monostate, i64, f64, Str>;
const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
constexpr TokenIndex TK(const char token[]) {
for(int k=0; k<kTokenCount; k++){
const char* i = kTokens[k];
const char* j = token;
while(*i && *j && *i == *j) { i++; j++;}
if(*i == *j) return k;
}
UNREACHABLE();
}
#define TK_STR(t) kTokens[t]
const TokenIndex kTokenKwBegin = TK("class");
const TokenIndex kTokenKwEnd = TK("raise");
const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
std::map<std::string_view, TokenIndex> map;
for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
return map;
}();
struct Token{
TokenIndex type;
const char* start;
int length;
int line;
TokenValue value;
Str str() const { return Str(start, length);}
Str info() const {
StrStream ss;
Str raw = str();
if (raw == Str("\n")) raw = "\\n";
ss << line << ": " << TK_STR(type) << " '" << raw << "'";
return ss.str();
}
};
// https://docs.python.org/3/reference/expressions.html
enum Precedence {
PREC_NONE,
PREC_ASSIGNMENT, // =
PREC_COMMA, // ,
PREC_TERNARY, // ?:
PREC_LOGICAL_OR, // or
PREC_LOGICAL_AND, // and
PREC_LOGICAL_NOT, // not
PREC_EQUALITY, // == !=
PREC_TEST, // in / is / is not / not in
PREC_COMPARISION, // < > <= >=
PREC_BITWISE_OR, // |
PREC_BITWISE_XOR, // ^
PREC_BITWISE_AND, // &
PREC_BITWISE_SHIFT, // << >>
PREC_TERM, // + -
PREC_FACTOR, // * / % //
PREC_UNARY, // - not
PREC_EXPONENT, // **
PREC_CALL, // ()
PREC_SUBSCRIPT, // []
PREC_ATTRIB, // .index
PREC_PRIMARY,
};
enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
struct Lexer {
shared_ptr<SourceData> src;
const char* token_start;
const char* curr_char;
int current_line = 1;
std::vector<Token> nexts;
stack<int> indents;
int brackets_level = 0;
bool used = false;
char peekchar() const{ return *curr_char; }
bool match_n_chars(int n, char c0){
const char* c = curr_char;
for(int i=0; i<n; i++){
if(*c == '\0') return false;
if(*c != c0) return false;
c++;
}
for(int i=0; i<n; i++) eatchar_include_newline();
return true;
}
int eat_spaces(){
int count = 0;
while (true) {
switch (peekchar()) {
case ' ' : count+=1; break;
case '\t': count+=4; break;
default: return count;
}
eatchar();
}
}
bool eat_indentation(){
if(brackets_level > 0) return true;
int spaces = eat_spaces();
if(peekchar() == '#') skip_line_comment();
if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
if(spaces > indents.top()){
indents.push(spaces);
nexts.push_back(Token{TK("@indent"), token_start, 0, current_line});
} else if(spaces < indents.top()){
while(spaces < indents.top()){
indents.pop();
nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line});
}
if(spaces != indents.top()){
return false;
}
}
return true;
}
char eatchar() {
char c = peekchar();
if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
curr_char++;
return c;
}
char eatchar_include_newline() {
char c = peekchar();
curr_char++;
if (c == '\n'){
current_line++;
src->line_starts.push_back(curr_char);
}
return c;
}
int eat_name() {
curr_char--;
while(true){
uint8_t c = peekchar();
int u8bytes = 0;
if((c & 0b10000000) == 0b00000000) u8bytes = 1;
else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
else return 1;
if(u8bytes == 1){
if(isalpha(c) || c=='_' || isdigit(c)) {
curr_char++;
continue;
}else{
break;
}
}
// handle multibyte char
std::string u8str(curr_char, u8bytes);
if(u8str.size() != u8bytes) return 2;
uint32_t value = 0;
for(int k=0; k < u8bytes; k++){
uint8_t b = u8str[k];
if(k==0){
if(u8bytes == 2) value = (b & 0b00011111) << 6;
else if(u8bytes == 3) value = (b & 0b00001111) << 12;
else if(u8bytes == 4) value = (b & 0b00000111) << 18;
}else{
value |= (b & 0b00111111) << (6*(u8bytes-k-1));
}
}
if(is_unicode_Lo_char(value)) curr_char += u8bytes;
else break;
}
int length = (int)(curr_char - token_start);
if(length == 0) return 3;
std::string_view name(token_start, length);
if(src->mode == JSON_MODE){
if(name == "true"){
add_token(TK("True"));
} else if(name == "false"){
add_token(TK("False"));
} else if(name == "null"){
add_token(TK("None"));
} else {
return 4;
}
return 0;
}
if(kTokenKwMap.count(name)){
if(name == "not"){
if(strncmp(curr_char, " in", 3) == 0){
curr_char += 3;
add_token(TK("not in"));
return 0;
}
}else if(name == "is"){
if(strncmp(curr_char, " not", 4) == 0){
curr_char += 4;
add_token(TK("is not"));
return 0;
}
}
add_token(kTokenKwMap.at(name));
} else {
add_token(TK("@id"));
}
return 0;
}
void skip_line_comment() {
char c;
while ((c = peekchar()) != '\0') {
if (c == '\n') return;
eatchar();
}
}
bool matchchar(char c) {
if (peekchar() != c) return false;
eatchar_include_newline();
return true;
}
void add_token(TokenIndex type, TokenValue value={}) {
switch(type){
case TK("{"): case TK("["): case TK("("): brackets_level++; break;
case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
}
nexts.push_back( Token{
type,
token_start,
(int)(curr_char - token_start),
current_line - ((type == TK("@eol")) ? 1 : 0),
value
});
}
void add_token_2(char c, TokenIndex one, TokenIndex two) {
if (matchchar(c)) add_token(two);
else add_token(one);
}
Str eat_string_until(char quote, bool raw) {
bool quote3 = match_n_chars(2, quote);
std::vector<char> buff;
while (true) {
char c = eatchar_include_newline();
if (c == quote){
if(quote3 && !match_n_chars(2, quote)){
buff.push_back(c);
continue;
}
break;
}
if (c == '\0'){
if(quote3 && src->mode == REPL_MODE){
throw NeedMoreLines(false);
}
SyntaxError("EOL while scanning string literal");
}
if (c == '\n'){
if(!quote3) SyntaxError("EOL while scanning string literal");
else{
buff.push_back(c);
continue;
}
}
if (!raw && c == '\\') {
switch (eatchar_include_newline()) {
case '"': buff.push_back('"'); break;
case '\'': buff.push_back('\''); break;
case '\\': buff.push_back('\\'); break;
case 'n': buff.push_back('\n'); break;
case 'r': buff.push_back('\r'); break;
case 't': buff.push_back('\t'); break;
default: SyntaxError("invalid escape char");
}
} else {
buff.push_back(c);
}
}
return Str(buff.data(), buff.size());
}
void eat_string(char quote, StringType type) {
Str s = eat_string_until(quote, type == RAW_STRING);
if(type == F_STRING){
add_token(TK("@fstr"), s);
}else{
add_token(TK("@str"), s);
}
}
void eat_number() {
static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
std::smatch m;
const char* i = token_start;
while(*i != '\n' && *i != '\0') i++;
std::string s = std::string(token_start, i);
try{
if (std::regex_search(s, m, pattern)) {
// here is m.length()-1, since the first char was eaten by lex_token()
for(int j=0; j<m.length()-1; j++) eatchar();
int base = 10;
size_t size;
if (m[1].matched) base = 16;
if (m[2].matched) {
if(base == 16) SyntaxError("hex literal should not contain a dot");
add_token(TK("@num"), S_TO_FLOAT(m[0], &size));
} else {
add_token(TK("@num"), S_TO_INT(m[0], &size, base));
}
if (size != m.length()) UNREACHABLE();
}
}catch(std::exception& _){
SyntaxError("invalid number literal");
}
}
bool lex_one_token() {
while (peekchar() != '\0') {
token_start = curr_char;
char c = eatchar_include_newline();
switch (c) {
case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
case '#': skip_line_comment(); break;
case '{': add_token(TK("{")); return true;
case '}': add_token(TK("}")); return true;
case ',': add_token(TK(",")); return true;
case ':': add_token_2(':', TK(":"), TK("::")); return true;
case ';': add_token(TK(";")); return true;
case '(': add_token(TK("(")); return true;
case ')': add_token(TK(")")); return true;
case '[': add_token(TK("[")); return true;
case ']': add_token(TK("]")); return true;
case '@': add_token(TK("@")); return true;
case '%': add_token_2('=', TK("%"), TK("%=")); return true;
case '&': add_token_2('=', TK("&"), TK("&=")); return true;
case '|': add_token_2('=', TK("|"), TK("|=")); return true;
case '^': add_token_2('=', TK("^"), TK("^=")); return true;
case '?': add_token(TK("?")); return true;
case '.': {
if(matchchar('.')) {
if(matchchar('.')) {
add_token(TK("..."));
} else {
SyntaxError("invalid token '..'");
}
} else {
add_token(TK("."));
}
return true;
}
case '=': add_token_2('=', TK("="), TK("==")); return true;
case '+': add_token_2('=', TK("+"), TK("+=")); return true;
case '>': {
if(matchchar('=')) add_token(TK(">="));
else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
else add_token(TK(">"));
return true;
}
case '<': {
if(matchchar('=')) add_token(TK("<="));
else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
else add_token(TK("<"));
return true;
}
case '-': {
if(matchchar('=')) add_token(TK("-="));
else if(matchchar('>')) add_token(TK("->"));
else add_token(TK("-"));
return true;
}
case '!':
if(matchchar('=')) add_token(TK("!="));
else SyntaxError("expected '=' after '!'");
break;
case '*':
if (matchchar('*')) {
add_token(TK("**")); // '**'
} else {
add_token_2('=', TK("*"), TK("*="));
}
return true;
case '/':
if(matchchar('/')) {
add_token_2('=', TK("//"), TK("//="));
} else {
add_token_2('=', TK("/"), TK("/="));
}
return true;
case '\r': break; // just ignore '\r'
case ' ': case '\t': eat_spaces(); break;
case '\n': {
add_token(TK("@eol"));
if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
return true;
}
default: {
if(c == 'f'){
if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
if(matchchar('"')) {eat_string('"', F_STRING); return true;}
}else if(c == 'r'){
if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
}
if (c >= '0' && c <= '9') {
eat_number();
return true;
}
switch (eat_name())
{
case 0: break;
case 1: SyntaxError("invalid char: " + std::string(1, c));
case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
case 3: SyntaxError("@id contains invalid char"); break;
case 4: SyntaxError("invalid JSON token"); break;
default: UNREACHABLE();
}
return true;
}
}
}
token_start = curr_char;
while(indents.size() > 1){
indents.pop();
add_token(TK("@dedent"));
return true;
}
add_token(TK("@eof"));
return false;
}
/***** Error Reporter *****/
void throw_err(Str type, Str msg){
int lineno = current_line;
const char* cursor = curr_char;
if(peekchar() == '\n'){
lineno--;
cursor--;
}
throw_err(type, msg, lineno, cursor);
}
void throw_err(Str type, Str msg, int lineno, const char* cursor){
auto e = Exception("SyntaxError", msg);
e.st_push(src->snapshot(lineno, cursor));
throw e;
}
void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
void IndentationError(Str msg){ throw_err("IndentationError", msg); }
Lexer(shared_ptr<SourceData> src) {
this->src = src;
this->token_start = src->source;
this->curr_char = src->source;
this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line});
this->indents.push(0);
}
std::vector<Token> run() {
if(used) UNREACHABLE();
used = true;
while (lex_one_token());
return std::move(nexts);
}
};
} // namespace pkpy

View File

@ -1,302 +0,0 @@
#pragma once
#include "error.h"
#include "obj.h"
namespace pkpy{
typedef uint8_t TokenIndex;
constexpr const char* kTokens[] = {
"@error", "@eof", "@eol", "@sof",
".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
"+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
"<<", ">>", "&", "|", "^", "?", "@",
"==", "!=", ">=", "<=",
"+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
/** KW_BEGIN **/
"class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
"None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
"goto", "label", // extended keywords, not available in cpython
"while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
/** KW_END **/
"is not", "not in",
"@id", "@num", "@str", "@fstr",
"@indent", "@dedent"
};
const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
constexpr TokenIndex TK(const char token[]) {
for(int k=0; k<kTokenCount; k++){
const char* i = kTokens[k];
const char* j = token;
while(*i && *j && *i == *j) { i++; j++;}
if(*i == *j) return k;
}
UNREACHABLE();
}
#define TK_STR(t) kTokens[t]
const TokenIndex kTokenKwBegin = TK("class");
const TokenIndex kTokenKwEnd = TK("raise");
const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
std::map<std::string_view, TokenIndex> map;
for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
return map;
}();
struct Token{
TokenIndex type;
const char* start;
int length;
int line;
PyObject* value;
Str str() const { return Str(start, length);}
Str info() const {
StrStream ss;
Str raw = str();
if (raw == Str("\n")) raw = "\\n";
ss << line << ": " << TK_STR(type) << " '" << raw << "'";
return ss.str();
}
};
// https://docs.python.org/3/reference/expressions.html
enum Precedence {
PREC_NONE,
PREC_ASSIGNMENT, // =
PREC_COMMA, // ,
PREC_TERNARY, // ?:
PREC_LOGICAL_OR, // or
PREC_LOGICAL_AND, // and
PREC_LOGICAL_NOT, // not
PREC_EQUALITY, // == !=
PREC_TEST, // in / is / is not / not in
PREC_COMPARISION, // < > <= >=
PREC_BITWISE_OR, // |
PREC_BITWISE_XOR, // ^
PREC_BITWISE_AND, // &
PREC_BITWISE_SHIFT, // << >>
PREC_TERM, // + -
PREC_FACTOR, // * / % //
PREC_UNARY, // - not
PREC_EXPONENT, // **
PREC_CALL, // ()
PREC_SUBSCRIPT, // []
PREC_ATTRIB, // .index
PREC_PRIMARY,
};
// The context of the parsing phase for the compiler.
struct Parser {
shared_ptr<SourceData> src;
const char* token_start;
const char* curr_char;
int current_line = 1;
Token prev, curr;
queue<Token> nexts;
stack<int> indents;
int brackets_level = 0;
Token next_token(){
if(nexts.empty()){
return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
}
Token t = nexts.front();
if(t.type == TK("@eof") && indents.size()>1){
nexts.pop();
indents.pop();
return Token{TK("@dedent"), token_start, 0, current_line};
}
nexts.pop();
return t;
}
char peekchar() const{ return *curr_char; }
bool match_n_chars(int n, char c0){
const char* c = curr_char;
for(int i=0; i<n; i++){
if(*c == '\0') return false;
if(*c != c0) return false;
c++;
}
for(int i=0; i<n; i++) eatchar_include_newline();
return true;
}
int eat_spaces(){
int count = 0;
while (true) {
switch (peekchar()) {
case ' ' : count+=1; break;
case '\t': count+=4; break;
default: return count;
}
eatchar();
}
}
bool eat_indentation(){
if(brackets_level > 0) return true;
int spaces = eat_spaces();
if(peekchar() == '#') skip_line_comment();
if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
if(spaces > indents.top()){
indents.push(spaces);
nexts.push(Token{TK("@indent"), token_start, 0, current_line});
} else if(spaces < indents.top()){
while(spaces < indents.top()){
indents.pop();
nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
}
if(spaces != indents.top()){
return false;
}
}
return true;
}
char eatchar() {
char c = peekchar();
if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
curr_char++;
return c;
}
char eatchar_include_newline() {
char c = peekchar();
curr_char++;
if (c == '\n'){
current_line++;
src->line_starts.push_back(curr_char);
}
return c;
}
int eat_name() {
curr_char--;
while(true){
uint8_t c = peekchar();
int u8bytes = 0;
if((c & 0b10000000) == 0b00000000) u8bytes = 1;
else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
else return 1;
if(u8bytes == 1){
if(isalpha(c) || c=='_' || isdigit(c)) {
curr_char++;
continue;
}else{
break;
}
}
// handle multibyte char
std::string u8str(curr_char, u8bytes);
if(u8str.size() != u8bytes) return 2;
uint32_t value = 0;
for(int k=0; k < u8bytes; k++){
uint8_t b = u8str[k];
if(k==0){
if(u8bytes == 2) value = (b & 0b00011111) << 6;
else if(u8bytes == 3) value = (b & 0b00001111) << 12;
else if(u8bytes == 4) value = (b & 0b00000111) << 18;
}else{
value |= (b & 0b00111111) << (6*(u8bytes-k-1));
}
}
if(is_unicode_Lo_char(value)) curr_char += u8bytes;
else break;
}
int length = (int)(curr_char - token_start);
if(length == 0) return 3;
std::string_view name(token_start, length);
if(src->mode == JSON_MODE){
if(name == "true"){
set_next_token(TK("True"));
} else if(name == "false"){
set_next_token(TK("False"));
} else if(name == "null"){
set_next_token(TK("None"));
} else {
return 4;
}
return 0;
}
if(kTokenKwMap.count(name)){
if(name == "not"){
if(strncmp(curr_char, " in", 3) == 0){
curr_char += 3;
set_next_token(TK("not in"));
return 0;
}
}else if(name == "is"){
if(strncmp(curr_char, " not", 4) == 0){
curr_char += 4;
set_next_token(TK("is not"));
return 0;
}
}
set_next_token(kTokenKwMap.at(name));
} else {
set_next_token(TK("@id"));
}
return 0;
}
void skip_line_comment() {
char c;
while ((c = peekchar()) != '\0') {
if (c == '\n') return;
eatchar();
}
}
bool matchchar(char c) {
if (peekchar() != c) return false;
eatchar_include_newline();
return true;
}
void set_next_token(TokenIndex type, PyObject* value=nullptr) {
switch(type){
case TK("{"): case TK("["): case TK("("): brackets_level++; break;
case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
}
nexts.push( Token{
type,
token_start,
(int)(curr_char - token_start),
current_line - ((type == TK("@eol")) ? 1 : 0),
value
});
}
void set_next_token_2(char c, TokenIndex one, TokenIndex two) {
if (matchchar(c)) set_next_token(two);
else set_next_token(one);
}
Parser(shared_ptr<SourceData> src) {
this->src = src;
this->token_start = src->source;
this->curr_char = src->source;
this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
this->indents.push(0);
}
};
} // namespace pkpy

View File

@ -760,6 +760,7 @@ inline void add_module_gc(VM* vm){
inline void VM::post_init(){ inline void VM::post_init(){
init_builtins(this); init_builtins(this);
#if !DEBUG_NO_BUILTIN_MODULES
add_module_sys(this); add_module_sys(this);
add_module_time(this); add_module_time(this);
add_module_json(this); add_module_json(this);
@ -793,6 +794,7 @@ inline void VM::post_init(){
const PyTypeInfo& info = vm->_all_types[OBJ_GET(Type, args[0])]; const PyTypeInfo& info = vm->_all_types[OBJ_GET(Type, args[0])];
return VAR(info.name); return VAR(info.name);
})); }));
#endif
} }
} // namespace pkpy } // namespace pkpy

View File

@ -93,7 +93,7 @@ public:
} }
Frame* top_frame() const { Frame* top_frame() const {
#if PK_EXTRA_CHECK #if DEBUG_EXTRA_CHECK
if(callstack.empty()) UNREACHABLE(); if(callstack.empty()) UNREACHABLE();
#endif #endif
return callstack.top().get(); return callstack.top().get();
@ -166,7 +166,7 @@ public:
if(_module == nullptr) _module = _main; if(_module == nullptr) _module = _main;
try { try {
CodeObject_ code = compile(source, filename, mode); CodeObject_ code = compile(source, filename, mode);
if(_module == _main) std::cout << disassemble(code) << '\n'; // if(_module == _main) std::cout << disassemble(code) << '\n';
return _exec(code, _module); return _exec(code, _module);
}catch (const Exception& e){ }catch (const Exception& e){
*_stderr << e.summary() << '\n'; *_stderr << e.summary() << '\n';