update lexer

This commit is contained in:
blueloveTH 2023-03-29 23:33:45 +08:00
parent 120773891a
commit e78aa44895
11 changed files with 730 additions and 630 deletions

View File

@ -6,8 +6,8 @@ with open("src/opcodes.h", "rt", encoding='utf-8') as f:
OPCODES_TEXT = f.read()
pipeline = [
["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h"],
["obj.h", "parser.h", "codeobject.h", "frame.h"],
["common.h", "memory.h", "str.h", "tuplelist.h", "namedict.h", "error.h", "lexer.h"],
["obj.h", "codeobject.h", "frame.h"],
["gc.h", "vm.h", "ref.h", "ceval.h", "compiler.h", "repl.h"],
["iter.h", "cffi.h", "io.h", "_generated.h", "pocketpy.h"]
]

View File

@ -7,7 +7,7 @@ namespace pkpy{
inline PyObject* VM::run_frame(Frame* frame){
while(frame->has_next_bytecode()){
// heap._auto_collect(this);
heap._auto_collect(this);
const Bytecode& byte = frame->next_bytecode();
switch (byte.op)
@ -325,7 +325,7 @@ inline PyObject* VM::run_frame(Frame* frame){
if(frame->_data.size() != 1) throw std::runtime_error("_data.size() != 1 in EVAL/JSON_MODE");
return frame->pop_value(this);
}
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(!frame->_data.empty()) throw std::runtime_error("_data.size() != 0 in EXEC_MODE");
#endif
return None;

View File

@ -10,7 +10,6 @@
#include <sstream>
#include <regex>
#include <cmath>
#include <cstdlib>
#include <stdexcept>
#include <vector>
#include <string>
@ -26,10 +25,13 @@
#include <algorithm>
#include <random>
#include <initializer_list>
#include <list>
#include <variant>
#define PK_VERSION "0.9.5"
#define PK_EXTRA_CHECK 0
#define PK_VERSION "0.9.6"
// debug macros
#define DEBUG_NO_BUILTIN_MODULES 0
#define DEBUG_EXTRA_CHECK 1
#if (defined(__ANDROID__) && __ANDROID_API__ <= 22) || defined(__EMSCRIPTEN__)
#define PK_ENABLE_FILEIO 0
@ -40,13 +42,13 @@
#if defined(__EMSCRIPTEN__) || defined(__arm__) || defined(__i386__)
typedef int32_t i64;
typedef float f64;
#define S_TO_INT std::stoi
#define S_TO_FLOAT std::stof
#define S_TO_INT(...) static_cast<i64>(std::stoi(__VA_ARGS__))
#define S_TO_FLOAT(...) static_cast<f64>(std::stof(__VA_ARGS__))
#else
typedef int64_t i64;
typedef double f64;
#define S_TO_INT std::stoll
#define S_TO_FLOAT std::stod
#define S_TO_INT(...) static_cast<i64>(std::stoll(__VA_ARGS__))
#define S_TO_FLOAT(...) static_cast<f64>(std::stod(__VA_ARGS__))
#endif
namespace pkpy{
@ -100,22 +102,6 @@ inline bool is_both_int(PyObject* a, PyObject* b) noexcept {
return is_int(a) && is_int(b);
}
template <typename T>
class queue{
std::list<T> list;
public:
void push(const T& t){ list.push_back(t); }
void push(T&& t){ list.push_back(std::move(t)); }
void pop(){ list.pop_front(); }
void clear(){ list.clear(); }
bool empty() const { return list.empty(); }
size_t size() const { return list.size(); }
T& front(){ return list.front(); }
const T& front() const { return list.front(); }
const std::list<T>& data() const { return list; }
};
template <typename T>
class stack{
std::vector<T> vec;

View File

@ -2,7 +2,7 @@
#include "codeobject.h"
#include "common.h"
#include "parser.h"
#include "lexer.h"
#include "error.h"
#include "ceval.h"
@ -18,24 +18,21 @@ struct GrammarRule{
Precedence precedence;
};
enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
class Compiler {
std::unique_ptr<Parser> parser;
std::unique_ptr<Lexer> lexer;
stack<CodeObject_> codes;
int lexing_count = 0;
bool used = false;
VM* vm;
std::map<TokenIndex, GrammarRule> rules;
CodeObject_ co() const{ return codes.top(); }
CompileMode mode() const{ return parser->src->mode; }
CompileMode mode() const{ return lexer->src->mode; }
NameScope name_scope() const { return codes.size()>1 ? NAME_LOCAL : NAME_GLOBAL; }
public:
Compiler(VM* vm, const char* source, Str filename, CompileMode mode){
this->vm = vm;
this->parser = std::make_unique<Parser>(
this->lexer = std::make_unique<Lexer>(
make_sp<SourceData>(source, filename, mode)
);
@ -104,239 +101,36 @@ public:
}
private:
Str eat_string_until(char quote, bool raw) {
bool quote3 = parser->match_n_chars(2, quote);
std::vector<char> buff;
while (true) {
char c = parser->eatchar_include_newline();
if (c == quote){
if(quote3 && !parser->match_n_chars(2, quote)){
buff.push_back(c);
continue;
}
break;
}
if (c == '\0'){
if(quote3 && parser->src->mode == REPL_MODE){
throw NeedMoreLines(false);
}
SyntaxError("EOL while scanning string literal");
}
if (c == '\n'){
if(!quote3) SyntaxError("EOL while scanning string literal");
else{
buff.push_back(c);
continue;
}
}
if (!raw && c == '\\') {
switch (parser->eatchar_include_newline()) {
case '"': buff.push_back('"'); break;
case '\'': buff.push_back('\''); break;
case '\\': buff.push_back('\\'); break;
case 'n': buff.push_back('\n'); break;
case 'r': buff.push_back('\r'); break;
case 't': buff.push_back('\t'); break;
default: SyntaxError("invalid escape char");
}
} else {
buff.push_back(c);
}
}
return Str(buff.data(), buff.size());
}
int i = 0;
std::vector<Token> tokens;
void eat_string(char quote, StringType type) {
Str s = eat_string_until(quote, type == RAW_STRING);
if(type == F_STRING){
parser->set_next_token(TK("@fstr"), VAR(s));
}else{
parser->set_next_token(TK("@str"), VAR(s));
}
}
void eat_number() {
static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
std::smatch m;
const char* i = parser->token_start;
while(*i != '\n' && *i != '\0') i++;
std::string s = std::string(parser->token_start, i);
try{
if (std::regex_search(s, m, pattern)) {
// here is m.length()-1, since the first char was eaten by lex_token()
for(int j=0; j<m.length()-1; j++) parser->eatchar();
int base = 10;
size_t size;
if (m[1].matched) base = 16;
if (m[2].matched) {
if(base == 16) SyntaxError("hex literal should not contain a dot");
parser->set_next_token(TK("@num"), VAR(S_TO_FLOAT(m[0], &size)));
} else {
parser->set_next_token(TK("@num"), VAR(S_TO_INT(m[0], &size, base)));
}
if (size != m.length()) UNREACHABLE();
}
}catch(std::exception& _){
SyntaxError("invalid number literal");
}
}
void lex_token(){
lexing_count++;
_lex_token();
lexing_count--;
}
// Lex the next token and set it as the next token.
void _lex_token() {
parser->prev = parser->curr;
parser->curr = parser->next_token();
//std::cout << parser->curr.info() << std::endl;
while (parser->peekchar() != '\0') {
parser->token_start = parser->curr_char;
char c = parser->eatchar_include_newline();
switch (c) {
case '\'': case '"': eat_string(c, NORMAL_STRING); return;
case '#': parser->skip_line_comment(); break;
case '{': parser->set_next_token(TK("{")); return;
case '}': parser->set_next_token(TK("}")); return;
case ',': parser->set_next_token(TK(",")); return;
case ':': parser->set_next_token_2(':', TK(":"), TK("::")); return;
case ';': parser->set_next_token(TK(";")); return;
case '(': parser->set_next_token(TK("(")); return;
case ')': parser->set_next_token(TK(")")); return;
case '[': parser->set_next_token(TK("[")); return;
case ']': parser->set_next_token(TK("]")); return;
case '@': parser->set_next_token(TK("@")); return;
case '%': parser->set_next_token_2('=', TK("%"), TK("%=")); return;
case '&': parser->set_next_token_2('=', TK("&"), TK("&=")); return;
case '|': parser->set_next_token_2('=', TK("|"), TK("|=")); return;
case '^': parser->set_next_token_2('=', TK("^"), TK("^=")); return;
case '?': parser->set_next_token(TK("?")); return;
case '.': {
if(parser->matchchar('.')) {
if(parser->matchchar('.')) {
parser->set_next_token(TK("..."));
} else {
SyntaxError("invalid token '..'");
}
} else {
parser->set_next_token(TK("."));
}
return;
}
case '=': parser->set_next_token_2('=', TK("="), TK("==")); return;
case '+': parser->set_next_token_2('=', TK("+"), TK("+=")); return;
case '>': {
if(parser->matchchar('=')) parser->set_next_token(TK(">="));
else if(parser->matchchar('>')) parser->set_next_token_2('=', TK(">>"), TK(">>="));
else parser->set_next_token(TK(">"));
return;
}
case '<': {
if(parser->matchchar('=')) parser->set_next_token(TK("<="));
else if(parser->matchchar('<')) parser->set_next_token_2('=', TK("<<"), TK("<<="));
else parser->set_next_token(TK("<"));
return;
}
case '-': {
if(parser->matchchar('=')) parser->set_next_token(TK("-="));
else if(parser->matchchar('>')) parser->set_next_token(TK("->"));
else parser->set_next_token(TK("-"));
return;
}
case '!':
if(parser->matchchar('=')) parser->set_next_token(TK("!="));
else SyntaxError("expected '=' after '!'");
break;
case '*':
if (parser->matchchar('*')) {
parser->set_next_token(TK("**")); // '**'
} else {
parser->set_next_token_2('=', TK("*"), TK("*="));
}
return;
case '/':
if(parser->matchchar('/')) {
parser->set_next_token_2('=', TK("//"), TK("//="));
} else {
parser->set_next_token_2('=', TK("/"), TK("/="));
}
return;
case '\r': break; // just ignore '\r'
case ' ': case '\t': parser->eat_spaces(); break;
case '\n': {
parser->set_next_token(TK("@eol"));
if(!parser->eat_indentation()) IndentationError("unindent does not match any outer indentation level");
return;
}
default: {
if(c == 'f'){
if(parser->matchchar('\'')) {eat_string('\'', F_STRING); return;}
if(parser->matchchar('"')) {eat_string('"', F_STRING); return;}
}else if(c == 'r'){
if(parser->matchchar('\'')) {eat_string('\'', RAW_STRING); return;}
if(parser->matchchar('"')) {eat_string('"', RAW_STRING); return;}
}
if (c >= '0' && c <= '9') {
eat_number();
return;
}
switch (parser->eat_name())
{
case 0: break;
case 1: SyntaxError("invalid char: " + std::string(1, c));
case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
case 3: SyntaxError("@id contains invalid char"); break;
case 4: SyntaxError("invalid JSON token"); break;
default: UNREACHABLE();
}
return;
}
}
}
parser->token_start = parser->curr_char;
parser->set_next_token(TK("@eof"));
}
TokenIndex peek() {
return parser->curr.type;
}
// not sure this will work
TokenIndex peek_next() {
if(parser->nexts.empty()) return TK("@eof");
return parser->nexts.front().type;
}
const Token& prev() { return tokens.at(i-1); }
const Token& curr() { return tokens.at(i); }
const Token& next() { return tokens.at(i+1); }
const Token& peek(int offset=0) { return tokens.at(i+offset); }
void advance() { i++; }
bool match(TokenIndex expected) {
if (peek() != expected) return false;
lex_token();
if (curr().type != expected) return false;
advance();
return true;
}
void consume(TokenIndex expected) {
if (!match(expected)){
StrStream ss;
ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(peek()) << "'";
ss << "expected '" << TK_STR(expected) << "', but got '" << TK_STR(curr().type) << "'";
SyntaxError(ss.str());
}
}
bool match_newlines(bool repl_throw=false) {
bool consumed = false;
if (peek() == TK("@eol")) {
while (peek() == TK("@eol")) lex_token();
if (curr().type == TK("@eol")) {
while (curr().type == TK("@eol")) advance();
consumed = true;
}
if (repl_throw && peek() == TK("@eof")){
if (repl_throw && curr().type == TK("@eof")){
throw NeedMoreLines(co()->_is_compiling_class);
}
return consumed;
@ -344,8 +138,8 @@ private:
bool match_end_stmt() {
if (match(TK(";"))) { match_newlines(); return true; }
if (match_newlines() || peek()==TK("@eof")) return true;
if (peek() == TK("@dedent")) return true;
if (match_newlines() || curr().type == TK("@eof")) return true;
if (curr().type == TK("@dedent")) return true;
return false;
}
@ -353,15 +147,27 @@ private:
if (!match_end_stmt()) SyntaxError("expected statement end");
}
PyObject* get_value(const Token& token) {
switch (token.type) {
case TK("@num"):
if(std::holds_alternative<i64>(token.value)) return VAR(std::get<i64>(token.value));
if(std::holds_alternative<f64>(token.value)) return VAR(std::get<f64>(token.value));
UNREACHABLE();
case TK("@str"): case TK("@fstr"):
return VAR(std::get<Str>(token.value));
default: throw std::runtime_error(Str("invalid token type: ") + TK_STR(token.type));
}
}
void exprLiteral() {
PyObject* value = parser->prev.value;
PyObject* value = get_value(prev());
int index = co()->add_const(value);
emit(OP_LOAD_CONST, index);
}
void exprFString() {
static const std::regex pattern(R"(\{(.*?)\})");
PyObject* value = parser->prev.value;
PyObject* value = get_value(prev());
Str s = CAST(Str, value);
std::sregex_iterator begin(s.begin(), s.end(), pattern);
std::sregex_iterator end;
@ -395,7 +201,7 @@ private:
_compile_f_args(func, false);
consume(TK(":"));
}
func.code = make_sp<CodeObject>(parser->src, func.name.str());
func.code = make_sp<CodeObject>(lexer->src, func.name.str());
this->codes.push(func.code);
co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
emit(OP_RETURN_VALUE);
@ -414,7 +220,7 @@ private:
if(is_load_name_ref) co()->codes.pop_back();
co()->_rvalue += 1;
TokenIndex op = parser->prev.type;
TokenIndex op = prev().type;
if(op == TK("=")) { // a = (expr)
EXPR_TUPLE();
if(is_load_name_ref){
@ -487,7 +293,7 @@ private:
}
void exprBinaryOp() {
TokenIndex op = parser->prev.type;
TokenIndex op = prev().type;
parse_expression((Precedence)(rules[op].precedence + 1));
switch (op) {
@ -525,7 +331,7 @@ private:
}
void exprUnaryOp() {
TokenIndex op = parser->prev.type;
TokenIndex op = prev().type;
parse_expression((Precedence)(PREC_UNARY + 1));
switch (op) {
case TK("-"): emit(OP_UNARY_NEGATIVE); break;
@ -588,7 +394,7 @@ private:
int ARGC = 0;
do {
match_newlines(mode()==REPL_MODE);
if (peek() == TK("]")) break;
if (curr().type == TK("]")) break;
EXPR(); ARGC++;
match_newlines(mode()==REPL_MODE);
if(ARGC == 1 && match(TK("for"))){
@ -609,9 +415,9 @@ private:
int ARGC = 0;
do {
match_newlines(mode()==REPL_MODE);
if (peek() == TK("}")) break;
if (curr().type == TK("}")) break;
EXPR();
if(peek() == TK(":")) parsing_dict = true;
if(curr().type == TK(":")) parsing_dict = true;
if(parsing_dict){
consume(TK(":"));
EXPR();
@ -637,10 +443,10 @@ private:
bool need_unpack = false;
do {
match_newlines(mode()==REPL_MODE);
if (peek() == TK(")")) break;
if(peek() == TK("@id") && peek_next() == TK("=")) {
if (curr().type == TK(")")) break;
if(curr().type == TK("@id") && next().type == TK("=")) {
consume(TK("@id"));
const Str& key = parser->prev.str();
const Str& key = prev().str();
emit(OP_LOAD_CONST, co()->add_const(VAR(key)));
consume(TK("="));
co()->_rvalue += 1; EXPR(); co()->_rvalue -= 1;
@ -666,7 +472,7 @@ private:
void exprName(){ _exprName(false); }
void _exprName(bool force_lvalue) {
Token tkname = parser->prev;
const Token& tkname = prev();
int index = co()->add_name(tkname.str(), name_scope());
bool fast_load = !force_lvalue && co()->_rvalue>0;
emit(fast_load ? OP_LOAD_NAME : OP_LOAD_NAME_REF, index);
@ -674,7 +480,7 @@ private:
void exprAttrib() {
consume(TK("@id"));
const Str& name = parser->prev.str();
const Str& name = prev().str();
int index = co()->add_name(name, NAME_ATTR);
emit(co()->_rvalue ? OP_BUILD_ATTR : OP_BUILD_ATTR_REF, index);
}
@ -710,7 +516,7 @@ private:
}
void exprValue() {
TokenIndex op = parser->prev.type;
TokenIndex op = prev().type;
switch (op) {
case TK("None"): emit(OP_LOAD_NONE); break;
case TK("True"): emit(OP_LOAD_TRUE); break;
@ -721,7 +527,7 @@ private:
}
int emit(Opcode opcode, int arg=-1, bool keepline=false) {
int line = parser->prev.line;
int line = prev().line;
co()->codes.push_back(
Bytecode{(uint8_t)opcode, (uint16_t)co()->_curr_block_i, arg, line}
);
@ -738,7 +544,7 @@ private:
void compile_block_body(CompilerAction action=nullptr) {
if(action == nullptr) action = &Compiler::compile_stmt;
consume(TK(":"));
if(peek()!=TK("@eol") && peek()!=TK("@eof")){
if(curr().type!=TK("@eol") && curr().type!=TK("@eof")){
(this->*action)(); // inline block
return;
}
@ -746,7 +552,7 @@ private:
SyntaxError("expected a new line after ':'");
}
consume(TK("@indent"));
while (peek() != TK("@dedent")) {
while (curr().type != TK("@dedent")) {
match_newlines();
(this->*action)();
match_newlines();
@ -756,7 +562,7 @@ private:
Token _compile_import() {
consume(TK("@id"));
Token tkmodule = parser->prev;
Token tkmodule = prev();
int index = co()->add_name(tkmodule.str(), NAME_SPECIAL);
emit(OP_IMPORT_NAME, index);
return tkmodule;
@ -768,7 +574,7 @@ private:
Token tkmodule = _compile_import();
if (match(TK("as"))) {
consume(TK("@id"));
tkmodule = parser->prev;
tkmodule = prev();
}
int index = co()->add_name(tkmodule.str(), name_scope());
emit(OP_STORE_NAME, index);
@ -789,12 +595,12 @@ private:
do {
emit(OP_DUP_TOP_VALUE);
consume(TK("@id"));
Token tkname = parser->prev;
Token tkname = prev();
int index = co()->add_name(tkname.str(), NAME_ATTR);
emit(OP_BUILD_ATTR, index);
if (match(TK("as"))) {
consume(TK("@id"));
tkname = parser->prev;
tkname = prev();
}
index = co()->add_name(tkname.str(), name_scope());
emit(OP_STORE_NAME, index);
@ -807,14 +613,14 @@ private:
// ['a', '1', '2', '+', '=']
//
void parse_expression(Precedence precedence) {
lex_token();
GrammarFn prefix = rules[parser->prev.type].prefix;
if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(parser->prev.type));
advance();
GrammarFn prefix = rules[prev().type].prefix;
if (prefix == nullptr) SyntaxError(Str("expected an expression, but got ") + TK_STR(prev().type));
(this->*prefix)();
bool meet_assign_token = false;
while (rules[peek()].precedence >= precedence) {
lex_token();
TokenIndex op = parser->prev.type;
while (rules[curr().type].precedence >= precedence) {
advance();
TokenIndex op = prev().type;
if (op == TK("=")){
if(meet_assign_token) SyntaxError();
meet_assign_token = true;
@ -891,7 +697,7 @@ private:
do {
consume(TK("except"));
if(match(TK("@id"))){
int name_idx = co()->add_name(parser->prev.str(), NAME_SPECIAL);
int name_idx = co()->add_name(prev().str(), NAME_SPECIAL);
emit(OP_EXCEPTION_MATCH, name_idx);
}else{
emit(OP_LOAD_TRUE);
@ -901,7 +707,7 @@ private:
compile_block_body();
patches.push_back(emit(OP_JUMP_ABSOLUTE));
patch_jump(patch);
}while(peek() == TK("except"));
}while(curr().type == TK("except"));
emit(OP_RE_RAISE); // no match, re-raise
for (int patch : patches) patch_jump(patch);
}
@ -968,7 +774,7 @@ private:
EXPR();
consume(TK("as"));
consume(TK("@id"));
Token tkname = parser->prev;
Token tkname = prev();
int index = co()->add_name(tkname.str(), name_scope());
emit(OP_STORE_NAME, index);
emit(OP_LOAD_NAME_REF, index);
@ -979,18 +785,18 @@ private:
} else if(match(TK("label"))){
if(mode() != EXEC_MODE) SyntaxError("'label' is only available in EXEC_MODE");
consume(TK(".")); consume(TK("@id"));
Str label = parser->prev.str();
Str label = prev().str();
bool ok = co()->add_label(label);
if(!ok) SyntaxError("label '" + label + "' already exists");
consume_end_stmt();
} else if(match(TK("goto"))){ // https://entrian.com/goto/
if(mode() != EXEC_MODE) SyntaxError("'goto' is only available in EXEC_MODE");
consume(TK(".")); consume(TK("@id"));
emit(OP_GOTO, co()->add_name(parser->prev.str(), NAME_SPECIAL));
emit(OP_GOTO, co()->add_name(prev().str(), NAME_SPECIAL));
consume_end_stmt();
} else if(match(TK("raise"))){
consume(TK("@id"));
int dummy_t = co()->add_name(parser->prev.str(), NAME_SPECIAL);
int dummy_t = co()->add_name(prev().str(), NAME_SPECIAL);
if(match(TK("(")) && !match(TK(")"))){
EXPR(); consume(TK(")"));
}else{
@ -1005,7 +811,7 @@ private:
} else if(match(TK("global"))){
do {
consume(TK("@id"));
co()->global_names[parser->prev.str()] = 1;
co()->global_names[prev().str()] = 1;
} while (match(TK(",")));
consume_end_stmt();
} else if(match(TK("pass"))){
@ -1030,10 +836,10 @@ private:
void compile_class(){
consume(TK("@id"));
int cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL);
int cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
int super_cls_name_idx = -1;
if(match(TK("(")) && match(TK("@id"))){
super_cls_name_idx = co()->add_name(parser->prev.str(), NAME_GLOBAL);
super_cls_name_idx = co()->add_name(prev().str(), NAME_GLOBAL);
consume(TK(")"));
}
if(super_cls_name_idx == -1) emit(OP_LOAD_NONE);
@ -1059,13 +865,13 @@ private:
}
consume(TK("@id"));
const Str& name = parser->prev.str();
const Str& name = prev().str();
if(func.has_name(name)) SyntaxError("duplicate argument name");
// eat type hints
if(enable_type_hints && match(TK(":"))) consume(TK("@id"));
if(state == 0 && peek() == TK("=")) state = 2;
if(state == 0 && curr().type == TK("=")) state = 2;
switch (state)
{
@ -1075,7 +881,7 @@ private:
consume(TK("="));
PyObject* value = read_literal();
if(value == nullptr){
SyntaxError(Str("expect a literal, not ") + TK_STR(parser->curr.type));
SyntaxError(Str("expect a literal, not ") + TK_STR(curr().type));
}
func.kwargs.set(name, value);
func.kwargs_order.push_back(name);
@ -1090,11 +896,11 @@ private:
Function func;
StrName obj_name;
consume(TK("@id"));
func.name = parser->prev.str();
func.name = prev().str();
if(!co()->_is_compiling_class && match(TK("::"))){
consume(TK("@id"));
obj_name = func.name;
func.name = parser->prev.str();
func.name = prev().str();
}
consume(TK("("));
if (!match(TK(")"))) {
@ -1104,7 +910,7 @@ private:
if(match(TK("->"))){
if(!match(TK("None"))) consume(TK("@id"));
}
func.code = make_sp<CodeObject>(parser->src, func.name.str());
func.code = make_sp<CodeObject>(lexer->src, func.name.str());
this->codes.push(func.code);
compile_block_body();
func.code->optimize(vm);
@ -1132,11 +938,11 @@ private:
PyObject* read_literal(){
if(match(TK("-"))){
consume(TK("@num"));
PyObject* val = parser->prev.value;
PyObject* val = get_value(prev());
return vm->num_negated(val);
}
if(match(TK("@num"))) return parser->prev.value;
if(match(TK("@str"))) return parser->prev.value;
if(match(TK("@num"))) return get_value(prev());
if(match(TK("@str"))) return get_value(prev());
if(match(TK("True"))) return VAR(true);
if(match(TK("False"))) return VAR(false);
if(match(TK("None"))) return vm->None;
@ -1144,23 +950,8 @@ private:
return nullptr;
}
/***** Error Reporter *****/
void throw_err(Str type, Str msg){
int lineno = parser->curr.line;
const char* cursor = parser->curr.start;
// if error occurs in lexing, lineno should be `parser->current_line`
if(lexing_count > 0){
lineno = parser->current_line;
cursor = parser->curr_char;
}
if(parser->peekchar() == '\n') lineno--;
auto e = Exception("SyntaxError", msg);
e.st_push(parser->src->snapshot(lineno, cursor));
throw e;
}
void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
void IndentationError(Str msg){ throw_err("IndentationError", msg); }
void SyntaxError(Str msg){ lexer->throw_err("SyntaxError", msg, curr().line, curr().start); }
void SyntaxError(){ lexer->throw_err("SyntaxError", "invalid syntax", curr().line, curr().start); }
public:
CodeObject_ compile(){
@ -1168,11 +959,16 @@ public:
if(used) UNREACHABLE();
used = true;
CodeObject_ code = make_sp<CodeObject>(parser->src, Str("<module>"));
tokens = lexer->run();
// if(lexer->src->filename == "tests/01_int.py"){
// for(auto& t: tokens) std::cout << t.info() << std::endl;
// }
CodeObject_ code = make_sp<CodeObject>(lexer->src, lexer->src->filename);
codes.push(code);
lex_token(); lex_token();
match_newlines();
advance(); // skip @sof, so prev() is always valid
match_newlines(); // skip leading '\n'
if(mode()==EVAL_MODE) {
EXPR_TUPLE();

108
src/expr.h Normal file
View File

@ -0,0 +1,108 @@
#pragma once
#include "codeobject.h"
#include "common.h"
#include "parser.h"
#include "error.h"
#include "ceval.h"
#include <memory>
namespace pkpy{
struct Expression;
typedef std::unique_ptr<Expression> Expression_;
struct Expression{
std::vector<Expression_> children;
virtual Str to_string() const = 0;
};
struct NameExpr: Expression{
Str name;
NameScope scope;
NameExpr(Str name, NameScope scope): name(name), scope(scope) {}
Str to_string() const override { return name; }
};
struct GroupExpr: Expression{
Expression_ expr;
GroupExpr(Expression_ expr): expr(std::move(expr)) {}
Str to_string() const override { return "()"; }
};
struct UnaryExpr: Expression{
TokenIndex op;
UnaryExpr(TokenIndex op): op(op) {}
Str to_string() const override { return TK_STR(op); }
};
struct NotExpr: Expression{
Str to_string() const override { return "not"; }
};
struct AndExpr: Expression{
Str to_string() const override { return "and"; }
};
struct OrExpr: Expression{
Str to_string() const override { return "or"; }
};
// None, True, False, ...
struct SpecialValueExpr: Expression{
TokenIndex token;
SpecialValueExpr(TokenIndex token): token(token) {}
Str to_string() const override { return TK_STR(token); }
};
// @num, @str which needs to invoke OP_LOAD_CONST
struct LiteralExpr: Expression{
PyObject* value;
LiteralExpr(PyObject* value): value(value) {}
Str to_string() const override { return "literal"; }
};
struct ListExpr: Expression{
Str to_string() const override { return "[]"; }
};
struct DictExpr: Expression{
Str to_string() const override { return "{}"; }
};
struct LambdaExpr: Expression{
Str to_string() const override { return "lambda"; }
};
struct FStringExpr: Expression{
Str to_string() const override { return "@fstr"; }
};
struct AttribExpr: Expression{
Str to_string() const override { return "."; }
};
struct CallExpr: Expression{
Str to_string() const override { return "()"; }
};
struct BinaryExpr: Expression{
TokenIndex op;
BinaryExpr(TokenIndex op): op(op) {}
Str to_string() const override { return TK_STR(op); }
};
struct TernaryExpr: Expression{
Str to_string() const override { return "?"; }
};
struct AssignExpr: Expression{
Str to_string() const override { return "="; }
};
struct CommaExpr: Expression{
Str to_string() const override { return ","; }
};
} // namespace pkpy

View File

@ -58,7 +58,7 @@ struct Frame {
}
PyObject* pop(){
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif
PyObject* v = _data.back();
@ -67,7 +67,7 @@ struct Frame {
}
void _pop(){
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif
_data.pop_back();
@ -88,14 +88,14 @@ struct Frame {
}
PyObject*& top(){
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(_data.empty()) throw std::runtime_error("_data.empty() is true");
#endif
return _data.back();
}
PyObject*& top_1(){
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(_data.size() < 2) throw std::runtime_error("_data.size() < 2");
#endif
return _data[_data.size()-2];

View File

@ -67,9 +67,9 @@ struct ManagedHeap{
~ManagedHeap(){
for(PyObject* obj: _no_gc) delete obj;
for(auto& [type, count]: deleted){
std::cout << "GC: " << type << "=" << count << std::endl;
}
// for(auto& [type, count]: deleted){
// std::cout << "GC: " << type << "=" << count << std::endl;
// }
}
int sweep(VM* vm){

510
src/lexer.h Normal file
View File

@ -0,0 +1,510 @@
#pragma once
#include "common.h"
#include "error.h"
#include "str.h"
namespace pkpy{
typedef uint8_t TokenIndex;
constexpr const char* kTokens[] = {
"@eof", "@eol", "@sof",
".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
"+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
"<<", ">>", "&", "|", "^", "?", "@",
"==", "!=", ">=", "<=",
"+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
/** KW_BEGIN **/
"class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
"None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
"goto", "label", // extended keywords, not available in cpython
"while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
/** KW_END **/
"is not", "not in",
"@id", "@num", "@str", "@fstr",
"@indent", "@dedent"
};
using TokenValue = std::variant<std::monostate, i64, f64, Str>;
const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
constexpr TokenIndex TK(const char token[]) {
for(int k=0; k<kTokenCount; k++){
const char* i = kTokens[k];
const char* j = token;
while(*i && *j && *i == *j) { i++; j++;}
if(*i == *j) return k;
}
UNREACHABLE();
}
#define TK_STR(t) kTokens[t]
const TokenIndex kTokenKwBegin = TK("class");
const TokenIndex kTokenKwEnd = TK("raise");
const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
std::map<std::string_view, TokenIndex> map;
for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
return map;
}();
struct Token{
TokenIndex type;
const char* start;
int length;
int line;
TokenValue value;
Str str() const { return Str(start, length);}
Str info() const {
StrStream ss;
Str raw = str();
if (raw == Str("\n")) raw = "\\n";
ss << line << ": " << TK_STR(type) << " '" << raw << "'";
return ss.str();
}
};
// https://docs.python.org/3/reference/expressions.html
enum Precedence {
PREC_NONE,
PREC_ASSIGNMENT, // =
PREC_COMMA, // ,
PREC_TERNARY, // ?:
PREC_LOGICAL_OR, // or
PREC_LOGICAL_AND, // and
PREC_LOGICAL_NOT, // not
PREC_EQUALITY, // == !=
PREC_TEST, // in / is / is not / not in
PREC_COMPARISION, // < > <= >=
PREC_BITWISE_OR, // |
PREC_BITWISE_XOR, // ^
PREC_BITWISE_AND, // &
PREC_BITWISE_SHIFT, // << >>
PREC_TERM, // + -
PREC_FACTOR, // * / % //
PREC_UNARY, // - not
PREC_EXPONENT, // **
PREC_CALL, // ()
PREC_SUBSCRIPT, // []
PREC_ATTRIB, // .index
PREC_PRIMARY,
};
enum StringType { NORMAL_STRING, RAW_STRING, F_STRING };
struct Lexer {
shared_ptr<SourceData> src;
const char* token_start;
const char* curr_char;
int current_line = 1;
std::vector<Token> nexts;
stack<int> indents;
int brackets_level = 0;
bool used = false;
char peekchar() const{ return *curr_char; }
bool match_n_chars(int n, char c0){
const char* c = curr_char;
for(int i=0; i<n; i++){
if(*c == '\0') return false;
if(*c != c0) return false;
c++;
}
for(int i=0; i<n; i++) eatchar_include_newline();
return true;
}
int eat_spaces(){
int count = 0;
while (true) {
switch (peekchar()) {
case ' ' : count+=1; break;
case '\t': count+=4; break;
default: return count;
}
eatchar();
}
}
bool eat_indentation(){
if(brackets_level > 0) return true;
int spaces = eat_spaces();
if(peekchar() == '#') skip_line_comment();
if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
if(spaces > indents.top()){
indents.push(spaces);
nexts.push_back(Token{TK("@indent"), token_start, 0, current_line});
} else if(spaces < indents.top()){
while(spaces < indents.top()){
indents.pop();
nexts.push_back(Token{TK("@dedent"), token_start, 0, current_line});
}
if(spaces != indents.top()){
return false;
}
}
return true;
}
char eatchar() {
char c = peekchar();
if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
curr_char++;
return c;
}
char eatchar_include_newline() {
char c = peekchar();
curr_char++;
if (c == '\n'){
current_line++;
src->line_starts.push_back(curr_char);
}
return c;
}
int eat_name() {
curr_char--;
while(true){
uint8_t c = peekchar();
int u8bytes = 0;
if((c & 0b10000000) == 0b00000000) u8bytes = 1;
else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
else return 1;
if(u8bytes == 1){
if(isalpha(c) || c=='_' || isdigit(c)) {
curr_char++;
continue;
}else{
break;
}
}
// handle multibyte char
std::string u8str(curr_char, u8bytes);
if(u8str.size() != u8bytes) return 2;
uint32_t value = 0;
for(int k=0; k < u8bytes; k++){
uint8_t b = u8str[k];
if(k==0){
if(u8bytes == 2) value = (b & 0b00011111) << 6;
else if(u8bytes == 3) value = (b & 0b00001111) << 12;
else if(u8bytes == 4) value = (b & 0b00000111) << 18;
}else{
value |= (b & 0b00111111) << (6*(u8bytes-k-1));
}
}
if(is_unicode_Lo_char(value)) curr_char += u8bytes;
else break;
}
int length = (int)(curr_char - token_start);
if(length == 0) return 3;
std::string_view name(token_start, length);
if(src->mode == JSON_MODE){
if(name == "true"){
add_token(TK("True"));
} else if(name == "false"){
add_token(TK("False"));
} else if(name == "null"){
add_token(TK("None"));
} else {
return 4;
}
return 0;
}
if(kTokenKwMap.count(name)){
if(name == "not"){
if(strncmp(curr_char, " in", 3) == 0){
curr_char += 3;
add_token(TK("not in"));
return 0;
}
}else if(name == "is"){
if(strncmp(curr_char, " not", 4) == 0){
curr_char += 4;
add_token(TK("is not"));
return 0;
}
}
add_token(kTokenKwMap.at(name));
} else {
add_token(TK("@id"));
}
return 0;
}
void skip_line_comment() {
char c;
while ((c = peekchar()) != '\0') {
if (c == '\n') return;
eatchar();
}
}
bool matchchar(char c) {
if (peekchar() != c) return false;
eatchar_include_newline();
return true;
}
void add_token(TokenIndex type, TokenValue value={}) {
switch(type){
case TK("{"): case TK("["): case TK("("): brackets_level++; break;
case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
}
nexts.push_back( Token{
type,
token_start,
(int)(curr_char - token_start),
current_line - ((type == TK("@eol")) ? 1 : 0),
value
});
}
void add_token_2(char c, TokenIndex one, TokenIndex two) {
if (matchchar(c)) add_token(two);
else add_token(one);
}
Str eat_string_until(char quote, bool raw) {
bool quote3 = match_n_chars(2, quote);
std::vector<char> buff;
while (true) {
char c = eatchar_include_newline();
if (c == quote){
if(quote3 && !match_n_chars(2, quote)){
buff.push_back(c);
continue;
}
break;
}
if (c == '\0'){
if(quote3 && src->mode == REPL_MODE){
throw NeedMoreLines(false);
}
SyntaxError("EOL while scanning string literal");
}
if (c == '\n'){
if(!quote3) SyntaxError("EOL while scanning string literal");
else{
buff.push_back(c);
continue;
}
}
if (!raw && c == '\\') {
switch (eatchar_include_newline()) {
case '"': buff.push_back('"'); break;
case '\'': buff.push_back('\''); break;
case '\\': buff.push_back('\\'); break;
case 'n': buff.push_back('\n'); break;
case 'r': buff.push_back('\r'); break;
case 't': buff.push_back('\t'); break;
default: SyntaxError("invalid escape char");
}
} else {
buff.push_back(c);
}
}
return Str(buff.data(), buff.size());
}
void eat_string(char quote, StringType type) {
Str s = eat_string_until(quote, type == RAW_STRING);
if(type == F_STRING){
add_token(TK("@fstr"), s);
}else{
add_token(TK("@str"), s);
}
}
void eat_number() {
static const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?");
std::smatch m;
const char* i = token_start;
while(*i != '\n' && *i != '\0') i++;
std::string s = std::string(token_start, i);
try{
if (std::regex_search(s, m, pattern)) {
// here is m.length()-1, since the first char was eaten by lex_token()
for(int j=0; j<m.length()-1; j++) eatchar();
int base = 10;
size_t size;
if (m[1].matched) base = 16;
if (m[2].matched) {
if(base == 16) SyntaxError("hex literal should not contain a dot");
add_token(TK("@num"), S_TO_FLOAT(m[0], &size));
} else {
add_token(TK("@num"), S_TO_INT(m[0], &size, base));
}
if (size != m.length()) UNREACHABLE();
}
}catch(std::exception& _){
SyntaxError("invalid number literal");
}
}
bool lex_one_token() {
while (peekchar() != '\0') {
token_start = curr_char;
char c = eatchar_include_newline();
switch (c) {
case '\'': case '"': eat_string(c, NORMAL_STRING); return true;
case '#': skip_line_comment(); break;
case '{': add_token(TK("{")); return true;
case '}': add_token(TK("}")); return true;
case ',': add_token(TK(",")); return true;
case ':': add_token_2(':', TK(":"), TK("::")); return true;
case ';': add_token(TK(";")); return true;
case '(': add_token(TK("(")); return true;
case ')': add_token(TK(")")); return true;
case '[': add_token(TK("[")); return true;
case ']': add_token(TK("]")); return true;
case '@': add_token(TK("@")); return true;
case '%': add_token_2('=', TK("%"), TK("%=")); return true;
case '&': add_token_2('=', TK("&"), TK("&=")); return true;
case '|': add_token_2('=', TK("|"), TK("|=")); return true;
case '^': add_token_2('=', TK("^"), TK("^=")); return true;
case '?': add_token(TK("?")); return true;
case '.': {
if(matchchar('.')) {
if(matchchar('.')) {
add_token(TK("..."));
} else {
SyntaxError("invalid token '..'");
}
} else {
add_token(TK("."));
}
return true;
}
case '=': add_token_2('=', TK("="), TK("==")); return true;
case '+': add_token_2('=', TK("+"), TK("+=")); return true;
case '>': {
if(matchchar('=')) add_token(TK(">="));
else if(matchchar('>')) add_token_2('=', TK(">>"), TK(">>="));
else add_token(TK(">"));
return true;
}
case '<': {
if(matchchar('=')) add_token(TK("<="));
else if(matchchar('<')) add_token_2('=', TK("<<"), TK("<<="));
else add_token(TK("<"));
return true;
}
case '-': {
if(matchchar('=')) add_token(TK("-="));
else if(matchchar('>')) add_token(TK("->"));
else add_token(TK("-"));
return true;
}
case '!':
if(matchchar('=')) add_token(TK("!="));
else SyntaxError("expected '=' after '!'");
break;
case '*':
if (matchchar('*')) {
add_token(TK("**")); // '**'
} else {
add_token_2('=', TK("*"), TK("*="));
}
return true;
case '/':
if(matchchar('/')) {
add_token_2('=', TK("//"), TK("//="));
} else {
add_token_2('=', TK("/"), TK("/="));
}
return true;
case '\r': break; // just ignore '\r'
case ' ': case '\t': eat_spaces(); break;
case '\n': {
add_token(TK("@eol"));
if(!eat_indentation()) IndentationError("unindent does not match any outer indentation level");
return true;
}
default: {
if(c == 'f'){
if(matchchar('\'')) {eat_string('\'', F_STRING); return true;}
if(matchchar('"')) {eat_string('"', F_STRING); return true;}
}else if(c == 'r'){
if(matchchar('\'')) {eat_string('\'', RAW_STRING); return true;}
if(matchchar('"')) {eat_string('"', RAW_STRING); return true;}
}
if (c >= '0' && c <= '9') {
eat_number();
return true;
}
switch (eat_name())
{
case 0: break;
case 1: SyntaxError("invalid char: " + std::string(1, c));
case 2: SyntaxError("invalid utf8 sequence: " + std::string(1, c));
case 3: SyntaxError("@id contains invalid char"); break;
case 4: SyntaxError("invalid JSON token"); break;
default: UNREACHABLE();
}
return true;
}
}
}
token_start = curr_char;
while(indents.size() > 1){
indents.pop();
add_token(TK("@dedent"));
return true;
}
add_token(TK("@eof"));
return false;
}
/***** Error Reporter *****/
void throw_err(Str type, Str msg){
int lineno = current_line;
const char* cursor = curr_char;
if(peekchar() == '\n'){
lineno--;
cursor--;
}
throw_err(type, msg, lineno, cursor);
}
void throw_err(Str type, Str msg, int lineno, const char* cursor){
auto e = Exception("SyntaxError", msg);
e.st_push(src->snapshot(lineno, cursor));
throw e;
}
void SyntaxError(Str msg){ throw_err("SyntaxError", msg); }
void SyntaxError(){ throw_err("SyntaxError", "invalid syntax"); }
void IndentationError(Str msg){ throw_err("IndentationError", msg); }
Lexer(shared_ptr<SourceData> src) {
this->src = src;
this->token_start = src->source;
this->curr_char = src->source;
this->nexts.push_back(Token{TK("@sof"), token_start, 0, current_line});
this->indents.push(0);
}
std::vector<Token> run() {
if(used) UNREACHABLE();
used = true;
while (lex_one_token());
return std::move(nexts);
}
};
} // namespace pkpy

View File

@ -1,302 +0,0 @@
#pragma once
#include "error.h"
#include "obj.h"
namespace pkpy{
typedef uint8_t TokenIndex;
constexpr const char* kTokens[] = {
"@error", "@eof", "@eol", "@sof",
".", ",", ":", ";", "#", "(", ")", "[", "]", "{", "}", "%", "::",
"+", "-", "*", "/", "//", "**", "=", ">", "<", "...", "->",
"<<", ">>", "&", "|", "^", "?", "@",
"==", "!=", ">=", "<=",
"+=", "-=", "*=", "/=", "//=", "%=", "&=", "|=", "^=", ">>=", "<<=",
/** KW_BEGIN **/
"class", "import", "as", "def", "lambda", "pass", "del", "from", "with", "yield",
"None", "in", "is", "and", "or", "not", "True", "False", "global", "try", "except", "finally",
"goto", "label", // extended keywords, not available in cpython
"while", "for", "if", "elif", "else", "break", "continue", "return", "assert", "raise",
/** KW_END **/
"is not", "not in",
"@id", "@num", "@str", "@fstr",
"@indent", "@dedent"
};
const TokenIndex kTokenCount = sizeof(kTokens) / sizeof(kTokens[0]);
constexpr TokenIndex TK(const char token[]) {
for(int k=0; k<kTokenCount; k++){
const char* i = kTokens[k];
const char* j = token;
while(*i && *j && *i == *j) { i++; j++;}
if(*i == *j) return k;
}
UNREACHABLE();
}
#define TK_STR(t) kTokens[t]
const TokenIndex kTokenKwBegin = TK("class");
const TokenIndex kTokenKwEnd = TK("raise");
const std::map<std::string_view, TokenIndex> kTokenKwMap = [](){
std::map<std::string_view, TokenIndex> map;
for(int k=kTokenKwBegin; k<=kTokenKwEnd; k++) map[kTokens[k]] = k;
return map;
}();
struct Token{
TokenIndex type;
const char* start;
int length;
int line;
PyObject* value;
Str str() const { return Str(start, length);}
Str info() const {
StrStream ss;
Str raw = str();
if (raw == Str("\n")) raw = "\\n";
ss << line << ": " << TK_STR(type) << " '" << raw << "'";
return ss.str();
}
};
// https://docs.python.org/3/reference/expressions.html
enum Precedence {
PREC_NONE,
PREC_ASSIGNMENT, // =
PREC_COMMA, // ,
PREC_TERNARY, // ?:
PREC_LOGICAL_OR, // or
PREC_LOGICAL_AND, // and
PREC_LOGICAL_NOT, // not
PREC_EQUALITY, // == !=
PREC_TEST, // in / is / is not / not in
PREC_COMPARISION, // < > <= >=
PREC_BITWISE_OR, // |
PREC_BITWISE_XOR, // ^
PREC_BITWISE_AND, // &
PREC_BITWISE_SHIFT, // << >>
PREC_TERM, // + -
PREC_FACTOR, // * / % //
PREC_UNARY, // - not
PREC_EXPONENT, // **
PREC_CALL, // ()
PREC_SUBSCRIPT, // []
PREC_ATTRIB, // .index
PREC_PRIMARY,
};
// The context of the parsing phase for the compiler.
struct Parser {
shared_ptr<SourceData> src;
const char* token_start;
const char* curr_char;
int current_line = 1;
Token prev, curr;
queue<Token> nexts;
stack<int> indents;
int brackets_level = 0;
Token next_token(){
if(nexts.empty()){
return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
}
Token t = nexts.front();
if(t.type == TK("@eof") && indents.size()>1){
nexts.pop();
indents.pop();
return Token{TK("@dedent"), token_start, 0, current_line};
}
nexts.pop();
return t;
}
char peekchar() const{ return *curr_char; }
bool match_n_chars(int n, char c0){
const char* c = curr_char;
for(int i=0; i<n; i++){
if(*c == '\0') return false;
if(*c != c0) return false;
c++;
}
for(int i=0; i<n; i++) eatchar_include_newline();
return true;
}
int eat_spaces(){
int count = 0;
while (true) {
switch (peekchar()) {
case ' ' : count+=1; break;
case '\t': count+=4; break;
default: return count;
}
eatchar();
}
}
bool eat_indentation(){
if(brackets_level > 0) return true;
int spaces = eat_spaces();
if(peekchar() == '#') skip_line_comment();
if(peekchar() == '\0' || peekchar() == '\n' || peekchar() == '\r') return true;
// https://docs.python.org/3/reference/lexical_analysis.html#indentation
if(spaces > indents.top()){
indents.push(spaces);
nexts.push(Token{TK("@indent"), token_start, 0, current_line});
} else if(spaces < indents.top()){
while(spaces < indents.top()){
indents.pop();
nexts.push(Token{TK("@dedent"), token_start, 0, current_line});
}
if(spaces != indents.top()){
return false;
}
}
return true;
}
char eatchar() {
char c = peekchar();
if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
curr_char++;
return c;
}
char eatchar_include_newline() {
char c = peekchar();
curr_char++;
if (c == '\n'){
current_line++;
src->line_starts.push_back(curr_char);
}
return c;
}
int eat_name() {
curr_char--;
while(true){
uint8_t c = peekchar();
int u8bytes = 0;
if((c & 0b10000000) == 0b00000000) u8bytes = 1;
else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
else if((c & 0b11110000) == 0b11100000) u8bytes = 3;
else if((c & 0b11111000) == 0b11110000) u8bytes = 4;
else return 1;
if(u8bytes == 1){
if(isalpha(c) || c=='_' || isdigit(c)) {
curr_char++;
continue;
}else{
break;
}
}
// handle multibyte char
std::string u8str(curr_char, u8bytes);
if(u8str.size() != u8bytes) return 2;
uint32_t value = 0;
for(int k=0; k < u8bytes; k++){
uint8_t b = u8str[k];
if(k==0){
if(u8bytes == 2) value = (b & 0b00011111) << 6;
else if(u8bytes == 3) value = (b & 0b00001111) << 12;
else if(u8bytes == 4) value = (b & 0b00000111) << 18;
}else{
value |= (b & 0b00111111) << (6*(u8bytes-k-1));
}
}
if(is_unicode_Lo_char(value)) curr_char += u8bytes;
else break;
}
int length = (int)(curr_char - token_start);
if(length == 0) return 3;
std::string_view name(token_start, length);
if(src->mode == JSON_MODE){
if(name == "true"){
set_next_token(TK("True"));
} else if(name == "false"){
set_next_token(TK("False"));
} else if(name == "null"){
set_next_token(TK("None"));
} else {
return 4;
}
return 0;
}
if(kTokenKwMap.count(name)){
if(name == "not"){
if(strncmp(curr_char, " in", 3) == 0){
curr_char += 3;
set_next_token(TK("not in"));
return 0;
}
}else if(name == "is"){
if(strncmp(curr_char, " not", 4) == 0){
curr_char += 4;
set_next_token(TK("is not"));
return 0;
}
}
set_next_token(kTokenKwMap.at(name));
} else {
set_next_token(TK("@id"));
}
return 0;
}
void skip_line_comment() {
char c;
while ((c = peekchar()) != '\0') {
if (c == '\n') return;
eatchar();
}
}
bool matchchar(char c) {
if (peekchar() != c) return false;
eatchar_include_newline();
return true;
}
void set_next_token(TokenIndex type, PyObject* value=nullptr) {
switch(type){
case TK("{"): case TK("["): case TK("("): brackets_level++; break;
case TK(")"): case TK("]"): case TK("}"): brackets_level--; break;
}
nexts.push( Token{
type,
token_start,
(int)(curr_char - token_start),
current_line - ((type == TK("@eol")) ? 1 : 0),
value
});
}
void set_next_token_2(char c, TokenIndex one, TokenIndex two) {
if (matchchar(c)) set_next_token(two);
else set_next_token(one);
}
Parser(shared_ptr<SourceData> src) {
this->src = src;
this->token_start = src->source;
this->curr_char = src->source;
this->nexts.push(Token{TK("@sof"), token_start, 0, current_line});
this->indents.push(0);
}
};
} // namespace pkpy

View File

@ -760,6 +760,7 @@ inline void add_module_gc(VM* vm){
inline void VM::post_init(){
init_builtins(this);
#if !DEBUG_NO_BUILTIN_MODULES
add_module_sys(this);
add_module_time(this);
add_module_json(this);
@ -793,6 +794,7 @@ inline void VM::post_init(){
const PyTypeInfo& info = vm->_all_types[OBJ_GET(Type, args[0])];
return VAR(info.name);
}));
#endif
}
} // namespace pkpy

View File

@ -93,7 +93,7 @@ public:
}
Frame* top_frame() const {
#if PK_EXTRA_CHECK
#if DEBUG_EXTRA_CHECK
if(callstack.empty()) UNREACHABLE();
#endif
return callstack.top().get();
@ -166,7 +166,7 @@ public:
if(_module == nullptr) _module = _main;
try {
CodeObject_ code = compile(source, filename, mode);
if(_module == _main) std::cout << disassemble(code) << '\n';
// if(_module == _main) std::cout << disassemble(code) << '\n';
return _exec(code, _module);
}catch (const Exception& e){
*_stderr << e.summary() << '\n';