This commit is contained in:
blueloveTH 2023-01-12 14:07:18 +08:00
parent b95417c5d8
commit cdc88afa23
2 changed files with 103 additions and 117 deletions

View File

@ -24,16 +24,10 @@ public:
bool isCompilingClass = false; bool isCompilingClass = false;
int lexingCnt = 0; int lexingCnt = 0;
VM* vm; VM* vm;
emhash8::HashMap<_TokenType, GrammarRule> rules; emhash8::HashMap<_TokenType, GrammarRule> rules;
_Code co() { _Code co() const{ return codes.top(); }
return codes.top(); CompileMode mode() const{ return parser->src->mode;}
}
CompileMode mode() {
return parser->src->mode;
}
Compiler(VM* vm, const char* source, _Str filename, CompileMode mode){ Compiler(VM* vm, const char* source, _Str filename, CompileMode mode){
this->vm = vm; this->vm = vm;
@ -99,7 +93,7 @@ public:
#undef NO_INFIX #undef NO_INFIX
#define EXPR() parsePrecedence(PREC_TERNARY) // no '=' and ',' just a simple expression #define EXPR() parsePrecedence(PREC_TERNARY) // no '=' and ',' just a simple expression
#define EXPR_TUPLE() parsePrecedence(PREC_COMMA) // no '=', but ',' is allowed #define EXPR_TUPLE() parsePrecedence(PREC_COMMA) // no '=', but ',' is allowed
#define EXPR_ANY() parsePrecedence(PREC_ASSIGNMENT) #define EXPR_ANY() parsePrecedence(PREC_ASSIGNMENT)
} }
@ -108,19 +102,19 @@ public:
std::string_view sv = parser->lookahead(2); std::string_view sv = parser->lookahead(2);
if(sv.size() == 2 && sv[0] == quote && sv[1] == quote) { if(sv.size() == 2 && sv[0] == quote && sv[1] == quote) {
quote3 = true; quote3 = true;
parser->eatChar(); parser->eatchar();
parser->eatChar(); parser->eatchar();
} }
std::vector<char> buff; std::vector<char> buff;
while (true) { while (true) {
char c = parser->eatCharIncludeNewLine(); char c = parser->eatchar_include_newLine();
if (c == quote){ if (c == quote){
if(quote3){ if(quote3){
sv = parser->lookahead(2); sv = parser->lookahead(2);
if(sv.size() == 2 && sv[0] == quote && sv[1] == quote) { if(sv.size() == 2 && sv[0] == quote && sv[1] == quote) {
parser->eatChar(); parser->eatchar();
parser->eatChar(); parser->eatchar();
break; break;
} }
buff.push_back(c); buff.push_back(c);
@ -142,7 +136,7 @@ public:
} }
} }
if (!raw && c == '\\') { if (!raw && c == '\\') {
switch (parser->eatCharIncludeNewLine()) { switch (parser->eatchar_include_newLine()) {
case '"': buff.push_back('"'); break; case '"': buff.push_back('"'); break;
case '\'': buff.push_back('\''); break; case '\'': buff.push_back('\''); break;
case '\\': buff.push_back('\\'); break; case '\\': buff.push_back('\\'); break;
@ -161,9 +155,9 @@ public:
void eatString(char quote, StringType type) { void eatString(char quote, StringType type) {
_Str s = eatStringUntil(quote, type == RAW_STRING); _Str s = eatStringUntil(quote, type == RAW_STRING);
if(type == F_STRING){ if(type == F_STRING){
parser->setNextToken(TK("@fstr"), vm->PyStr(s)); parser->set_next_token(TK("@fstr"), vm->PyStr(s));
}else{ }else{
parser->setNextToken(TK("@str"), vm->PyStr(s)); parser->set_next_token(TK("@str"), vm->PyStr(s));
} }
} }
@ -177,17 +171,17 @@ public:
try{ try{
if (std::regex_search(s, m, pattern)) { if (std::regex_search(s, m, pattern)) {
// here is m.length()-1, since the first char is eaten by lexToken() // here is m.length()-1, since the first char was eaten by lexToken()
for(int j=0; j<m.length()-1; j++) parser->eatChar(); for(int j=0; j<m.length()-1; j++) parser->eatchar();
int base = 10; int base = 10;
size_t size; size_t size;
if (m[1].matched) base = 16; if (m[1].matched) base = 16;
if (m[2].matched) { if (m[2].matched) {
if(base == 16) syntaxError("hex literal should not contain a dot"); if(base == 16) syntaxError("hex literal should not contain a dot");
parser->setNextToken(TK("@num"), vm->PyFloat(std::stod(m[0], &size))); parser->set_next_token(TK("@num"), vm->PyFloat(std::stod(m[0], &size)));
} else { } else {
parser->setNextToken(TK("@num"), vm->PyInt(std::stoll(m[0], &size, base))); parser->set_next_token(TK("@num"), vm->PyInt(std::stoll(m[0], &size, base)));
} }
if (size != m.length()) throw std::runtime_error("length mismatch"); if (size != m.length()) throw std::runtime_error("length mismatch");
} }
@ -205,94 +199,94 @@ public:
// Lex the next token and set it as the next token. // Lex the next token and set it as the next token.
void _lexToken() { void _lexToken() {
parser->prev = parser->curr; parser->prev = parser->curr;
parser->curr = parser->nextToken(); parser->curr = parser->next_token();
//_Str _info = parser->curr.info(); std::cout << _info << '[' << parser->current_line << ']' << std::endl; //_Str _info = parser->curr.info(); std::cout << _info << '[' << parser->current_line << ']' << std::endl;
while (parser->peek_char() != '\0') { while (parser->peekchar() != '\0') {
parser->token_start = parser->curr_char; parser->token_start = parser->curr_char;
char c = parser->eatCharIncludeNewLine(); char c = parser->eatchar_include_newLine();
switch (c) { switch (c) {
case '\'': case '"': eatString(c, NORMAL_STRING); return; case '\'': case '"': eatString(c, NORMAL_STRING); return;
case '#': parser->skipLineComment(); break; case '#': parser->skip_line_comment(); break;
case '{': parser->setNextToken(TK("{")); return; case '{': parser->set_next_token(TK("{")); return;
case '}': parser->setNextToken(TK("}")); return; case '}': parser->set_next_token(TK("}")); return;
case ',': parser->setNextToken(TK(",")); return; case ',': parser->set_next_token(TK(",")); return;
case ':': parser->setNextToken(TK(":")); return; case ':': parser->set_next_token(TK(":")); return;
case ';': parser->setNextToken(TK(";")); return; case ';': parser->set_next_token(TK(";")); return;
case '(': parser->setNextToken(TK("(")); return; case '(': parser->set_next_token(TK("(")); return;
case ')': parser->setNextToken(TK(")")); return; case ')': parser->set_next_token(TK(")")); return;
case '[': parser->setNextToken(TK("[")); return; case '[': parser->set_next_token(TK("[")); return;
case ']': parser->setNextToken(TK("]")); return; case ']': parser->set_next_token(TK("]")); return;
case '%': parser->setNextTwoCharToken('=', TK("%"), TK("%=")); return; case '%': parser->set_next_token_2('=', TK("%"), TK("%=")); return;
case '&': parser->setNextTwoCharToken('=', TK("&"), TK("&=")); return; case '&': parser->set_next_token_2('=', TK("&"), TK("&=")); return;
case '|': parser->setNextTwoCharToken('=', TK("|"), TK("|=")); return; case '|': parser->set_next_token_2('=', TK("|"), TK("|=")); return;
case '^': parser->setNextTwoCharToken('=', TK("^"), TK("^=")); return; case '^': parser->set_next_token_2('=', TK("^"), TK("^=")); return;
case '?': parser->setNextToken(TK("?")); return; case '?': parser->set_next_token(TK("?")); return;
case '.': { case '.': {
if(parser->matchChar('.')) { if(parser->matchchar('.')) {
if(parser->matchChar('.')) { if(parser->matchchar('.')) {
parser->setNextToken(TK("...")); parser->set_next_token(TK("..."));
} else { } else {
syntaxError("invalid token '..'"); syntaxError("invalid token '..'");
} }
} else { } else {
parser->setNextToken(TK(".")); parser->set_next_token(TK("."));
} }
return; return;
} }
case '=': parser->setNextTwoCharToken('=', TK("="), TK("==")); return; case '=': parser->set_next_token_2('=', TK("="), TK("==")); return;
case '+': parser->setNextTwoCharToken('=', TK("+"), TK("+=")); return; case '+': parser->set_next_token_2('=', TK("+"), TK("+=")); return;
case '>': { case '>': {
if(parser->matchChar('=')) parser->setNextToken(TK(">=")); if(parser->matchchar('=')) parser->set_next_token(TK(">="));
else if(parser->matchChar('>')) parser->setNextToken(TK(">>")); else if(parser->matchchar('>')) parser->set_next_token(TK(">>"));
else parser->setNextToken(TK(">")); else parser->set_next_token(TK(">"));
return; return;
} }
case '<': { case '<': {
if(parser->matchChar('=')) parser->setNextToken(TK("<=")); if(parser->matchchar('=')) parser->set_next_token(TK("<="));
else if(parser->matchChar('<')) parser->setNextToken(TK("<<")); else if(parser->matchchar('<')) parser->set_next_token(TK("<<"));
else parser->setNextToken(TK("<")); else parser->set_next_token(TK("<"));
return; return;
} }
case '-': { case '-': {
if(parser->matchChar('=')) parser->setNextToken(TK("-=")); if(parser->matchchar('=')) parser->set_next_token(TK("-="));
else if(parser->matchChar('>')) parser->setNextToken(TK("->")); else if(parser->matchchar('>')) parser->set_next_token(TK("->"));
else parser->setNextToken(TK("-")); else parser->set_next_token(TK("-"));
return; return;
} }
case '!': case '!':
if(parser->matchChar('=')) parser->setNextToken(TK("!=")); if(parser->matchchar('=')) parser->set_next_token(TK("!="));
else syntaxError("expected '=' after '!'"); else syntaxError("expected '=' after '!'");
break; break;
case '*': case '*':
if (parser->matchChar('*')) { if (parser->matchchar('*')) {
parser->setNextToken(TK("**")); // '**' parser->set_next_token(TK("**")); // '**'
} else { } else {
parser->setNextTwoCharToken('=', TK("*"), TK("*=")); parser->set_next_token_2('=', TK("*"), TK("*="));
} }
return; return;
case '/': case '/':
if(parser->matchChar('/')) { if(parser->matchchar('/')) {
parser->setNextTwoCharToken('=', TK("//"), TK("//=")); parser->set_next_token_2('=', TK("//"), TK("//="));
} else { } else {
parser->setNextTwoCharToken('=', TK("/"), TK("/=")); parser->set_next_token_2('=', TK("/"), TK("/="));
} }
return; return;
case '\r': break; // just ignore '\r' case '\r': break; // just ignore '\r'
case ' ': case '\t': parser->eatSpaces(); break; case ' ': case '\t': parser->eat_spaces(); break;
case '\n': { case '\n': {
parser->setNextToken(TK("@eol")); parser->set_next_token(TK("@eol"));
if(!parser->eatIndentation()) indentationError("unindent does not match any outer indentation level"); if(!parser->eat_indentation()) indentationError("unindent does not match any outer indentation level");
return; return;
} }
default: { default: {
if(c == 'f'){ if(c == 'f'){
if(parser->matchChar('\'')) {eatString('\'', F_STRING); return;} if(parser->matchchar('\'')) {eatString('\'', F_STRING); return;}
if(parser->matchChar('"')) {eatString('"', F_STRING); return;} if(parser->matchchar('"')) {eatString('"', F_STRING); return;}
}else if(c == 'r'){ }else if(c == 'r'){
if(parser->matchChar('\'')) {eatString('\'', RAW_STRING); return;} if(parser->matchchar('\'')) {eatString('\'', RAW_STRING); return;}
if(parser->matchChar('"')) {eatString('"', RAW_STRING); return;} if(parser->matchchar('"')) {eatString('"', RAW_STRING); return;}
} }
if (c >= '0' && c <= '9') { if (c >= '0' && c <= '9') {
@ -300,7 +294,7 @@ public:
return; return;
} }
switch (parser->eatName()) switch (parser->eat_name())
{ {
case 0: break; case 0: break;
case 1: syntaxError("invalid char: " + std::string(1, c)); case 1: syntaxError("invalid char: " + std::string(1, c));
@ -315,7 +309,7 @@ public:
} }
parser->token_start = parser->curr_char; parser->token_start = parser->curr_char;
parser->setNextToken(TK("@eof")); parser->set_next_token(TK("@eof"));
} }
inline _TokenType peek() { inline _TokenType peek() {
@ -1100,7 +1094,7 @@ __LISTCOMP:
lineno = parser->current_line; lineno = parser->current_line;
cursor = parser->curr_char; cursor = parser->curr_char;
} }
if(parser->peek_char() == '\n') lineno--; if(parser->peekchar() == '\n') lineno--;
return parser->src->snapshot(lineno, cursor); return parser->src->snapshot(lineno, cursor);
} }

View File

@ -37,7 +37,6 @@ constexpr _TokenType TK(const char* const token) {
} }
#define TK_STR(t) __TOKENS[t] #define TK_STR(t) __TOKENS[t]
const _TokenType __KW_BEGIN = TK("class"); const _TokenType __KW_BEGIN = TK("class");
const _TokenType __KW_END = TK("raise"); const _TokenType __KW_END = TK("raise");
@ -56,9 +55,7 @@ struct Token{
int line; //< Line number of the token (1 based). int line; //< Line number of the token (1 based).
PyVar value; //< Literal value of the token. PyVar value; //< Literal value of the token.
const _Str str() const { const _Str str() const { return _Str(start, length);}
return _Str(start, length);
}
const _Str info() const { const _Str info() const {
_StrStream ss; _StrStream ss;
@ -108,8 +105,10 @@ struct Parser {
int brackets_level_1 = 0; int brackets_level_1 = 0;
int brackets_level_2 = 0; int brackets_level_2 = 0;
Token nextToken(){ Token next_token(){
if(nexts.empty()) return makeErrToken(); if(nexts.empty()){
return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
}
Token t = nexts.front(); Token t = nexts.front();
if(t.type == TK("@eof") && indents.size()>1){ if(t.type == TK("@eof") && indents.size()>1){
nexts.pop(); nexts.pop();
@ -120,11 +119,9 @@ struct Parser {
return t; return t;
} }
inline char peek_char() { inline char peekchar() const{ return *curr_char; }
return *curr_char;
}
std::string_view lookahead(int n){ std::string_view lookahead(int n) const{
const char* c = curr_char; const char* c = curr_char;
for(int i=0; i<n; i++){ for(int i=0; i<n; i++){
if(*c == '\0') return std::string_view(curr_char, i); if(*c == '\0') return std::string_view(curr_char, i);
@ -133,23 +130,23 @@ struct Parser {
return std::string_view(curr_char, n); return std::string_view(curr_char, n);
} }
int eatSpaces(){ int eat_spaces(){
int count = 0; int count = 0;
while (true) { while (true) {
switch (peek_char()) { switch (peekchar()) {
case ' ' : count+=1; break; case ' ' : count+=1; break;
case '\t': count+=4; break; case '\t': count+=4; break;
default: return count; default: return count;
} }
eatChar(); eatchar();
} }
} }
bool eatIndentation(){ bool eat_indentation(){
if(brackets_level_0 > 0 || brackets_level_1 > 0 || brackets_level_2 > 0) return true; if(brackets_level_0 > 0 || brackets_level_1 > 0 || brackets_level_2 > 0) return true;
int spaces = eatSpaces(); int spaces = eat_spaces();
if(peek_char() == '#') skipLineComment(); if(peekchar() == '#') skip_line_comment();
if(peek_char() == '\0' || peek_char() == '\n') return true; if(peekchar() == '\0' || peekchar() == '\n') return true;
// https://docs.python.org/3/reference/lexical_analysis.html#indentation // https://docs.python.org/3/reference/lexical_analysis.html#indentation
if(spaces > indents.top()){ if(spaces > indents.top()){
indents.push(spaces); indents.push(spaces);
@ -166,15 +163,15 @@ struct Parser {
return true; return true;
} }
char eatChar() { char eatchar() {
char c = peek_char(); char c = peekchar();
if(c == '\n') throw std::runtime_error("eatChar() cannot consume a newline"); if(c == '\n') throw std::runtime_error("eatchar() cannot consume a newline");
curr_char++; curr_char++;
return c; return c;
} }
char eatCharIncludeNewLine() { char eatchar_include_newLine() {
char c = peek_char(); char c = peekchar();
curr_char++; curr_char++;
if (c == '\n'){ if (c == '\n'){
current_line++; current_line++;
@ -183,10 +180,10 @@ struct Parser {
return c; return c;
} }
int eatName() { int eat_name() {
curr_char--; curr_char--;
while(true){ while(true){
uint8_t c = peek_char(); uint8_t c = peekchar();
int u8bytes = 0; int u8bytes = 0;
if((c & 0b10000000) == 0b00000000) u8bytes = 1; if((c & 0b10000000) == 0b00000000) u8bytes = 1;
else if((c & 0b11100000) == 0b11000000) u8bytes = 2; else if((c & 0b11100000) == 0b11000000) u8bytes = 2;
@ -225,11 +222,11 @@ struct Parser {
if(src->mode == JSON_MODE){ if(src->mode == JSON_MODE){
if(name == "true"){ if(name == "true"){
setNextToken(TK("True")); set_next_token(TK("True"));
} else if(name == "false"){ } else if(name == "false"){
setNextToken(TK("False")); set_next_token(TK("False"));
} else if(name == "null"){ } else if(name == "null"){
setNextToken(TK("None")); set_next_token(TK("None"));
} else { } else {
return 4; return 4;
} }
@ -240,46 +237,41 @@ struct Parser {
if(name == "not"){ if(name == "not"){
if(strncmp(curr_char, " in", 3) == 0){ if(strncmp(curr_char, " in", 3) == 0){
curr_char += 3; curr_char += 3;
setNextToken(TK("not in")); set_next_token(TK("not in"));
return 0; return 0;
} }
}else if(name == "is"){ }else if(name == "is"){
if(strncmp(curr_char, " not", 4) == 0){ if(strncmp(curr_char, " not", 4) == 0){
curr_char += 4; curr_char += 4;
setNextToken(TK("is not")); set_next_token(TK("is not"));
return 0; return 0;
} }
} }
setNextToken(__KW_MAP.at(name)); set_next_token(__KW_MAP.at(name));
} else { } else {
setNextToken(TK("@id")); set_next_token(TK("@id"));
} }
return 0; return 0;
} }
void skipLineComment() { void skip_line_comment() {
char c; char c;
while ((c = peek_char()) != '\0') { while ((c = peekchar()) != '\0') {
if (c == '\n') return; if (c == '\n') return;
eatChar(); eatchar();
} }
} }
// If the current char is [c] consume it and advance char by 1 and returns // If the current char is [c] consume it and advance char by 1 and returns
// true otherwise returns false. // true otherwise returns false.
bool matchChar(char c) { bool matchchar(char c) {
if (peek_char() != c) return false; if (peekchar() != c) return false;
eatCharIncludeNewLine(); eatchar_include_newLine();
return true; return true;
} }
// Returns an error token from the current position for reporting error.
Token makeErrToken() {
return Token{TK("@error"), token_start, (int)(curr_char - token_start), current_line};
}
// Initialize the next token as the type. // Initialize the next token as the type.
void setNextToken(_TokenType type, PyVar value=nullptr) { void set_next_token(_TokenType type, PyVar value=nullptr) {
switch(type){ switch(type){
case TK("("): brackets_level_0++; break; case TK("("): brackets_level_0++; break;
@ -299,9 +291,9 @@ struct Parser {
}); });
} }
void setNextTwoCharToken(char c, _TokenType one, _TokenType two) { void set_next_token_2(char c, _TokenType one, _TokenType two) {
if (matchChar(c)) setNextToken(two); if (matchchar(c)) set_next_token(two);
else setNextToken(one); else set_next_token(one);
} }
Parser(_Source src) { Parser(_Source src) {