From 4d2b3e59a169a721e24f3ac576cf8bc81ff54699 Mon Sep 17 00:00:00 2001 From: Steve Tautonico Date: Sat, 7 Oct 2023 03:53:34 -0400 Subject: [PATCH 1/4] Added octal literal support --- src/lexer.cpp | 20 ++++++++++++++------ tests/01_int.py | 4 ++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index f033d504..1487033d 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -260,7 +260,7 @@ static bool is_unicode_Lo_char(uint32_t c) { } void Lexer::eat_number() { - PK_LOCAL_STATIC const std::regex pattern("^(0x)?[0-9a-fA-F]+(\\.[0-9]+)?(L)?"); + PK_LOCAL_STATIC const std::regex pattern("^(0[xo])?[0-9a-fA-F]+(\\.[0-9]+)?(L)?"); std::smatch m; const char* i = token_start; @@ -278,20 +278,28 @@ static bool is_unicode_Lo_char(uint32_t c) { } if(m[1].matched && m[2].matched){ - SyntaxError("hex literal should not contain a dot"); + SyntaxError("hex/octal literal should not contain a dot"); } try{ int base = 10; size_t size; - if (m[1].matched) base = 16; + if (m[1].matched) { + if (m[1].str() == "0o") base=8; + else base = 16; + } if (m[2].matched) { PK_ASSERT(base == 10); add_token(TK("@num"), Number::stof(m[0], &size)); } else { - add_token(TK("@num"), (i64)std::stoll(m[0], &size, base)); + // If we're base 8, chop off the "o" + std::string match = m[0].str(); + if (base == 8) match.erase(1, 1); + add_token(TK("@num"), (i64)std::stoll(match, &size, base)); } - PK_ASSERT((int)size == (int)m.length()); + // HACK: We need to check length-1 for octal since python octals are "0o..." and c/c++ octals are "0..." + if (base == 8) {PK_ASSERT((int)size == (int)m.length()-1);} + else {PK_ASSERT((int)size == (int)m.length());} }catch(...){ SyntaxError("invalid number literal"); } @@ -466,4 +474,4 @@ static bool is_unicode_Lo_char(uint32_t c) { return std::move(nexts); } -} // namespace pkpy \ No newline at end of file +} // namespace pkpy diff --git a/tests/01_int.py b/tests/01_int.py index 07b8d793..f95264a4 100644 --- a/tests/01_int.py +++ b/tests/01_int.py @@ -5,6 +5,10 @@ assert 0x7fffffff == 2147483647 # test 64-bit assert 2**60-1 + 546 - 0xfffffffffffff == 1148417904979477026 +# test oct literals +assert 0o1234 == 668 +assert 0o17777777777 == 2147483647 + # test == != >= <= < > assert -1 == -1 assert -1 != 1 From badc8d44d17cdf64e951553c2de7b5ae24ad77bd Mon Sep 17 00:00:00 2001 From: Steve Tautonico Date: Sun, 8 Oct 2023 00:20:14 -0400 Subject: [PATCH 2/4] Added binary literal support --- src/lexer.cpp | 7 ++++--- tests/01_int.py | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 1487033d..596d0f71 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -260,7 +260,7 @@ static bool is_unicode_Lo_char(uint32_t c) { } void Lexer::eat_number() { - PK_LOCAL_STATIC const std::regex pattern("^(0[xo])?[0-9a-fA-F]+(\\.[0-9]+)?(L)?"); + PK_LOCAL_STATIC const std::regex pattern("^(0[xob])?[0-9a-fA-F]+(\\.[0-9]+)?(L)?"); std::smatch m; const char* i = token_start; @@ -278,14 +278,15 @@ static bool is_unicode_Lo_char(uint32_t c) { } if(m[1].matched && m[2].matched){ - SyntaxError("hex/octal literal should not contain a dot"); + SyntaxError("binary/hex/octal literal should not contain a dot"); } try{ int base = 10; size_t size; if (m[1].matched) { - if (m[1].str() == "0o") base=8; + if (m[1].str() == "0b") base = 2; + else if (m[1].str() == "0o") base = 8; else base = 16; } if (m[2].matched) { diff --git a/tests/01_int.py b/tests/01_int.py index f95264a4..3a27aeb4 100644 --- a/tests/01_int.py +++ b/tests/01_int.py @@ -9,6 +9,10 @@ assert 2**60-1 + 546 - 0xfffffffffffff == 1148417904979477026 assert 0o1234 == 668 assert 0o17777777777 == 2147483647 +# test binary literals +assert 0b10010 == 18 +assert 0b11111111111111111111111111111111 == 4294967295 + # test == != >= <= < > assert -1 == -1 assert -1 != 1 From 23f1c6e9414a26b37bb2ddd39b05463fe6d987b8 Mon Sep 17 00:00:00 2001 From: Steve Tautonico Date: Sun, 8 Oct 2023 04:11:03 -0400 Subject: [PATCH 3/4] Fixed issue with binary literal --- src/lexer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lexer.cpp b/src/lexer.cpp index 596d0f71..b44eb602 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -293,13 +293,13 @@ static bool is_unicode_Lo_char(uint32_t c) { PK_ASSERT(base == 10); add_token(TK("@num"), Number::stof(m[0], &size)); } else { - // If we're base 8, chop off the "o" + // If we're base 8/2, chop off the "o" std::string match = m[0].str(); - if (base == 8) match.erase(1, 1); + if (base == 8 || base == 2) match.erase(1, 1); add_token(TK("@num"), (i64)std::stoll(match, &size, base)); } // HACK: We need to check length-1 for octal since python octals are "0o..." and c/c++ octals are "0..." - if (base == 8) {PK_ASSERT((int)size == (int)m.length()-1);} + if (base == 8 || base == 2) {PK_ASSERT((int)size == (int)m.length()-1);} else {PK_ASSERT((int)size == (int)m.length());} }catch(...){ SyntaxError("invalid number literal"); From 57beebdfd90e7b7469a3c4a2d2b4130b4ce672f0 Mon Sep 17 00:00:00 2001 From: blueloveTH Date: Wed, 11 Oct 2023 13:05:07 +0800 Subject: [PATCH 4/4] ... --- build.sh | 5 ++ include/pocketpy/lexer.h | 3 +- prebuild.py | 3 +- src/lexer.cpp | 109 ++++++++++++++++++++++++++++++++------- src/pocketpy.cpp | 9 ++-- 5 files changed, 102 insertions(+), 27 deletions(-) diff --git a/build.sh b/build.sh index 473b1f7d..a545c611 100644 --- a/build.sh +++ b/build.sh @@ -21,6 +21,11 @@ echo "> Running prebuild.py... " python3 prebuild.py +if [ $? -ne 0 ]; then + echo "prebuild.py failed." + exit 1 +fi + SRC=$(find src/ -name "*.cpp") echo "> Compiling and linking source files... " diff --git a/include/pocketpy/lexer.h b/include/pocketpy/lexer.h index b2652f79..68b5f460 100644 --- a/include/pocketpy/lexer.h +++ b/include/pocketpy/lexer.h @@ -48,7 +48,6 @@ const std::map kTokenKwMap = [](){ return map; }(); - struct Token{ TokenIndex type; const char* start; @@ -139,4 +138,6 @@ struct Lexer { std::vector run(); }; +bool parse_int(std::string_view text, i64* out, int base=10); + } // namespace pkpy diff --git a/prebuild.py b/prebuild.py index b19723b4..013b3c6b 100644 --- a/prebuild.py +++ b/prebuild.py @@ -4,7 +4,8 @@ from datetime import datetime def generate_python_sources(): sources = {} for file in os.listdir("python"): - assert file.endswith(".py") + if not file.endswith(".py"): + continue key = file.split(".")[0] with open("python/" + file) as f: value = f.read() diff --git a/src/lexer.cpp b/src/lexer.cpp index b44eb602..8c5eae45 100644 --- a/src/lexer.cpp +++ b/src/lexer.cpp @@ -281,28 +281,35 @@ static bool is_unicode_Lo_char(uint32_t c) { SyntaxError("binary/hex/octal literal should not contain a dot"); } - try{ - int base = 10; + int base = 10; + if (m[1].matched) { + char tag = m[1].first.base()[1]; + switch (tag) { + case 'x': base = 16; break; + case 'o': base = 8; break; + case 'b': base = 2; break; + default: FATAL_ERROR(); + } + } + if (m[2].matched) { + // float point number + f64 out; size_t size; - if (m[1].matched) { - if (m[1].str() == "0b") base = 2; - else if (m[1].str() == "0o") base = 8; - else base = 16; + try{ + out = Number::stof(m[0], &size); + PK_ASSERT((int)size == (int)m[0].length()); + }catch(...){ + SyntaxError("invalid number literal"); } - if (m[2].matched) { - PK_ASSERT(base == 10); - add_token(TK("@num"), Number::stof(m[0], &size)); - } else { - // If we're base 8/2, chop off the "o" - std::string match = m[0].str(); - if (base == 8 || base == 2) match.erase(1, 1); - add_token(TK("@num"), (i64)std::stoll(match, &size, base)); + add_token(TK("@num"), out); + } else { + std::string_view text(m[0].first.base(), m[0].length()); + i64 out; + bool ok = parse_int(text, &out, base); + if(!ok){ + SyntaxError("invalid number literal for base " + std::to_string(base)); } - // HACK: We need to check length-1 for octal since python octals are "0o..." and c/c++ octals are "0..." - if (base == 8 || base == 2) {PK_ASSERT((int)size == (int)m.length()-1);} - else {PK_ASSERT((int)size == (int)m.length());} - }catch(...){ - SyntaxError("invalid number literal"); + add_token(TK("@num"), out); } } @@ -475,4 +482,68 @@ static bool is_unicode_Lo_char(uint32_t c) { return std::move(nexts); } +bool parse_int(std::string_view text, i64* out, int base){ + // TODO: detect overflow + *out = 0; + + const auto f_startswith_2 = [](std::string_view t, const char* prefix) -> bool{ + if(t.length() < 2) return false; + return t[0] == prefix[0] && t[1] == prefix[1]; + }; + + if(base == 10){ + // 10-base 12334 + if(text.length() == 0) return false; + for(char c : text){ + if(c >= '0' && c <= '9'){ + *out = (*out * 10) + (c - '0'); + }else{ + return false; + } + } + return true; + }else if(base == 2){ + // 2-base 0b101010 + if(f_startswith_2(text, "0b")) text.remove_prefix(2); + if(text.length() == 0) return false; + for(char c : text){ + if(c == '0' || c == '1'){ + *out = (*out << 1) | (c - '0'); + }else{ + return false; + } + } + return true; + }else if(base == 8){ + // 8-base 0o123 + if(f_startswith_2(text, "0o")) text.remove_prefix(2); + if(text.length() == 0) return false; + for(char c : text){ + if(c >= '0' && c <= '7'){ + *out = (*out << 3) | (c - '0'); + }else{ + return false; + } + } + return true; + }else if(base == 16){ + // 16-base 0x123 + if(f_startswith_2(text, "0x")) text.remove_prefix(2); + if(text.length() == 0) return false; + for(char c : text){ + if(c >= '0' && c <= '9'){ + *out = (*out << 4) | (c - '0'); + }else if(c >= 'a' && c <= 'f'){ + *out = (*out << 4) | (c - 'a' + 10); + }else if(c >= 'A' && c <= 'F'){ + *out = (*out << 4) | (c - 'A' + 10); + }else{ + return false; + } + } + return true; + } + return false; +} + } // namespace pkpy diff --git a/src/pocketpy.cpp b/src/pocketpy.cpp index c42ead36..5de99ff2 100644 --- a/src/pocketpy.cpp +++ b/src/pocketpy.cpp @@ -434,14 +434,11 @@ void init_builtins(VM* _vm) { int base = 10; if(args.size() == 1+2) base = CAST(i64, args[2]); const Str& s = CAST(Str&, args[1]); - try{ - size_t parsed = 0; - i64 val = std::stoll(s.str(), &parsed, base); - PK_ASSERT(parsed == s.length()); - return VAR(val); - }catch(...){ + i64 val; + if(!parse_int(s.sv(), &val, base)){ vm->ValueError("invalid literal for int(): " + s.escape()); } + return VAR(val); } vm->TypeError("invalid arguments for int()"); return vm->None;