From f064be919210082807417f5b2b87c0c337ece5d7 Mon Sep 17 00:00:00 2001
From: szdytom <szdytom@qq.com>
Date: Sat, 2 Sep 2023 11:18:54 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84unicode=E5=AD=97=E7=AC=A6?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=20(#8)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/read.h  |   5 ++-
 include/token.h |  42 +++++++++----------
 include/work.h  |   3 +-
 src/main.cpp    |  35 +++++++++++-----
 src/read.cpp    | 104 ++++++++++++++++++++++++++++++++++++++++++++----
 src/scan.cpp    |   2 +-
 src/token.cpp   |  24 ++---------
 src/work.cpp    |  13 +++---
 8 files changed, 158 insertions(+), 70 deletions(-)
diff --git a/include/read.h b/include/read.h
index da41de7..97f3bd0 100644
--- a/include/read.h
+++ b/include/read.h
@@ -1,8 +1,9 @@
 #ifndef ACPA_READ_H
 #define ACPA_READ_H
 
+#include <fstream>
 #include <string>
 
-std::string read();
+std::string read(std::istream& file);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/token.h b/include/token.h
index 3308f95..d9572db 100644
--- a/include/token.h
+++ b/include/token.h
@@ -5,27 +5,27 @@
 #include <vector>
 
 enum class TokenType {
-	COMMA,	 // ,
-	SEMI,	 // ;
-	LB,	 // {
-	RB,	 // }
-	LP,	 // (
-	RP,	 // )
-	LT,	 // <
-	RT,	 // >
-	ASSIGN,	 // =
-	DOT,	 // .
-	COLON,	 // :
-	SCOPE,	 // ::
-	IMPLY,	 // ->
-	STRUCT,	 // struct
-	FN,	 // Fn
-	RETURN,	 // return
-	TYPEOF,	 // typeof
-	PRIVATE,	 // private
-	ADMIT,	 // admit
-	DELETE,  // delete
-	ID,	 // identifier
+	COMMA,	  // ,
+	SEMI,	  // ;
+	LB,	  // {
+	RB,	  // }
+	LP,	  // (
+	RP,	  // )
+	LT,	  // <
+	RT,	  // >
+	ASSIGN,	  // =
+	DOT,	  // .
+	COLON,	  // :
+	SCOPE,	  // ::
+	IMPLY,	  // ->
+	STRUCT,	  // struct
+	FN,	  // Fn
+	RETURN,	  // return
+	TYPEOF,	  // typeof
+	PRIVATE,  // private
+	ADMIT,	  // admit
+	DELETE,	  // delete
+	ID,	  // identifier
 	EXCEED
 };
 
diff --git a/include/work.h b/include/work.h
index eec1a53..b11296f 100644
--- a/include/work.h
+++ b/include/work.h
@@ -1,10 +1,9 @@
 #ifndef ACPA_WORK_H
 #define ACPA_WORK_H
 
-#include "element.h"
 #include "token.h"
 #include <vector>
 
 void work(std::vector<Token>);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/main.cpp b/src/main.cpp
index e8c961d..fd17481 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,12 +1,10 @@
-#include "element.h"
 #include "read.h"
 #include "scan.h"
-#include "token.h"
 #include "work.h"
 #include <argparse/argparse.hpp>
 #include <csignal>
-
-using namespace std;
+#include <fstream>
+#include <iostream>
 
 void signal_handler(int signal) {
 	std::cerr << "runtime error, signal: " << signal << std::endl;
@@ -26,22 +24,37 @@ int main(int argc, char* argv[]) {
 					 argparse::default_arguments::help,
 					 false);
 
-	program.add_argument("input_file")
-		.help("Source proof file")
-		.action([](const std::string& value) { return value; });
+	program.add_argument("input_file").help("Source proof file").default_value("-");
+	program.add_argument("-Ep")
+		.help("Preprocess only")
+		.implicit_value(true)
+		.default_value(false);
 
 	try {
 		program.parse_args(argc, argv);
 	} catch (const std::runtime_error& err) {
-		cerr << err.what() << std::endl;
-		cerr << program;
+		std::cerr << err.what() << std::endl;
+		std::cerr << program;
 		return 1;
 	}
 
 	auto input_file = program.get<std::string>("input_file");
-	freopen(input_file.c_str(), "r", stdin);
+	std::string rd_res;
+	if (input_file != "-") {
+		std::ifstream file(input_file);
+		if (!file.is_open()) {
+			std::cerr << "Unable to open file " << input_file << std::endl;
+			return 0;
+		}
 
-	work(scan(read()));
+		rd_res = read(file);
+	} else {
+		rd_res = read(std::cin);
+	}
 
+	if (program["-Ep"] == true) {
+		std::cout << rd_res << std::endl;
+	}
+	work(scan(rd_res));
 	return 0;
 }
diff --git a/src/read.cpp b/src/read.cpp
index 765a502..9f06900 100644
--- a/src/read.cpp
+++ b/src/read.cpp
@@ -1,11 +1,101 @@
 #include "read.h"
+#include <codecvt>
+#include <cstdint>
+#include <iostream>
+#include <locale>
 
-using namespace std;
+const std::string base32_chars = "abcdefghijklmnopqrstuvwxyz012345";
+const std::string leader = "_P";
 
-string read() {
-	string s;
-	for (char ch = getchar(); ch != EOF; ch = getchar()) {
-		s += ch;
+int base32_value(char c) {
+	if ('a' <= c && c <= 'z') {
+		return c - 'a';
+	} else if ('0' <= c && c <= '5') {
+		return c - '0' + 26;
+	} else {
+		return -1;
 	}
-	return s;
-}
\ No newline at end of file
+}
+
+std::string base32_decode(const std::string& input) {
+	std::string decoded_data;
+	int buffer = 0, bits = 0;
+
+	for (char c : input) {
+		int value = base32_value(c);
+		if (value == -1) {
+			return input;
+		}
+
+		buffer <<= 5;
+		buffer |= value;
+		bits += 5;
+
+		if (bits >= 8) {
+			decoded_data.push_back(static_cast<uint8_t>(buffer >> (bits - 8)));
+			bits -= 8;
+		}
+	}
+
+	return decoded_data;
+}
+
+std::string base32_encode(const std::string& input) {
+	std::string output;
+	int buffer = 0;
+	int bits_left = 0;
+
+	for (uint8_t c : input) {
+		buffer <<= 8;
+		buffer |= c;
+		bits_left += 8;
+
+		while (bits_left >= 5) {
+			output += base32_chars[(buffer >> (bits_left - 5)) & 0x1F];
+			bits_left -= 5;
+		}
+	}
+
+	if (bits_left > 0) {
+		buffer <<= (5 - bits_left);
+		output += base32_chars[buffer & 0x1F];
+	}
+
+	return output;
+}
+
+std::string read(std::istream& file) {
+	std::string res;
+
+	std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+	std::string line;
+	std::string non_ascii_word;
+
+	while (std::getline(file, line)) {
+		std::wstring utf16_str = converter.from_bytes(line);
+		for (wchar_t wc : utf16_str) {
+			std::string utf8_char = converter.to_bytes(wc);
+			if (wc > 127) {
+				non_ascii_word += utf8_char;
+			} else {
+				if (!non_ascii_word.empty()) {
+					res += leader;
+					res += base32_encode(non_ascii_word);
+					res += "_";
+					non_ascii_word.clear();
+				}
+				res += utf8_char;
+			}
+		}
+
+		if (!non_ascii_word.empty()) {
+			res += leader;
+			res += base32_encode(non_ascii_word);
+			res += "_";
+			non_ascii_word.clear();
+		}
+		res += "\n";
+	}
+
+	return res;
+}
diff --git a/src/scan.cpp b/src/scan.cpp
index a667e47..c00030c 100644
--- a/src/scan.cpp
+++ b/src/scan.cpp
@@ -81,7 +81,7 @@ vector<Token> scan(string s) {
 				type = TokenType::DELETE;
 			} else {
 				type = TokenType::ID;
-				if(mp.find(t) == mp.end()) {
+				if (mp.find(t) == mp.end()) {
 					mp[t] = id_mp.size();
 					id_mp.push_back(t);
 				}
diff --git a/src/token.cpp b/src/token.cpp
index bf27f0b..caa6e6d 100644
--- a/src/token.cpp
+++ b/src/token.cpp
@@ -3,26 +3,8 @@
 
 using namespace std;
 
-string token_mp[] = {",",
-			  ";",
-			  "{",
-			  "}",
-			  "(",
-			  ")",
-			  "<",
-			  ">",
-			  "=",
-			  ".",
-			  ":",
-			  "::",
-			  "->",
-			  "struct",
-			  "Fn",
-			  "return",
-			  "typeof",
-			  "private",
-			  "admit",
-			  "delete",
-			  "ID"};
+string token_mp[]
+	= {",",	 ";",  "{",	 "}",  "(",	 ")",	   "<",	      ">",     "=",	 ".", ":",
+	   "::", "->", "struct", "Fn", "return", "typeof", "private", "admit", "delete", "ID"};
 
 vector<string> id_mp{""};
\ No newline at end of file
diff --git a/src/work.cpp b/src/work.cpp
index cbb6da4..3d80d7d 100644
--- a/src/work.cpp
+++ b/src/work.cpp
@@ -1,5 +1,6 @@
 #include "work.h"
-#include <assert.h>
+#include "element.h"
+#include <cassert>
 
 using namespace std;
 
@@ -410,7 +411,9 @@ vector<pair<int, shared_ptr<ValType>>> createPars(map<int, shared_ptr<ValType>>*
 			d->def_var = t;
 			ndefs[s] = static_pointer_cast<Def>(d);
 			pars.push_back({s, t});
-			if (vars != nullptr && pub) (*vars)[s]=t;
+			if (vars != nullptr && pub) {
+				(*vars)[s] = t;
+			}
 		};
 		for (single(); preview(TokenType::COMMA); pt++, single()) {}
 	}
@@ -431,7 +434,7 @@ pair<int, shared_ptr<Struct>> createStruct() {
 		if (preview(TokenType::LT)) {
 			tems = createTems();
 		}
-		if(preview({TokenType::LP,TokenType::DELETE})) {
+		if (preview({TokenType::LP, TokenType::DELETE})) {
 			constructor = 0;
 			pt += 2, jump(TokenType::RP);
 		} else {
@@ -441,7 +444,7 @@ pair<int, shared_ptr<Struct>> createStruct() {
 	for (const auto& pr : tems) {
 		t->c1.push_back(pr.second);
 	}
-	if(constructor) {
+	if (constructor) {
 		for (const auto& pr : pars) {
 			t->c2.push_back(pr.second);
 		}
@@ -525,4 +528,4 @@ pair<int, shared_ptr<ValType>> createVar() {
 void work(vector<Token> _tokens) {
 	tokens = _tokens;
 	createVar();
-}
\ No newline at end of file
+}