Created
September 24, 2025 18:30
-
-
Save timosarkar/434a6b305d66f346ed32391ce767c51c to your computer and use it in GitHub Desktop.
generate a tokenstream and abstract syntax tree for a small mathematical language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <string> | |
| #include <regex> | |
| #include <vector> | |
| #include <memory> | |
| // ---------------- Token Definitions ---------------- | |
| enum class TokenType { | |
| NUMBER, IDENTIFIER, ASSIGN, PLUS, MINUS, MULT, DIV, LPAREN, RPAREN, DOT, EOF_TOKEN | |
| }; | |
| struct Token { | |
| TokenType type; | |
| std::string value; | |
| }; | |
| // ---------------- AST Node Definitions ---------------- | |
| struct ASTNode { | |
| virtual ~ASTNode() {} | |
| virtual void print(int indent = 0) = 0; | |
| }; | |
| struct NumberNode : public ASTNode { | |
| int value; | |
| NumberNode(int v) : value(v) {} | |
| void print(int indent = 0) override { | |
| std::cout << std::string(indent, ' ') << "Number: " << value << "\n"; | |
| } | |
| }; | |
| struct VariableNode : public ASTNode { | |
| std::string name; | |
| VariableNode(const std::string &n) : name(n) {} | |
| void print(int indent = 0) override { | |
| std::cout << std::string(indent, ' ') << "Variable: " << name << "\n"; | |
| } | |
| }; | |
| struct BinaryOpNode : public ASTNode { | |
| std::string op; | |
| std::unique_ptr<ASTNode> left; | |
| std::unique_ptr<ASTNode> right; | |
| BinaryOpNode(const std::string &o, std::unique_ptr<ASTNode> l, std::unique_ptr<ASTNode> r) | |
| : op(o), left(std::move(l)), right(std::move(r)) {} | |
| void print(int indent = 0) override { | |
| std::cout << std::string(indent, ' ') << "BinaryOp: " << op << "\n"; | |
| left->print(indent + 2); | |
| right->print(indent + 2); | |
| } | |
| }; | |
| struct AssignmentNode : public ASTNode { | |
| std::string variable; | |
| std::unique_ptr<ASTNode> expr; | |
| AssignmentNode(const std::string &var, std::unique_ptr<ASTNode> e) | |
| : variable(var), expr(std::move(e)) {} | |
| void print(int indent = 0) override { | |
| std::cout << std::string(indent, ' ') << "Assignment: " << variable << "\n"; | |
| expr->print(indent + 2); | |
| } | |
| }; | |
| struct ProgramNode : public ASTNode { | |
| std::vector<std::unique_ptr<ASTNode>> statements; | |
| void print(int indent = 0) override { | |
| for (auto &stmt : statements) | |
| stmt->print(indent); | |
| } | |
| }; | |
| // ---------------- Lexer ---------------- | |
| std::vector<Token> lex(const std::string &code) { | |
| std::vector<Token> tokens; | |
| std::regex token_regex( | |
| R"((\d+)|([A-Za-z_]\w*)|(:=)|(\+)|(-)|(\*)|(/)|(\()|(\))|(\.)|(\s+))" | |
| ); | |
| std::smatch match; | |
| std::string::const_iterator start = code.begin(); | |
| std::string::const_iterator end = code.end(); | |
| while (std::regex_search(start, end, match, token_regex)) { | |
| if (match[1].matched) tokens.push_back({TokenType::NUMBER, match[1]}); | |
| else if (match[2].matched) tokens.push_back({TokenType::IDENTIFIER, match[2]}); | |
| else if (match[3].matched) tokens.push_back({TokenType::ASSIGN, match[3]}); | |
| else if (match[4].matched) tokens.push_back({TokenType::PLUS, match[4]}); | |
| else if (match[5].matched) tokens.push_back({TokenType::MINUS, match[5]}); | |
| else if (match[6].matched) tokens.push_back({TokenType::MULT, match[6]}); | |
| else if (match[7].matched) tokens.push_back({TokenType::DIV, match[7]}); | |
| else if (match[8].matched) tokens.push_back({TokenType::LPAREN, match[8]}); | |
| else if (match[9].matched) tokens.push_back({TokenType::RPAREN, match[9]}); | |
| else if (match[10].matched) tokens.push_back({TokenType::DOT, match[10]}); | |
| // Ignore whitespace (match[11]) | |
| start = match.suffix().first; | |
| } | |
| tokens.push_back({TokenType::EOF_TOKEN, ""}); | |
| return tokens; | |
| } | |
| // ---------------- Parser ---------------- | |
| class Parser { | |
| std::vector<Token> tokens; | |
| size_t pos; | |
| Token current() { return pos < tokens.size() ? tokens[pos] : Token{TokenType::EOF_TOKEN, ""}; } | |
| void advance() { if (pos < tokens.size()) pos++; } | |
| std::unique_ptr<ASTNode> parseExpression(); | |
| std::unique_ptr<ASTNode> parseTerm(); | |
| std::unique_ptr<ASTNode> parseFactor(); | |
| public: | |
| Parser(const std::vector<Token> &t) : tokens(t), pos(0) {} | |
| std::unique_ptr<ProgramNode> parseProgram(); | |
| }; | |
| // Factor: NUMBER | VARIABLE | '(' expression ')' | |
| std::unique_ptr<ASTNode> Parser::parseFactor() { | |
| Token tok = current(); | |
| if (tok.type == TokenType::NUMBER) { | |
| advance(); | |
| return std::make_unique<NumberNode>(std::stoi(tok.value)); | |
| } else if (tok.type == TokenType::IDENTIFIER) { | |
| advance(); | |
| return std::make_unique<VariableNode>(tok.value); | |
| } else if (tok.type == TokenType::LPAREN) { | |
| advance(); | |
| auto node = parseExpression(); | |
| if (current().type != TokenType::RPAREN) { | |
| std::cerr << "Expected ')'\n"; | |
| exit(1); | |
| } | |
| advance(); | |
| return node; | |
| } else { | |
| std::cerr << "Unexpected token: " << tok.value << "\n"; | |
| exit(1); | |
| } | |
| } | |
| // Term: factor ((*|/) factor)* | |
| std::unique_ptr<ASTNode> Parser::parseTerm() { | |
| auto node = parseFactor(); | |
| while (current().type == TokenType::MULT || current().type == TokenType::DIV) { | |
| std::string op = current().value; | |
| advance(); | |
| auto right = parseFactor(); | |
| node = std::make_unique<BinaryOpNode>(op, std::move(node), std::move(right)); | |
| } | |
| return node; | |
| } | |
| // Expression: term ((+|-) term)* | |
| std::unique_ptr<ASTNode> Parser::parseExpression() { | |
| auto node = parseTerm(); | |
| while (current().type == TokenType::PLUS || current().type == TokenType::MINUS) { | |
| std::string op = current().value; | |
| advance(); | |
| auto right = parseTerm(); | |
| node = std::make_unique<BinaryOpNode>(op, std::move(node), std::move(right)); | |
| } | |
| return node; | |
| } | |
| // Program: assignment DOT* | |
| std::unique_ptr<ProgramNode> Parser::parseProgram() { | |
| auto program = std::make_unique<ProgramNode>(); | |
| while (current().type != TokenType::EOF_TOKEN) { | |
| if (current().type == TokenType::IDENTIFIER && pos + 1 < tokens.size() && tokens[pos+1].type == TokenType::ASSIGN) { | |
| std::string var = current().value; | |
| advance(); // IDENTIFIER | |
| advance(); // ASSIGN | |
| auto expr = parseExpression(); | |
| if (current().type != TokenType::DOT) { | |
| std::cerr << "Expected '.'\n"; | |
| exit(1); | |
| } | |
| advance(); // DOT | |
| program->statements.push_back(std::make_unique<AssignmentNode>(var, std::move(expr))); | |
| } else { | |
| std::cerr << "Unexpected token in program: " << current().value << "\n"; | |
| exit(1); | |
| } | |
| } | |
| return program; | |
| } | |
| // ---------------- Main ---------------- | |
| int main() { | |
| std::string code = R"( | |
| x := 5 + 3. | |
| y := x * 2. | |
| z := y / (1 + 1). | |
| )"; | |
| auto tokens = lex(code); | |
| std::cout << "Tokens:\n"; | |
| for (auto &t : tokens) { | |
| if (t.type == TokenType::NUMBER) std::cout << "NUMBER(" << t.value << ")\n"; | |
| else if (t.type == TokenType::IDENTIFIER) std::cout << "IDENTIFIER(" << t.value << ")\n"; | |
| else if (t.type == TokenType::ASSIGN) std::cout << "ASSIGN(" << t.value << ")\n"; | |
| else if (t.type == TokenType::PLUS) std::cout << "PLUS(" << t.value << ")\n"; | |
| else if (t.type == TokenType::MINUS) std::cout << "MINUS(" << t.value << ")\n"; | |
| else if (t.type == TokenType::MULT) std::cout << "MULT(" << t.value << ")\n"; | |
| else if (t.type == TokenType::DIV) std::cout << "DIV(" << t.value << ")\n"; | |
| else if (t.type == TokenType::LPAREN) std::cout << "LPAREN(" << t.value << ")\n"; | |
| else if (t.type == TokenType::RPAREN) std::cout << "RPAREN(" << t.value << ")\n"; | |
| else if (t.type == TokenType::DOT) std::cout << "DOT(" << t.value << ")\n"; | |
| } | |
| Parser parser(tokens); | |
| auto ast = parser.parseProgram(); | |
| std::cout << "\nAST:\n"; | |
| ast->print(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment