Skip to content

Instantly share code, notes, and snippets.

@timosarkar
Created September 24, 2025 18:30
Show Gist options
  • Select an option

  • Save timosarkar/434a6b305d66f346ed32391ce767c51c to your computer and use it in GitHub Desktop.

Select an option

Save timosarkar/434a6b305d66f346ed32391ce767c51c to your computer and use it in GitHub Desktop.
generate a tokenstream and abstract syntax tree for a small mathematical language
#include <iostream>
#include <string>
#include <regex>
#include <vector>
#include <memory>
// ---------------- Token Definitions ----------------
enum class TokenType {
NUMBER, IDENTIFIER, ASSIGN, PLUS, MINUS, MULT, DIV, LPAREN, RPAREN, DOT, EOF_TOKEN
};
struct Token {
TokenType type;
std::string value;
};
// ---------------- AST Node Definitions ----------------
struct ASTNode {
virtual ~ASTNode() {}
virtual void print(int indent = 0) = 0;
};
struct NumberNode : public ASTNode {
int value;
NumberNode(int v) : value(v) {}
void print(int indent = 0) override {
std::cout << std::string(indent, ' ') << "Number: " << value << "\n";
}
};
struct VariableNode : public ASTNode {
std::string name;
VariableNode(const std::string &n) : name(n) {}
void print(int indent = 0) override {
std::cout << std::string(indent, ' ') << "Variable: " << name << "\n";
}
};
struct BinaryOpNode : public ASTNode {
std::string op;
std::unique_ptr<ASTNode> left;
std::unique_ptr<ASTNode> right;
BinaryOpNode(const std::string &o, std::unique_ptr<ASTNode> l, std::unique_ptr<ASTNode> r)
: op(o), left(std::move(l)), right(std::move(r)) {}
void print(int indent = 0) override {
std::cout << std::string(indent, ' ') << "BinaryOp: " << op << "\n";
left->print(indent + 2);
right->print(indent + 2);
}
};
struct AssignmentNode : public ASTNode {
std::string variable;
std::unique_ptr<ASTNode> expr;
AssignmentNode(const std::string &var, std::unique_ptr<ASTNode> e)
: variable(var), expr(std::move(e)) {}
void print(int indent = 0) override {
std::cout << std::string(indent, ' ') << "Assignment: " << variable << "\n";
expr->print(indent + 2);
}
};
struct ProgramNode : public ASTNode {
std::vector<std::unique_ptr<ASTNode>> statements;
void print(int indent = 0) override {
for (auto &stmt : statements)
stmt->print(indent);
}
};
// ---------------- Lexer ----------------
std::vector<Token> lex(const std::string &code) {
std::vector<Token> tokens;
std::regex token_regex(
R"((\d+)|([A-Za-z_]\w*)|(:=)|(\+)|(-)|(\*)|(/)|(\()|(\))|(\.)|(\s+))"
);
std::smatch match;
std::string::const_iterator start = code.begin();
std::string::const_iterator end = code.end();
while (std::regex_search(start, end, match, token_regex)) {
if (match[1].matched) tokens.push_back({TokenType::NUMBER, match[1]});
else if (match[2].matched) tokens.push_back({TokenType::IDENTIFIER, match[2]});
else if (match[3].matched) tokens.push_back({TokenType::ASSIGN, match[3]});
else if (match[4].matched) tokens.push_back({TokenType::PLUS, match[4]});
else if (match[5].matched) tokens.push_back({TokenType::MINUS, match[5]});
else if (match[6].matched) tokens.push_back({TokenType::MULT, match[6]});
else if (match[7].matched) tokens.push_back({TokenType::DIV, match[7]});
else if (match[8].matched) tokens.push_back({TokenType::LPAREN, match[8]});
else if (match[9].matched) tokens.push_back({TokenType::RPAREN, match[9]});
else if (match[10].matched) tokens.push_back({TokenType::DOT, match[10]});
// Ignore whitespace (match[11])
start = match.suffix().first;
}
tokens.push_back({TokenType::EOF_TOKEN, ""});
return tokens;
}
// ---------------- Parser ----------------
class Parser {
std::vector<Token> tokens;
size_t pos;
Token current() { return pos < tokens.size() ? tokens[pos] : Token{TokenType::EOF_TOKEN, ""}; }
void advance() { if (pos < tokens.size()) pos++; }
std::unique_ptr<ASTNode> parseExpression();
std::unique_ptr<ASTNode> parseTerm();
std::unique_ptr<ASTNode> parseFactor();
public:
Parser(const std::vector<Token> &t) : tokens(t), pos(0) {}
std::unique_ptr<ProgramNode> parseProgram();
};
// Factor: NUMBER | VARIABLE | '(' expression ')'
std::unique_ptr<ASTNode> Parser::parseFactor() {
Token tok = current();
if (tok.type == TokenType::NUMBER) {
advance();
return std::make_unique<NumberNode>(std::stoi(tok.value));
} else if (tok.type == TokenType::IDENTIFIER) {
advance();
return std::make_unique<VariableNode>(tok.value);
} else if (tok.type == TokenType::LPAREN) {
advance();
auto node = parseExpression();
if (current().type != TokenType::RPAREN) {
std::cerr << "Expected ')'\n";
exit(1);
}
advance();
return node;
} else {
std::cerr << "Unexpected token: " << tok.value << "\n";
exit(1);
}
}
// Term: factor ((*|/) factor)*
std::unique_ptr<ASTNode> Parser::parseTerm() {
auto node = parseFactor();
while (current().type == TokenType::MULT || current().type == TokenType::DIV) {
std::string op = current().value;
advance();
auto right = parseFactor();
node = std::make_unique<BinaryOpNode>(op, std::move(node), std::move(right));
}
return node;
}
// Expression: term ((+|-) term)*
std::unique_ptr<ASTNode> Parser::parseExpression() {
auto node = parseTerm();
while (current().type == TokenType::PLUS || current().type == TokenType::MINUS) {
std::string op = current().value;
advance();
auto right = parseTerm();
node = std::make_unique<BinaryOpNode>(op, std::move(node), std::move(right));
}
return node;
}
// Program: assignment DOT*
std::unique_ptr<ProgramNode> Parser::parseProgram() {
auto program = std::make_unique<ProgramNode>();
while (current().type != TokenType::EOF_TOKEN) {
if (current().type == TokenType::IDENTIFIER && pos + 1 < tokens.size() && tokens[pos+1].type == TokenType::ASSIGN) {
std::string var = current().value;
advance(); // IDENTIFIER
advance(); // ASSIGN
auto expr = parseExpression();
if (current().type != TokenType::DOT) {
std::cerr << "Expected '.'\n";
exit(1);
}
advance(); // DOT
program->statements.push_back(std::make_unique<AssignmentNode>(var, std::move(expr)));
} else {
std::cerr << "Unexpected token in program: " << current().value << "\n";
exit(1);
}
}
return program;
}
// ---------------- Main ----------------
int main() {
std::string code = R"(
x := 5 + 3.
y := x * 2.
z := y / (1 + 1).
)";
auto tokens = lex(code);
std::cout << "Tokens:\n";
for (auto &t : tokens) {
if (t.type == TokenType::NUMBER) std::cout << "NUMBER(" << t.value << ")\n";
else if (t.type == TokenType::IDENTIFIER) std::cout << "IDENTIFIER(" << t.value << ")\n";
else if (t.type == TokenType::ASSIGN) std::cout << "ASSIGN(" << t.value << ")\n";
else if (t.type == TokenType::PLUS) std::cout << "PLUS(" << t.value << ")\n";
else if (t.type == TokenType::MINUS) std::cout << "MINUS(" << t.value << ")\n";
else if (t.type == TokenType::MULT) std::cout << "MULT(" << t.value << ")\n";
else if (t.type == TokenType::DIV) std::cout << "DIV(" << t.value << ")\n";
else if (t.type == TokenType::LPAREN) std::cout << "LPAREN(" << t.value << ")\n";
else if (t.type == TokenType::RPAREN) std::cout << "RPAREN(" << t.value << ")\n";
else if (t.type == TokenType::DOT) std::cout << "DOT(" << t.value << ")\n";
}
Parser parser(tokens);
auto ast = parser.parseProgram();
std::cout << "\nAST:\n";
ast->print();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment