DEV Community

Javad
Javad

Posted on

Compiler Design 101: Falcon - A Minimal Production‑Oriented Language and Compiler

Hey Dev Community!

I’m glad you’re here — buckle up. Below is a complete, runnable, production‑oriented example of a tiny language called Falcon with full source code (C++17) for a lexer, parser, AST, semantic pass, LLVM codegen, and an ORC JIT runner. Paste this into a repo, follow the build steps, and you’ll have a working compiler that compiles a tiny program and runs it. I’ll keep it serious and professional — and yes, I’ll sprinkle a little humor so you don’t fall asleep while we bend LLVM to our will 😅☕.

Scope: This is a minimal but complete end‑to‑end example. Falcon supports:

  • integer literals
  • binary + and -
  • function declarations fn name() { return ; }
  • a main function returning int

It demonstrates the full pipeline: lexing → parsing → AST → semantic checks → lowering to LLVM IR → ORC JIT execution.


Project layout


falcon/
CMakeLists.txt
src/
main.cpp
token.h
lexer.h lexer.cpp
ast.h ast.cpp
parser.h parser.cpp
sema.h sema.cpp
codegen.h codegen.cpp
examples/
hello.fal
README.md


Build prerequisites

  • Linux or macOS (Windows possible with adjustments)
  • C++17 compiler (clang or gcc)
  • CMake 3.15+
  • LLVM development libraries (LLVM 12+ recommended). Set LLVM_DIR to your LLVM cmake dir (e.g., /usr/lib/llvm-14/cmake or the path from llvm-config --cmakedir).

CMakeLists.txt

`cmake
cmakeminimumrequired(VERSION 3.15)
project(falcon LANGUAGES CXX)

set(CMAKECXXSTANDARD 17)
set(CMAKECXXSTANDARD_REQUIRED ON)

find_package(LLVM REQUIRED CONFIG)
message(STATUS "Found LLVM ${LLVMPACKAGEVERSION}")
message(STATUS "LLVM include dirs: ${LLVMINCLUDEDIRS}")
message(STATUS "LLVM libraries: ${LLVM_LIBS}")

includedirectories(${LLVMINCLUDE_DIRS})
adddefinitions(${LLVMDEFINITIONS})

set(SRC
src/main.cpp
src/lexer.cpp
src/parser.cpp
src/ast.cpp
src/sema.cpp
src/codegen.cpp
)

add_executable(falcon ${SRC})
llvmmapcomponentstolibnames(REQLLVMLIBS
Core
Support
ExecutionEngine
OrcJIT
Target
IRReader
Analysis
TransformUtils
InstCombine
ScalarOpts
CodeGen
nativecodegen
)
targetlinklibraries(falcon PRIVATE ${REQLLVMLIBS})
`


Source files

Paste each file into src/ as named.


src/token.h

`cpp

pragma once

include

enum class TokenKind {
EndOfFile,
Identifier,
Number,
KwFn,
KwReturn,
Plus,
Minus,
Star,
Slash,
LParen,
RParen,
LBrace,
RBrace,
Semicolon,
Unknown
};

struct Span {
size_t start = 0;
size_t end = 0;
int line = 1;
int col = 1;
};

struct Token {
TokenKind kind;
std::string lexeme;
Span span;
};
`


src/lexer.h

`cpp

pragma once

include "token.h"

include

include

class Lexer {
public:
Lexer(const std::string &src);
Token next();
Token peek() const;
bool eof() const;

private:
const std::string src;
size_t pos = 0;
int line = 1;
int col = 1;
Token lastToken;

char peekChar() const;
char getChar();
void skipWhitespaceAndComments();
Token makeToken(TokenKind kind, const std::string &lexeme, sizet start, sizet end, int l, int c);
Token identifierOrKeyword(size_t start, int l, int c);
Token numberToken(size_t start, int l, int c);
Enter fullscreen mode Exit fullscreen mode

};
`


src/lexer.cpp

`cpp

include "lexer.h"

include

include

static std::unordered_map keywords = {
{"fn", TokenKind::KwFn},
{"return", TokenKind::KwReturn}
};

Lexer::Lexer(const std::string &s) : src(s) {}

char Lexer::peekChar() const {
if (pos >= src.size()) return '\0';
return src[pos];
}

char Lexer::getChar() {
if (pos >= src.size()) return '\0';
char c = src[pos++];
if (c == '\n') { line++; col = 1; } else col++;
return c;
}

void Lexer::skipWhitespaceAndComments() {
while (true) {
char c = peekChar();
if (c == '\0') return;
if (isspace((unsigned char)c)) { getChar(); continue; }
if (c == '/' && pos + 1 < src.size() && src[pos+1] == '/') {
// line comment
while (peekChar() != '\n' && peekChar() != '\0') getChar();
continue;
}
break;
}
}

Token Lexer::makeToken(TokenKind kind, const std::string &lexeme, sizet start, sizet end, int l, int c) {
Token t; t.kind = kind; t.lexeme = lexeme; t.span.start = start; t.span.end = end; t.span.line = l; t.span.col = c;
lastToken = t;
return t;
}

Token Lexer::identifierOrKeyword(size_t start, int l, int c) {
size_t p = start;
while (isalnum((unsigned char)peekChar()) || peekChar() == '_') getChar();
size_t end = pos;
std::string lex = src.substr(p, end - p);
auto it = keywords.find(lex);
if (it != keywords.end()) return makeToken(it->second, lex, p, end, l, c);
return makeToken(TokenKind::Identifier, lex, p, end, l, c);
}

Token Lexer::numberToken(size_t start, int l, int c) {
size_t p = start;
while (isdigit((unsigned char)peekChar())) getChar();
size_t end = pos;
std::string lex = src.substr(p, end - p);
return makeToken(TokenKind::Number, lex, p, end, l, c);
}

Token Lexer::next() {
skipWhitespaceAndComments();
size_t start = pos;
int l = line, c = col;
char ch = peekChar();
if (ch == '\0') return makeToken(TokenKind::EndOfFile, "", pos, pos, l, c);
if (isalpha((unsigned char)ch) || ch == '_') return identifierOrKeyword(start, l, c);
if (isdigit((unsigned char)ch)) return numberToken(start, l, c);
getChar(); // consume
switch (ch) {
case '+': return makeToken(TokenKind::Plus, "+", start, pos, l, c);
case '-': return makeToken(TokenKind::Minus, "-", start, pos, l, c);
case '': return makeToken(TokenKind::Star, "", start, pos, l, c);
case '/': return makeToken(TokenKind::Slash, "/", start, pos, l, c);
case '(': return makeToken(TokenKind::LParen, "(", start, pos, l, c);
case ')': return makeToken(TokenKind::RParen, ")", start, pos, l, c);
case '{': return makeToken(TokenKind::LBrace, "{", start, pos, l, c);
case '}': return makeToken(TokenKind::RBrace, "}", start, pos, l, c);
case ';': return makeToken(TokenKind::Semicolon, ";", start, pos, l, c);
default:
return makeToken(TokenKind::Unknown, std::string(1, ch), start, pos, l, c);
}
}

Token Lexer::peek() const {
return lastToken;
}

bool Lexer::eof() const {
return pos >= src.size();
}
`


src/ast.h

`cpp

pragma once

include "token.h"

include

include

include

struct Expr {
Span span;
virtual ~Expr() = default;
};

struct NumberExpr : Expr {
int value;
NumberExpr(int v) : value(v) {}
};

struct BinaryExpr : Expr {
char op;
std::unique_ptr lhs, rhs;
BinaryExpr(char o, std::uniqueptr l, std::uniqueptr r) : op(o), lhs(std::move(l)), rhs(std::move(r)) {}
};

struct Stmt {
Span span;
virtual ~Stmt() = default;
};

struct ReturnStmt : Stmt {
std::unique_ptr expr;
ReturnStmt(std::unique_ptr e) : expr(std::move(e)) {}
};

struct FunctionDecl {
std::string name;
std::vectorstd::string params;
std::vectorstd::unique_ptr<Stmt> body;
Span span;
};
`


src/parser.h

`cpp

pragma once

include "lexer.h"

include "ast.h"

include

include

class Parser {
public:
Parser(Lexer &lex);
std::vectorstd::unique_ptr<FunctionDecl> parseModule();

private:
Lexer &lex;
Token cur;

void advance();
bool match(TokenKind k);
void expect(TokenKind k, const std::string &msg);
std::unique_ptr<FunctionDecl> parseFunction();
std::unique_ptr<Stmt> parseStatement();
std::unique_ptr<ReturnStmt> parseReturn();
std::unique_ptr<Expr> parseExpression();
std::unique_ptr<Expr> parsePrimary();
std::uniqueptr<Expr> parseBinaryRHS(int exprPrec, std::uniqueptr<Expr> lhs);
int getTokPrecedence();
Enter fullscreen mode Exit fullscreen mode

};
`


src/parser.cpp

`cpp

include "parser.h"

include

include

Parser::Parser(Lexer &l) : lex(l) {
cur = lex.next();
}

void Parser::advance() { cur = lex.next(); }

bool Parser::match(TokenKind k) {
if (cur.kind == k) { advance(); return true; }
return false;
}

void Parser::expect(TokenKind k, const std::string &msg) {
if (cur.kind != k) {
throw std::runtime_error("Parse error: " + msg);
}
advance();
}

std::vectorstd::unique_ptr<FunctionDecl> Parser::parseModule() {
std::vectorstd::unique_ptr<FunctionDecl> funcs;
while (cur.kind != TokenKind::EndOfFile) {
funcs.push_back(parseFunction());
}
return funcs;
}

std::unique_ptr Parser::parseFunction() {
expect(TokenKind::KwFn, "expected 'fn'");
if (cur.kind != TokenKind::Identifier) throw std::runtime_error("expected function name");
std::string name = cur.lexeme;
advance();
expect(TokenKind::LParen, "expected '('");
expect(TokenKind::RParen, "expected ')'");
expect(TokenKind::LBrace, "expected '{'");
auto fn = std::make_unique();
fn->name = name;
while (cur.kind != TokenKind::RBrace && cur.kind != TokenKind::EndOfFile) {
fn->body.push_back(parseStatement());
}
expect(TokenKind::RBrace, "expected '}'");
return fn;
}

std::unique_ptr Parser::parseStatement() {
if (cur.kind == TokenKind::KwReturn) return parseReturn();
throw std::runtime_error("unknown statement");
}

std::unique_ptr Parser::parseReturn() {
expect(TokenKind::KwReturn, "expected 'return'");
auto expr = parseExpression();
expect(TokenKind::Semicolon, "expected ';' after return");
return std::make_unique(std::move(expr));
}

std::unique_ptr Parser::parseExpression() {
auto lhs = parsePrimary();
return parseBinaryRHS(0, std::move(lhs));
}

std::unique_ptr Parser::parsePrimary() {
if (cur.kind == TokenKind::Number) {
int v = std::stoi(cur.lexeme);
auto n = std::make_unique(v);
advance();
return n;
}
if (cur.kind == TokenKind::LParen) {
advance();
auto e = parseExpression();
expect(TokenKind::RParen, "expected ')'");
return e;
}
throw std::runtime_error("expected primary expression");
}

int Parser::getTokPrecedence() {
if (cur.kind == TokenKind::Plus || cur.kind == TokenKind::Minus) return 10;
if (cur.kind == TokenKind::Star || cur.kind == TokenKind::Slash) return 20;
return -1;
}

std::uniqueptr Parser::parseBinaryRHS(int exprPrec, std::uniqueptr lhs) {
while (true) {
int tokPrec = getTokPrecedence();
if (tokPrec < exprPrec) return lhs;
Token op = cur;
advance();
auto rhs = parsePrimary();
int nextPrec = getTokPrecedence();
if (tokPrec < nextPrec) {
rhs = parseBinaryRHS(tokPrec + 1, std::move(rhs));
}
char opc = '+';
if (op.kind == TokenKind::Plus) opc = '+';
else if (op.kind == TokenKind::Minus) opc = '-';
else if (op.kind == TokenKind::Star) opc = '*';
else if (op.kind == TokenKind::Slash) opc = '/';
lhs = std::make_unique(opc, std::move(lhs), std::move(rhs));
}
}
`


src/sema.h

`cpp

pragma once

include "ast.h"

include

include

class Semantic {
public:
void analyze(const std::vectorstd::unique_ptr<FunctionDecl> &funcs);
private:
void analyzeFunction(const FunctionDecl *fn);
void analyzeStmt(const Stmt *s);
void analyzeExpr(const Expr *e);
};
`


src/sema.cpp

`cpp

include "sema.h"

include

include

void Semantic::analyze(const std::vectorstd::unique_ptr<FunctionDecl> &funcs) {
bool hasMain = false;
for (auto &f : funcs) {
if (f->name == "main") hasMain = true;
analyzeFunction(f.get());
}
if (!hasMain) throw std::runtime_error("no main function found");
}

void Semantic::analyzeFunction(const FunctionDecl fn) {
// For this minimal language, just ensure body has a return
bool hasReturn = false;
for (auto &s : fn->body) {
if (dynamic_cast<ReturnStmt
>(s.get())) hasReturn = true;
}
if (!hasReturn) {
throw std::runtime_error("function '" + fn->name + "' missing return");
}
}

void Semantic::analyzeStmt(const Stmt *s) {
// nothing for now
}

void Semantic::analyzeExpr(const Expr *e) {
// nothing for now
}
`


src/codegen.h

`cpp

pragma once

include "ast.h"

include

include

include

class CodeGen {
public:
CodeGen();
std::uniqueptrllvm::Module generate(const std::vectorstd::uniqueptr<FunctionDecl> &funcs);
private:
llvm::LLVMContext context;
llvm::IRBuilder<> builder{context};
llvm::Module *module = nullptr;

llvm::Value genExpr(const Expr e);
void genFunction(const FunctionDecl *fn);
Enter fullscreen mode Exit fullscreen mode

};
`


src/codegen.cpp

`cpp

include "codegen.h"

include

include

include

include

include

include

include

include

using namespace llvm;
using namespace llvm::orc;

CodeGen::CodeGen() : context(), builder(context) {}

Value CodeGen::genExpr(const Expr e) {
if (auto n = dynamic_cast(e)) {
return ConstantInt::get(Type::getInt32Ty(context), n->value);
}
if (auto b = dynamic_cast(e)) {
Value L = genExpr(b->lhs.get());
Value *R = genExpr(b->rhs.get());
switch (b->op) {
case '+': return builder.CreateAdd(L, R, "addtmp");
case '-': return builder.CreateSub(L, R, "subtmp");
case '
': return builder.CreateMul(L, R, "multmp");
case '/': return builder.CreateSDiv(L, R, "divtmp");
default: throw std::runtime_error("unknown binary op");
}
}
throw std::runtime_error("unknown expr node");
}

void CodeGen::genFunction(const FunctionDecl fn) {
// all functions return i32 and take no params in this minimal example
FunctionType *ft = FunctionType::get(Type::getInt32Ty(context), false);
Function *f = Function::Create(ft, Function::ExternalLinkage, fn->name, module);
BasicBlock *bb = BasicBlock::Create(context, "entry", f);
builder.SetInsertPoint(bb);
for (auto &s : fn->body) {
if (auto ret = dynamic_cast<ReturnStmt
>(s.get())) {
Value *v = genExpr(ret->expr.get());
builder.CreateRet(v);
// after return, we could create a new block, but minimal example ends here
}
}
// verify
if (verifyFunction(*f, &errs())) {
throw std::runtime_error("function verification failed");
}
}

std::uniqueptr CodeGen::generate(const std::vectorstd::uniqueptr<FunctionDecl> &funcs) {
module = new Module("falcon_module", context);
for (auto &f : funcs) genFunction(f.get());
return std::unique_ptr(module);
}
`


src/main.cpp

`cpp

include "lexer.h"

include "parser.h"

include "sema.h"

include "codegen.h"

include

include

include

include

include

include

using namespace llvm;
using namespace llvm::orc;

int main(int argc, char argv) {
if (argc < 2) {
std::cerr << "Usage: falcon \n";
return 1;
}
std::ifstream in(argv[1]);
if (!in) { std::cerr << "Cannot open file\n"; return 1; }
std::stringstream ss; ss << in.rdbuf();
std::string src = ss.str();

try {
    Lexer lex(src);
    Parser parser(lex);
    auto funcs = parser.parseModule();

    Semantic sem;
    sem.analyze(funcs);

    CodeGen cg;
    auto module = cg.generate(funcs);

    // Initialize LLVM targets for JIT
    InitializeNativeTarget();
    InitializeNativeTargetAsmPrinter();
    InitializeNativeTargetAsmParser();

    auto J = cantFail(LLJITBuilder().create());
    auto TSM = ThreadSafeModule(std::move(module), std::make_unique<LLVMContext>());
    cantFail(J->addIRModule(std::move(TSM)));

    auto sym = J->lookup("main");
    if (!sym) { std::cerr << "main not found\n"; return 1; }
    using MainFn = int(*)();
    auto mainPtr = (MainFn)sym->getAddress();
    int result = mainPtr();
    std::cout << "Program returned " << result << "\n";
} catch (const std::exception &ex) {
    std::cerr << "Error: " << ex.what() << "\n";
    return 1;
}
return 0;
Enter fullscreen mode Exit fullscreen mode

}
`


Example program

examples/hello.fal:


fn main() {
return 40 + 2;
}


Build & Run

  1. Create the directory structure and files as above.
  2. Configure and build:

bash
mkdir build
cd build
cmake .. -DLLVM_DIR=/path/to/llvm/cmake
make -j

  1. Run:

`bash
./falcon ../examples/hello.fal

Expected: Program returned 42
`


Notes

I will public it at github with name: Falcon (if it was available).
else, with name: Falcon-Lang | Falcon-Prolang

Have a nice times!

Top comments (1)

Collapse
 
javadinteger profile image
Javad

This post is the first entry in the Compiler Design 101 series 🚀
Stay tuned for the next one