As a best-selling author, I invite you to explore my books on Amazon. Don't forget to follow me on Medium and show your support. Thank you! Your support means the world!
Building a compiler might sound like a task for experts, but with Python, you can start understanding the process step by step. I will show you how to break down this complex idea into smaller, manageable pieces. Think of it as teaching a computer to read a new language, from the basic alphabet to forming complete, runnable sentences.
The first step is teaching the computer to recognize the basic building blocks of your language, like numbers, words, and symbols. This is called lexical analysis, or tokenization. In Python, tools like Sly help us do this by defining patterns. You tell it what a number looks like, what an operator looks like, and it scans the code to produce a list of these tokens.
from sly import Lexer
class CalcLexer(Lexer):
tokens = { NUMBER, ID, PLUS, MINUS, TIMES,
DIVIDE, ASSIGN, LPAREN, RPAREN }
ignore = ' \t'
ignore_comment = r'\#.*'
PLUS = r'\+'
MINUS = r'-'
TIMES = r'\*'
DIVIDE = r'/'
ASSIGN = r'='
LPAREN = r'\('
RPAREN = r'\)'
ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
@_(r'\d+')
def NUMBER(self, t):
t.value = int(t.value)
return t
@_(r'\n+')
def newline(self, t):
self.lineno += len(t.value)
def error(self, t):
print(f"Illegal character '{t.value[0]}'")
self.index += 1
# Tokenize sample code
lexer = CalcLexer()
code = """
x = 42 + (y * 3) # Assignment
result = x / 2
"""
tokens = list(lexer.tokenize(code))
for token in tokens:
print(f"{token.type:10} {token.value!r:10} at line {token.lineno}")
Once you have tokens, the next step is to figure out how they fit together grammatically. This is parsing. A parser takes the flat list of tokens and builds a structured tree that represents the order of operations. Lark is a great Python library for this. You write a grammar that defines rules, like "an expression can be a term or an expression plus a term."
from lark import Lark, Transformer, Visitor
calculator_grammar = """
start: expr
expr: term
| expr "+" term -> add
| expr "-" term -> sub
term: factor
| term "*" factor -> mul
| term "/" factor -> div
factor: NUMBER -> number
| "-" factor -> neg
| "(" expr ")"
NUMBER: /-?\d+(\.\d+)?/
%ignore " "
"""
class CalcTransformer(Transformer):
def add(self, items):
left, right = items
return ('add', left, right)
def sub(self, items):
left, right = items
return ('sub', left, right)
def mul(self, items):
left, right = items
return ('mul', left, right)
def div(self, items):
left, right = items
return ('div', left, right)
def neg(self, items):
value = items[0]
return ('neg', value)
def number(self, items):
return ('number', float(items[0]))
parser = Lark(calculator_grammar, parser='lalr', transformer=CalcTransformer())
# Parse and transform
ast = parser.parse("3 + 4 * (2 - 1)")
print(f"AST: {ast}")
# Evaluate the AST
def evaluate(node):
if node[0] == 'number':
return node[1]
elif node[0] == 'add':
return evaluate(node[1]) + evaluate(node[2])
elif node[0] == 'sub':
return evaluate(node[1]) - evaluate(node[2])
elif node[0] == 'mul':
return evaluate(node[1]) * evaluate(node[2])
elif node[0] == 'div':
return evaluate(node[1]) / evaluate(node[2])
elif node[0] == 'neg':
return -evaluate(node[1])
result = evaluate(ast)
print(f"Result: {result}")
Syntax is about structure, but meaning is about rules. This is semantic analysis. It checks if the program makes sense. For example, is a variable used before it's declared? Are you trying to add a number to a text string? You need a symbol table to keep track of all variables and their types as you analyze the code.
from dataclasses import dataclass
from typing import Dict, Optional, Any
@dataclass
class Symbol:
name: str
type: str
scope: str
defined_at: int
class SemanticAnalyzer:
def __init__(self):
self.symbol_table: Dict[str, Symbol] = {}
self.current_scope = "global"
self.errors = []
def enter_scope(self, scope_name: str):
self.current_scope = scope_name
def exit_scope(self):
self.current_scope = "global"
def declare_variable(self, name: str, var_type: str, line: int):
if name in self.symbol_table:
existing = self.symbol_table[name]
if existing.scope == self.current_scope:
self.errors.append(
f"Line {line}: Variable '{name}' already declared in this scope"
)
return False
self.symbol_table[name] = Symbol(
name=name,
type=var_type,
scope=self.current_scope,
defined_at=line
)
return True
def lookup_variable(self, name: str, line: int) -> Optional[Symbol]:
# Check current scope first
for sym in self.symbol_table.values():
if sym.name == name and sym.scope == self.current_scope:
return sym
# Check global scope
for sym in self.symbol_table.values():
if sym.name == name and sym.scope == "global":
return sym
self.errors.append(f"Line {line}: Undefined variable '{name}'")
return None
def check_assignment(self, target: str, expr_type: str, line: int) -> bool:
sym = self.lookup_variable(target, line)
if not sym:
return False
if sym.type != expr_type:
self.errors.append(
f"Line {line}: Type mismatch. Cannot assign {expr_type} to {sym.type} variable '{target}'"
)
return False
return True
def check_operation(self, op: str, left_type: str, right_type: str, line: int) -> Optional[str]:
"""Check if operation is valid and return result type."""
numeric_ops = {'+', '-', '*', '/'}
comparison_ops = {'<', '>', '<=', '>=', '==', '!='}
# Numeric operations
if op in numeric_ops:
if left_type == 'int' and right_type == 'int':
return 'int'
elif left_type in ['int', 'float'] and right_type in ['int', 'float']:
return 'float'
else:
self.errors.append(
f"Line {line}: Cannot apply '{op}' to {left_type} and {right_type}"
)
return None
# Comparison operations
if op in comparison_ops:
if left_type == right_type:
return 'bool'
elif left_type in ['int', 'float'] and right_type in ['int', 'float']:
return 'bool'
else:
self.errors.append(
f"Line {line}: Cannot compare {left_type} and {right_type}"
)
return None
# Boolean operations
if op in {'and', 'or'}:
if left_type == 'bool' and right_type == 'bool':
return 'bool'
else:
self.errors.append(
f"Line {line}: Cannot apply '{op}' to non-boolean types"
)
return None
self.errors.append(f"Line {line}: Unknown operation '{op}'")
return None
# Example semantic analysis
analyzer = SemanticAnalyzer()
# Simulate parsing a small program
program_lines = [
("declare", "x", "int", 1),
("declare", "y", "float", 2),
("assign", "x", ("number", 10, "int"), 3),
("assign", "y", ("number", 3.5, "float"), 4),
("assign", "x", ("binary", "+", ("var", "x"), ("number", 5, "int")), 5),
("assign", "z", ("number", 1, "int"), 6), # Error: undeclared
("operation", "<", ("var", "x"), ("var", "y"), 7), # Error: type mismatch
]
for stmt in program_lines:
if stmt[0] == "declare":
_, name, var_type, line = stmt
analyzer.declare_variable(name, var_type, line)
elif stmt[0] == "assign":
_, target, expr, line = stmt
if expr[0] == "number":
expr_type = expr[2]
elif expr[0] == "var":
sym = analyzer.lookup_variable(expr[1], line)
expr_type = sym.type if sym else None
elif expr[0] == "binary":
# Simplified: assume we know the type
expr_type = "int"
if expr_type:
analyzer.check_assignment(target, expr_type, line)
print("Semantic analysis results:")
if analyzer.errors:
for error in analyzer.errors:
print(f" Error: {error}")
else:
print(" No errors found")
print(f"Symbol table: {[s.name for s in analyzer.symbol_table.values()]}")
After checking the program's meaning, we often translate it into a simpler, more generic form. This is called an Intermediate Representation. It's like a universal middle language that makes the next steps easier. It often looks like simple three-address code: t1 = x + y.
from dataclasses import dataclass
from typing import List, Optional, Union, Dict
from enum import Enum
class IRType(Enum):
INTEGER = "int"
FLOAT = "float"
BOOLEAN = "bool"
VOID = "void"
@dataclass
class TempVariable:
id: int
type: IRType
@dataclass
class IRInstruction:
opcode: str
operands: List[Union[str, TempVariable, int, float]]
result: Optional[TempVariable] = None
class IRGenerator:
def __init__(self):
self.temp_counter = 0
self.instructions: List[IRInstruction] = []
self.labels: Dict[str, int] = {}
def new_temp(self, ir_type: IRType) -> TempVariable:
self.temp_counter += 1
return TempVariable(id=self.temp_counter, type=ir_type)
def add_instruction(self, opcode: str, operands: List, result: Optional[TempVariable] = None):
self.instructions.append(
IRInstruction(opcode=opcode, operands=operands, result=result)
)
def add_label(self, name: str):
self.labels[name] = len(self.instructions)
self.add_instruction("LABEL", [name])
def generate_from_ast(self, node) -> List[IRInstruction]:
"""Generate IR from AST node."""
if node[0] == 'program':
for stmt in node[1]:
self.generate_from_ast(stmt)
elif node[0] == 'assign':
var_name, expr = node[1], node[2]
expr_result = self.generate_expression(expr)
self.add_instruction("STORE", [expr_result, var_name])
elif node[0] == 'if':
cond, then_block, else_block = node[1], node[2], node[3]
# Generate condition
cond_result = self.generate_expression(cond)
# Create labels
else_label = f"L{len(self.labels)}"
end_label = f"L{len(self.labels) + 1}"
# Conditional jump
self.add_instruction("JUMP_IF_FALSE", [cond_result, else_label])
# Then block
for stmt in then_block:
self.generate_from_ast(stmt)
self.add_instruction("JUMP", [end_label])
# Else block
self.add_label(else_label)
if else_block:
for stmt in else_block:
self.generate_from_ast(stmt)
self.add_label(end_label)
elif node[0] == 'while':
cond, body = node[1], node[2]
start_label = f"L{len(self.labels)}"
end_label = f"L{len(self.labels) + 1}"
self.add_label(start_label)
# Condition
cond_result = self.generate_expression(cond)
self.add_instruction("JUMP_IF_FALSE", [cond_result, end_label])
# Body
for stmt in body:
self.generate_from_ast(stmt)
self.add_instruction("JUMP", [start_label])
self.add_label(end_label)
return self.instructions
def generate_expression(self, node) -> TempVariable:
"""Generate IR for expression, return temp variable holding result."""
if node[0] == 'number':
value = node[1]
result = self.new_temp(IRType.FLOAT if '.' in str(value) else IRType.INTEGER)
self.add_instruction("LOAD_CONST", [value], result)
return result
elif node[0] == 'var':
var_name = node[1]
result = self.new_temp(IRType.INTEGER) # Simplified type inference
self.add_instruction("LOAD", [var_name], result)
return result
elif node[0] in {'add', 'sub', 'mul', 'div'}:
left = self.generate_expression(node[1])
right = self.generate_expression(node[2])
opcode_map = {
'add': 'ADD',
'sub': 'SUB',
'mul': 'MUL',
'div': 'DIV'
}
result = self.new_temp(IRType.FLOAT) # Assume float for arithmetic
self.add_instruction(opcode_map[node[0]], [left, right], result)
return result
elif node[0] == 'compare':
left = self.generate_expression(node[2])
right = self.generate_expression(node[3])
op_map = {
'<': 'LT',
'>': 'GT',
'<=': 'LTE',
'>=': 'GTE',
'==': 'EQ',
'!=': 'NEQ'
}
result = self.new_temp(IRType.BOOLEAN)
self.add_instruction(op_map[node[1]], [left, right], result)
return result
# Generate IR for a simple program
ast = ('program', [
('assign', 'x', ('number', 10)),
('assign', 'y', ('number', 20)),
('assign', 'z', ('add', ('var', 'x'), ('var', 'y'))),
('if',
('compare', '<', ('var', 'z'), ('number', 50)),
[('assign', 'result', ('number', 1))],
[('assign', 'result', ('number', 0))]
)
])
generator = IRGenerator()
ir_code = generator.generate_from_ast(ast)
print("Generated IR Code:")
for i, instr in enumerate(ir_code):
operands_str = ', '.join(str(op) for op in instr.operands)
result_str = f" -> t{instr.result.id}" if instr.result else ""
print(f"{i:3}: {instr.opcode:15} {operands_str}{result_str}")
Now we have a clean, simple representation. The next step is to turn it into actual runnable code. This is code generation. The target could be another high-level language like Python, or low-level assembly code. You walk through the IR instructions and emit equivalent code in the target language.
class CodeGenerator:
def __init__(self, target="python"):
self.target = target
self.indent_level = 0
self.output_lines = []
def indent(self):
self.indent_level += 1
def dedent(self):
self.indent_level -= 1
def add_line(self, line: str):
indent = " " * self.indent_level
self.output_lines.append(f"{indent}{line}")
def generate_python(self, ir_code: List[IRInstruction]) -> str:
"""Generate Python source code from IR."""
# Reset state
self.output_lines = []
self.indent_level = 0
# Track variable mappings
var_map = {}
self.add_line("# Generated Python code")
self.add_line("")
for instr in ir_code:
if instr.opcode == "LOAD_CONST":
# t1 = LOAD_CONST 10
const_val = instr.operands[0]
temp_var = f"t{instr.result.id}"
var_map[temp_var] = const_val
self.add_line(f"{temp_var} = {const_val}")
elif instr.opcode == "LOAD":
# t2 = LOAD x
var_name = instr.operands[0]
temp_var = f"t{instr.result.id}"
self.add_line(f"{temp_var} = {var_name}")
var_map[temp_var] = var_name
elif instr.opcode == "STORE":
# STORE t1, x
src = instr.operands[0]
dest = instr.operands[1]
if isinstance(src, TempVariable):
src_var = f"t{src.id}"
if src_var in var_map and not isinstance(var_map[src_var], str):
# It's a constant value
self.add_line(f"{dest} = {var_map[src_var]}")
else:
self.add_line(f"{dest} = {src_var}")
else:
self.add_line(f"{dest} = {src}")
elif instr.opcode == "ADD":
# t3 = ADD t1, t2
left, right = instr.operands
result = f"t{instr.result.id}"
left_var = f"t{left.id}" if isinstance(left, TempVariable) else left
right_var = f"t{right.id}" if isinstance(right, TempVariable) else right
self.add_line(f"{result} = {left_var} + {right_var}")
elif instr.opcode == "LABEL":
# LABEL L1
label = instr.operands[0]
self.add_line(f"{label}:")
elif instr.opcode == "JUMP_IF_FALSE":
# JUMP_IF_FALSE t4, L2
cond, label = instr.operands
cond_var = f"t{cond.id}" if isinstance(cond, TempVariable) else cond
self.add_line(f"if not {cond_var}:")
self.indent()
self.add_line(f"goto {label}")
self.dedent()
elif instr.opcode == "JUMP":
# JUMP L3
label = instr.operands[0]
self.add_line(f"goto {label}")
elif instr.opcode in {"LT", "GT", "LTE", "GTE", "EQ", "NEQ"}:
# t5 = LT t1, t2
left, right = instr.operands
result = f"t{instr.result.id}"
left_var = f"t{left.id}" if isinstance(left, TempVariable) else left
right_var = f"t{right.id}" if isinstance(right, TempVariable) else right
op_map = {
"LT": "<",
"GT": ">",
"LTE": "<=",
"GTE": ">=",
"EQ": "==",
"NEQ": "!="
}
self.add_line(f"{result} = {left_var} {op_map[instr.opcode]} {right_var}")
# Replace goto with proper control flow (simplified)
final_code = []
for line in self.output_lines:
if line.strip().startswith("goto"):
label = line.strip().split()[1]
final_code.append(f" break # goto {label}")
else:
final_code.append(line)
return "\n".join(final_code)
def generate_assembly_x86(self, ir_code: List[IRInstruction]) -> str:
"""Generate x86 assembly from IR."""
output = []
output.append("section .text")
output.append("global _start")
output.append("")
output.append("_start:")
# Simple mapping for demonstration
for instr in ir_code:
if instr.opcode == "LOAD_CONST":
const_val = instr.operands[0]
temp_reg = f"eax" # Simplified
output.append(f" mov {temp_reg}, {const_val}")
elif instr.opcode == "ADD":
left, right = instr.operands
output.append(f" add eax, ebx") # Simplified
elif instr.opcode == "STORE":
# Store to memory location
var_name = instr.operands[1]
output.append(f" mov [{var_name}], eax")
output.append(" mov eax, 1 ; sys_exit")
output.append(" int 0x80")
output.append("")
output.append("section .bss")
output.append(" x resd 1")
output.append(" y resd 1")
output.append(" z resd 1")
return "\n".join(output)
# Generate code from previous IR example
codegen = CodeGenerator(target="python")
python_code = codegen.generate_python(ir_code)
print("Generated Python Code:")
print(python_code)
print()
# Try assembly generation
print("Generated x86 Assembly (simplified):")
asm_code = codegen.generate_assembly_x86(ir_code[:3]) # Just first few instructions
print(asm_code)
Before generating the final code, we can often make it better. This is optimization. Simple optimizations include constant folding (calculating 2 + 3 at compile time to get 5) and dead code elimination (removing code that never runs). These steps make the final program faster and smaller.
class Optimizer:
def __init__(self):
self.optimizations_applied = []
def constant_folding(self, instructions: List[IRInstruction]) -> List[IRInstruction]:
"""Evaluate constant expressions at compile time."""
optimized = []
constant_values = {}
for instr in instructions:
new_instr = instr
# Track constant values
if instr.opcode == "LOAD_CONST":
const_val = instr.operands[0]
constant_values[instr.result] = const_val
# Fold binary operations with constant operands
elif instr.opcode in {"ADD", "SUB", "MUL", "DIV"}:
left, right = instr.operands
left_const = constant_values.get(left)
right_const = constant_values.get(right)
if left_const is not None and right_const is not None:
# Both operands are constants, compute result
if instr.opcode == "ADD":
result_val = left_const + right_const
elif instr.opcode == "SUB":
result_val = left_const - right_const
elif instr.opcode == "MUL":
result_val = left_const * right_const
elif instr.opcode == "DIV" and right_const != 0:
result_val = left_const / right_const
else:
result_val = None
if result_val is not None:
# Replace operation with constant load
new_instr = IRInstruction(
opcode="LOAD_CONST",
operands=[result_val],
result=instr.result
)
constant_values[instr.result] = result_val
self.optimizations_applied.append("constant_folding")
# Fold comparison operations
elif instr.opcode in {"LT", "GT", "LTE", "GTE", "EQ", "NEQ"}:
left, right = instr.operands
left_const = constant_values.get(left)
right_const = constant_values.get(right)
if left_const is not None and right_const is not None:
# Evaluate comparison
if instr.opcode == "LT":
result_val = left_const < right_const
elif instr.opcode == "GT":
result_val = left_const > right_const
elif instr.opcode == "LTE":
result_val = left_const <= right_const
elif instr.opcode == "GTE":
result_val = left_const >= right_const
elif instr.opcode == "EQ":
result_val = left_const == right_const
elif instr.opcode == "NEQ":
result_val = left_const != right_const
new_instr = IRInstruction(
opcode="LOAD_CONST",
operands=[1 if result_val else 0], # Boolean as integer
result=instr.result
)
constant_values[instr.result] = result_val
self.optimizations_applied.append("constant_folding")
optimized.append(new_instr)
return optimized
def dead_code_elimination(self, instructions: List[IRInstruction]) -> List[IRInstruction]:
"""Remove unused computations."""
# Track which temporaries are used
used_temps = set()
# First pass: find all used temporaries
for instr in instructions:
for operand in instr.operands:
if isinstance(operand, TempVariable):
used_temps.add(operand.id)
# Second pass: remove instructions whose results are unused
optimized = []
for instr in instructions:
keep = True
# Check if instruction produces an unused result
if instr.result and instr.result.id not in used_temps:
# Exception: STORE instructions have side effects
if instr.opcode != "STORE":
keep = False
self.optimizations_applied.append("dead_code_elimination")
if keep:
optimized.append(instr)
return optimized
def common_subexpression_elimination(self, instructions: List[IRInstruction]) -> List[IRInstruction]:
"""Replace repeated computations with single computation."""
expression_cache = {} # Map expression to temporary variable
optimized = []
for instr in instructions:
if instr.opcode in {"ADD", "SUB", "MUL", "DIV", "LT", "GT", "LTE", "GTE", "EQ", "NEQ"}:
# Create expression signature
left, right = instr.operands
expr_sig = (instr.opcode, left, right)
if expr_sig in expression_cache:
# Replace with copy of previous result
cached_temp = expression_cache[expr_sig]
optimized.append(IRInstruction(
opcode="COPY",
operands=[cached_temp],
result=instr.result
))
self.optimizations_applied.append("common_subexpression_elimination")
continue
else:
# Cache this expression
expression_cache[expr_sig] = instr.result
optimized.append(instr)
return optimized
def optimize(self, instructions: List[IRInstruction]) -> List[IRInstruction]:
"""Apply all optimizations."""
self.optimizations_applied = []
# Apply optimizations multiple times until no changes
changed = True
current = instructions
while changed:
changed = False
start_len = len(current)
# Apply optimizations in sequence
current = self.constant_folding(current)
current = self.common_subexpression_elimination(current)
current = self.dead_code_elimination(current)
if len(current) != start_len:
changed = True
return current
# Test optimizations
test_ir = [
IRInstruction("LOAD_CONST", [10], TempVariable(1, IRType.INTEGER)),
IRInstruction("LOAD_CONST", [20], TempVariable(2, IRType.INTEGER)),
IRInstruction("ADD", [TempVariable(1, IRType.INTEGER),
TempVariable(2, IRType.INTEGER)],
TempVariable(3, IRType.INTEGER)),
IRInstruction("LOAD_CONST", [10], TempVariable(4, IRType.INTEGER)),
IRInstruction("LOAD_CONST", [20], TempVariable(5, IRType.INTEGER)),
IRInstruction("ADD", [TempVariable(4, IRType.INTEGER),
TempVariable(5, IRType.INTEGER)],
TempVariable(6, IRType.INTEGER)), # Same as t3
IRInstruction("MUL", [TempVariable(3, IRType.INTEGER),
TempVariable(6, IRType.INTEGER)],
TempVariable(7, IRType.INTEGER)),
IRInstruction("STORE", [TempVariable(7, IRType.INTEGER), "result"]),
]
print("Original IR:")
for i, instr in enumerate(test_ir):
ops = ', '.join(f't{op.id}' if isinstance(op, TempVariable) else str(op)
for op in instr.operands)
result = f' -> t{instr.result.id}' if instr.result else ''
print(f"{i:2}: {instr.opcode:10} {ops}{result}")
optimizer = Optimizer()
optimized_ir = optimizer.optimize(test_ir)
print("\nOptimized IR:")
for i, instr in enumerate(optimized_ir):
ops = ', '.join(f't{op.id}' if isinstance(op, TempVariable) else str(op)
for op in instr.operands)
result = f' -> t{instr.result.id}' if instr.result else ''
print(f"{i:2}: {instr.opcode:10} {ops}{result}")
print(f"\nOptimizations applied: {optimizer.optimizations_applied}")
print(f"Instructions reduced: {len(test_ir)} -> {len(optimized_ir)}")
Sometimes, you don't compile everything ahead of time. Just-in-time compilation generates machine code right when the program runs. This is useful for languages that need flexibility. Python's llvmlite library lets you build and compile LLVM intermediate code on the fly.
import llvmlite.ir as ir
import llvmlite.binding as llvm
class JITCompiler:
def __init__(self):
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
self.module = ir.Module(name="jit_module")
self.builder = None
self.functions = {}
def compile_function(self, name, param_types, return_type, ir_generator):
"""Compile a function to machine code."""
# Create function type
func_type = ir.FunctionType(return_type, param_types)
# Create function
func = ir.Function(self.module, func_type, name=name)
# Create entry block
entry_block = func.append_basic_block(name="entry")
self.builder = ir.IRBuilder(entry_block)
# Generate IR using the provided generator
ir_generator(self.builder, func.args)
# Verify the module
llvm.parse_assembly(str(self.module))
# Create execution engine
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly(str(self.module))
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
# Get compiled function pointer
func_ptr = engine.get_function_address(name)
self.functions[name] = {
'function': func,
'engine': engine,
'address': func_ptr
}
return func_ptr
def execute_function(self, name, *args):
"""Execute a compiled function."""
if name not in self.functions:
raise ValueError(f"Function {name} not compiled")
# Get function address and create ctypes wrapper
import ctypes
func_ptr = self.functions[name]['address']
# Simple execution for demonstration
# In reality, we'd need to match the exact signature
print(f"Executing {name} at address {hex(func_ptr)}")
print(f"Module IR:\n{self.module}")
return None
# Example: Compile and execute a simple function
jit = JITCompiler()
def generate_add_function(builder, args):
"""IR generator for add function."""
# args[0] and args[1] are the function parameters
result = builder.add(args[0], args[1], name="result")
builder.ret(result)
# Compile add function
add_func_ptr = jit.compile_function(
name="add",
param_types=[ir.IntType(32), ir.IntType(32)],
return_type=ir.IntType(32),
ir_generator=generate_add_function
)
print("Successfully compiled 'add' function")
# Show generated IR
print("\nGenerated LLVM IR:")
print(jit.module)
# Example: Compile a more complex function
def generate_factorial(builder, args):
"""IR generator for factorial function (recursive)."""
# Create basic blocks
entry = builder.block
then_block = builder.append_basic_block("then")
else_block = builder.append_basic_block("else")
merge_block = builder.append_basic_block("merge")
# Compare n <= 1
one = ir.Constant(ir.IntType(32), 1)
cmp = builder.icmp_signed('<=', args[0], one)
builder.cbranch(cmp, then_block, else_block)
# then block: return 1
builder.position_at_start(then_block)
builder.ret(one)
# else block: return n * factorial(n-1)
builder.position_at_start(else_block)
n_minus_one = builder.sub(args[0], one)
# Call factorial recursively
factorial_func = jit.module.get_global("factorial")
if not factorial_func:
# Declare the function first
fact_type = ir.FunctionType(ir.IntType(32), [ir.IntType(32)])
factorial_func = ir.Function(jit.module, fact_type, name="factorial")
recurse = builder.call(factorial_func, [n_minus_one])
result = builder.mul(args[0], recurse)
builder.ret(result)
# Note: This creates a recursive function - compile would need special handling
print("\nFactorial function IR would be generated here")
# Cleanup and show module
print("\nFinal module contains:")
for func in jit.module.functions:
print(f" Function: {func.name}")
Finally, you might not need a full general-purpose language. A Domain-Specific Language is tailored for a specific task, like querying a database or configuring a server. You can build these by embedding them in Python, using its own syntax to define new, specialized rules.
class QueryDSL:
"""A DSL for database queries."""
def __init__(self, table_name):
self.table_name = table_name
self.filters = []
self.projections = []
self.order_by = None
self.limit_value = None
def select(self, *columns):
self.projections.extend(columns)
return self
def where(self, **conditions):
for column, value in conditions.items():
self.filters.append((column, '=', value))
return self
def where_gt(self, column, value):
self.filters.append((column, '>', value))
return self
def where_lt(self, column, value):
self.filters.append((column, '<', value))
return self
def order_by_asc(self, column):
self.order_by = (column, 'ASC')
return self
def order_by_desc(self, column):
self.order_by = (column, 'DESC')
return self
def limit(self, n):
self.limit_value = n
return self
def build(self):
"""Build SQL query from DSL."""
# Build SELECT clause
if self.projections:
select_clause = ", ".join(self.projections)
else:
select_clause = "*"
query = f"SELECT {select_clause} FROM {self.table_name}"
# Build WHERE clause
if self.filters:
where_parts = []
for column, op, value in self.filters:
if isinstance(value, str):
value_str = f"'{value}'"
else:
value_str = str(value)
where_parts.append(f"{column} {op} {value_str}")
query += " WHERE " + " AND ".join(where_parts)
# Build ORDER BY clause
if self.order_by:
column, direction = self.order_by
query += f" ORDER BY {column} {direction}"
# Build LIMIT clause
if self.limit_value:
query += f" LIMIT {self.limit_value}"
return query
def execute(self, connection):
"""Execute query against a database connection."""
query = self.build()
# In reality, use connection.execute(query)
print(f"Executing: {query}")
return f"Results from {query}"
# Example usage
query = (QueryDSL("users")
.select("id", "name", "email")
.where_gt("age", 18)
.where(status="active")
.order_by_desc("created_at")
.limit(10))
print("Generated SQL:")
print(query.build())
print()
# More complex DSL example
class ConfigurationDSL:
"""A DSL for configuration files."""
def __init__(self):
self.config = {}
self.current_section = None
def __getattr__(self, name):
"""Handle section definitions."""
if name.startswith('_'):
raise AttributeError(name)
def section(**kwargs):
self.config[name] = kwargs
self.current_section = name
return self
return section
def set(self, **kwargs):
"""Set values in current section."""
if self.current_section:
self.config[self.current_section].update(kwargs)
return self
def build(self):
"""Build configuration dictionary."""
return self.config
# Use configuration DSL
config = (ConfigurationDSL()
.database(host="localhost", port=5432)
.set(database="mydb", user="admin")
.redis(host="redis.local", port=6379)
.set(db=0, timeout=5)
.api(host="0.0.0.0", port=8000))
print("Generated configuration:")
import json
print(json.dumps(config.build(), indent=2))
print()
# DSL for mathematical expressions
class MathExpression:
def __init__(self, value):
self.value = value
def __add__(self, other):
return MathExpression(('+', self.value, other.value))
def __sub__(self, other):
return MathExpression(('-', self.value, other.value))
def __mul__(self, other):
return MathExpression(('*', self.value, other.value))
def __truediv__(self, other):
return MathExpression(('/', self.value, other.value))
def evaluate(self, vars=None):
"""Evaluate expression with optional variables."""
vars = vars or {}
def eval_node(node):
if isinstance(node, (int, float)):
return node
elif isinstance(node, str):
return vars.get(node, 0)
elif isinstance(node, tuple):
op, left, right = node
left_val = eval_node(left)
right_val = eval_node(right)
if op == '+':
return left_val + right_val
elif op == '-':
return left_val - right_val
elif op == '*':
return left_val * right_val
elif op == '/':
return left_val / right_val
return eval_node(self.value)
def __repr__(self):
def to_str(node):
if isinstance(node, (int, float, str)):
return str(node)
elif isinstance(node, tuple):
op, left, right = node
left_str = to_str(left)
right_str = to_str(right)
return f"({left_str} {op} {right_str})"
return to_str(self.value)
# Create mathematical DSL using operator overloading
x = MathExpression('x')
y = MathExpression('y')
z = MathExpression('z')
expression = (x + 2) * (y - 3) / z
print(f"Mathematical expression: {expression}")
print(f"Evaluation with x=5, y=10, z=2: {expression.evaluate({'x': 5, 'y': 10, 'z': 2})}")
These eight techniques give you a path from a simple idea for a language to a working tool. You can start with a small calculator and gradually add features. The choice of which methods to use depends on what you want your language to do and how fast it needs to be. Python provides a gentle and powerful environment to explore all of these concepts.
📘 Checkout my latest ebook for free on my channel!
Be sure to like, share, comment, and subscribe to the channel!
101 Books
101 Books is an AI-driven publishing company co-founded by author Aarav Joshi. By leveraging advanced AI technology, we keep our publishing costs incredibly low—some books are priced as low as $4—making quality knowledge accessible to everyone.
Check out our book Golang Clean Code available on Amazon.
Stay tuned for updates and exciting news. When shopping for books, search for Aarav Joshi to find more of our titles. Use the provided link to enjoy special discounts!
Our Creations
Be sure to check out our creations:
Investor Central | Investor Central Spanish | Investor Central German | Smart Living | Epochs & Echoes | Puzzling Mysteries | Hindutva | Elite Dev | Java Elite Dev | Golang Elite Dev | Python Elite Dev | JS Elite Dev | JS Schools
We are on Medium
Tech Koala Insights | Epochs & Echoes World | Investor Central Medium | Puzzling Mysteries Medium | Science & Epochs Medium | Modern Hindutva
Top comments (0)