DEV Community

asciidude
asciidude

Posted on

How to make a language (using Python)

After completing this tutorial, you should be able to make a file that looks like this:

var age = 18 + 5;
output "You are " + age;

endproc 0;
Enter fullscreen mode Exit fullscreen mode

This should do the following:

  • Store the "age" variable in the memory, with the value of 23
  • Output "You are 23"
  • End the process on exit code 0

So, to start off, we should first make a main file. This shouldn't take too long:

# import the parser and lexer
import lexer as l
import parser as p

with open('main.lopa', 'r') as f:
    #################
    #     LEXER     #
    #################
    contents = [i for j in f.read().split() for i in (j, ' ')][:-1] # include spaces in the input file
    lexer = l.Lexer(contents)
    tokens = lexer.tokenize() # tokenize the contents

    ################
    #    PARSER    #
    ################
    parser = p.Parser(tokens)
    parser.parse() # parse the tokenized contents from the lexer
    parser.generateFile('main.py') # generate a python file after the parsing has been completed, this is what the user will run
Enter fullscreen mode Exit fullscreen mode

Once you have completed that, it is time to move on to the lexer!

import re # import regex

class Lexer:
    def __init__(self, source):
        self.source = source

    # tokenize function
    def tokenize(self):
        # initalize variables
        tokens = [] # the tokens array, this is returned after tokenization
        index = 0 # the index the lexer is on of the inputs

        while index < len(self.source):
            word = self.source[index]

            # this already contains a few things from my own language, but i will describe what it does anyways and how to utilize it
            # to add on to this lexer, all you have to do is simply check if a word equals something, if it does then append it to the tokens list with an array with two values (name, literal)

            if word == 'set':
                tokens.append(['VARIABLE_DECLARATOR', word])

            elif re.match('[a-zA-Z]', word):
                if word[len(word) - 1] == ';':
                    tokens.append(['IDENTIFIER', word[0:len(word) - 1]])
                else:
                    tokens.append(['IDENTIFIER', word])

            elif word[0] == '"':
                buffer = []

                if word[len(word) - 1] != ';':
                    while word[len(word) - 1] != '"':
                        buffer.append(word)

                        index += 1
                        word = self.source[index]

                    buffer.append(word)
                else:
                    buffer.append(word[0:len(word) - 1])

                tokens.append(['STRING', ''.join(buffer)])

            elif re.match('-?[0-9]', word):
                if word[len(word) - 1] == ';':
                    tokens.append(['NUMBER', word[0:len(word) - 1]])
                else:
                    tokens.append(['NUMBER', word])
            elif word in '=/*-+{}()':
                tokens.append(['OPERATOR', word])

            if word[len(word) -1] == ";":
                tokens.append(['END', ';'])

            # increment the index - this should not be in the if statements
            index += 1

        # finally, return the tokens, on to parsing!
        return tokens
Enter fullscreen mode Exit fullscreen mode

After all of that, it's time to parse!

import lexer as l

class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.index = 0
        self.transpiled = '' # you can include a watermark, if you'd like. anything that is transpiled will be appended to this string

    # this function will loop over the tokenized contents and determine what to parse
    def parse(self):
        while self.index < len(self.tokens):
            # stores token types
            t_type  = self.tokens[self.index][0]

            # stores value of token
            t_value = self.tokens[self.index][1]

            if t_type == "VARIABLE_DECLARATOR" and t_value == 'set':
                self.parse_variable_declaration(self.tokens[self.index:len(self.tokens)])

            elif t_type == "IDENTIFIER" and t_value == 'output':
                self.parse_output_statement(self.tokens[self.index:len(self.tokens)])
            elif t_type == "IDENTIFIER" and t_value == 'endproc':
                self.parse_endproc(self.tokens[self.index:len(self.tokens)])

            self.index += 1

    # this function will generate the file
    def generateFile(self, output):
        with open(output, 'w') as f:
            f.write(self.transpiled)

    # the parser functions

    # variables
    def parse_variable_declaration(self, stream):
        check = 0

        # these 3 variables determine the name, operator, and value of the variable (eg: +-/*...)
        name     = ''
        operator = ''
        value    = ''

        for token in range(0, len(stream)):
            t_type = stream[check][0]
            t_value = stream[check][1]

            if t_type == 'END':
                break

            elif token == 1 and t_type == 'IDENTIFIER':
                name = t_value
            elif token == 1 and t_type != 'IDENTIFIER':
                print(f'ERR -> Failed to parse, invalid variable name \'{t_value}\'')
                quit(-1)

            elif token == 2 and t_type == 'OPERATOR':
                operator = t_value
            elif token == 2 and t_type != 'OPERATOR':
                print('ERR -> Failed to parse, assignment operator is missing or invalid on the declaration of a variable')
                quit(-1)

            elif token > 2 and t_type in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
                value += t_value
            elif token > 2 and t_type not in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
                print(f'ERR -> Failed to parse, invalid assignment value, {t_value}')
                quit(-1)

            check += 1

        # finally, push the transpiled code
        self.transpiled += f'{name} {operator} {value}\n'
        self.index += check

    # everything else is basically the same, i recommend to seperate these all into different files to avoid bloating up your file and making it unreadable. happy coding!

    # output statement
    def parse_output_statement(self, stream):
        check = 0
        value = ''

        for token in range(0, len(stream)):
            t_type = stream[check][0]
            t_value = stream[check][1]

            if t_type == 'END':
                break

            elif token > 0 and t_type in ['STRING', 'NUMBER', 'IDENTIFIER', 'OPERATOR']:
                value += t_value
            elif token > 0 and t_type not in ['NUMBER', 'IDENTIFIER', 'OPERATOR']:
                print(f'ERR -> Failed to parse, invalid assignment value, {t_value}')
                quit(-1)

            check += 1

        self.transpiled += f'print({value})\n'
        self.index += check

    # endproc
    def parse_endproc(self, stream):
        check = 0
        code = 0

        for token in range(0, len(stream)):
            t_type = stream[check][0]
            t_value = stream[check][1]

            if t_type == 'END':
                break

            elif token == 1 and t_type == 'NUMBER':
                if '.' in t_value:
                    print('ERR -> Failed to parse, cannot quit program on a decimal number')
                    quit(-1)

                code = t_value
            elif token == 1 and t_type not in ['NUMBER', None]:
                print(f'ERR -> Failed to parse, cannot parse value {t_value} on \'endproc\'')
                quit(-1)

            check += 1

        self.transpiled += f'quit({code})\n'
        self.index += check
Enter fullscreen mode Exit fullscreen mode

Top comments (1)

Collapse
 
asciidude profile image
asciidude

As a side note, this may not be the... best.. way to make a language, I recommend organizing your files and such so you don't get lost in tons of code. Happy coding!