DEV Community

Sarun Tapee
Sarun Tapee

Posted on

Enhance lexical parser functionality

Now we have a lexical parser working, but we need the ability to undo the parsing by moving back in lex_buffer. Imagine that we want to check if the expression is a SQL select syntax or a mathematical expression. First, we need to check the input string with SQL select syntax. If we find out that the input string is not compiled with SQL select syntax, we need to undo all the previous parsing and then check the input string against the mathematical expression syntax. This algorithm is called backtracking.

Using Stack

One solution to implement a backtrack algorithm is to use a stack by pushing data into the stack while parsing the input string. When we cannot go further, we pop the data out of the stack and then process it against a new syntax. We will keep doing this until we find a matching syntax.

Let's begin by defining data to push into the stack

// parser_export.h
typedef struct lex_data {
    math_exp_type token_code;
    int len;
    char *text;
} lex_data;

// math_exp_enum.h
typedef enum math_exp_type {
    MATH_BRACKET_START,
    MATH_BRACKET_END,

    MATH_OPERATOR_START,
    MATH_LESS_THAN_EQ,
    MATH_LESS_THAN,
    MATH_GREATER_THAN,
    MATH_GREATER_THAN_EQ,
    MATH_EQ,
    MATH_NOT_EQ,
    MATH_AND,
    MATH_OR,
    MATH_MUL,
    MATH_PLUS,
    MATH_MINUS,
    MATH_DIV,
    MATH_SQRT,
    MATH_SQR,
    MATH_MAX,
    MATH_MIN,
    MATH_SIN,
    MATH_COS,
    MATH_POW,
    MATH_OPERATOR_END,

    MATH_COMMA,

    MATH_OPERAND_START,
    MATH_INTEGER_VALUE,
    MATH_DOUBLE_VALUE,
    MATH_IDENTIFIER,
    MATH_IDENTIFIER_IDENTIFIER,
    MATH_BOOL,
    MATH_OPERAND_END,

    MATH_WILDCARD_VALUE,
    MATH_STRING_VALUE,
    MATH_SPACE,

    MATH_INVALID,

    PARSER_EOL,
    PARSER_QUIT,

    MATH_EXP_TYPE_MAX,
} math_exp_type;

Enter fullscreen mode Exit fullscreen mode

For stack implementation

// parser_export.h
...
#define STACK_CAP 512

typedef struct stack {
    int top;
    lex_data data[STACK_CAP];
} stack;

...

// parser.l
%{
#include "parser_export.h"
char lex_buffer[MAX_STR_SIZE];
char *curr_ptr = lex_buffer;
stack lex_stack = {.top = -1};


static void stack_push(lex_data v) {
    assert(lex_stack.top+1 < STACK_CAP);
    lex_stack.data[++lex_stack.top] = v;
}

static lex_data stack_pop() {
    assert(lex_stack.top > -1);
    return lex_stack.data[lex_stack.top--];
}

static int stack_size() {
    return lex_stack.top + 1;
}

static lex_data stack_peek() {
    assert(lex_stack.top > -1);
    return lex_stack.data[lex_stack.top];
}

void stack_reset() {
    lex_data d;

    while (stack_size() > 0) {
        d = stack_pop();
        free(d.text);
    }
    yy_scan_string(lex_buffer);
    curr_ptr = lex_buffer;
}

lex_data cyylex() {
    int token_code = yylex();
    lex_data d = {.token_code = (math_exp_type) token_code, .len = yyleng};
    d.text = (char*) malloc(yyleng+1);
    strncpy(d.text, yytext, yyleng+1);
    stack_push(d);
    curr_ptr += yyleng;
    return d;
}

void yyrewind(int n) {
    int i;

    while (n > 0 && lex_stack.top != -1) {
        i = lex_stack.top;
        free(lex_stack.data[i].text);
        lex_stack.data[i].text = NULL;
        if (lex_stack.data[i].token_code != MATH_SPACE) {
            n--;
        }
        curr_ptr -= lex_stack.data[lex_stack.top--].len;
    }
    yy_scan_string(curr_ptr);
}

void restore(int _chkp) {
    assert(_chkp <= lex_stack.top);
    lex_data *p;
    while (lex_stack.top > _chkp) {
        p = &lex_stack.data[lex_stack.top--];
        free(p->text);
        p->text = NULL;
        curr_ptr -= p->len;
    }
    yy_scan_string(curr_ptr);
}

static void process_white_space(int n) {
    lex_data d = {.token_code = MATH_SPACE, .len = n};
    stack_push(d);
    curr_ptr += n;
}

%}

Enter fullscreen mode Exit fullscreen mode

Since we need to undo the parsing, we need to keep track of the head of the string so that when we undo, we can reset the lex by calling yy_scan_string and passing the beginning of the string to the function. That's why we need char *curr_ptr to keep track of the current position of the string.

The method of the stack is straightforward.

cyylex function is a wrapper of yylex.

yyrewind function is used to undo the parsing by providing the number of times to undo the cyylex call.

restore function is used to undo the parsing to the desired point (checkpoint).

process_white_space is used to push white space to the stack. Since we know what white space looks like, we do not need to store a space string.

Then we need to define a regular expression in parser.l file. Note that for white space, we want to ignore it, but we need to push it to the stack to use in the undo process, so that we know the length of the string we need to move backward.

...
%%
"(" {
    return MATH_BRACKET_START;
}

")" {
    return MATH_BRACKET_END;
}

...

[ ] {
    /* Ignore */
    process_white_space(1);
}

...

0|-?[1-9][0-9]* {
    return MATH_INTEGER_VALUE;
}

-?[0-9]*\.[0-9]+ {
    return MATH_DOUBLE_VALUE;
}

[a-zA-Z0-9_]+ {
    return MATH_IDENTIFIER;
}

...
%%

Enter fullscreen mode Exit fullscreen mode

Usage of enhanced lexical parser

Let's test our lexical parser implementation by checking if the input string is in the format of a circular equation (pow(x,2) + pow(y,2) = c).

// parser_export.h
...
typedef enum parse_status {
    PARSE_ERROR,
    PARSE_SUCCESS,
} parse_status;
...

// equation.c
static parse_status circle_eql() {
    lex_data d;

    d = cyylex();
    if (d.token_code != MATH_POW) {
        yyrewind(1);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_BRACKET_START) {
        yyrewind(2);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_IDENTIFIER) {
        yyrewind(3);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_COMMA) {
        yyrewind(4);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE || strcmp("2", d.text) != 0) {
        yyrewind(5);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_BRACKET_END) {
        yyrewind(6);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_PLUS) {
        yyrewind(7);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_POW) {
        yyrewind(8);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_BRACKET_START) {
        yyrewind(9);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_IDENTIFIER) {
        yyrewind(10);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_COMMA) {
        yyrewind(11);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE || strcmp("2", d.text) != 0) {
        yyrewind(12);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_BRACKET_END)  {
        yyrewind(13);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_EQ) {
        yyrewind(14);
        return PARSE_ERROR;
    }
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE) {
        yyrewind(15);
        return PARSE_ERROR;
    }
    return PARSE_SUCCESS;
}
Enter fullscreen mode Exit fullscreen mode

As you can see, the usage of our lex parser is not convenient since we need to keep track of how many times we call cyylex and when the parsing fails, we need to undo by calling yyrewind before returning PARSE_ERROR. Let's make use of a macro.

// parser_export.h
...
#define PARSE_INIT \
    lex_data d; \
    int _chkp = lex_stack.top;

#define RETURN_PARSE_ERROR { \
    restore(_chkp); \
    return PARSE_ERROR; \
}

#define RESTORE_CHECK_POINT { \
    restore(_chkp); \
}

Enter fullscreen mode Exit fullscreen mode

Now the usage can be simplified as follows

// equation.c
static parse_status circle_eql() {
    PARSE_INIT
    d = cyylex();
    if (d.token_code != MATH_POW)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_BRACKET_START)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_IDENTIFIER)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_COMMA)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE || strcmp("2", d.text) != 0)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_BRACKET_END)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_PLUS)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_POW)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_BRACKET_START)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_IDENTIFIER)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_COMMA)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE || strcmp("2", d.text) != 0)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_BRACKET_END)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_EQ)
        RETURN_PARSE_ERROR
    d = cyylex();
    if (d.token_code != MATH_INTEGER_VALUE)
        RETURN_PARSE_ERROR
    return PARSE_SUCCESS;
}

Enter fullscreen mode Exit fullscreen mode

Then you can test the implementation by writing a main function. ellips_eql is left for your exercise.

// equation.c
void parse_equation() {
    if (circle_eql() == PARSE_SUCCESS)
        printf("equation of circle\n");
    if (ellipse_eql() == PARSE_SUCCESS)
        printf("equation fo ellipse\n");
}

// parser.l
extern void parse_equation();

int main(void) {
    int token_code;

    while (1) {
        printf("input -> ");
        fgets(lex_buffer, sizeof(lex_buffer), stdin);
        stack_reset();
        parse_equation();
    }
    return 0;
}
Enter fullscreen mode Exit fullscreen mode

The complete code can be found on my GitHub.

Top comments (0)