BlitzMax Lexer Module

-=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- (c) WidthPadding Industries 1987 0\|419\|0 -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=- -=+=-
SoCoder -> Snippet Home -> Misc

Cower	Created : 13 March 2010 Language : Blitz Max BlitzMax Lexer Module Module for tokenizing BlitzMax source code
I originally wrote this in Ruby, but there is a rather annoying issue with writing any code in Ruby: using it anywhere else is an immense pain. If you've ever had to work with the C API to embed Ruby in something, you're probably aware of this. You may also be insane if you're going "I did it and I thoroughly enjoyed the experience." I can't help those people, they're clearly lost causes. Anyhow, so I ported the code to C, and overall I think it's an improvement because it's a little less messy. There's not a lot of comments — there are actually four total in the C side of things, and only a handful in the BlitzMax code just because BlitzMax sucks at actually working with C code and sometimes I need to make a note about what type something really is. The C API is private in this, mostly because I think most BlitzMax users would find it terrifying even if it's relatively simple. The BlitzMax API is fairly simple, I don't think I need to explain what each method does or what the fields of something are. If it has an _ before it, you don't touch that, fairly simple. If you need to parse BlitzMax code, this is probably a decent starting point so you don't have to concern yourself with the annoying string parsing crap you'd otherwise have to do and just focus on structure and chunks of code. If you want to tweak the lexer to match certain other things, it's probably fairly easy to do and could be a decent starting point for something else (most of what you'd change would likely be covered by the token singles/pairs arrays and changing those to match your own preferences - case sensitivity options are in there, so you could work that in as well). Anyhow, the C side of things... lexer.h #ifndef LEXER_H_BICMCZIT #define LEXER_H_BICMCZIT #ifdef __cplusplus extern "C" { #endif typedef enum { TOK_INVALID=0, TOK_ID, TOK_END_KW, TOK_FUNCTION_KW, TOK_ENDFUNCTION_KW, TOK_METHOD_KW, TOK_ENDMETHOD_KW, TOK_TYPE_KW, TOK_EXTENDS_KW, TOK_ABSTRACT_KW, TOK_FINAL_KW, TOK_NODEBUG_KW, TOK_ENDTYPE_KW, TOK_EXTERN_KW, TOK_ENDEXTERN_KW, TOK_REM_KW, TOK_ENDREM_KW, TOK_FLOAT_KW, TOK_DOUBLE_KW, TOK_BYTE_KW, TOK_SHORT_KW, TOK_INT_KW, TOK_STRING_KW, TOK_OBJECT_KW, TOK_LOCAL_KW, TOK_GLOBAL_KW, TOK_CONST_KW, TOK_VARPTR_KW, TOK_PTR_KW, TOK_VAR_KW, TOK_NULL_KW, TOK_STRICT_KW, TOK_SUPERSTRICT_KW, TOK_FRAMEWORK_KW, TOK_MODULE_KW, TOK_MODULEINFO_KW, TOK_IMPORT_KW, TOK_INCLUDE_KW, TOK_PRIVATE_KW, TOK_PUBLIC_KW, TOK_OR_KW, TOK_AND_KW, TOK_SHR_KW, TOK_SHL_KW, TOK_SAR_KW, TOK_MOD_KW, TOK_NOT_KW, TOK_WHILE_KW, TOK_WEND_KW, TOK_ENDWHILE_KW, TOK_FOR_KW, TOK_NEXT_KW, TOK_UNTIL_KW, TOK_TO_KW, TOK_EACHIN_KW, TOK_REPEAT_KW, TOK_FOREVER_KW, TOK_IF_KW, TOK_ENDIF_KW, TOK_ELSE_KW, TOK_ELSEIF_KW, TOK_THEN_KW, TOK_SELECT_KW, TOK_CASE_KW, TOK_DEFAULT_KW, TOK_ENDSELECT_KW, TOK_SELF_KW, TOK_SUPER_KW, TOK_PI_KW, TOK_NEW_KW, // extensions TOK_PROTOCOL_KW, TOK_ENDPROTOCOL_KW, TOK_AUTO_KW, TOK_IMPLEMENTS_KW, TOK_COLON, TOK_QUESTION, TOK_BANG, TOK_HASH, TOK_DOT, TOK_DOUBLEDOT, TOK_TRIPLEDOT, TOK_AT, TOK_DOUBLEAT, TOK_DOLLAR, TOK_PERCENT, TOK_SINGLEQUOTE, TOK_OPENPAREN, TOK_CLOSEPAREN, TOK_OPENBRACKET, TOK_CLOSEBRACKET, TOK_OPENCURL, TOK_CLOSECURL, TOK_GREATERTHAN, TOK_LESSTHAN, TOK_EQUALS, TOK_MINUS, TOK_PLUS, TOK_ASTERISK, TOK_CARET, TOK_TILDE, TOK_GRAVE, TOK_BACKSLASH, TOK_SLASH, TOK_COMMA, TOK_SEMICOLON, TOK_PIPE, TOK_AMPERSAND, TOK_NEWLINE, TOK_ASSIGN_ADD, TOK_ASSIGN_SUBTRACT, TOK_ASSIGN_DIVIDE, TOK_ASSIGN_MULTIPLY, TOK_ASSIGN_POWER, TOK_ASSIGN_SHL, TOK_ASSIGN_SHR, TOK_ASSIGN_SAR, TOK_ASSIGN_MOD, TOK_ASSIGN_XOR, TOK_ASSIGN_AND, TOK_ASSIGN_OR, TOK_ASSIGN_AUTO, TOK_DOUBLEMINUS, TOK_DOUBLEPLUS, TOK_NUMBER_LIT, TOK_HEX_LIT, TOK_BIN_LIT, TOK_STRING_LIT, TOK_LINE_COMMENT, TOK_BLOCK_COMMENT, TOK_EOF, TOK_LAST=TOK_EOF, TOK_COUNT } token_kind_t; typedef struct s_token { token_kind_t kind; const char from, to; int line, column; } token_t; typedef struct s_lexer lexer_t; /* allocates a new lexer for the range specified by source_begin and source_end and returns it / lexer_t lexer_new(const char source_begin, const char source_end); /* destroys the contents (tokens and such) of the lexer and releases its memory / void lexer_destroy(lexer_t lexer); /* runs the lexer - you should only do this once, doing it twice will result in the entire list of tokens being duplicated for no reason / int lexer_run(lexer_t lexer); /* returns the error string or NULL if there is no error / const char lexer_get_error(lexer_t lexer); / returns the number of tokens identified by the lexer / int lexer_get_num_tokens(lexer_t lexer); /* returns the kind of the token at the index and copies that token to the provided token if it isn't null / token_kind_t lexer_get_token(lexer_t lexer, int index, token_t token); / returns a copy of all tokens identified by the lexer / token_t lexer_copy_tokens(lexer_t lexer, int num_tokens); /* returns a copy of the string contents of the token, must be freed via free(str) / char token_to_string(const token_t* tok); #ifdef __cplusplus } #endif #endif /* end of include guard: LEXER_H_BICMCZIT / --v lexer.c* #include <stdio.h> #include <stdlib.h> #include <stddef.h> #include <stdbool.h> #include <ctype.h> #include <string.h> #include "lexer.h" const int LEXER_INITIAL_CAPACITY = 500; typedef struct s_token_mark { const char place; int line, column; int token; } token_mark_t; struct s_lexer { int capacity; token_t tokens; const char source_begin, source_end; token_mark_t current; char error; }; static void lexer_tokens_fit(lexer_t lexer, size_t n); static token_t lexer_new_token(lexer_t lexer); static token_t lexer_merge_tokens(lexer_t lexer, int from, int to); static token_mark_t lexer_mark(lexer_t lexer); static void lexer_reset(lexer_t lexer, token_mark_t mark); static char lexer_current(lexer_t lexer); static bool lexer_has_next(lexer_t lexer); static char lexer_next(lexer_t lexer); static char lexer_peek(lexer_t lexer); static void lexer_skip_whitespace(lexer_t lexer); static token_t lexer_read_base_number(lexer_t lexer); static token_kind_t token_kind_for_single(const char single, size_t len); static token_t lexer_read_number(lexer_t lexer); token_t lexer_read_word(lexer_t lexer); static token_t lexer_read_string(lexer_t lexer); static token_t lexer_read_line_comment(lexer_t lexer); static const char token_strings[] = { "INVALID", "Identifier", "End", "Function", "End Function", "Method", "End Method", "Type", "Extends", "Abstract", "Final", "No Debug", "End Type", "Extern", "End Extern", "Rem", "End Rem", "Float", "Double", "Byte", "Short", "Int", "String", "Object", "Local", "Global", "Const", "VarPtr", "Ptr", "Var", "Null", "Strict", "SuperStrict", "Framework", "Module", "ModuleInfo", "Import", "Include", "Private", "Public", "Or", "And", "Shr", "Shl", "Sar", "Mod", "Not", "While", "Wend", "End While", "For", "Next", "Until", "To", "EachIn", "Repeat", "Forever", "If", "End If", "Else", "Else If", "Then", "Select", "Case", "Default", "End Select", "Self", "Super", "Pi", "New", "Protocol", "End Protocol", "Auto", "Implements", ":", "?", "!", "#", ".", "..", "...", "@", "@@", "$", "%", "'", "(", ")", "[", "]", "{", "}", ">", "<", "=", "-", "+", "", "^", "~", "`", "\\", "/", ",", ";", "\|", "&", "\\n", ":+", ":-", ":/", ":", ":^", ":Shl", ":Shr", ":Sar", ":Mod", ":~", ":&", ":\|", ":=", "--", "++", "Number Literal", "Hex Literal", "Bin Literal", "String Literal", "Line Comment", "Block Comment", "<EOF>", }; typedef struct s_token_single { token_kind_t kind; const char matches; bool case_sensitive; } token_single_t; static token_single_t const token_singles[] = { { .kind = TOK_END_KW, .case_sensitive = false, .matches = "end" }, { .kind = TOK_FUNCTION_KW, .case_sensitive = false, .matches = "function" }, { .kind = TOK_ENDFUNCTION_KW, .case_sensitive = false, .matches = "endfunction" }, { .kind = TOK_METHOD_KW, .case_sensitive = false, .matches = "method" }, { .kind = TOK_ENDMETHOD_KW, .case_sensitive = false, .matches = "endmethod" }, { .kind = TOK_TYPE_KW, .case_sensitive = false, .matches = "type" }, { .kind = TOK_EXTENDS_KW, .case_sensitive = false, .matches = "extends" }, { .kind = TOK_ABSTRACT_KW, .case_sensitive = false, .matches = "abstract" }, { .kind = TOK_FINAL_KW, .case_sensitive = false, .matches = "final" }, { .kind = TOK_NODEBUG_KW, .case_sensitive = false, .matches = "nodebug" }, { .kind = TOK_ENDTYPE_KW, .case_sensitive = false, .matches = "endtype" }, { .kind = TOK_EXTERN_KW, .case_sensitive = false, .matches = "extern" }, { .kind = TOK_ENDEXTERN_KW, .case_sensitive = false, .matches = "endextern" }, { .kind = TOK_REM_KW, .case_sensitive = false, .matches = "rem" }, { .kind = TOK_ENDREM_KW, .case_sensitive = false, .matches = "endrem" }, { .kind = TOK_FLOAT_KW, .case_sensitive = false, .matches = "float" }, { .kind = TOK_DOUBLE_KW, .case_sensitive = false, .matches = "double" }, { .kind = TOK_BYTE_KW, .case_sensitive = false, .matches = "byte" }, { .kind = TOK_SHORT_KW, .case_sensitive = false, .matches = "short" }, { .kind = TOK_INT_KW, .case_sensitive = false, .matches = "int" }, { .kind = TOK_STRING_KW, .case_sensitive = false, .matches = "string" }, { .kind = TOK_OBJECT_KW, .case_sensitive = false, .matches = "object" }, { .kind = TOK_LOCAL_KW, .case_sensitive = false, .matches = "local" }, { .kind = TOK_GLOBAL_KW, .case_sensitive = false, .matches = "global" }, { .kind = TOK_CONST_KW, .case_sensitive = false, .matches = "const" }, { .kind = TOK_VARPTR_KW, .case_sensitive = false, .matches = "varptr" }, { .kind = TOK_PTR_KW, .case_sensitive = false, .matches = "ptr" }, { .kind = TOK_VAR_KW, .case_sensitive = false, .matches = "var" }, { .kind = TOK_NULL_KW, .case_sensitive = false, .matches = "null" }, { .kind = TOK_STRICT_KW, .case_sensitive = false, .matches = "strict" }, { .kind = TOK_SUPERSTRICT_KW, .case_sensitive = false, .matches = "superstrict" }, { .kind = TOK_FRAMEWORK_KW, .case_sensitive = false, .matches = "framework" }, { .kind = TOK_MODULE_KW, .case_sensitive = false, .matches = "module" }, { .kind = TOK_MODULEINFO_KW, .case_sensitive = false, .matches = "moduleinfo" }, { .kind = TOK_IMPORT_KW, .case_sensitive = false, .matches = "import" }, { .kind = TOK_INCLUDE_KW, .case_sensitive = false, .matches = "include" }, { .kind = TOK_PRIVATE_KW, .case_sensitive = false, .matches = "private" }, { .kind = TOK_PUBLIC_KW, .case_sensitive = false, .matches = "public" }, { .kind = TOK_OR_KW, .case_sensitive = false, .matches = "or" }, { .kind = TOK_AND_KW, .case_sensitive = false, .matches = "and" }, { .kind = TOK_SHR_KW, .case_sensitive = false, .matches = "shr" }, { .kind = TOK_SHL_KW, .case_sensitive = false, .matches = "shl" }, { .kind = TOK_SAR_KW, .case_sensitive = false, .matches = "sar" }, { .kind = TOK_MOD_KW, .case_sensitive = false, .matches = "mod" }, { .kind = TOK_NOT_KW, .case_sensitive = false, .matches = "not" }, { .kind = TOK_WHILE_KW, .case_sensitive = false, .matches = "while" }, { .kind = TOK_WEND_KW, .case_sensitive = false, .matches = "wend" }, { .kind = TOK_ENDWHILE_KW, .case_sensitive = false, .matches = "endwhile" }, { .kind = TOK_FOR_KW, .case_sensitive = false, .matches = "for" }, { .kind = TOK_NEXT_KW, .case_sensitive = false, .matches = "next" }, { .kind = TOK_UNTIL_KW, .case_sensitive = false, .matches = "until" }, { .kind = TOK_TO_KW, .case_sensitive = false, .matches = "to" }, { .kind = TOK_EACHIN_KW, .case_sensitive = false, .matches = "eachin" }, { .kind = TOK_REPEAT_KW, .case_sensitive = false, .matches = "repeat" }, { .kind = TOK_FOREVER_KW, .case_sensitive = false, .matches = "forever" }, { .kind = TOK_IF_KW, .case_sensitive = false, .matches = "if" }, { .kind = TOK_ENDIF_KW, .case_sensitive = false, .matches = "endif" }, { .kind = TOK_ELSE_KW, .case_sensitive = false, .matches = "else" }, { .kind = TOK_ELSEIF_KW, .case_sensitive = false, .matches = "elseif" }, { .kind = TOK_THEN_KW, .case_sensitive = false, .matches = "then" }, { .kind = TOK_SELECT_KW, .case_sensitive = false, .matches = "select" }, { .kind = TOK_CASE_KW, .case_sensitive = false, .matches = "case" }, { .kind = TOK_DEFAULT_KW, .case_sensitive = false, .matches = "default" }, { .kind = TOK_ENDSELECT_KW, .case_sensitive = false, .matches = "endselect" }, { .kind = TOK_SELF_KW, .case_sensitive = false, .matches = "self" }, { .kind = TOK_SUPER_KW, .case_sensitive = false, .matches = "super" }, { .kind = TOK_PI_KW, .case_sensitive = false, .matches = "pi" }, { .kind = TOK_NEW_KW, .case_sensitive = false, .matches = "new" }, { .kind = TOK_PROTOCOL_KW, .case_sensitive = false, .matches = "protocol" }, { .kind = TOK_ENDPROTOCOL_KW, .case_sensitive = false, .matches = "endprotocol" }, { .kind = TOK_AUTO_KW, .case_sensitive = false, .matches = "auto" }, { .kind = TOK_IMPLEMENTS_KW, .case_sensitive = false, .matches = "implements" }, { .kind = TOK_COLON, .case_sensitive = false, .matches = ":" }, { .kind = TOK_QUESTION, .case_sensitive = false, .matches = "?" }, { .kind = TOK_BANG, .case_sensitive = false, .matches = "!" }, { .kind = TOK_HASH, .case_sensitive = false, .matches = "#" }, { .kind = TOK_DOLLAR, .case_sensitive = false, .matches = "$" }, { .kind = TOK_PERCENT, .case_sensitive = false, .matches = "%" }, { .kind = TOK_OPENPAREN, .case_sensitive = false, .matches = "(" }, { .kind = TOK_CLOSEPAREN, .case_sensitive = false, .matches = ")" }, { .kind = TOK_OPENBRACKET, .case_sensitive = false, .matches = "[" }, { .kind = TOK_CLOSEBRACKET, .case_sensitive = false, .matches = "]" }, { .kind = TOK_OPENCURL, .case_sensitive = false, .matches = "{" }, { .kind = TOK_CLOSECURL, .case_sensitive = false, .matches = "}" }, { .kind = TOK_GREATERTHAN, .case_sensitive = false, .matches = ">" }, { .kind = TOK_LESSTHAN, .case_sensitive = false, .matches = "<" }, { .kind = TOK_EQUALS, .case_sensitive = false, .matches = "=" }, { .kind = TOK_MINUS, .case_sensitive = false, .matches = "-" }, { .kind = TOK_PLUS, .case_sensitive = false, .matches = "+" }, { .kind = TOK_ASTERISK, .case_sensitive = false, .matches = "" }, { .kind = TOK_CARET, .case_sensitive = false, .matches = "^" }, { .kind = TOK_TILDE, .case_sensitive = false, .matches = "~" }, { .kind = TOK_GRAVE, .case_sensitive = false, .matches = "`" }, { .kind = TOK_BACKSLASH, .case_sensitive = false, .matches = "\\" }, { .kind = TOK_SLASH, .case_sensitive = false, .matches = "/" }, { .kind = TOK_COMMA, .case_sensitive = false, .matches = "," }, { .kind = TOK_SEMICOLON, .case_sensitive = false, .matches = ";" }, { .kind = TOK_PIPE, .case_sensitive = false, .matches = "\|" }, { .kind = TOK_AMPERSAND, .case_sensitive = false, .matches = "&" }, { .kind = TOK_NEWLINE, .case_sensitive = false, .matches = "\n" }, { .kind = TOK_INVALID, .case_sensitive = false, .matches = NULL }, }; typedef struct s_token_pair { token_kind_t left, right; token_kind_t kind; size_t range; } token_pair_t; static token_pair_t const token_pairs[] = { { .left = TOK_END_KW, .right = TOK_REM_KW, .kind = TOK_ENDREM_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_METHOD_KW, .kind = TOK_ENDMETHOD_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_FUNCTION_KW, .kind = TOK_ENDFUNCTION_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_TYPE_KW, .kind = TOK_ENDTYPE_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_EXTERN_KW, .kind = TOK_ENDEXTERN_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_IF_KW, .kind = TOK_ENDIF_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_SELECT_KW, .kind = TOK_ENDSELECT_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_WHILE_KW, .kind = TOK_ENDWHILE_KW, .range = 1 }, { .left = TOK_END_KW, .right = TOK_PROTOCOL_KW, .kind = TOK_ENDPROTOCOL_KW, .range = 1 }, { .left = TOK_COLON, .right = TOK_PLUS, .kind = TOK_ASSIGN_ADD, .range = 0 }, { .left = TOK_COLON, .right = TOK_MINUS, .kind = TOK_ASSIGN_SUBTRACT, .range = 0 }, { .left = TOK_COLON, .right = TOK_SLASH, .kind = TOK_ASSIGN_DIVIDE, .range = 0 }, { .left = TOK_COLON, .right = TOK_ASTERISK, .kind = TOK_ASSIGN_MULTIPLY, .range = 0 }, { .left = TOK_COLON, .right = TOK_CARET, .kind = TOK_ASSIGN_POWER, .range = 0 }, { .left = TOK_COLON, .right = TOK_SHL_KW, .kind = TOK_ASSIGN_SHL, .range = 0 }, { .left = TOK_COLON, .right = TOK_SHR_KW, .kind = TOK_ASSIGN_SHR, .range = 0 }, { .left = TOK_COLON, .right = TOK_SAR_KW, .kind = TOK_ASSIGN_SAR, .range = 0 }, { .left = TOK_COLON, .right = TOK_MOD_KW, .kind = TOK_ASSIGN_MOD, .range = 0 }, { .left = TOK_COLON, .right = TOK_TILDE, .kind = TOK_ASSIGN_XOR, .range = 0 }, { .left = TOK_COLON, .right = TOK_AMPERSAND, .kind = TOK_ASSIGN_AND, .range = 0 }, { .left = TOK_COLON, .right = TOK_PIPE, .kind = TOK_ASSIGN_OR, .range = 0 }, { .left = TOK_COLON, .right = TOK_EQUALS, .kind = TOK_ASSIGN_AUTO, .range = 0 }, { .left = TOK_MINUS, .right = TOK_MINUS, .kind = TOK_DOUBLEMINUS, .range = 0 }, { .left = TOK_PLUS, .right = TOK_PLUS, .kind = TOK_DOUBLEPLUS, .range = 0 }, // { .left = TOK_MINUS, .right = TOK_NUMBER_LIT, .kind = TOK_NUMBER_LIT, .range = 0 }, { .left = TOK_INVALID, .right = TOK_INVALID, .kind = TOK_INVALID, .range = -1 }, }; char token_to_string(const token_t tok) { const char orig; char buf = NULL; size_t len; if (tok == NULL) { orig = token_strings[TOK_INVALID]; len = strlen(orig); } else if (tok->from != NULL && tok->to != NULL && tok->kind != TOK_EOF && tok->kind != TOK_INVALID && tok->kind != TOK_NEWLINE) { orig = tok->from; len = (size_t)(tok->to-tok->from); } else { orig = token_strings[tok->kind]; if (orig != NULL) { len = strlen(orig); } } if (orig != NULL) { buf = (char)calloc(len+1, sizeof(char)); strncpy(buf, orig, len); } return buf; } lexer_t lexer_new(const char source_begin, const char source_end) { lexer_t lexer = malloc(sizeof(lexer_t)); if (source_begin == NULL \|\| source_end == NULL \|\| source_begin > source_end) { return NULL; } lexer->capacity = 0; lexer->tokens = NULL; lexer->source_begin = source_begin; lexer->source_end = source_end; lexer->current.place = source_begin; lexer->current.line = 1; lexer->current.column = 1; lexer->current.token = 0; lexer->error = NULL; lexer_tokens_fit(lexer, LEXER_INITIAL_CAPACITY); return lexer; } void lexer_destroy(lexer_t lexer) { if (lexer == NULL) { return; } if (lexer->tokens != NULL) { free(lexer->tokens); lexer->tokens = NULL; } if (lexer->error != NULL) { free(lexer->error); lexer->error = NULL; } lexer->source_begin = NULL; lexer->source_end = NULL; lexer->current.place = NULL; free(lexer); } static void lexer_tokens_fit(lexer_t lexer, size_t n) { if (n < lexer->capacity) { return; } size_t sz = lexer->capacity2; if (sz < n) { sz = n; } lexer->tokens = realloc(lexer->tokens, szsizeof(token_t)); lexer->capacity = sz; } static token_t lexer_new_token(lexer_t lexer) { int index = lexer->current.token + 1; lexer_tokens_fit(lexer, index+1); token_t token = lexer->tokens+lexer->current.token; lexer->current.token = index; token->kind = TOK_INVALID; token->from = token->to = NULL; token->line = 0; token->column = 0; return token; } static token_t lexer_merge_tokens(lexer_t lexer, int from, int to) { lexer->tokens[from].to = lexer->tokens[to].to; int offset = from - to; int idx = to+1; for (; idx < lexer->current.token; ++idx) lexer->tokens[idx+offset] = lexer->tokens[idx]; lexer->current.token += offset; return NULL; } static token_mark_t lexer_mark(lexer_t lexer) { return lexer->current; } static void lexer_reset(lexer_t lexer, token_mark_t mark) { lexer->current = mark; } static char lexer_current(lexer_t lexer) { if (lexer->source_end < lexer->current.place) return 0; return (lexer->current.place); } static bool lexer_has_next(lexer_t lexer) { return (bool)((lexer->current.place) < lexer->source_end); } static char lexer_next(lexer_t lexer) { if (lexer_current(lexer) == '\n') { lexer->current.line += 1; lexer->current.column = 1; } else { ++lexer->current.column; } return lexer_has_next(lexer) ? (++lexer->current.place) : 0; } static char lexer_peek(lexer_t lexer) { return lexer_has_next(lexer) ? (lexer->current.place+1) : 0; } static void lexer_skip_whitespace(lexer_t lexer) { char cur; while ((cur = lexer_current(lexer)) != 0 && (cur == ' ' \|\| cur == '\t' \|\| cur == '\r')) { lexer_next(lexer); } } static token_t lexer_read_base_number(lexer_t lexer) { char cur = lexer_current(lexer); token_mark_t mark = lexer_mark(lexer); token_t token = { .kind = TOK_NUMBER_LIT, .line = mark.line, .column = mark.column, .from = mark.place, .to = NULL, }; if (cur == '%') { // bin while (lexer_has_next(lexer) && (cur = lexer_next(lexer)) == '0' \|\| cur == '1'); } else if (cur == '$') { // hex while (lexer_has_next(lexer) && isxdigit(lexer_next(lexer))); } else { asprintf(&lexer->error, "[%d:%d] Malformed number literal encountered, not a number\n", lexer->current.line, lexer->current.column); token.kind = TOK_INVALID; return token; } lexer_next(lexer); token.to = lexer->current.place; return token; } static token_kind_t token_kind_for_single(const char single, size_t len) { const token_single_t* iter = token_singles; while (iter->kind != TOK_INVALID) { if (strlen(iter->matches) == len && (iter->case_sensitive ? strncmp(iter->matches, single, len) : strncasecmp(iter->matches, single, len)) == 0) { break; } ++iter; } return iter->kind; } static token_t lexer_read_number(lexer_t lexer) { char cur = lexer_current(lexer); token_mark_t mark = lexer_mark(lexer); bool isDec = (cur == '.'); bool isExp = false; token_t token = { .kind = TOK_NUMBER_LIT, .line = mark.line, .column = mark.column, .from = mark.place, .to = NULL, }; while (lexer_has_next(lexer) && (cur = lexer_next(lexer)) != 0) { if (cur == '.') { if (isDec) { break; } isDec = true; continue; } if (isdigit(cur)) { continue; } if (tolower(cur) == 'e') { if (isExp) { asprintf(&lexer->error, "[%d:%d] Malformed number literal encountered, exponent already provided\n", lexer->current.line, lexer->current.column); token.kind = TOK_INVALID; return token; } isExp = true; cur = lexer_peek(lexer); if (cur == '-' \|\| cur == '+') { lexer_next(lexer); cur = lexer_peek(lexer); } if (!isdigit(cur)) { asprintf(&lexer->error, "[%d:%d] Malformed number literal encountered, exponent expected but not found (%c)\n", lexer->current.line, lexer->current.column, cur); token.kind = TOK_INVALID; return token; } continue; } break; } token.to = lexer->current.place; return token; } token_t lexer_read_word(lexer_t lexer) { token_mark_t mark = lexer_mark(lexer); token_t token = { .kind = TOK_ID, .line = mark.line, .column = mark.column, .from = mark.place, .to = NULL, }; while (lexer_has_next(lexer)) { char cur = lexer_peek(lexer); if (cur != '_' && !isalnum(cur)) { break; } lexer_next(lexer); } lexer_next(lexer); token.to = lexer->current.place; token_kind_t alter = token_kind_for_single(token.from, (size_t)(token.to-token.from)); if (alter != TOK_INVALID) { token.kind = alter; } return token; } static token_t lexer_read_string(lexer_t lexer) { char cur = lexer_current(lexer); token_mark_t mark = lexer_mark(lexer); token_t token = { .kind = TOK_STRING_LIT, .line = mark.line, .column = mark.column, .from = mark.place, .to = NULL, }; while (lexer_has_next(lexer) && (cur = lexer_next(lexer)) != '"') { if (cur == '\n') { asprintf(&lexer->error, "[%d:%d] String literal does not terminate before newline or EOF\n", lexer->current.line, lexer->current.column); token.kind = TOK_INVALID; return token; } } lexer_next(lexer); token.to = lexer->current.place; return token; } static token_t lexer_read_line_comment(lexer_t lexer) { char cur = lexer_current(lexer); token_mark_t mark = lexer_mark(lexer); token_t token = { .kind = TOK_LINE_COMMENT, .line = mark.line, .column = mark.column, .from = mark.place, .to = NULL, }; do { cur = lexer_next(lexer); } while(cur != 0 && cur != '\n'); lexer_next(lexer); token.to = lexer->current.place; return token; } int lexer_run(lexer_t lexer) { if (lexer == NULL \|\| lexer->error != NULL) { return 1; } token_mark_t mark; token_t comment = {.kind=TOK_INVALID}; token_t token; char cur; while(lexer_current(lexer) != 0) { token.kind = TOK_INVALID; lexer_skip_whitespace(lexer); mark = lexer_mark(lexer); cur = lexer_current(lexer); if (comment.kind == TOK_INVALID) { if (cur == '@') { token.kind = TOK_AT; if (lexer_next(lexer) == '@') { token.kind = TOK_DOUBLEAT; lexer_next(lexer); } token.from = mark.place; token.to = lexer->current.place; token.line = mark.line; token.column = mark.column; } if (cur == '.') { if (isdigit(lexer_peek(lexer))) { token = lexer_read_number(lexer); } else { token.kind = TOK_DOT; while(token.kind <= TOK_TRIPLEDOT && lexer_next(lexer) == '.') { ++token.kind; } token.from = mark.place; token.to = lexer->current.place; token.line = mark.line; token.column = mark.column; } } if (cur == '\'') { token = lexer_read_line_comment(lexer); } if (cur == '%') { char peek = lexer_peek(lexer); if (peek == '1' \|\| peek == '0') { token = lexer_read_base_number(lexer); } } if (cur == '$' && isxdigit(lexer_peek(lexer))) { token = lexer_read_base_number(lexer); } if (cur == '"') { token = lexer_read_string(lexer); } if (isdigit(cur)) { token = lexer_read_number(lexer); } if (token.kind == TOK_INVALID) { token_kind_t alter = token_kind_for_single(&cur, 1); if (alter != TOK_INVALID) { token.kind = alter; token.from = mark.place; token.line = mark.line; token.column = mark.column; lexer_next(lexer); token.to = lexer->current.place; } } } if (cur == '_' \|\| isalpha(cur)) { token = lexer_read_word(lexer); } if (comment.kind != TOK_INVALID) { if (token.kind == TOK_END_KW) { if (lexer_current(lexer) == ' ') { lexer_next(lexer); } if ((cur = lexer_current(lexer)) == '_' \|\| isalpha(cur)) { token_mark_t next_mark = lexer_mark(lexer); token_t next = lexer_read_word(lexer); if (next.kind == TOK_REM_KW) { token.kind = TOK_ENDREM_KW; token.to = next.to; } else { lexer_reset(lexer, next_mark); } } } if (token.kind == TOK_ENDREM_KW) { token_t block = { .kind = TOK_BLOCK_COMMENT, .line = comment.line, .column = comment.column, .from = comment.to + 1, .to = token.from - 1, }; comment.kind = TOK_INVALID; lexer_new_token(lexer) = block; } if (token.kind == TOK_INVALID) { lexer_next(lexer); lexer_skip_whitespace(lexer); } } if (token.kind != TOK_INVALID && comment.kind == TOK_INVALID) { lexer_new_token(lexer) = token; } if (comment.kind == TOK_INVALID && token.kind == TOK_REM_KW) { comment = token; } if (token.kind == TOK_INVALID && lexer->error == NULL) { asprintf(&lexer->error, "[%d:%d] Invalid token: %c\n", lexer->current.line, lexer->current.column, cur); } if (lexer->error != NULL) { return 1; } } lexer_new_token(lexer)->kind = TOK_EOF; unsigned int tok_index = 0; while (lexer->tokens[tok_index].kind != TOK_EOF) { token_t left, right; bool merged = false; left = lexer->tokens[tok_index]; right = lexer->tokens[tok_index+1]; const token_pair_t pair_iter = token_pairs; while (pair_iter->left != TOK_INVALID && !merged) { if (pair_iter->left == left.kind && pair_iter->right == right.kind && right.from <= left.to+pair_iter->range) { lexer_merge_tokens(lexer, tok_index, tok_index+1); merged = true; } ++pair_iter; } if (!merged) ++tok_index; } return 0; } token_t lexer_copy_tokens(lexer_t lexer, int num_tokens) { if (lexer == NULL \|\| num_tokens == NULL) return NULL; int num = lexer->current.token; token_t tokens = (token_t)calloc(lexer->current.token, sizeof(token_t)); memcpy(tokens, lexer->tokens, sizeof(token_t)num); num_tokens = num; return tokens; } int lexer_get_num_tokens(lexer_t lexer) { return lexer->current.token; } token_kind_t lexer_get_token(lexer_t lexer, int index, token_t token) { if (lexer == NULL) { return TOK_INVALID; } if (index < 0 \|\| lexer->current.token <= index) { if (token != NULL) { token->kind = TOK_INVALID; } return TOK_INVALID; } if (token != NULL) { token = lexer->tokens[index]; } return token->kind; } const char lexer_get_error(lexer_t lexer) { return (const char)(lexer != NULL ? lexer->error : NULL); } --v bmxlexer.bmx SuperStrict Module Cower.BMXLexer ModuleInfo "Name: BlitzMax Lexer" ModuleInfo "Description: Wrapped lexer for BlitzMax source code" ModuleInfo "Author: Noel Cower" ModuleInfo "License: Public Domain" Import "lexer.c" Private Extern "C" Function lexer_new@Ptr(source_begin@Ptr, source_end@Ptr) Function lexer_destroy(lexer@Ptr) Function lexer_run:Int(lexer@Ptr) Function lexer_get_error$z(lexer@Ptr) Function lexer_get_num_tokens:Int(lexer@Ptr) Function lexer_get_token:Int(lexer@Ptr, index%, token@Ptr) ' Function lexer_copy_tokens@Ptr(lexer@Ptr, num_tokens%Ptr)'unused Function token_to_string@Ptr(tok@Ptr) Function free(b@Ptr) End Extern Public Type TToken Field kind% ' token_kind_t Field _from:Byte Ptr ' const char * Field _to_:Byte Ptr ' const char * Field line% ' int Field column% ' int Field _cachedStr$=Null Method ToString$() If _cachedStr = Null Then Local cstr@Ptr = token_to_string(Self) _cachedStr = String.FromCString(cstr) free(cstr) EndIf Return _cachedStr End Method '#region token_kind_t Const TOK_INVALID% = 0 Const TOK_ID% = 1 Const TOK_END_KW% = 2 Const TOK_FUNCTION_KW% = 3 Const TOK_ENDFUNCTION_KW% = 4 Const TOK_METHOD_KW% = 5 Const TOK_ENDMETHOD_KW% = 6 Const TOK_TYPE_KW% = 7 Const TOK_EXTENDS_KW% = 8 Const TOK_ABSTRACT_KW% = 9 Const TOK_FINAL_KW% = 10 Const TOK_NODEBUG_KW% = 11 Const TOK_ENDTYPE_KW% = 12 Const TOK_EXTERN_KW% = 13 Const TOK_ENDEXTERN_KW% = 14 Const TOK_REM_KW% = 15 Const TOK_ENDREM_KW% = 16 Const TOK_FLOAT_KW% = 17 Const TOK_DOUBLE_KW% = 18 Const TOK_BYTE_KW% = 19 Const TOK_SHORT_KW% = 20 Const TOK_INT_KW% = 21 Const TOK_STRING_KW% = 22 Const TOK_OBJECT_KW% = 23 Const TOK_LOCAL_KW% = 24 Const TOK_GLOBAL_KW% = 25 Const TOK_CONST_KW% = 26 Const TOK_VARPTR_KW% = 27 Const TOK_PTR_KW% = 28 Const TOK_VAR_KW% = 29 Const TOK_NULL_KW% = 30 Const TOK_STRICT_KW% = 31 Const TOK_SUPERSTRICT_KW% = 32 Const TOK_FRAMEWORK_KW% = 33 Const TOK_MODULE_KW% = 34 Const TOK_MODULEINFO_KW% = 35 Const TOK_IMPORT_KW% = 36 Const TOK_INCLUDE_KW% = 37 Const TOK_PRIVATE_KW% = 38 Const TOK_PUBLIC_KW% = 39 Const TOK_OR_KW% = 40 Const TOK_AND_KW% = 41 Const TOK_SHR_KW% = 42 Const TOK_SHL_KW% = 43 Const TOK_SAR_KW% = 44 Const TOK_MOD_KW% = 45 Const TOK_NOT_KW% = 46 Const TOK_WHILE_KW% = 47 Const TOK_WEND_KW% = 48 Const TOK_ENDWHILE_KW% = 49 Const TOK_FOR_KW% = 50 Const TOK_NEXT_KW% = 51 Const TOK_UNTIL_KW% = 52 Const TOK_TO_KW% = 53 Const TOK_EACHIN_KW% = 54 Const TOK_REPEAT_KW% = 55 Const TOK_FOREVER_KW% = 56 Const TOK_IF_KW% = 57 Const TOK_ENDIF_KW% = 58 Const TOK_ELSE_KW% = 59 Const TOK_ELSEIF_KW% = 60 Const TOK_THEN_KW% = 61 Const TOK_SELECT_KW% = 62 Const TOK_CASE_KW% = 63 Const TOK_DEFAULT_KW% = 64 Const TOK_ENDSELECT_KW% = 65 Const TOK_SELF_KW% = 66 Const TOK_SUPER_KW% = 67 Const TOK_PI_KW% = 68 Const TOK_NEW_KW% = 69 Const TOK_PROTOCOL_KW% = 70 Const TOK_ENDPROTOCOL_KW% = 71 Const TOK_AUTO_KW% = 72 Const TOK_IMPLEMENTS_KW% = 73 Const TOK_COLON% = 74 Const TOK_QUESTION% = 75 Const TOK_BANG% = 76 Const TOK_HASH% = 77 Const TOK_DOT% = 78 Const TOK_DOUBLEDOT% = 79 Const TOK_TRIPLEDOT% = 80 Const TOK_AT% = 81 Const TOK_DOUBLEAT% = 82 Const TOK_DOLLAR% = 83 Const TOK_PERCENT% = 84 Const TOK_SINGLEQUOTE% = 85 Const TOK_OPENPAREN% = 86 Const TOK_CLOSEPAREN% = 87 Const TOK_OPENBRACKET% = 88 Const TOK_CLOSEBRACKET% = 89 Const TOK_OPENCURL% = 90 Const TOK_CLOSECURL% = 91 Const TOK_GREATERTHAN% = 92 Const TOK_LESSTHAN% = 93 Const TOK_EQUALS% = 94 Const TOK_MINUS% = 95 Const TOK_PLUS% = 96 Const TOK_ASTERISK% = 97 Const TOK_CARET% = 98 Const TOK_TILDE% = 99 Const TOK_GRAVE% = 100 Const TOK_BACKSLASH% = 101 Const TOK_SLASH% = 102 Const TOK_COMMA% = 103 Const TOK_SEMICOLON% = 104 Const TOK_PIPE% = 105 Const TOK_AMPERSAND% = 106 Const TOK_NEWLINE% = 107 Const TOK_ASSIGN_ADD% = 108 Const TOK_ASSIGN_SUBTRACT% = 109 Const TOK_ASSIGN_DIVIDE% = 110 Const TOK_ASSIGN_MULTIPLY% = 111 Const TOK_ASSIGN_POWER% = 112 Const TOK_ASSIGN_SHL% = 113 Const TOK_ASSIGN_SHR% = 114 Const TOK_ASSIGN_SAR% = 115 Const TOK_ASSIGN_MOD% = 116 Const TOK_ASSIGN_XOR% = 117 Const TOK_ASSIGN_AND% = 118 Const TOK_ASSIGN_OR% = 119 Const TOK_ASSIGN_AUTO% = 120 Const TOK_DOUBLEMINUS% = 121 Const TOK_DOUBLEPLUS% = 122 Const TOK_NUMBER_LIT% = 123 Const TOK_HEX_LIT% = 124 Const TOK_BIN_LIT% = 125 Const TOK_STRING_LIT% = 126 Const TOK_LINE_COMMENT% = 127 Const TOK_BLOCK_COMMENT% = 128 Const TOK_EOF% = 129 Const TOK_LAST%=TOK_EOF Const TOK_COUNT%=TOK_LAST+1 '#endregion End Type Type TLexer Field _lexer@Ptr ' lexer_t Field _run:Int = False Field _cstr_source@Ptr Field _length% Field _tokens:TToken[] Field _error:String = Null Method InitWithSource:TLexer(source$) Assert _cstr_source=Null Else "Lexer already initialized" _cstr_source = source.ToCString() _length = source.Length _lexer = lexer_new(_cstr_source, _cstr_source+_length) Return Self End Method Method Delete() If _cstr_source Then MemFree(_cstr_source) EndIf If _lexer Then lexer_destroy(_lexer) EndIf End Method Method Run:Int() Assert _run = False Else "Lexer has already run" _run = True Local r% = lexer_run(_lexer) If r <> 0 Then _error = lexer_get_error(_lexer) EndIf Return (r=0) End Method Method _cacheTokens() If _tokens = Null Then _tokens = New TToken[lexer_get_num_tokens(_lexer)] For Local init_idx:Int = 0 Until _tokens.Length _tokens[init_idx] = New TToken lexer_get_token(_lexer, init_idx, _tokens[init_idx]) Next EndIf End Method Method GetToken:TToken(index%) _cacheTokens() Return _tokens[index] End Method Method GetTokens:TToken[]() _cacheTokens() Return _tokens End Method Method NumTokens:Int() If _tokens Then Return _tokens.Length EndIf Return lexer_get_num_tokens(_lexer) End Method Method GetError$() Return _error End Method End Type --v

Comments

Comments