#include "lex.h" #include #include #include #include "dyn_arr.h" #define advance(lex) (++lex->head, ++lex->col) #define current_char(lex) (*lex->head) // For printing purposes static char *token_names[] = { "TOKEN_EOF", #define TOKEN_NAME(name) "TOKEN_" #name, XTOKENS(TOKEN_NAME) #undef TOKEN_NAME }; static bool is_digit(char c) { return c >= '0' && c <= '9'; } static bool is_upper(char c) { return c >= 'A' && c <= 'Z'; } static bool is_lower(char c) { return c >= 'a' && c <= 'z'; } static bool is_ident(char c) { return is_lower(c) || is_upper(c) || c == '_'; } static bool match_kw(const struct lexer *lex, const char *b) { size_t alen = (size_t)(lex->head - lex->base); size_t blen = strlen(b); return alen == blen && memcmp(lex->base, b, alen) == 0; } // Advances if the given character is a match. static bool match_char(struct lexer *lex, char c) { if (current_char(lex) == c) { advance(lex); return true; } return false; } // Looks for a non-whitespace character and sets the start of the token to // that. If it sees a newline, it will also update all relevant data. And // finally, if it sees a #, it will interpret it as a comment. static void goto_token_start(struct lexer *lex) { while (true) { switch (current_char(lex)) { case '#': while ( current_char(lex) != '\n' && current_char(lex) != '\0' ) advance(lex); break; case '\n': advance(lex); lex->line++; lex->col = 0; break; case ' ': case '\t': case '\r': advance(lex); break; default: lex->base = lex->head; return; } } } static struct token create_token(struct lexer *lex, u16 token_kind) { struct token tok; tok.kind = token_kind; tok.start = lex->base; tok.len = (int)(lex->head - lex->base); tok.line = lex->line; tok.col = lex->col - tok.len; tok.val = create_zilch(); return tok; } static struct token err_token(struct lexer *lex, const char *msg) { struct token tok; tok.kind = TOKEN_ERR; tok.start = msg; tok.len = (int)strlen(msg); tok.line = lex->line; tok.col = lex->col; tok.val = create_zilch(); return tok; } static struct token num_token(struct lexer *lex) { while (is_digit(current_char(lex))) { advance(lex); } if (current_char(lex) == '.') { advance(lex); while (is_digit(current_char(lex))) { advance(lex); } } return create_token(lex, TOKEN_NUM); } // And ident(ifier) token is either a keyword, or an actual identifier static struct token ident_token(struct lexer *lex) { while (true) { char c = current_char(lex); if (c == ':') { // If a : appears in the middle of an identifier, allow // it if (!is_ident(lex->head[1])) break; } else if (!is_ident(c) && !is_digit(c)) { break; } advance(lex); } // : is just to be used as a namespacer. So obviously it should be // disallowed at the beginning and end of identifiers. if (lex->head[-1] == ':') return err_token(lex, "cannot end an identifier in ':'"); u16 kind = TOKEN_IDENT; if (match_kw(lex, "and")) kind = TOKEN_AND; else if (match_kw(lex, "break")) kind = TOKEN_BREAK; else if (match_kw(lex, "do")) kind = TOKEN_DO; else if (match_kw(lex, "else")) kind = TOKEN_ELSE; else if (match_kw(lex, "elseif")) kind = TOKEN_ELSEIF; else if (match_kw(lex, "end")) kind = TOKEN_END; else if (match_kw(lex, "false")) kind = TOKEN_FALSE; else if (match_kw(lex, "fun")) kind = TOKEN_FUN; else if (match_kw(lex, "global")) kind = TOKEN_GLOBAL; else if (match_kw(lex, "if")) kind = TOKEN_IF; else if (match_kw(lex, "in")) kind = TOKEN_IN; else if (match_kw(lex, "let")) kind = TOKEN_LET; else if (match_kw(lex, "loop")) kind = TOKEN_LOOP; else if (match_kw(lex, "nada")) kind = TOKEN_ZILCH; else if (match_kw(lex, "next")) kind = TOKEN_NEXT; else if (match_kw(lex, "not")) kind = TOKEN_NOT; else if (match_kw(lex, "or")) kind = TOKEN_OR; else if (match_kw(lex, "ret")) kind = TOKEN_RET; else if (match_kw(lex, "true")) kind = TOKEN_TRUE; else if (match_kw(lex, "zilch")) kind = TOKEN_ZILCH; return create_token(lex, kind); } static struct token str_token(struct lexer *lex, char term) { // TODO: escape sequences char *chars = da_create(char, 0); while (current_char(lex) != term && current_char(lex) != '\0') { if (current_char(lex) != '\\') { da_append(char, &chars, current_char(lex)); advance(lex); continue; } advance(lex); switch (current_char(lex)) { case '\\': da_append(char, &chars, '\\'); break; case '\'': da_append(char, &chars, '\''); break; case '"': da_append(char, &chars, '"'); break; case 'n': da_append(char, &chars, '\n'); break; case 't': da_append(char, &chars, '\t'); break; case 'r': da_append(char, &chars, '\r'); break; default: da_free(chars); return err_token(lex, "invalid escape sequence"); } advance(lex); // eat escape } da_append(char, &chars, 0); if (current_char(lex) == '\0') return err_token(lex, "string never terminates"); advance(lex); // eat terminator struct token str = create_token(lex, TOKEN_STR); str.val = wrap_str(copy_str(chars, da_len(chars) - 1)); da_free(chars); return str; } // A symbol token is any token that contains no alphanumeric characters. static struct token symbol_token(struct lexer *lex, char c) { switch (c) { case '(': case ')': case '{': case '}': case '[': case ']': case ',': case ';': case ':': return create_token(lex, c); case '.': return create_token( lex, match_char(lex, '.') ? TOKEN_DOT_DOT : c ); case '+': return create_token( lex, match_char(lex, '=') ? TOKEN_PLUS_EQL : c ); case '-': return create_token( lex, match_char(lex, '=') ? TOKEN_MINUS_EQL : c ); case '*': return create_token( lex, match_char(lex, '=') ? TOKEN_MULT_EQL : c ); case '/': return create_token( lex, match_char(lex, '=') ? TOKEN_DIV_EQL : c ); case '%': return create_token( lex, match_char(lex, '=') ? TOKEN_MOD_EQL : c ); case '<': return create_token( lex, match_char(lex, '=') ? TOKEN_LTEQL : c ); case '>': return create_token( lex, match_char(lex, '=') ? TOKEN_GTEQL : c ); case '=': return create_token( lex, match_char(lex, '=') ? TOKEN_EQL : c ); case '!': if (match_char(lex, '=')) return create_token(lex, TOKEN_NEQL); break; } // TODO: log unknown character return err_token(lex, "unknown character"); } void lex_init(struct lexer *lex, const char *src) { lex->src = src; lex->base = src; lex->head = src; lex->line = 1; lex->col = 0; } struct token lex_next_token(struct lexer *lex) { goto_token_start(lex); char c = current_char(lex); if (c == '\0') return create_token(lex, TOKEN_EOF); advance(lex); if (c == '"' || c == '\'') return str_token(lex, c); if (is_digit(c)) return num_token(lex); if (is_ident(c)) return ident_token(lex); return symbol_token(lex, c); } void token_kind_name(char *dst, size_t len, u16 kind) { char char_name[2] = {0, 0}; const char *name = char_name; if (kind < TOKEN_EOF) char_name[0] = kind; else name = token_names[kind - TOKEN_EOF]; len = fmin(len, strlen(name)); memcpy(dst, name, len); dst[len] = '\0'; } void print_token(struct token tok) { char kind_name[128]; token_kind_name(kind_name, 128, tok.kind); printf( "%-15s | %3d:%-2d | %-24.*s\n", kind_name, tok.line, tok.col, tok.len, tok.start ); }