diff options
| author | iamcheeseman <[email protected]> | 2026-04-06 17:04:05 -0400 |
|---|---|---|
| committer | iamcheeseman <[email protected]> | 2026-04-06 17:06:53 -0400 |
| commit | 957c64c7b8b5e98d8a03dd84c7e27e7991fb9dbc (patch) | |
| tree | f5fc230703791cee8d8e7851fb87eaef07ae63a2 /uscript/lex.c | |
Initial commit
Diffstat (limited to 'uscript/lex.c')
| -rw-r--r-- | uscript/lex.c | 382 |
1 files changed, 382 insertions, 0 deletions
diff --git a/uscript/lex.c b/uscript/lex.c new file mode 100644 index 0000000..cc41b2a --- /dev/null +++ b/uscript/lex.c @@ -0,0 +1,382 @@ +#include "lex.h" + +#include <string.h> +#include <stdio.h> +#include <math.h> + +#include "dyn_arr.h" + +#define advance(lex) (++lex->head, ++lex->col) +#define current_char(lex) (*lex->head) + +// For printing purposes +static +char *token_names[] = { + "TOKEN_EOF", +#define TOKEN_NAME(name) "TOKEN_" #name, + XTOKENS(TOKEN_NAME) +#undef TOKEN_NAME +}; + +static +bool is_digit(char c) +{ + return c >= '0' && c <= '9'; +} + +static +bool is_upper(char c) +{ + return c >= 'A' && c <= 'Z'; +} + +static +bool is_lower(char c) +{ + return c >= 'a' && c <= 'z'; +} + +static +bool is_ident(char c) +{ + return is_lower(c) || + is_upper(c) || + c == '_'; +} + +static +bool match_kw(const struct lexer *lex, const char *b) +{ + size_t alen = (size_t)(lex->head - lex->base); + size_t blen = strlen(b); + return alen == blen && memcmp(lex->base, b, alen) == 0; +} + +// Advances if the given character is a match. +static +bool match_char(struct lexer *lex, char c) +{ + if (current_char(lex) == c) { + advance(lex); + return true; + } + return false; +} + +// Looks for a non-whitespace character and sets the start of the token to +// that. If it sees a newline, it will also update all relevant data. And +// finally, if it sees a #, it will interpret it as a comment. +static +void goto_token_start(struct lexer *lex) +{ + while (true) { + switch (current_char(lex)) { + case '#': + while ( + current_char(lex) != '\n' && + current_char(lex) != '\0' + ) + advance(lex); + break; + case '\n': + advance(lex); + lex->line++; + lex->col = 0; + break; + case ' ': + case '\t': + case '\r': + advance(lex); + break; + default: + lex->base = lex->head; + return; + } + } +} + +static +struct token create_token(struct lexer *lex, u16 token_kind) +{ + struct token tok; + tok.kind = token_kind; + tok.start = lex->base; + tok.len = (int)(lex->head - lex->base); + tok.line = lex->line; + tok.col = lex->col - tok.len; + tok.val = create_zilch(); + return tok; +} + +static +struct token err_token(struct lexer *lex, const char *msg) +{ + struct token tok; + tok.kind = TOKEN_ERR; + tok.start = msg; + tok.len = (int)strlen(msg); + tok.line = lex->line; + tok.col = lex->col; + tok.val = create_zilch(); + return tok; +} + +static +struct token num_token(struct lexer *lex) +{ + while (is_digit(current_char(lex))) { + advance(lex); + } + + if (current_char(lex) == '.') { + advance(lex); + while (is_digit(current_char(lex))) { + advance(lex); + } + } + + return create_token(lex, TOKEN_NUM); +} + +// And ident(ifier) token is either a keyword, or an actual identifier +static +struct token ident_token(struct lexer *lex) +{ + while (true) { + char c = current_char(lex); + if (c == ':') { + // If a : appears in the middle of an identifier, allow + // it + if (!is_ident(lex->head[1])) + break; + + } else if (!is_ident(c) && !is_digit(c)) { + break; + } + + + advance(lex); + } + + // : is just to be used as a namespacer. So obviously it should be + // disallowed at the beginning and end of identifiers. + if (lex->head[-1] == ':') + return err_token(lex, "cannot end an identifier in ':'"); + + u16 kind = TOKEN_IDENT; + + if (match_kw(lex, "if")) + kind = TOKEN_IF; + else if (match_kw(lex, "elseif")) + kind = TOKEN_ELSEIF; + else if (match_kw(lex, "else")) + kind = TOKEN_ELSE; + else if (match_kw(lex, "loop")) + kind = TOKEN_LOOP; + else if (match_kw(lex, "mod")) + kind = TOKEN_MOD; + else if (match_kw(lex, "true")) + kind = TOKEN_TRUE; + else if (match_kw(lex, "false")) + kind = TOKEN_FALSE; + else if (match_kw(lex, "zilch")) + kind = TOKEN_ZILCH; + else if (match_kw(lex, "nada")) + kind = TOKEN_ZILCH; + else if (match_kw(lex, "do")) + kind = TOKEN_DO; + else if (match_kw(lex, "break")) + kind = TOKEN_BREAK; + else if (match_kw(lex, "next")) + kind = TOKEN_NEXT; + else if (match_kw(lex, "in")) + kind = TOKEN_IN; + else if (match_kw(lex, "fun")) + kind = TOKEN_FUN; + else if (match_kw(lex, "ret")) + kind = TOKEN_RET; + else if (match_kw(lex, "let")) + kind = TOKEN_LET; + else if (match_kw(lex, "end")) + kind = TOKEN_END; + else if (match_kw(lex, "global")) + kind = TOKEN_GLOBAL; + else if (match_kw(lex, "print")) + kind = TOKEN_PRINT; + + return create_token(lex, kind); +} + +static +struct token str_token(struct lexer *lex, char term) +{ + // TODO: escape sequences + char *chars = da_create(char, 0); + + while (current_char(lex) != term && current_char(lex) != '\0') { + if (current_char(lex) != '\\') { + da_append(char, &chars, current_char(lex)); + advance(lex); + continue; + } + advance(lex); + + switch (current_char(lex)) { + case '\\': da_append(char, &chars, '\\'); break; + case '\'': da_append(char, &chars, '\''); break; + case '"': da_append(char, &chars, '"'); break; + case 'n': da_append(char, &chars, '\n'); break; + case 't': da_append(char, &chars, '\t'); break; + case 'r': da_append(char, &chars, '\r'); break; + default: + da_free(chars); + return err_token(lex, "invalid escape sequence"); + } + + advance(lex); // eat escape + } + + da_append(char, &chars, 0); + + if (current_char(lex) == '\0') + return err_token(lex, "string never terminates"); + + advance(lex); // eat terminator + + struct token str = create_token(lex, TOKEN_STR); + str.val = wrap_str(copy_str(chars, da_len(chars) - 1)); + da_free(chars); + return str; +} + +// A symbol token is any token that contains no alphanumeric characters. +static +struct token symbol_token(struct lexer *lex, char c) +{ + switch (c) { + case '(': + case ')': + case '{': + case '}': + case '[': + case ']': + case ',': + case ';': + case ':': + return create_token(lex, c); + case '.': + return create_token( + lex, + match_char(lex, '.') ? TOKEN_DOT_DOT : c + ); + case '+': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_PLUS_EQL : c + ); + case '-': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_MINUS_EQL : c + ); + case '*': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_MULT_EQL : c + ); + case '/': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_DIV_EQL : c + ); + case '%': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_MOD_EQL : c + ); + case '<': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_LTEQL : c + ); + case '>': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_GTEQL : c + ); + case '=': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_EQL : c + ); + case '!': + return create_token( + lex, + match_char(lex, '=') ? TOKEN_NEQL : c + ); + } + + // TODO: log unknown character + return err_token(lex, "unknown character"); +} + +void lex_init(struct lexer *lex, const char *src) +{ + lex->src = src; + lex->base = src; + lex->head = src; + + lex->line = 1; + lex->col = 0; +} + +struct token lex_next_token(struct lexer *lex) +{ + goto_token_start(lex); + + char c = current_char(lex); + + if (c == '\0') + return create_token(lex, TOKEN_EOF); + + advance(lex); + + if (c == '"' || c == '\'') + return str_token(lex, c); + if (is_digit(c)) + return num_token(lex); + if (is_ident(c)) + return ident_token(lex); + + return symbol_token(lex, c); +} + +void token_kind_name(char *dst, size_t len, u16 kind) +{ + char char_name[2] = {0, 0}; + const char *name = char_name; + + if (kind < TOKEN_EOF) + char_name[0] = kind; + else + name = token_names[kind - TOKEN_EOF]; + + len = fmin(len, strlen(name)); + memcpy(dst, name, len); + dst[len] = '\0'; +} + +void print_token(struct token tok) +{ + char kind_name[128]; + token_kind_name(kind_name, 128, tok.kind); + + printf( + "%-15s | %3d:%-2d | %-24.*s\n", + kind_name, + tok.line, + tok.col, + tok.len, + tok.start + ); +} |
