Initial commit

author: iamcheeseman <[email protected]> 2026-04-06 17:04:05 -0400
committer: iamcheeseman <[email protected]> 2026-04-06 17:06:53 -0400
commit: 957c64c7b8b5e98d8a03dd84c7e27e7991fb9dbc (patch)
tree: f5fc230703791cee8d8e7851fb87eaef07ae63a2 /uscript/lex.c
1 files changed, 382 insertions, 0 deletions
diff --git a/uscript/lex.c b/uscript/lex.c
new file mode 100644
index 0000000..cc41b2a
--- /dev/null
+++ b/uscript/lex.c
@@ -0,0 +1,382 @@
+#include "lex.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "dyn_arr.h"
+
+#define advance(lex) (++lex->head, ++lex->col)
+#define current_char(lex) (*lex->head)
+
+// For printing purposes
+static
+char *token_names[] = {
+        "TOKEN_EOF",
+#define TOKEN_NAME(name) "TOKEN_" #name,
+        XTOKENS(TOKEN_NAME)
+#undef TOKEN_NAME
+};
+
+static
+bool is_digit(char c)
+{
+        return c >= '0' && c <= '9';
+}
+
+static
+bool is_upper(char c)
+{
+        return c >= 'A' && c <= 'Z';
+}
+
+static
+bool is_lower(char c)
+{
+        return c >= 'a' && c <= 'z';
+}
+
+static
+bool is_ident(char c)
+{
+        return is_lower(c) ||
+                is_upper(c) ||
+                c == '_';
+}
+
+static
+bool match_kw(const struct lexer *lex, const char *b)
+{
+        size_t alen = (size_t)(lex->head - lex->base);
+        size_t blen = strlen(b);
+        return alen == blen && memcmp(lex->base, b, alen) == 0;
+}
+
+// Advances if the given character is a match.
+static
+bool match_char(struct lexer *lex, char c)
+{
+        if (current_char(lex) == c) {
+                advance(lex);
+                return true;
+        }
+        return false;
+}
+
+// Looks for a non-whitespace character and sets the start of the token to
+// that. If it sees a newline, it will also update all relevant data. And
+// finally, if it sees a #, it will interpret it as a comment.
+static
+void goto_token_start(struct lexer *lex)
+{
+        while (true) {
+                switch (current_char(lex)) {
+                case '#':
+                        while (
+                                current_char(lex) != '\n' &&
+                                current_char(lex) != '\0'
+                        )
+                                advance(lex);
+                        break;
+                case '\n':
+                        advance(lex);
+                        lex->line++;
+                        lex->col = 0;
+                        break;
+                case ' ':
+                case '\t':
+                case '\r':
+                        advance(lex);
+                        break;
+                default:
+                        lex->base = lex->head;
+                        return;
+                }
+        }
+}
+
+static
+struct token create_token(struct lexer *lex, u16 token_kind)
+{
+        struct token tok;
+        tok.kind = token_kind;
+        tok.start = lex->base;
+        tok.len = (int)(lex->head - lex->base);
+        tok.line = lex->line;
+        tok.col = lex->col - tok.len;
+        tok.val = create_zilch();
+        return tok;
+}
+
+static
+struct token err_token(struct lexer *lex, const char *msg)
+{
+        struct token tok;
+        tok.kind = TOKEN_ERR;
+        tok.start = msg;
+        tok.len = (int)strlen(msg);
+        tok.line = lex->line;
+        tok.col = lex->col;
+        tok.val = create_zilch();
+        return tok;
+}
+
+static
+struct token num_token(struct lexer *lex)
+{
+        while (is_digit(current_char(lex))) {
+                advance(lex);
+        }
+
+        if (current_char(lex) == '.') {
+                advance(lex);
+                while (is_digit(current_char(lex))) {
+                        advance(lex);
+                }
+        }
+
+        return create_token(lex, TOKEN_NUM);
+}
+
+// And ident(ifier) token is either a keyword, or an actual identifier
+static
+struct token ident_token(struct lexer *lex)
+{
+        while (true) {
+                char c = current_char(lex);
+                if (c == ':') {
+                        // If a : appears in the middle of an identifier, allow
+                        // it
+                        if (!is_ident(lex->head[1]))
+                                break;
+
+                } else if (!is_ident(c) && !is_digit(c)) {
+                        break;
+                }
+
+
+                advance(lex);
+        }
+
+        // : is just to be used as a namespacer. So obviously it should be
+        // disallowed at the beginning and end of identifiers.
+        if (lex->head[-1] == ':')
+                return err_token(lex, "cannot end an identifier in ':'");
+
+        u16 kind = TOKEN_IDENT;
+
+        if (match_kw(lex, "if"))
+                kind = TOKEN_IF;
+        else if (match_kw(lex, "elseif"))
+                kind = TOKEN_ELSEIF;
+        else if (match_kw(lex, "else"))
+                kind = TOKEN_ELSE;
+        else if (match_kw(lex, "loop"))
+                kind = TOKEN_LOOP;
+        else if (match_kw(lex, "mod"))
+                kind = TOKEN_MOD;
+        else if (match_kw(lex, "true"))
+                kind = TOKEN_TRUE;
+        else if (match_kw(lex, "false"))
+                kind = TOKEN_FALSE;
+        else if (match_kw(lex, "zilch"))
+                kind = TOKEN_ZILCH;
+        else if (match_kw(lex, "nada"))
+                kind = TOKEN_ZILCH;
+        else if (match_kw(lex, "do"))
+                kind = TOKEN_DO;
+        else if (match_kw(lex, "break"))
+                kind = TOKEN_BREAK;
+        else if (match_kw(lex, "next"))
+                kind = TOKEN_NEXT;
+        else if (match_kw(lex, "in"))
+                kind = TOKEN_IN;
+        else if (match_kw(lex, "fun"))
+                kind = TOKEN_FUN;
+        else if (match_kw(lex, "ret"))
+                kind = TOKEN_RET;
+        else if (match_kw(lex, "let"))
+                kind = TOKEN_LET;
+        else if (match_kw(lex, "end"))
+                kind = TOKEN_END;
+        else if (match_kw(lex, "global"))
+                kind = TOKEN_GLOBAL;
+        else if (match_kw(lex, "print"))
+                kind = TOKEN_PRINT;
+
+        return create_token(lex, kind);
+}
+
+static
+struct token str_token(struct lexer *lex, char term)
+{
+        // TODO: escape sequences
+        char *chars = da_create(char, 0);
+
+        while (current_char(lex) != term && current_char(lex) != '\0') {
+                if (current_char(lex) != '\\') {
+                        da_append(char, &chars, current_char(lex));
+                        advance(lex);
+                        continue;
+                }
+                advance(lex);
+
+                switch (current_char(lex)) {
+                case '\\': da_append(char, &chars, '\\'); break;
+                case '\'': da_append(char, &chars, '\''); break;
+                case '"': da_append(char, &chars, '"'); break;
+                case 'n': da_append(char, &chars, '\n'); break;
+                case 't': da_append(char, &chars, '\t'); break;
+                case 'r': da_append(char, &chars, '\r'); break;
+                default:
+                          da_free(chars);
+                          return err_token(lex, "invalid escape sequence");
+                }
+
+                advance(lex); // eat escape
+        }
+
+        da_append(char, &chars, 0);
+
+        if (current_char(lex) == '\0')
+                return err_token(lex, "string never terminates");
+
+        advance(lex); // eat terminator
+
+        struct token str = create_token(lex, TOKEN_STR);
+        str.val = wrap_str(copy_str(chars, da_len(chars) - 1));
+        da_free(chars);
+        return str;
+}
+
+// A symbol token is any token that contains no alphanumeric characters.
+static
+struct token symbol_token(struct lexer *lex, char c)
+{
+        switch (c) {
+        case '(':
+        case ')':
+        case '{':
+        case '}':
+        case '[':
+        case ']':
+        case ',':
+        case ';':
+        case ':':
+                return create_token(lex, c);
+        case '.':
+                return create_token(
+                        lex,
+                        match_char(lex, '.') ? TOKEN_DOT_DOT : c
+                );
+        case '+':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_PLUS_EQL : c
+                );
+        case '-':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_MINUS_EQL : c
+                );
+        case '*':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_MULT_EQL : c
+                );
+        case '/':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_DIV_EQL : c
+                );
+        case '%':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_MOD_EQL : c
+                );
+        case '<':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_LTEQL : c
+                );
+        case '>':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_GTEQL : c
+                );
+        case '=':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_EQL : c
+                );
+        case '!':
+                return create_token(
+                        lex,
+                        match_char(lex, '=') ? TOKEN_NEQL : c
+                );
+        }
+
+        // TODO: log unknown character
+        return err_token(lex, "unknown character");
+}
+
+void lex_init(struct lexer *lex, const char *src)
+{
+        lex->src = src;
+        lex->base = src;
+        lex->head = src;
+
+        lex->line = 1;
+        lex->col = 0;
+}
+
+struct token lex_next_token(struct lexer *lex)
+{
+        goto_token_start(lex);
+
+        char c = current_char(lex);
+
+        if (c == '\0')
+                return create_token(lex, TOKEN_EOF);
+
+        advance(lex);
+
+        if (c == '"' || c == '\'')
+                return str_token(lex, c);
+        if (is_digit(c))
+                return num_token(lex);
+        if (is_ident(c))
+                return ident_token(lex);
+
+        return symbol_token(lex, c);
+}
+
+void token_kind_name(char *dst, size_t len, u16 kind)
+{
+        char char_name[2] = {0, 0};
+        const char *name = char_name;
+
+        if (kind < TOKEN_EOF)
+                char_name[0] = kind;
+        else
+                name = token_names[kind - TOKEN_EOF];
+
+        len = fmin(len, strlen(name));
+        memcpy(dst, name, len);
+        dst[len] = '\0'; 
+}
+
+void print_token(struct token tok)
+{
+        char kind_name[128];
+        token_kind_name(kind_name, 128, tok.kind);
+
+        printf(
+                "%-15s | %3d:%-2d | %-24.*s\n",
+                kind_name,
+                tok.line,
+                tok.col,
+                tok.len,
+                tok.start
+        );
+}
author	iamcheeseman <[email protected]>	2026-04-06 17:04:05 -0400
committer	iamcheeseman <[email protected]>	2026-04-06 17:06:53 -0400
commit	957c64c7b8b5e98d8a03dd84c7e27e7991fb9dbc (patch)
tree	f5fc230703791cee8d8e7851fb87eaef07ae63a2 /uscript/lex.c