summaryrefslogtreecommitdiff
path: root/uscript/lex.c
diff options
context:
space:
mode:
authoriamcheeseman <[email protected]>2026-04-06 17:04:05 -0400
committeriamcheeseman <[email protected]>2026-04-06 17:06:53 -0400
commit957c64c7b8b5e98d8a03dd84c7e27e7991fb9dbc (patch)
treef5fc230703791cee8d8e7851fb87eaef07ae63a2 /uscript/lex.c
Initial commit
Diffstat (limited to 'uscript/lex.c')
-rw-r--r--uscript/lex.c382
1 files changed, 382 insertions, 0 deletions
diff --git a/uscript/lex.c b/uscript/lex.c
new file mode 100644
index 0000000..cc41b2a
--- /dev/null
+++ b/uscript/lex.c
@@ -0,0 +1,382 @@
+#include "lex.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+#include "dyn_arr.h"
+
+#define advance(lex) (++lex->head, ++lex->col)
+#define current_char(lex) (*lex->head)
+
+// For printing purposes
+static
+char *token_names[] = {
+ "TOKEN_EOF",
+#define TOKEN_NAME(name) "TOKEN_" #name,
+ XTOKENS(TOKEN_NAME)
+#undef TOKEN_NAME
+};
+
+static
+bool is_digit(char c)
+{
+ return c >= '0' && c <= '9';
+}
+
+static
+bool is_upper(char c)
+{
+ return c >= 'A' && c <= 'Z';
+}
+
+static
+bool is_lower(char c)
+{
+ return c >= 'a' && c <= 'z';
+}
+
+static
+bool is_ident(char c)
+{
+ return is_lower(c) ||
+ is_upper(c) ||
+ c == '_';
+}
+
+static
+bool match_kw(const struct lexer *lex, const char *b)
+{
+ size_t alen = (size_t)(lex->head - lex->base);
+ size_t blen = strlen(b);
+ return alen == blen && memcmp(lex->base, b, alen) == 0;
+}
+
+// Advances if the given character is a match.
+static
+bool match_char(struct lexer *lex, char c)
+{
+ if (current_char(lex) == c) {
+ advance(lex);
+ return true;
+ }
+ return false;
+}
+
+// Looks for a non-whitespace character and sets the start of the token to
+// that. If it sees a newline, it will also update all relevant data. And
+// finally, if it sees a #, it will interpret it as a comment.
+static
+void goto_token_start(struct lexer *lex)
+{
+ while (true) {
+ switch (current_char(lex)) {
+ case '#':
+ while (
+ current_char(lex) != '\n' &&
+ current_char(lex) != '\0'
+ )
+ advance(lex);
+ break;
+ case '\n':
+ advance(lex);
+ lex->line++;
+ lex->col = 0;
+ break;
+ case ' ':
+ case '\t':
+ case '\r':
+ advance(lex);
+ break;
+ default:
+ lex->base = lex->head;
+ return;
+ }
+ }
+}
+
+static
+struct token create_token(struct lexer *lex, u16 token_kind)
+{
+ struct token tok;
+ tok.kind = token_kind;
+ tok.start = lex->base;
+ tok.len = (int)(lex->head - lex->base);
+ tok.line = lex->line;
+ tok.col = lex->col - tok.len;
+ tok.val = create_zilch();
+ return tok;
+}
+
+static
+struct token err_token(struct lexer *lex, const char *msg)
+{
+ struct token tok;
+ tok.kind = TOKEN_ERR;
+ tok.start = msg;
+ tok.len = (int)strlen(msg);
+ tok.line = lex->line;
+ tok.col = lex->col;
+ tok.val = create_zilch();
+ return tok;
+}
+
+static
+struct token num_token(struct lexer *lex)
+{
+ while (is_digit(current_char(lex))) {
+ advance(lex);
+ }
+
+ if (current_char(lex) == '.') {
+ advance(lex);
+ while (is_digit(current_char(lex))) {
+ advance(lex);
+ }
+ }
+
+ return create_token(lex, TOKEN_NUM);
+}
+
+// And ident(ifier) token is either a keyword, or an actual identifier
+static
+struct token ident_token(struct lexer *lex)
+{
+ while (true) {
+ char c = current_char(lex);
+ if (c == ':') {
+ // If a : appears in the middle of an identifier, allow
+ // it
+ if (!is_ident(lex->head[1]))
+ break;
+
+ } else if (!is_ident(c) && !is_digit(c)) {
+ break;
+ }
+
+
+ advance(lex);
+ }
+
+ // : is just to be used as a namespacer. So obviously it should be
+ // disallowed at the beginning and end of identifiers.
+ if (lex->head[-1] == ':')
+ return err_token(lex, "cannot end an identifier in ':'");
+
+ u16 kind = TOKEN_IDENT;
+
+ if (match_kw(lex, "if"))
+ kind = TOKEN_IF;
+ else if (match_kw(lex, "elseif"))
+ kind = TOKEN_ELSEIF;
+ else if (match_kw(lex, "else"))
+ kind = TOKEN_ELSE;
+ else if (match_kw(lex, "loop"))
+ kind = TOKEN_LOOP;
+ else if (match_kw(lex, "mod"))
+ kind = TOKEN_MOD;
+ else if (match_kw(lex, "true"))
+ kind = TOKEN_TRUE;
+ else if (match_kw(lex, "false"))
+ kind = TOKEN_FALSE;
+ else if (match_kw(lex, "zilch"))
+ kind = TOKEN_ZILCH;
+ else if (match_kw(lex, "nada"))
+ kind = TOKEN_ZILCH;
+ else if (match_kw(lex, "do"))
+ kind = TOKEN_DO;
+ else if (match_kw(lex, "break"))
+ kind = TOKEN_BREAK;
+ else if (match_kw(lex, "next"))
+ kind = TOKEN_NEXT;
+ else if (match_kw(lex, "in"))
+ kind = TOKEN_IN;
+ else if (match_kw(lex, "fun"))
+ kind = TOKEN_FUN;
+ else if (match_kw(lex, "ret"))
+ kind = TOKEN_RET;
+ else if (match_kw(lex, "let"))
+ kind = TOKEN_LET;
+ else if (match_kw(lex, "end"))
+ kind = TOKEN_END;
+ else if (match_kw(lex, "global"))
+ kind = TOKEN_GLOBAL;
+ else if (match_kw(lex, "print"))
+ kind = TOKEN_PRINT;
+
+ return create_token(lex, kind);
+}
+
+static
+struct token str_token(struct lexer *lex, char term)
+{
+ // TODO: escape sequences
+ char *chars = da_create(char, 0);
+
+ while (current_char(lex) != term && current_char(lex) != '\0') {
+ if (current_char(lex) != '\\') {
+ da_append(char, &chars, current_char(lex));
+ advance(lex);
+ continue;
+ }
+ advance(lex);
+
+ switch (current_char(lex)) {
+ case '\\': da_append(char, &chars, '\\'); break;
+ case '\'': da_append(char, &chars, '\''); break;
+ case '"': da_append(char, &chars, '"'); break;
+ case 'n': da_append(char, &chars, '\n'); break;
+ case 't': da_append(char, &chars, '\t'); break;
+ case 'r': da_append(char, &chars, '\r'); break;
+ default:
+ da_free(chars);
+ return err_token(lex, "invalid escape sequence");
+ }
+
+ advance(lex); // eat escape
+ }
+
+ da_append(char, &chars, 0);
+
+ if (current_char(lex) == '\0')
+ return err_token(lex, "string never terminates");
+
+ advance(lex); // eat terminator
+
+ struct token str = create_token(lex, TOKEN_STR);
+ str.val = wrap_str(copy_str(chars, da_len(chars) - 1));
+ da_free(chars);
+ return str;
+}
+
+// A symbol token is any token that contains no alphanumeric characters.
+static
+struct token symbol_token(struct lexer *lex, char c)
+{
+ switch (c) {
+ case '(':
+ case ')':
+ case '{':
+ case '}':
+ case '[':
+ case ']':
+ case ',':
+ case ';':
+ case ':':
+ return create_token(lex, c);
+ case '.':
+ return create_token(
+ lex,
+ match_char(lex, '.') ? TOKEN_DOT_DOT : c
+ );
+ case '+':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_PLUS_EQL : c
+ );
+ case '-':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_MINUS_EQL : c
+ );
+ case '*':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_MULT_EQL : c
+ );
+ case '/':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_DIV_EQL : c
+ );
+ case '%':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_MOD_EQL : c
+ );
+ case '<':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_LTEQL : c
+ );
+ case '>':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_GTEQL : c
+ );
+ case '=':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_EQL : c
+ );
+ case '!':
+ return create_token(
+ lex,
+ match_char(lex, '=') ? TOKEN_NEQL : c
+ );
+ }
+
+ // TODO: log unknown character
+ return err_token(lex, "unknown character");
+}
+
+void lex_init(struct lexer *lex, const char *src)
+{
+ lex->src = src;
+ lex->base = src;
+ lex->head = src;
+
+ lex->line = 1;
+ lex->col = 0;
+}
+
+struct token lex_next_token(struct lexer *lex)
+{
+ goto_token_start(lex);
+
+ char c = current_char(lex);
+
+ if (c == '\0')
+ return create_token(lex, TOKEN_EOF);
+
+ advance(lex);
+
+ if (c == '"' || c == '\'')
+ return str_token(lex, c);
+ if (is_digit(c))
+ return num_token(lex);
+ if (is_ident(c))
+ return ident_token(lex);
+
+ return symbol_token(lex, c);
+}
+
+void token_kind_name(char *dst, size_t len, u16 kind)
+{
+ char char_name[2] = {0, 0};
+ const char *name = char_name;
+
+ if (kind < TOKEN_EOF)
+ char_name[0] = kind;
+ else
+ name = token_names[kind - TOKEN_EOF];
+
+ len = fmin(len, strlen(name));
+ memcpy(dst, name, len);
+ dst[len] = '\0';
+}
+
+void print_token(struct token tok)
+{
+ char kind_name[128];
+ token_kind_name(kind_name, 128, tok.kind);
+
+ printf(
+ "%-15s | %3d:%-2d | %-24.*s\n",
+ kind_name,
+ tok.line,
+ tok.col,
+ tok.len,
+ tok.start
+ );
+}