#include #include #include // For debugging, can be removed #include // For debugging, can be removed #include // For malloc, realloc, free #include // For strcmp, strncmp, strchr, strstr #include // For isalnum, isdigit, isalpha, isspace #include // For errno #include // Required for stat structure and functions #include // Required for fileno() #include "waterbow.c" // --- Type Definitions --- // Structure for string slices (references to the input string) typedef struct { const char* ptr; // Pointer to the beginning of the string slice size_t len; // Length of the string slice } str_t; typedef struct { char* ptr; size_t len; } slc_t; // Enum for token types (tags) typedef enum { IDENTIFIER, KEYWORD, NUMBER, STRING, COMMENT, SYMBOL, // e.g., '(', ')', '[', ']', '{', '}', ';', ',', '.' OPERATOR, // e.g., '+', '-', '*', '/', '=', '==', '!=', '<', '>', '&&', '||' WHITESPACE, // e.g., ' ', '\t', '\n', '\r', '\v', '\f' } tag_t; const char* tag_str[] = { "IDENTIFIER", "KEYWORD", "NUMBER", "STRING", "COMMENT", "SYMBOL", "OPERATOR", "WHITESPACE" }; // Token structure typedef struct { tag_t tag; // The type of the token str_t str; // The string representation of the token's lexeme } token_t; // Enum for error codes typedef enum { SUCCESS = 0, // No error MALLOC_FAIL, // Memory allocation failed UNTERMINMINATED_STRING, // Unterminated string literal INVALID_CHARACTER_LITERAL, INVALID_NUMBER, // Malformed number literal (e.g., "123a") UNRECOGNIZED_CHAR, // Unrecognized byte/character sequence or invalid character OUT_OF_BOUNDS, // Attempt to read past end of input string (internal error, should ideally not happen) } error_t; const char* error_str[] = { "No error", "Memory allocation failed", "Unterminated string literal", "Invalid character literal ('x','\\n')", "Malformed number literal (e.g., \"123a\")", "Unrecognized byte/character sequence or invalid character", "Attempt to read past end of input string (internal error, should ideally not happen)", }; // --- Token Array Management Type --- typedef struct { token_t* tokens; // Pointer to the dynamically allocated array of tokens size_t count; // Current number of tokens in the array size_t capacity; // Current allocated capacity of the array } token_array_t; // --- Forward Declarations for Helper Functions --- static int is_c_digit(char c); static int is_c_alpha(char c); static int is_c_alnum(char c); static int is_c_whitespace(char c); static int is_c_symbol(char c); static int is_c_operator_start(char c); // token_array_t methods static error_t token_array_init(token_array_t* arr); static error_t token_array_reserve(token_array_t* arr, size_t new_capacity); static error_t token_array_add(token_array_t* arr, token_t token); static void token_array_free(token_array_t* arr); // --- Keywords List --- static const char* const C_KEYWORDS[] = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register", "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "_Alignas", "_Alignof", "_Atomic", "_Bool", "_Complex", "_Generic", "_Imaginary", "_Noreturn", "_Static_assert", "_Thread_local" }; static const size_t C_KEYWORDS_COUNT = sizeof(C_KEYWORDS) / sizeof(C_KEYWORDS[0]); // --- Helper Functions Implementation --- static int is_c_digit(char c) { return c >= '0' && c <= '9'; } static int is_c_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static int is_c_alnum(char c) { return is_c_alpha(c) || is_c_digit(c); } static int is_c_whitespace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\v' || c == '\f'; } static int is_c_symbol(char c) { return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}' || c == ';' || c == ',' || c == '.' || c == ':' || c == '?' || c == '~' || c == '#'; } static int is_c_operator_start(char c) { return c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '=' || c == '!' || c == '<' || c == '>' || c == '&' || c == '|' || c == '^' || c == '~' || c == '?'; } // token_array_t method implementations static error_t token_array_init(token_array_t* arr) { if (arr == NULL) return MALLOC_FAIL; arr->tokens = NULL; arr->count = 0; arr->capacity = 0; return SUCCESS; } // Ensures array has at least 'new_capacity' static error_t token_array_reserve(token_array_t* arr, size_t new_capacity) { if (new_capacity <= arr->capacity) { return SUCCESS; // Already has enough capacity } // Prevent overflow in realloc size calculation if (new_capacity > (SIZE_MAX / sizeof(token_t))) { return MALLOC_FAIL; // Capacity too large } token_t* new_array = (token_t*)realloc(arr->tokens, new_capacity * sizeof(token_t)); if (new_array == NULL) { return MALLOC_FAIL; } arr->tokens = new_array; arr->capacity = new_capacity; return SUCCESS; } static error_t token_array_add(token_array_t* arr, token_t token) { if (arr->count >= arr->capacity) { // Double capacity, or start with 16 if currently empty error_t err = token_array_reserve(arr, (arr->capacity == 0) ? 16 : arr->capacity * 2); if (err != SUCCESS) { return err; } } arr->tokens[arr->count] = token; // Assign the entire token struct arr->count++; return SUCCESS; } static void token_array_free(token_array_t* arr) { if (arr != NULL) { free(arr->tokens); arr->tokens = NULL; arr->count = 0; arr->capacity = 0; } } // static inline size_t min( size_t a, size_t b ) { return ( b < a ? b : a ); } // static inline size_t max( size_t a, size_t b ) { return ( b > a ? b : a ); } // --- Main Tokenization Function --- //* // @brief Tokenizes a C-like source code string into an array of tokens. // // This function parses the input C-like source string and breaks it down into a // sequence of tokens, such as identifiers, keywords, numbers, strings, comments, // symbols, operators, and whitespace. The tokens are returned as a // dynamically allocated array of 'token_t' structs within the provided // 'token_array_t' structure. // // @param src A 'str_t' struct representing the input C-like source code string. // The function will only reference this string; no copies are made, // and the function does not free the source string's memory. // @param out_tokens A pointer to a 'token_array_t' structure. On success, // this structure will be populated with the dynamically allocated array of tokens // and its count and capacity. On error, its internal array will be freed and // count/capacity reset. // The caller is responsible for initializing this structure before the call // and for calling `token_array_free` on it when the tokens are no longer needed. // // @return An `error_t` enum value indicating the result of the operation. // `SUCCESS` (0) on success, or a specific error code on failure. // If an error occurs, the internal `tokens` array within `out_tokens` will be freed // and `out_tokens->count` and `out_tokens->capacity` will be set to 0. /// error_t tokenize( str_t src, token_array_t* out_tokens, slc_t* out_details ) { error_t err = token_array_init(out_tokens); if (err != SUCCESS) { return err; } // printf( "src: `%.*s`\n", (int)src.len, src.ptr ); size_t current_pos = 0; while (current_pos < src.len) { // printf( "pos: %d\n", (int)current_pos ); // printf( "( current_pos = %zu ) < ( src.len = %zu )\n", current_pos, src.len ); // Store the starting position of the current token size_t token_start_pos = current_pos; size_t token_len = 0; token_t new_token; char c = src.ptr[current_pos]; if ((c < ' ' && !is_c_whitespace(c)) || c == 127) { err = UNRECOGNIZED_CHAR; goto error; } // 1. Whitespace if (is_c_whitespace(c)) { while (current_pos < src.len && is_c_whitespace(src.ptr[current_pos])) { if( src.ptr[current_pos] == '\n' ) { current_pos++; break; } current_pos++; } token_len = current_pos - token_start_pos; // Simplified calculation new_token = (token_t){ .tag = WHITESPACE, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } // 2. Identifiers and Keywords if (is_c_alpha(c)) { while (current_pos < src.len && is_c_alnum(src.ptr[current_pos])) { current_pos++; } token_len = current_pos - token_start_pos; // Simplified calculation // Check if it's a keyword tag_t tag = IDENTIFIER; for (size_t i = 0; i < C_KEYWORDS_COUNT; ++i) { if (token_len == strlen(C_KEYWORDS[i]) && strncmp(src.ptr + token_start_pos, C_KEYWORDS[i], token_len) == 0) { tag = KEYWORD; break; } } new_token = (token_t){ .tag = tag, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } // 3. Numbers (integers only for now, can be extended) if (is_c_digit(c)) { const char *digits = "0123456789_"; if (src.ptr[current_pos] == '0' && current_pos + 1 < src.len && src.ptr[current_pos + 1] == 'x') { current_pos += 2; digits = "0123456789ABCDEFabcdef_"; } if (src.ptr[current_pos] == '0' && current_pos + 1 < src.len && src.ptr[current_pos + 1] == 'b') { current_pos += 2; digits = "01_"; } // while (current_pos < src.len && is_c_digit(src.ptr[current_pos])) while (current_pos < src.len && strchr(digits, src.ptr[current_pos])) { current_pos++; } if (current_pos < src.len && src.ptr[current_pos] == '.') { current_pos++; int decimal_pos = current_pos; while (current_pos < src.len && strchr(digits, src.ptr[current_pos])) { current_pos++; } if( current_pos == decimal_pos ) { err = INVALID_NUMBER; goto error; } } token_len = current_pos - token_start_pos; // Simplified calculation // Basic validation: ensure no non-digit characters immediately follow a number that would make it invalid // BK ?? if (current_pos < src.len && strchr(digits, src.ptr[current_pos])) { err = INVALID_NUMBER; goto error; } new_token = (token_t){ .tag = NUMBER, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } // 4. String Literals if (c == '"') { current_pos++; // Consume the opening quote while (current_pos < src.len) { char current_char_in_string = src.ptr[current_pos]; if (current_char_in_string == '"') { break; // End of string } if (current_char_in_string == '\\') { // Handle escape sequences current_pos++; if (current_pos >= src.len) { err = UNTERMINMINATED_STRING; goto error; } // UTF-8 in string literals: allowed, so no ASCII-range check on individual bytes here. } current_pos++; } if (current_pos >= src.len) { err = UNTERMINMINATED_STRING; goto error; } // `token_start_pos` is at the opening quote. `current_pos` is now at the closing quote. token_len = (current_pos - token_start_pos) + 1; // Length includes both opening and closing quotes. current_pos++; // Consume the closing quote new_token = (token_t){ .tag = STRING, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } if (c == '\'') { current_pos++; if ( src.ptr[current_pos] != '\'' && src.ptr[current_pos] != '\\' && current_pos + 1 < src.len && src.ptr[current_pos + 1] == '\'' ) { current_pos += 2; } else if( src.ptr[current_pos] == '\\' && current_pos + 2 < src.len && src.ptr[current_pos + 2] == '\'' ) { current_pos += 3; } else { err = INVALID_CHARACTER_LITERAL; goto error; } // TODO consider unicode escapes '\u...' token_len = current_pos - token_start_pos; // Simplified calculation new_token = (token_t){ .tag = STRING, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } // 5. Comments (Single-line //) if (c == '/' && current_pos + 1 < src.len && src.ptr[current_pos + 1] == '/') { current_pos += 2; // Consume "//" // `token_start_pos` is already at the first '/' while (current_pos < src.len && src.ptr[current_pos] != '\n' && src.ptr[current_pos] != '\r') { // UTF-8 in comments: allowed, so no ASCII-range check on individual bytes here. current_pos++; } // Include newline if present if (current_pos < src.len) { if (src.ptr[current_pos] == '\r') { current_pos++; // Consume '\r' if (current_pos < src.len && src.ptr[current_pos] == '\n') { current_pos++; // Consume '\n' for CRLF } } else if (src.ptr[current_pos] == '\n') { current_pos++; // Consume '\n' } token_len = current_pos - token_start_pos; // Final length including newline(s) } token_len = current_pos - token_start_pos; // Length up to but not including newline new_token = (token_t){ .tag = COMMENT, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } if (c == '#') { // NOTE: ignoring '\r' while (current_pos < src.len && src.ptr[current_pos] != '\n') { current_pos++; if (src.ptr[ current_pos - 1 ] == '\\' && current_pos < src.len) { current_pos++; } } if (src.ptr[current_pos] == '\n') { current_pos++; } token_len = current_pos - token_start_pos; new_token = (token_t){ .tag = SYMBOL, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; // fprintf( stderr, "macro: `%.*s`\n", (int)token_len, &src.ptr[token_start_pos] ); continue; } // 6. Operators (multi-character first, then single-character) if (is_c_operator_start(c)) { // Check for two-character operators first if (current_pos + 1 < src.len) { char next_c = src.ptr[current_pos + 1]; // Common two-char operators. Order matters for greedy matching. if ((c == '=' && next_c == '=') || (c == '!' && next_c == '=') || (c == '>' && next_c == '=') || (c == '<' && next_c == '=') || (c == '&' && next_c == '&') || (c == '|' && next_c == '|') || (c == '+' && next_c == '+') || (c == '-' && next_c == '-') || (c == '-' && next_c == '>') || (c == '<' && next_c == '<') || (c == '>' && next_c == '>') || (c == '+' && next_c == '=') || (c == '-' && next_c == '=') || (c == '*' && next_c == '=') || (c == '/' && next_c == '=') || (c == '%' && next_c == '=') || (c == '&' && next_c == '=') || (c == '|' && next_c == '=') || (c == '^' && next_c == '=') ) { token_len = 2; current_pos += 2; new_token = (token_t){ .tag = OPERATOR, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } } // Single-character operators (if not part of a two-char operator) if (c == '+' || c == '-' || c == '*' || c == '%' || c == '=' || c == '!' || c == '<' || c == '>' || c == '&' || c == '|' || c == '^' || c == '~' || c == '/' ) { token_len = 1; current_pos += 1; new_token = (token_t){ .tag = OPERATOR, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } } // 7. Symbols (single character) if (is_c_symbol(c)) { token_len = 1; current_pos += 1; new_token = (token_t){ .tag = SYMBOL, .str = { .ptr = src.ptr + token_start_pos, .len = token_len } }; err = token_array_add(out_tokens, new_token); if (err != SUCCESS) goto error; continue; } // If none of the above matched, it's an unrecognized character err = UNRECOGNIZED_CHAR; goto error; } printf( "happy loop exit\n" ); printf( "pos: %d\n", (int)current_pos ); printf( "( current_pos = %zu ) <= ( src.len = %zu )\n", current_pos, src.len ); // Success: Tokenization complete // fprintf( stderr, "error: %.*s\n", (int)strlen( error_str[err] ), error_str[err] ); assert( current_pos == src.len ); assert( err == SUCCESS ); return SUCCESS; error: // On error, free any allocated tokens within the token_array_t and reset it printf( "error\n" ); fprintf( stderr, "context: `%.*s`\n", 64, &src.ptr[current_pos] ); token_array_free(out_tokens); if( err == UNRECOGNIZED_CHAR ) { fprintf( stderr, "position: %zu <= src.len: %zu\n" , current_pos, src.len ); fprintf( stderr, "char: '%c'\n" , src.ptr[current_pos] ); fprintf( stderr, "char code: 0x%x\n" , src.ptr[current_pos] ); } return err; } int main( int argc, const char* argv[] ) { printf( "hello world\n" ); const char abc[] = "abc"; assert( sizeof( abc ) - 1 == 3 ); // FILE *f = fopen( "c-like-tokenizer.c", "r" ); FILE *f = fopen( "waterbow.c", "r" ); struct stat st = {0}; fstat( fileno( f ), &st ); size_t size = st.st_size; char *buf = malloc( size ); if( !buf ) { fprintf( stderr, "malloc failed\n" ); exit( 1 ); } size_t read = fread( buf, 1, size, f ); if( size != read ) { fprintf( stderr, "failed to read full file\n" ); exit( 2 ); } // note: `sizeof( array ) == array.len` but `sizeof( ptr ) == sizeof( usize )` // meaning `sizeof( const char* ) == MACHINE_WORD_SIZE` but `sizeof( const char[] ) == len` const char src[] = // XXX const char* src = "#include \n" "\n" "int main( int argc, const char* argv[] ) {\n" " printf( \"hello world\\n\" );" " return 0;\n" "}\n" "\n" ; // char buffer[4096] = {0}; // slc_t buf_str = { .ptr = (char*)&buffer, .len = sizeof(buffer) }; // str_t str = { .ptr = src, .len = sizeof( src ) - 1 }; // printf( "sizeof( src ) == %zu\n", sizeof( src ) ); str_t str = { .ptr = buf, .len = size }; printf( "str.len == %zu\n", size ); token_array_t tokens = {0}; error_t err; err = token_array_init( &tokens ); if( err != SUCCESS ) { fprintf( stderr, "token_array_init failed\n" ); goto err1; }; err = tokenize( str, &tokens, NULL ); if( err != SUCCESS ) { fprintf( stderr, "tokenize failed\n" ); goto err0; } // void f_color_clear ( FILE *f ) // void f_color_palette ( FILE *f, ground_t ground, unsigned char index ) // void f_color_palette_rgb ( FILE *f, ground_t ground, size_t r, size_t g, size_t b ) // void f_color_palette_mono ( FILE *f, ground_t ground, size_t white ) // void f_color_truecolor ( FILE *f, ground_t ground, unsigned char r, unsigned char g, unsigned char b ) // BLACK = 0, // RED = 1, // GREEN = 2, // ORANGE = 3, // BLUE = 4, // PURPLE = 5, // CYAN = 6, // WHITE = 7, // https://alloc.dev/2025/05/25/syntax_highlighting // Comments: #808080 (RGB: 128, 128, 128) // Builtins: #ff7065 (RGB: 255, 112, 101) // Keywords: #ffbb65 (RGB: 255, 187, 101) // Strings: #deff65 (RGB: 222, 255, 101) // Numbers: #65ffc3 (RGB: 101, 255, 195) // Types: #65dfff (RGB: 101, 223, 255) // functions(): #659cff (RGB: 101, 156, 255) // var_names: #b565ff (RGB: 181, 101, 255) // Primitives: #ff65d3 (RGB: 255, 101, 211) // Default: #ffffff (RGB: 255, 255, 255) #define COLOR_GREY 128, 128, 128 #define COLOR_RED 255, 112, 101 #define COLOR_ORANGE 255, 187, 101 #define COLOR_YELLOW 222, 255, 101 #define COLOR_GREEN 101, 255, 195 #define COLOR_TEAL 101, 223, 255 #define COLOR_BLUE 101, 156, 255 #define COLOR_PURPLE 181, 101, 255 #define COLOR_PINK 255, 101, 211 #define COLOR_WHITE 255, 255, 255 for( int i = 0; i < tokens.count; i++ ) { // tokens, count, capacity token_t* token = &tokens.tokens[i]; // ignore whitespace at end of lines // TODO handle whitespace in comments and macros if( token->tag == WHITESPACE && 0 < token->str.len && token->str.ptr[ token->str.len - 1 ] == '\n' ) { fprintf( stdout, "\n" ); continue; } switch( token->tag ) { // // case IDENTIFIER : f_color_palette ( stdout, FOREGROUND, WHITE ); break; // case IDENTIFIER : f_color_truecolor ( stdout, FOREGROUND, 101,223,255 ); break; // case KEYWORD : f_color_palette ( stdout, FOREGROUND, ORANGE ); break; // case NUMBER : f_color_palette ( stdout, FOREGROUND, GREEN ); break; // case STRING : f_color_truecolor ( stdout, FOREGROUND, 222,255,101 ); break; // case COMMENT : f_color_palette_mono ( stdout, FOREGROUND, 12 ); break; // case SYMBOL : f_color_palette ( stdout, FOREGROUND, RED ); break; // case OPERATOR : f_color_palette ( stdout, FOREGROUND, PURPLE ); break; // case WHITESPACE : break; case IDENTIFIER : f_color_truecolor( stdout, FOREGROUND, COLOR_BLUE ); break; case KEYWORD : f_color_truecolor( stdout, FOREGROUND, COLOR_TEAL ); break; case NUMBER : f_color_truecolor( stdout, FOREGROUND, COLOR_ORANGE ); break; case STRING : f_color_truecolor( stdout, FOREGROUND, COLOR_GREEN ); break; case COMMENT : f_color_truecolor( stdout, FOREGROUND, COLOR_GREY ); break; case SYMBOL : f_color_truecolor( stdout, FOREGROUND, COLOR_PURPLE ); break; case OPERATOR : f_color_truecolor( stdout, FOREGROUND, COLOR_PINK ); break; case WHITESPACE : break; default: // fprintf( stdout, "!" ); f_color_palette( stdout, BACKGROUND, WHITE ); } fprintf( // stdout, "`%.*s", stdout, "%.*s", (int)token->str.len, token->str.ptr ); f_color_clear( stdout ); } printf( "fin / success\n" ); err0: token_array_free( &tokens ); err1: if( err != SUCCESS ) { fprintf( stderr, "error code %d: %s\n", err, error_str[err] ); } free( buf ); fclose( f ); return err; }