const std = @import( "std" ); // zig run poc-tokenize.zig -freference-trace=16 -- hello | zig run poc-tokenize.zig -- tokenize // const escape = @import( "../terminal-escape-codes/ansi-lib.zig" ); const escape = @import( "ansi-lib.zig" ); const rgb = escape.fmt_g_rgb; fn print_tokens( tokens :[]Token ) void { for( tokens ) |token| { if( token.kind == .whitespace ) { continue; } // std.debug.print( "token: {any}\n", .{ token } ); std.debug.print( "token: .kind = {s: >12}, .src = {s}\n", .{ @tagName( token.kind ), token.src } ); } } // https://alloc.dev/2025/05/25/syntax_highlighting const alloc_dev = .{ .comment = rgb( .foreground, 0x80, 0x80, 0x80 ), // grey .builtin = rgb( .foreground, 0xff, 0x70, 0x65 ), // red .keyword = rgb( .foreground, 0xff, 0xbb, 0x65 ), // brighter red .string = rgb( .foreground, 0xde, 0xff, 0x65 ), // yellow / green .number = rgb( .foreground, 0x65, 0xff, 0xc3 ), // green / teal .xtype = rgb( .foreground, 0x65, 0xdf, 0xff ), // blue / green .function = rgb( .foreground, 0x65, 0x9c, 0xff ), // blue .variable = rgb( .foreground, 0xb5, 0x65, 0xff ), // blue /purple .primitive = rgb( .foreground, 0xff, 0x65, 0xd3 ), // red /purple .other = rgb( .foreground, 0xff, 0xff, 0xff ), // white }; // https://www.nordtheme.com/ const nord = .{ .polar_night = .{ .dark0 = rgb( .foreground, 0x2e, 0x34, 0x40 ), .dark1 = rgb( .foreground, 0x3b, 0x42, 0x52 ), .dark2 = rgb( .foreground, 0x43, 0x4c, 0x5e ), .dark3 = rgb( .foreground, 0x4c, 0x56, 0x6a ), }, .snow_storm = .{ .light0 = rgb( .foreground, 0xd8, 0xde, 0xe9 ), .light1 = rgb( .foreground, 0xe5, 0xe9, 0xf0 ), .light2 = rgb( .foreground, 0xec, 0xef, 0xf4 ), }, .frost = .{ .moss = rgb( .foreground, 0x8f, 0xbc, 0xbb ), .turquoise = rgb( .foreground, 0x88, 0xc0, 0xd0 ), .slate = rgb( .foreground, 0x81, 0xa1, 0xc1 ), .navy = rgb( .foreground, 0x5e, 0x81, 0xac ), }, // aurora .aurora = .{ .red = rgb( .foreground, 0xbf, 0x61, 0x6a ), .orange = rgb( .foreground, 0xd0, 0x87, 0x70 ), .yellow = rgb( .foreground, 0xeb, 0xcb, 0x8b ), .green = rgb( .foreground, 0xa3, 0xbe, 0x8c ), .purple = rgb( .foreground, 0xb4, 0x8e, 0xad ), }, }; // https://draculatheme.com/spec#color-palette // https://github.com/dracula/dracula-theme const dracula = .{ .background = rgb( .foreground, 0x28, 0x2a, 0x36 ), // 40, 42, 54 231°, 15%, 18% Main background .current_line = rgb( .foreground, 0x62, 0x72, 0xa4 ), // 98, 114, 164 225°, 27%, 51% Comments, disabled code .selection = rgb( .foreground, 0x44, 0x47, 0x5a ), // 68, 71, 90 232°, 14%, 31% Text selection .foreground = rgb( .foreground, 0xf8, 0xf8, 0xf2 ), // 248, 248, 242 60°, 30%, 96% Default text .comment = rgb( .foreground, 0x62, 0x72, 0xa4 ), // 98, 114, 164 225°, 27%, 51% Comments, disabled code .red = rgb( .foreground, 0xff, 0x55, 0x55 ), // 255, 85, 85 0°, 100%, 67% Errors, warnings, deletion .orange = rgb( .foreground, 0xff, 0xb8, 0x6c ), // 255, 184, 108 31°, 100%, 71% Numbers, constants, booleans .yellow = rgb( .foreground, 0xf1, 0xfa, 0x8c ), // 241, 250, 140 65°, 92%, 76% Functions, methods .green = rgb( .foreground, 0x50, 0xfa, 0x7b ), // 80, 250, 123 135°, 94%, 65% Strings, inherited classes .cyan = rgb( .foreground, 0x8b, 0xe9, 0xfd ), // 139, 233, 253 191°, 97%, 77% Support functions, regex .purple = rgb( .foreground, 0xbd, 0x93, 0xf9 ), // 189, 147, 249 265°, 89%, 78% Classes, types, variables .pink = rgb( .foreground, 0xff, 0x79, 0xc6 ), // 255, 121, 198 326°, 100%, 74% Keywords, storage types }; fn print_tokens_color( tokens :[]Token ) void { for( tokens ) |token| { std.debug.print( "{s}{s}" ++ escape._fmt_clear_color, .{ switch( .alloc_dev_2 ) { inline else => @panic( "unreachable" ), inline .alloc_dev => switch( token.kind ) { // --- alloc_dev --- .keyword => alloc_dev.keyword, .whitespace => alloc_dev.comment, .identifier => alloc_dev.other, .string => alloc_dev.string, .number => alloc_dev.number, .symbol => alloc_dev.builtin, // .primitive, // .function, .operator => alloc_dev.primitive, }, inline .alloc_dev_2 => switch( token.kind ) { .symbol => alloc_dev.builtin, .operator => alloc_dev.primitive, // rgb( .foreground, 0xe0, 0xa0, 0xa0 ), .identifier => alloc_dev.other, .whitespace => rgb( .foreground, 0x80, 0x8b, 0x98 ), // grey // "\x1b[38;2;128;139;150m", // .{ r, g, b } .keyword => rgb( .foreground, 0xb5, 0x65, 0xff ), // purple .number => alloc_dev.number, .string => alloc_dev.string, }, inline .nord => switch( token.kind ) { // --- nord --- .keyword => nord.aurora.purple, .whitespace => nord.snow_storm.light2, .identifier => nord.snow_storm.light0, .string => nord.aurora.green, .number => nord.aurora.yellow, .symbol => nord.aurora.orange, .operator => nord.aurora.red, }, inline .dracula => switch( token.kind ) { // --- dracula --- .keyword => dracula.purple, .whitespace => dracula.comment, .identifier => dracula.foreground, .string => dracula.green, .number => dracula.yellow, .symbol => dracula.orange, .operator => dracula.red, }, }, token.src, } ); } } pub fn main() !void { // std.debug.print( "hello world\n", .{} ); // TODO character literals 'x' // TODO builtins @import // std.mem.page_allocator; var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer { _ = gpa.deinit(); } const all = gpa.allocator(); var args = std.process.args(); _ = args.next(); // bin const arg1 = args.next() orelse ""; if( std.mem.eql( u8, arg1, "hello" ) ) { // std.debug.print( "\"\"\"{s}\"\"\"\n", .{ hello } ); try std.fs.File.stdout().writeAll( hello ); } else if( std.mem.eql( u8, arg1, "tokenize" ) ) { const data = try std.fs.File.stdin().readToEndAlloc( all, std.math.maxInt( usize ) ); defer { all.free( data ); } const tokens = try tokenize( all, data ); defer { all.free( tokens ); } // print_tokens( tokens ); print_tokens_color( tokens ); std.debug.print( "\n", .{} ); } else { std.debug.print( "unrecognized arg\n", .{} ); } } const hello = \\ \\var std = @import( "std" ); \\ \\pub var main = fn :void { \\ std.debug.print( "hello world\n", .() ); \\}; \\ \\var dead_beef = 0xdead_beef; \\var ten_five = 0b_1010_0101; \\var flags = 0b_1_0_0_0_0_1_0_0_; \\var permissions = 0o777; // XXX \\var BOOBS = 80085; \\ \\var poem = \\ /" roses are red, \\ /"violets are blue, \\ /" here there is me, \\ /"there there is you. \\; \\ ; const Token = struct { src :[]const u8, kind :Kind, const Kind = enum { keyword , identifier , number , string , symbol , operator , whitespace , }; }; // TODO consider a reader instead of src // XXX reader is suboptimal because a second buffer would be required // all tokens refer to their own source via a slice into the original; // this also provides location. fn tokenize( all :std.mem.Allocator, src :[]const u8 ) ![]Token { var list = std.ArrayList( Token ).initBuffer( ( [0]Token {} )[0..] ); defer { list.deinit( all ); } var str = src; errdefer { std.debug.print( "error details\n", .{} ); const parsed = src[ 0..str.ptr-src.ptr ]; const from = if( std.mem.lastIndexOf( u8, parsed, "\n" ) ) |i| ( i+1 ) else ( 0 ); const to = std.mem.indexOf( u8, str, "\n" ) orelse str.len; const line = src[ from..( str.ptr - src.ptr + to ) ]; std.debug.print( "----\n", .{} ); std.debug.print( "{s}\n", .{ line } ); for( from..( str.ptr - src.ptr ) ) |_| { std.debug.print( " ", .{} ); } std.debug.print( "^\n", .{} ); std.debug.print( "----\n", .{} ); } while( 0 < str.len ) { switch( str[0] ) { ' ','\t','\n' => { const start = str.ptr; str = for( str[0..], 0.. ) |c,i| { const more = ( c == ' ' or c == '\t' or c == '\n' ); if( !more ) { break str[ i.. ]; } } else ( str[ str.len.. ] ); const end = str.ptr; try list.append( all, .{ .kind = .whitespace, .src = start[ 0..end-start ] } ); }, 'a'...'z', 'A'...'Z', '@' => { const start = str.ptr; str = for( str[0..], 0.. ) |c,i| { const more = ( 'A' <= c and c <= 'Z' ) or ( 'a' <= c and c <= 'z' ) or ( '0' <= c and c <= '9' ) or ( c == '@' or c == '_' ) ; if( !more ) { break str[ i.. ]; } } else ( str[ str.len.. ] ); const end = str.ptr; const token_src = start[ 0..end-start ]; const pan_keywords = [_][]const u8 { "nil", "undefined", "true", "false", "void", "bool", "usize", "isize", "u8", "i8", "s8", // TODO pick 'i' or 's' "record", "class", // "open" inheritance "enum", "variant", "pub", "var", "mut", "ref", "and", "or", "fn", "infer", "out", "err", "return", // "trap" -> `@panic()` "if", "elif", "else", // "when" alignment.. "for", "while", "continue", "break", "fin", "switch", "case", "default", "goto", "error", "catch", "defer", "errdefer", "ifnil", "iferr", }; const kind :Token.Kind = for( pan_keywords ) |key| { if( std.mem.eql( u8, token_src, key ) ) { break .keyword; } } else ( .identifier ); try list.append( all, .{ .kind = kind, .src = token_src } ); }, '0'...'9' => { // TODO float const Radix = enum( u8 ) { bin = 'b', oct = 'o', hex = 'x', dec = '0', fn fromChar( char :u8 ) @This() { return switch( char ) { 'b', 'o', 'x' => ( @enumFromInt( char ) ), else => ( .dec ), }; } fn valid( radix :@This(), cc :u8 ) bool { if( cc == '_' ) { return true; } return switch( radix ) { .bin => ( '0' <= cc and cc < '2' ), .oct => ( '0' <= cc and cc < '8' ), .dec => ( '0' <= cc and cc <= '9' ), .hex => ( ( '0' <= cc and cc <= '9' ) or ( 'a' <= cc and cc <= 'f' ) ), }; } }; const start = str.ptr; const radix = if( str[0] == '0' and 1 < str.len ) radix: { const radix = Radix.fromChar( str[1] ); if( radix != .dec ) { str = str[ 2.. ]; } break :radix radix; } else ( .dec ); str = for( str[0..], 0.. ) |c,i| { const more = radix.valid( c ); if( !more ) { break str[ i.. ]; } } else ( str[ str.len.. ] ); const end = str.ptr; try list.append( all, .{ .kind = .number, .src = start[ 0..end-start ] } ); }, '"' => { const start = str.ptr; str = str[ 1.. ]; // consume opening quote // TODO switch loop while( 0 < str.len ) { switch( str[0] ) { '\\' => { if( str.len < 2 ) { return error.partial_string; } str = str[ @min( 2, str.len ).. ]; }, '"' => { str = str[ 1.. ]; // consume closing quote break; }, else => { str = str[ 1.. ]; } } } else { return error.partial_string; } try list.append( all, .{ .kind = .string, .src = start[ 0..str.ptr-start ] } ); }, '\'' => { // take_character const start = str.ptr; str = str[ 1.. ]; if( str.len < 1 ) { return error.partial_character; } const apos2 :u8 = if( str[0] == '\\' ) ( 2 ) else ( 1 ); if( str.len < apos2 or str[ apos2 ] != '\'' ) { return error.partial_character; } str = str[ apos2+1.. ]; // TODO consider `.kind = .character` try list.append( all, .{ .kind = .string, .src = start[ 0..str.ptr-start ] } ); }, '!', '#','$','%','^','&','*','(',')', '`','-','=', '~','_','+', '[',']','\\', '{','}','|', ';', // '\'', ':', // '"', ',','.','/', '<','>','?', => { const is_comment = std.mem.startsWith( u8, str, "//" ); const is_stringline = std.mem.startsWith( u8, str, "/\"" ); // XXX consider `my_numerator/"sample".len` // TODO multi symbol operators const operators = [_][]const u8 { "==", "<>", // equality "+=", "-=", "%=", "*=", "/=", // arithmetic assignment // "++", "--", "**", "..", // range, slice "<<", ">>", "<<<", ">>>", // bitwise shifts ".!", "!!", // errors ? ".:", ":.", // TODO which ? // ".(", ".|", ".{", // XXX inferred records, fn "<+>", "<->", // bitwise and, bitwise or // ? "<*>", "", "<^>", "<~>", "", "[|", "|]", // "***", "%%", "^^", "$$", }; const token :Token = if( is_comment or is_stringline ) b: { const index = std.mem.indexOf( u8, str, "\n" ) orelse ( str.len ); // std.debug.print( "comment: \"{s}\"\n", .{ start[ 0..index ] } ); const kind :Token.Kind = if( is_comment ) ( .whitespace ) else ( .string ); break :b .{ .kind = kind, .src = str[ 0..index ] }; } else for( operators ) |op| { if( std.mem.startsWith( u8, str, op ) ) { break .{ .kind = .operator, .src = str[ 0..op.len ] }; } } else ( // std.debug.print( "symbol: '{c}'\n", .{ str[0] } ); .{ .kind = .symbol, .src = str[ 0..1 ] } ); str = str[ token.src.len.. ]; try list.append( all, token ); }, else => { // std.debug.print( "value: {d}\n", .{ x } ); // std.debug.print( "char: '{c}'\n", .{ x } ); // std.debug.print( "index: src[{d}]\n", .{ str.ptr - src.ptr } ); return error.unrecognized; // @panic( "unrecognized\n" ); // unreachable; // XXX panic is for the dev to communicate to the compiler that: // "THIS STATEMENT IS UNREACHABLE" }, } } return list.toOwnedSlice( all ); }