const std = @import( "std" ); const testing = std.testing; const mem = std.mem; const ArrayList = std.ArrayList; const Allocator = std.mem.Allocator; // create & destroy // alloc & free // const print = std.debug.print; const print = struct { fn noop( _ :[]const u8, _ :anytype ) void {} }.noop; // while inotifywait main.zig ; do // clear ; // date --iso-8601=seconds ; // echo ; // zig test Grammar.zig ; // echo ; // done // https://github.com/ziglang/zig/blob/master/lib/std/fmt.zig#L69C1-L77C1 // pub fn format(value: ?, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype) !void const expected_grammar :Grammar = Grammar { .rules = &.{ Rule { .identifier = "start", .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "_" } }, Expression { .zero_or_more = .{ .expression = &Expression { .identifier = .{ .identifier = "rule" } } } }, }, } }, Rule { .identifier = "_", .choice = &.{ &.{ Expression { .zero_or_more = .{ .expression = &Expression { .identifier = .{ .identifier = "space" } } } }, }, } }, Rule { .identifier = "space", .choice = &.{ &.{ Expression { .literal = .{ .literal = " " } }, }, &.{ Expression { .literal = .{ .literal = "\t" } }, }, &.{ Expression { .literal = .{ .literal = "\n" } }, }, &.{ Expression { .identifier = .{ .identifier = "comment" } }, }, } }, Rule { .identifier = "comment", .choice = &.{ &.{ Expression { .literal = .{ .literal = "//" } }, Expression { .zero_or_more = .{ .expression = &Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xfffffffffffffbff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } } } }, Expression { .literal = .{ .literal = "\n" } }, }, } }, Rule { .identifier = "rule", .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "identifier" } }, Expression { .literal = .{ .literal = "=" } }, Expression { .identifier = .{ .identifier = "_" } }, Expression { .identifier = .{ .identifier = "choice" } }, Expression { .literal = .{ .literal = ";" } }, Expression { .identifier = .{ .identifier = "_" } }, }, } }, Rule { .identifier = "identifier", .choice = &.{ &.{ Expression { .one_or_more = .{ .expression = &Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x07fffffe87fffffe, 0x0000000000000000, 0x0000000000000000, } } } } } }, Expression { .identifier = .{ .identifier = "_" } }, }, } }, Rule { .identifier = "choice", .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "sequence" } }, Expression { .zero_or_more = .{ .expression = &Expression { .parenthesis = .{ .choice = &.{ &.{ Expression { .literal = .{ .literal = "|" } }, Expression { .identifier = .{ .identifier = "_" } }, Expression { .identifier = .{ .identifier = "sequence" } }, }, } } } } }, }, } }, // XXX // Rule { .identifier = "choice_", .choice = &.{ &.{ // Expression { .literal = .{ .literal = "|" } }, // Expression { .identifier = .{ .identifier = "_" } }, // Expression { .identifier = .{ .identifier = "sequence" } }, // }, } }, Rule { .identifier = "sequence", .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "expression" } }, Expression { .zero_or_more = .{ .expression = &Expression { .parenthesis = .{ .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "space" } }, Expression { .identifier = .{ .identifier = "expression" } }, }, } } } } }, }, } }, // XXX // Rule { .identifier = "sequence_", .choice = &.{ &.{ // Expression { .identifier = .{ .identifier = "space" } }, // Expression { .identifier = .{ .identifier = "expression" } }, // }, } }, Rule { .identifier = "expression", .choice = &.{ &.{ Expression { .literal = .{ .literal = "(" } }, Expression { .identifier = .{ .identifier = "_" } }, Expression { .identifier = .{ .identifier = "choice" } }, Expression { .literal = .{ .literal = ")" } }, Expression { .identifier = .{ .identifier = "_" } }, }, &.{ Expression { .identifier = .{ .identifier = "expression" } }, Expression { .literal = .{ .literal = "?" } }, Expression { .identifier = .{ .identifier = "_" } }, }, &.{ Expression { .identifier = .{ .identifier = "expression" } }, Expression { .literal = .{ .literal = "+" } }, Expression { .identifier = .{ .identifier = "_" } }, }, &.{ Expression { .identifier = .{ .identifier = "expression" } }, Expression { .literal = .{ .literal = "*" } }, Expression { .identifier = .{ .identifier = "_" } }, }, &.{ Expression { .identifier = .{ .identifier = "item" } }, }, } }, Rule { .identifier = "item", .choice = &.{ &.{ Expression { .identifier = .{ .identifier = "identifier" } }, }, &.{ Expression { .identifier = .{ .identifier = "literal" } }, }, &.{ Expression { .identifier = .{ .identifier = "set" } }, }, } }, Rule { .identifier = "literal", .choice = &.{ &.{ Expression { .literal = .{ .literal = "\"" } }, Expression { .one_or_more = .{ .expression = &Expression { .parenthesis = .{ .choice = &.{ &.{ Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xfffffffbfffffbff, 0xffffffffefffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, }, &.{ Expression { .literal = .{ .literal = "\\" } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xfffffffffffffbff, 0xfeffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, }, &.{ Expression { .literal = .{ .literal = "\\x" } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x0000007e0000007e, 0x0000000000000000, 0x0000000000000000, } } } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x0000007e0000007e, 0x0000000000000000, 0x0000000000000000, } } } }, }, } } } } }, Expression { .literal = .{ .literal = "\"" } }, Expression { .identifier = .{ .identifier = "_" } }, }, } }, // XXX // Rule { .identifier = "literal_", .choice = &.{ // &.{ // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xfffffffbfffffbff, 0xffffffffefffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, // }, // &.{ // Expression { .literal = .{ .literal = "\\" } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xfffffffffffffbff, 0xfeffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, // }, // &.{ // Expression { .literal = .{ .literal = "\\x" } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x0000007e0000007e, 0x0000000000000000, 0x0000000000000000, } } } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x0000007e0000007e, 0x0000000000000000, 0x0000000000000000, } } } }, // }, // } }, Rule { .identifier = "set", .choice = &.{ &.{ Expression { .literal = .{ .literal = "[" } }, Expression { .one_or_more = .{ .expression = &Expression { .parenthesis = .{ .choice = &.{ &.{ Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xffffffffffffffff, 0xffffffffcfffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, }, &.{ Expression { .literal = .{ .literal = "\\" } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xffffffffffffffff, 0xfeffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, }, &.{ Expression { .literal = .{ .literal = "\\x" } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x07fffffe07fffffe, 0x0000000000000000, 0x0000000000000000, } } } }, Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x07fffffe07fffffe, 0x0000000000000000, 0x0000000000000000, } } } }, }, } } } } }, Expression { .literal = .{ .literal = "]" } }, Expression { .identifier = .{ .identifier = "_" } }, }, } }, // XXX // Rule { .identifier = "set_", .choice = &.{ // &.{ // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xffffffffffffffff, 0xffffffffcfffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, // }, // &.{ // Expression { .literal = .{ .literal = "\\" } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0xffffffffffffffff, 0xfeffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, } } } }, // }, // &.{ // Expression { .literal = .{ .literal = "\\x" } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x07fffffe07fffffe, 0x0000000000000000, 0x0000000000000000, } } } }, // Expression { .character_set = .{ .char_set = Char_Set { .bits = .{ 0x03ff000000000000, 0x07fffffe07fffffe, 0x0000000000000000, 0x0000000000000000, } } } }, // }, // } }, } }; test "parse Grammar" { var grammar_str = \\ \\// after every literal, whitespace is optional \\// except in expression-sequences \\ \\start = _ rule* ; \\_ = space* ; \\space = " " | "\t" | "\n" | comment ; \\comment = "//" [^\n]* "\n" ; \\rule = identifier "=" _ choice ";" _ ; \\identifier = [_a-zA-Z0-9]+ _ ; \\choice = sequence ( "|" _ sequence )* ; // \\choice_ = "|" _ sequence ; \\sequence = expression ( space expression )* ; // \\sequence_ = space expression ; \\expression = "(" _ choice ")" _ \\ | expression "?" _ \\ | expression "+" _ \\ | expression "*" _ \\ | item \\ ; \\item = identifier \\ | literal \\ | set \\ ; \\literal = "\"" ( [^\"\n\\] | "\\" [^x\n] | "\\x" [0-9a-fA-F] [0-9a-fA-F] )+ "\"" _ ; // \\literal_ = [^\"\n\\] | "\\" [^x\n] | "\\x" [0-9a-fA-F] [0-9a-fA-F] ; \\set = "[" ( [^\]\\] | "\\" [^x] | "\\x" [0-9a-zA-Z] [0-9a-zA-Z] )+ "]" _ ; // \\set_ = [^\]\\] | "\\" [^x] | "\\x" [0-9a-zA-Z] [0-9a-zA-Z] ; \\ ; // var buffer = [_]u8 { undefined } ** grammar_str.len; // @memcpy( &buffer, grammar_str ); var buffer // :[grammar_str.len]u8 = grammar_str.* ; _ = buffer; // var var_str :[comptime_const_str.len]u8 = comptime_const_str.*; // print( "\n", .{} ); // var allocator = std.heap.page_allocator; var allocator = testing.allocator; // noisy when debugging logic // var grammar_slice :[]const u8 = &buffer; // ^^^^^ var grammar_slice :[]const u8 = grammar_str[0..]; const grammar = Grammar.parse( allocator, &grammar_slice ) catch |e| { print( "\nerror: {s}\n", .{ @errorName( e ) } ); print( "\nremaining:\n{s}\n", .{ grammar_slice } ); return e; }; defer grammar.free( allocator ); try testing.expectEqualDeep( expected_grammar, grammar ); print( "\n", .{} ); print( "{}\n", .{ grammar } ); try testing.expect( true ); } test "Grammar negative" { const grammar_str = \\ \\// after every literal, whitespace is optional \\// except in expression-sequences \\ \\start = _ rule* ; \\_ = space* ; \\space = " " | "\t" | "\n" | comment ; \\comment = "//" [^\n]* "\n" ; \\rule = identifier "=" _ choice ";" _ ; \\identifier = [_a-zA-Z0-9]+ _ ; \\choice = sequence ( "|" _ sequence )* ; // \\choice_ = "|" _ sequence ; \\sequence = expression ( space expression )* ; // \\sequence_ = space expression ; \\expression = "(" _ choice ")" _ \\ | expression "?" _ \\ | expression "+" _ \\ | expression "*" _ \\ | item \\ ; \\item = identifier \\ | literal \\ | set \\ ; \\literal = "\"" ( [^\"\n\\] | "\\" [^x\n] | "\\x" [0-9a-fA-F] [0-9a-fA-F] )+ "\"" _ ; // \\literal_ = [^\"\n\\] | "\\" [^x\n] | "\\x" [0-9a-fA-F] [0-9a-fA-F] ; \\set = "[" ( [^\]\\] | "\\" [^x] | "\\x" [0-9a-zA-Z] [0-9a-zA-Z] )+ "]" _ ; // \\set_ = [^\]\\] | "\\" [^x] | "\\x" [0-9a-zA-Z] [0-9a-zA-Z] ; \\x ; const err = testParse( grammar_str, Grammar, testing.allocator ); try testing.expectError( error.end_of_input, err ); } test "Grammar Expression Sequence Space" { const str = "start = \"hello\"\"world\" ;"; var buf = str.*; var slc :[]u8 = buf[0..]; const grammar_err = Grammar.parse( testing.allocator, &slc ); try testing.expectError( error.no_whitespace, grammar_err ); const err = testParse( "start = \"hello\"\"world\" ;", Grammar, testing.allocator ); try testing.expectError( error.no_whitespace, err ); const hello_world = try testParse( "start = \"hello\" \"world\" ;", Grammar, testing.allocator ); defer hello_world.free( testing.allocator ); const expected_hello_world :Grammar = Grammar { .rules = &.{ Rule { .identifier = "start", .choice = &.{ &.{ Expression { .literal = .{ .literal = "hello" } }, Expression { .literal = .{ .literal = "world" } }, }, } }, } }; try testing.expectEqualDeep( expected_hello_world, hello_world ); } /// `str` is : `*[_]const u8` and /// `TypeWithParse` must have : `fn TWP.parse( allocator :Allocator, ptrToSlice :*[]u8 ) !TWP` ; fn testParse( comptime str :anytype , comptime TypeWithParse :type, allocator :Allocator ) !TypeWithParse { var buf = str.*; var slc :[]u8 = buf[0..]; return TypeWithParse.parse( allocator, &slc ); } // `*[]const u8` vs `std.fmt.Parser` // https://ziglang.org/documentation/master/std/src/std/fmt.zig.html#L297 // Parser { char, peek, maybe, until } const Parser = struct { slc :*[]const u8, /// are there still bytes to consume ? pub fn more( self :Parser ) bool { return ( 0 < self.slc.*.len ); } /// the next byte to consume or null pub fn peek( self :Parser ) ?u8 { return if( self.more() ) self.slc.*[0] else null; } /// consume the next byte or null pub fn char( self :Parser ) ?u8 { if( self.peek() ) |c| { defer self.slc.* = self.slc.*[1..]; return c; } else { return null; } } /// if `peek == c` then consume and true else false pub fn maybe( self :Parser, c :u8 ) bool { if( self.peek() == c ) { _ = self.char(); return true; } else { return false; } } /// get slice up to `c` /// error if EOF pub fn until( self :Parser, c :u8 ) ![]const u8 { // while( self.more() != self.peek() ) const slice = self.slc.*; var i :usize = 0; while( self.char() ) |c2| :( i += 1 ) { // std.debug.print( "char {c}, str \"{s}\"\n", .{ c2, slice[0..i] } ); if( c == c2 ) { // break; return slice[0..i]; } } // return slice[0..i]; return error.expected_terminal; } }; test "Parser" { var slice :[]const u8 = "hello world"; const parser = Parser { .slc = &slice }; // try testing.expect( false ); try testing.expect( parser.more() ); try testing.expectEqual( @as( u8, 'h' ), parser.peek().? ); try testing.expectEqual( @as( u8, 'h' ), parser.char().? ); try testing.expectEqual( @as( u8, 'e' ), parser.char().? ); try testing.expectEqual( @as( u8, 'l' ), parser.char().? ); try testing.expectEqual( @as( u8, 'l' ), parser.char().? ); try testing.expectEqual( @as( u8, 'o' ), parser.char().? ); try testing.expectEqual( @as( u8, ' ' ), parser.char().? ); try testing.expect( !parser.maybe( 'x' ) ); try testing.expect( parser.maybe( 'w' ) ); try testing.expectEqualStrings( "orl", try parser.until( 'd' ) ); try testing.expect( !parser.more() ); try testing.expect( parser.peek() == null ); try testing.expect( parser.char() == null ); try testing.expect( !parser.maybe( '$' ) ); try testing.expectError( error.expected_terminal, parser.until( '$' ) ); } test "Parser 2" { var slice :[]const u8 = "[alpha|omega]"; const parser = Parser { .slc = &slice }; try testing.expect( parser.maybe( '[' ) ); try testing.expectEqualStrings( "alpha|omega", try parser.until( ']' ) ); } const Grammar = struct { // start rules :[]const Rule, pub fn parse( allocator :Allocator, src :*[]const u8 ) !Grammar { parse_whitespace( src ); var list = ArrayList( Rule ).init( allocator ); // defer list.deinit(); // errdefer Grammar.free( &.{ .rules = list.items }, allocator ); errdefer { for( list.items ) |rule| { rule.free( allocator ); } list.deinit(); } while( 0 < src.len ) { switch( src.*[0] ) { '_','a'...'z','A'...'Z','0'...'9' => { // identifier lead var rule = try Rule.parse( allocator, src ); try list.append( rule ); }, else => { return error.bad_char; }, } } return .{ .rules = try list.toOwnedSlice(), }; } pub fn free( self :*const Grammar, allocator :Allocator ) void { for( self.rules ) |rule| { rule.free( allocator ); } allocator.free( self.rules ); } pub fn format( value: Grammar, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { _ = options; _ = fmt; // print( "options: {}\n", .{ options } ); // XXX deadlock // print( "fmt: {any}\n", .{ fmt } ); // XXX deadlock // print( "writer: {}\n", .{ writer } ); // XXX deadlock // @compileLog( options ); // fmt.FormatOptions, presumably parsed fmt... // @compileLog( fmt ); // "", value inside the curlies // @compileLog( writer ); //" @as( io.writer.Writer( fs.file.File, //" error{ AccessDenied,Unexpected,DiskQuota,FileTooBig,InputOutput,NoSpaceLeft,DeviceBusy,InvalidArgument,BrokenPipe,SystemResources,OperationAborted,NotOpenForWriting,LockViolation,WouldBlock,ConnectionResetByPeer }, //" (function 'write')), //" [runtime value] ) // try writer.writeAll( "from Grammar.format()\n" ); try writer.print( "Grammar {{ .rules = [", .{} ); if( value.rules.len != 0 ) { for( value.rules ) |rule| { try writer.print( " {},", .{ rule } ); } try writer.print( " ", .{} ); } else { // zero rules } try writer.print( "] }}", .{} ); } }; test "Rule" { const err = testParse( "start = \"hello\"\"world\" ;", Rule, testing.allocator ); try testing.expectError( error.no_whitespace, err ); const hello_world = try testParse( "start = \"hello\" \"world\" ;", Rule, testing.allocator ); defer hello_world.free( testing.allocator ); const expected_hello_world :Rule = Rule { .identifier = "start", .choice = &.{ &.{ Expression { .literal = .{ .literal = "hello" } }, Expression { .literal = .{ .literal = "world" } }, }, } }; try testing.expectEqualDeep( expected_hello_world, hello_world ); } const Rule = struct { identifier :[]const u8, choice :[]const []const Expression, pub fn parse( allocator :Allocator, src :*[]const u8 ) !Rule { // parse identifier var identifier :[]u8 = try Identifier.parse( allocator, src ); errdefer Identifier.free( identifier, allocator ); try consume( src, '=' ); parse_whitespace( src ); var choice :[][]Expression = try Choice.parse( allocator, src ); errdefer Choice.free( choice, allocator ); try consume( src, ';' ); parse_whitespace( src ); return .{ .identifier = identifier, .choice = choice, }; } pub fn free( self :*const Rule, allocator :Allocator ) void { Identifier .free( self.identifier, allocator ); Choice .free( self.choice, allocator ); // allocator .free( self.choice ); } pub fn format( value: Rule, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { // try writer.print( "", .{} ); // Rule { .identifier = "asdf", choice: [ ... ] } try writer.print( "Rule {{ .identifier = ", .{} ); try Identifier.format( value.identifier, fmt, options, writer ); try writer.print( ", choice: ", .{} ); try Choice.format( value.choice, fmt, options, writer ); // if( value.choice.len != 0 ) { // for( value.choice ) |sequence| { // try writer.print( " ", .{} ); // try Sequence.format( sequence, fmt, options, writer ); // try writer.print( ",", .{} ); // } // try writer.print( " ", .{} ); // } try writer.print( " }}", .{} ); } }; pub fn parse_whitespace( src :*[]const u8 ) void { // var len = src.len; while( 0 < src.len ) { switch( src.*[0] ) { ' ','\t','\r','\n', 0xb, 0xc, => { // '\v','\f' => { // is_whitespace( src.*[0] ) var i :usize = 0; while( i < src.len and is_whitespace( src.*[i] ) ) { i += 1; } src.* = src.*[i..]; }, '/' => { if( mem.startsWith( u8, src.*, "//" ) ) { // src.* = src[ "//".len.. ]; // while( 0 < src.len and src.*[0] != '\n' ) { src.* = src.*[1..]; }; var i :usize = "//".len; while( i < src.len and src.*[i] != '\n' ) { i += 1; } src.* = src.*[i..]; } else { break; } }, else => { break; }, } } // return len - src.len; } pub fn is_whitespace( char :u8 ) bool { return switch( char ) { ' ','\t','\r','\n',0xb,0xc, => true, // '\v','\f' => true, else => false, }; } // const mem = struct { // non-struct namespace ?? // pub fn startsWith( slice :[]u8, prefix :[]u8 ) bool { // if( slice.len < prefix.len ) { return false; } // return for( slice[ 0..prefix.len ], prefix ) |a,b| { // if( a != b ) { break false; } // } orelse true; // } // }; const assert = testing.expect; test "Choice" { var slice :[]const u8 = "\"hello\" | \"world\" "[0..]; // also okay: &buf; const helloWorld = try Choice.parse( testing.allocator, &slice ); defer Choice.free( helloWorld, testing.allocator ); const expectedHelloWorld :[]const []const Expression = &.{ &.{ Expression { .literal = .{ .literal = "hello" } }, }, &.{ Expression { .literal = .{ .literal = "world" } }, }, }; try testing.expectEqualDeep( expectedHelloWorld, helloWorld ); } test "Choice negative" { var slice :[]const u8 = "alpha? beta+ | gamma* delta | ".*[0..]; // @compileLog( @TypeOf( "str" ) ); // *const [_:0]u8 try testing.expectError( error.empty_source, Choice.parse( testing.allocator, &slice ) ); } const Choice = struct { const Choice_parse_errors = error { bad_char, early_eof, empty_identifier, empty_source, end_of_input, no_whitespace, invalid_expression, invalid_hex, partial_atom_0, partial_atom_1, partial_atom_4, unmatched_paren, OutOfMemory, TestUnexpectedResult, }; pub fn parse( allocator :Allocator, src :*[]const u8 ) Choice_parse_errors![][]Expression { var sequence_list = ArrayList( []Expression ).init( allocator ); // defer sequence_list.deinit(); errdefer { // Choice.free( sequence_list.items, allocator ); for( sequence_list.items ) |item| { Sequence.free( item, allocator ); } sequence_list.deinit(); } try sequence_list.append( try Sequence.parse( allocator, src ) ); while( 0 < src.len and src.*[0] == '|' ) { src.* = src.*[1..]; parse_whitespace( src ); try sequence_list.append( try Sequence.parse( allocator, src ) ); } return try sequence_list.toOwnedSlice(); } pub fn free( self :[]const []const Expression, allocator :Allocator ) void { for( self ) |sequence| { Sequence.free( sequence, allocator ); } allocator.free( self ); } pub fn format( value: []const []const Expression, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { try writer.print( "Choice [", .{} ); if( value.len != 0 ) { for( value ) |sequence| { try writer.print( " ", .{} ); try Sequence.format( sequence, fmt, options, writer ); try writer.print( ",", .{} ); } try writer.print( " ", .{} ); } try writer.print( "]", .{} ); } }; test "Sequence positive" { var slc :[]const u8 = "\"hello\" \"world\" "[0..]; const helloWorld = try Sequence.parse( testing.allocator, &slc ); defer Sequence.free( helloWorld, testing.allocator ); const expectedHelloWorld :[]const Expression = &.{ Expression { .literal = .{ .literal = "hello" } }, Expression { .literal = .{ .literal = "world" } }, }; try testing.expectEqualDeep( expectedHelloWorld, helloWorld ); } test "Sequence negative" { // leading space in the buffer is important for Sequence.parse() var slc :[]const u8 = " \"hello\"\"world\" "[1..]; const err = Sequence.parse( testing.allocator, &slc ); // defer Sequence.free( err, testing.allocator ); print( "sequence slice: {any}\n", .{ err } ); try testing.expectError( error.no_whitespace, err ); } const Sequence = struct { pub fn parse( allocator :Allocator, src :*[]const u8 ) ![]Expression { var ptr_0 = src.ptr; var expression_list = ArrayList( Expression ).init( allocator ); errdefer { // Sequence.free( expression_list.items, allocator ); for( expression_list.items ) |expr| { expr.free( allocator ); } expression_list.deinit(); // hmm.. so slices need to be freed with len, not just the base pointer } try expression_list.append( try Expression.parse( allocator, src ) ); try assert( @intFromPtr( ptr_0 ) < @intFromPtr( src.ptr ) ); try testing.expectEqual( mem.byte_size_in_bits, @typeInfo( u8 ).Int.bits ); while( 0 < src.len and switch( src.*[0] ) { '(', '_','a'...'z','A'...'Z','0'...'9', '[', '"' => true, else => false, } ) { // src.*.ptr[-1] // type 'usize' cannot represent integer value '-1' var prev_u8 :*u8 = @ptrFromInt( @intFromPtr( src.*.ptr ) - 1 ); if( is_whitespace( prev_u8.* ) ) { var expression = try Expression.parse( allocator, src ); try expression_list.append( expression ); } else { // break; return error.no_whitespace; // return error.bad_char; } // XXX this will accept expressions without whitespace between... // `"hello""world"` -vs- `"hello" "world"` } return try expression_list.toOwnedSlice(); } pub fn free( self :[]const Expression, allocator :Allocator ) void { for( self ) |expression| { expression.free( allocator ); } allocator.free( self ); } pub fn format( value: []const Expression, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { try writer.print( "Sequence [", .{} ); if( value.len != 0 ) { for( value ) |expression| { try writer.print( " ", .{} ); try expression.format( fmt, options, writer ); try writer.print( ",", .{} ); } try writer.print( " ", .{} ); } try writer.print( "]", .{} ); } }; const Expression = union(enum) { // LEAD is '(', [_a-zA-Z0-9], '"', '[' parenthesis : struct { choice : []const []const Expression , }, // choice (|) optional : struct { expression : *const Expression , }, // optionals ? one_or_more : struct { expression : *const Expression , }, // one_or_more + zero_or_more : struct { expression : *const Expression , }, // zero_or_more * identifier : struct { identifier : []const u8 , }, // - identifier [_a-zA-Z0-9] character_set : struct { char_set : Char_Set , }, // - literal " literal : struct { literal : []const u8 , }, // - char_set [ pub fn parse( allocator :Allocator, src :*[]const u8 ) !Expression { if( src.len == 0 ) { return error.empty_source; } const expression_item :Expression = switch( src.*[0] ) { '(' => b1: { // parse `choice` src.* = src.*[1..]; // consume '(' parse_whitespace( src ); // consume whitespace() const choice = try Choice.parse( allocator, src ); errdefer Choice.free( choice, allocator ); var expr :Expression = .{ .parenthesis = .{ .choice = choice } }; if( 0 < src.len and src.*[0] == ')' ) { src.* = src.*[1..]; } else { return error.unmatched_paren; } parse_whitespace( src ); // consume whitespace() break :b1 expr; }, // .( '_, 'a..'z,'z, 'A..'Z,'Z, '0..'9,'9 ) { // .( '_', 'a'..'z','z', 'A'..'Z','Z', '0'..'9','9' ) { '_', 'a'...'z', 'A'...'Z', '0'...'9' => .{ .identifier = .{ .identifier = try Identifier .parse( allocator, src ) } }, '[' => .{ .character_set = .{ .char_set = try Char_Set .parse( src ) } }, '"' => .{ .literal = .{ .literal = try Literal .parse( allocator, src ) } }, else => return error.invalid_expression, }; var expression_wrapped = expression_item; errdefer expression_wrapped.free( allocator ); while( 0 < src.len ) { switch( src.*[0] ) { '?', '+', '*' => { var symbol = src.*[0]; src.* = src.*[1..]; var expr = try allocator.create( Expression ); expr.* = expression_wrapped; expression_wrapped = switch( symbol ) { '?' => .{ .optional = .{ .expression = expr } }, '+' => .{ .one_or_more = .{ .expression = expr } }, '*' => .{ .zero_or_more = .{ .expression = expr } }, else => unreachable, }; }, else => break, } } parse_whitespace( src ); return expression_wrapped; } pub fn free( self :*const Expression, allocator :Allocator ) void { switch( self.* ) { .parenthesis => |x| { Choice .free( x.choice, allocator ); }, // struct { choice : [][]Expression , }, .optional => |x| { x.expression .free( allocator ); allocator.destroy( x.expression ); }, // struct { expression : *Expression , }, .one_or_more => |x| { x.expression .free( allocator ); allocator.destroy( x.expression ); }, // struct { expression : *Expression , }, .zero_or_more => |x| { x.expression .free( allocator ); allocator.destroy( x.expression ); }, // struct { expression : *Expression , }, .identifier => |x| { Identifier .free( x.identifier, allocator ); }, // struct { identifier : []u8 , }, .character_set => |_| { }, // struct { char_set : Char_Set , }, .literal => |x| { Literal .free( x.literal, allocator ); }, // struct { literal : []u8 , }, } } const Expression_format_errors = error { AccessDenied, BrokenPipe, ConnectionResetByPeer, DeviceBusy, DiskQuota, FileTooBig, InputOutput, InvalidArgument, LockViolation, NoSpaceLeft, NotOpenForWriting, OperationAborted, SystemResources, Unexpected, WouldBlock, }; // manually declare empty error_set directly on fn // then compile, and see what zig says is missing pub fn format( value: Expression, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) Expression_format_errors!void { try writer.print( "Expression {{ ", .{} ); switch( value ) { .parenthesis => |x| { try writer.print( ".parenthesis = {{ choice = ", .{} ); try Choice.format( x.choice, fmt, options, writer ); try writer.print( " }}", .{} ); }, .optional => |x| { try writer.print( ".optional = {{ expression = ", .{} ); try x.expression.format( fmt, options, writer ); try writer.print( " }}", .{} ); }, .one_or_more => |x| { try writer.print( ".one_or_more = {{ expression = ", .{} ); try x.expression.format( fmt, options, writer ); try writer.print( " }}", .{} ); }, .zero_or_more => |x| { try writer.print( ".zero_or_more = {{ expression = ", .{} ); try x.expression.format( fmt, options, writer ); try writer.print( " }}", .{} ); }, .identifier => |x| { try writer.print( ".identifier = {{ identifier = ", .{} ); try Identifier.format( x.identifier, fmt, options, writer ); try writer.print( " }}", .{} ); }, .character_set => |x| { try writer.print( ".character_set = {{ char_set = ", .{} ); try x.char_set.format( fmt, options, writer ); try writer.print( " }}", .{} ); }, .literal => |x| { try writer.print( ".literal = {{ literal = ", .{} ); try Literal.format( x.literal, fmt, options, writer ); try writer.print( " }}", .{} ); }, } try writer.print( " }}", .{} ); } }; // • // state transition table ( "shift-reduce" logic ) // | state_0 // | • start = • _ rule* ; // | • _ | ' ','\t','\n' | shift, state_0 // | • rule | [_a-zA-Z0-9]* | shift, state_1 // | • $ | $ | reduce, state_2 // | state_1 // | • start = _ • rule* ; // | • rule = • identifier _ "=" _ expression ( "|" expression )* ";" _ ; // | • identifier = • [_a-zA-Z0-9]+ _ ; | [_a-zA-Z0-9] | shift, state_0 // | start • = _ rule* • ; // | state_x // | • // _ = // \\ rule = r1 | r2 | r3 | r4 | r5 | r6 ; // \\ r1 = "literal" ; // \\ r2 = r3 r4 ; // sequence // \\ r3 = r5 | r6 ; // choice // \\ r4 = r1? ; // optional // \\ r5 = r2+ ; // one-or-more // \\ r6 = r3* ; // zero-or-more // ; // pub fn parse_token( allocator :*Allocator, source :[]u8 ) !*Token { // if( source.len == 0 ) { return error.empty_source; } // // probably a `while` up here somewhere // var char = source[0]; // // source = source[1..]; // switch( char ) { // .( ' ', '\t', '\r', '\n' ) { // source = source[1..]; // continue; // }, // .( '"' ) { }, // parse literal // .( '(' ) { }, // expression // .( '=' ) { }, // equal, parse rule // .( ';' ) { }, // semicolon, parse rule // } // }; // var Expression :type = enum { // parenthesis : struct { choice : [][]Expression , }, // optional : struct { expression : *Expression , }, // one_or_more : struct { expression : *Expression , }, // zero_or_more : struct { expression : *Expression , }, // identifier : struct { identifier : *Identifier , }, // character_set : struct { char_set : *Char_Set , }, // literal : struct { literal : *Literal , }, // }; // var expression = null; // var token_kind :type = enum ( // .whitespace, // multiple characters // .comment, // multiple characters // .identifier, // multiple characters // .equal, // .semicolon, // .pipe, // .parenthesis_open, // .parenthesis_close, // .question, // .plus, // .star, // .dot, // .literal, // // .quote, // // .backslash, // .set, // // .bracket_open, // // .bracket_close, // // .hyphen, // // .carot, // ); // const token_literal :type = struct ( // .src :[]u8, // undefined // .kind :token_kind, // ); // var token_class :type = enum ( // . // ); // const token :type = struct ( // .components :Array_List(), // ); // var tokenize_grammar = fn( input :[]u8 ) :TODO { // }; const Identifier = struct { // LEAD is [_a-zA-Z0-9] // ( '_','a'..'z'+1,'A'..'Z'+1,'0'..'9'+1 ) // identifier = [_a-zA-Z0-9]+ _ ; pub fn parse( allocator :Allocator, src :*[]const u8 ) ![]u8 { var list = ArrayList( u8 ).init( allocator ); // defer list.deinit(); // errdefer Identifier.free( list.items, allocator ); errdefer list.deinit(); // XXX var char_set = Char_Set.parse( &"_a-zA-Z0-9]" ); const char_set = b5: { // mutable pattern slice // const pattern = "[_a-zA-Z0-9]"; // var buffer = ( [_]u8 { undefined } ** pattern.len ); // @memcpy( &buffer, pattern ); // var buffer = "[_a-zA-Z0-9]".*; // pattern.*; var slice :[]const u8 = "[_a-zA-Z0-9]"[0..]; // pattern[0..]; break :b5 try Char_Set.parse( &slice ); }; // print( "identifier begin\n", .{} ); // defer print( "identifier fin\n", .{} ); while( 0 < src.len ) { // print( "char: '{c}', match: {}\n", .{ src.*[0], char_set.matches( src.*[0] ) } ); if( char_set.matches( src.*[0] ) ) { try list.append( src.*[0] ); src.* = src.*[1..]; } else if( list.items.len == 0 ) { return error.empty_identifier; } else { parse_whitespace( src ); return try list.toOwnedSlice(); } // } else { // return if( list.items.len == 0 ) { // error.empty_identifier // } else { // list.toOwnedSlice().! // }; // }; } // orelse { return error.end_of_input; }; return error.end_of_input; // TODO `end_of_input` -vs- `early_eof` } pub fn free( self :[]const u8, allocator :Allocator ) void { allocator.free( self ); } pub fn format( value: []const u8, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { _ = options; _ = fmt; // Identifier "asdf" try writer.print( "Identifier \"{s}\"", .{ value } ); } }; const Literal = struct { // only a namespace, Literal is a `[]u8` // LEAD is '"' pub fn parse( allocator :Allocator, src :*[]const u8 ) ![]u8 { var list = ArrayList( u8 ).init( allocator ); // defer list.deinit(); // errdefer Literal.free( list.items, allocator ); errdefer list.deinit(); // print( "literal begin\n", .{} ); // defer print( "literal fin\n", .{} ); try consume( src, '"' ); while( 0 < src.len ) { const char = src.*[0]; _ = char; const error_atom = try parse_atom( src, '"' ); // print( "char: '{c}', atom: 0x{?x}\n", .{ char, error_atom } ); var atom = ( error_atom ) orelse break; try list.append( atom ); } try consume( src, '"' ); parse_whitespace( src ); return try list.toOwnedSlice(); } pub fn free( self :[]const u8, allocator :Allocator ) void { allocator.free( self ); } pub fn format( value: []const u8, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { _ = options; _ = fmt; // Literal "asdf" try writer.print( "Literal \"{s}\"", .{ value } ); } }; const Char_Set = struct { bits :[4]u64 = .{ 0, 0, 0, 0 }, // 256 bits // LEAD is '[' pub fn matches( self :*const Char_Set, char :u8 ) bool { var chunk = self.*.bits[ char >> 6 ]; var chunk_mask :u64 = @as( u64, 1 ) << @intCast( 0b0011_1111 & char ); var flag = chunk & chunk_mask; return flag != 0; } /// for parsing ie. "[^a-z234\n\xFF]". /// on success, consumes the closing bracket ']'. /// XXX atoms toggle, double negation /// `[a-cd-f]` == `[abcdef]` /// XXX `[a-dc-f]` == `[abef]`, should be `[abcdef]` pub fn parse( src :*[]const u8 ) !Char_Set { try consume( src, '[' ); // TODO consider reseting src on error // var original_src = src; // errdefer { src = original_src; }; var char_set :Char_Set = .{}; var is_inverted = ( 0 < src.len and src.*[0] == '^' ); var fill :u64 = if( is_inverted ) 0xffff_ffff_ffff_ffff else 0; for( &char_set.bits ) |*chunk| { chunk.* = fill; } var begin :usize = if( is_inverted ) 1 else 0; src.* = src.*[begin..]; while( 0 < src.len ) { var atom = try parse_atom( src, ']' ) orelse { src.* = src.*[1..]; // consume( src, ']' ); return char_set; }; if( 0 < src.len and src.*[0] == '-' ) { src.* = src.*[1..]; // consume( src, '-' ); var atom_range_end = try parse_atom( src, ']' ) orelse { src.* = src.*[1..]; // consume( src, ']' ); char_set.toggle_char( atom ); char_set.toggle_char( '-' ); return char_set; }; char_set.toggle_range( atom, atom_range_end ); } else { char_set.toggle_char( atom ); } } return error.end_of_input; //return while( 0 < src.len ) { // var atom = Char_Set.parse_atom( src ).! // orelse { break char_set; } // ; // if( 1 < src.len and src.*[0] == '-' ) { // src.* = src.*[1..]; // var atom_range_end = Char_Set.parse_atom( src ).! orelse { // char_set.toggle_char( atom ); // char_set.toggle_char( '-' ); // break char_set; // }; // char_set.toggle_range( atom, atom_range_end ); // } else { // char_set.toggle_char( atom ); // }; //} orelse { error.end_of_input }; } //Char_Set.parse_atom = fn( content :*mut []u8 ) !u8 { // // XXX using ptr_to_slice.*[index] syntax here feels bad.. // // parse atoms: `a`, `\n`, `\xff` // if( 1 <= content.*.len and content.*[0] != '\\' ) { // // content.*[0..1]; // defer { content.* = content.*[1..]; }; // return content.*[0]; // }; // if( 2 <= content.*.len and content.*[1] != 'x' ) { // // content.*[0..2]; // defer { content.* = content.*[2..]; }; // // https://ziglang.org/documentation/master/#Escape-Sequences // // https://en.wikipedia.org/wiki/ASCII#Control_code_chart // return switch( content.*[1] ) { // .( '0' ) { 0x00 }, // null // .( 't' ) { 0x09 }, // tab // .( 'n' ) { 0x0A }, // line feed // .( 'v' ) { 0x0B }, // vertical tab // .( 'f' ) { 0x0C }, // form feed // .( 'r' ) { 0x0D }, // carriage return // .() { content.*[1] }, // }; // }; // if( 4 <= content.*.len ) { // // content.*[0..4]; // defer { content.* = content.*[4..]; }; // return hex_to_byte( content.*[2], content.*[3] ).!; // }; // return error.partial_atom; //}; /// for parsing char_set atoms like `a`, `\n`, `\xff`. /// modifies `content`, consumes atom. /// - errors on end-of-input or invalid-hex. /// - null on ']', end-of-char_set. /// * ']' can be returned as u8 via "\]" or "\x5d". /// * '\' can be returned as u8 via "\\" or "\x__". /// - u8 on atom, normal. // pub fn parse_atom( content :*[]u8 ) !?u8 { // return try parse_atom( content, ']' ); // } pub fn toggle_char( self :*Char_Set, char :u8 ) void { var chunk_mask :u64 = @as( u64, 1 ) << @intCast( 0b0011_1111 & char ); self.*.bits[ char >> 6 ] ^= chunk_mask; } pub fn toggle_range( self :*Char_Set, begin :u8, end :u8 ) void { // XXX terribly inefficient, but functional // also, what if { end < begin } ..? for( begin..end ) |i| { self.*.toggle_char( @intCast( i ) ); } self.*.toggle_char( end ); // inclusive } pub fn format( value: Char_Set, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype ) !void { _ = options; _ = fmt; // Char_Set { bits = [ 0xfeed, 0xbeef, 0x0000, 0xffff ] } try writer.print( "Char_Set {{ bits = [", .{} ); for( value.bits ) |bits| { try writer.print( " 0x{x:0>16},", .{ bits } ); } try writer.print( " ] }}", .{} ); } }; test "Char_Set" { // print( "\n", .{} ); // var char :u8 = 'a'; // // https://github.com/ziglang/zig/issues/16075 // // var a_slice = (&char)[0..1]; // // https://github.com/ziglang/zig/issues/6391 // var a_ptr :[*]u8 = @ptrCast( &char ); // var a_slice :[]u8 = a_ptr[ 0..1 ]; // var a_charset = try Char_Set.parse( &a_slice ); // var i : u64 = 1; // var expected :Char_Set = .{ // .bits = .{ // 0, // ( i << ( 'a' - 64 ) ), // 0, // 0, // } // }; // try testing.expectEqualDeep( expected, a_charset ); var one :u64 = 1; var buf = [3]u8 { '[', 'b', ']' }; var buf_ptr :[]u8 = &buf; var b_charset = try Char_Set.parse( &buf_ptr ); var b_expected :Char_Set = .{ .bits = .{ 0, // ( one << ( 'b' - 0 * 64 ) ), ( one << ( 'b' - 1 * 64 ) ), 0, // ( one << ( 'b' - 2 * 64 ) ), 0, // ( one << ( 'b' - 3 * 64 ) ), } }; try testing.expectEqualDeep( b_expected, b_charset ); try testing.expect( b_charset.matches( 'b' ) ); var pattern = "[_a-zA-Z0-9]"; var buffer = [_]u8 { undefined } ** pattern.len; @memcpy( &buffer, pattern ); var slice :[]u8 = &buffer; var identifier = try Char_Set.parse( &slice ); try testing.expect( identifier.matches( 'a' ) ); try testing.expect( identifier.matches( 'b' ) ); try testing.expect( identifier.matches( 'z' ) ); try testing.expect( identifier.matches( 'y' ) ); try testing.expect( identifier.matches( 'A' ) ); try testing.expect( identifier.matches( 'B' ) ); try testing.expect( identifier.matches( 'Z' ) ); try testing.expect( identifier.matches( 'Y' ) ); try testing.expect( identifier.matches( '0' ) ); try testing.expect( identifier.matches( '1' ) ); try testing.expect( identifier.matches( '8' ) ); try testing.expect( identifier.matches( '9' ) ); try testing.expect( identifier.matches( '_' ) ); try testing.expect( !identifier.matches( 0 ) ); try testing.expect( !identifier.matches( 128 ) ); try testing.expect( !identifier.matches( 196 ) ); try testing.expect( !identifier.matches( 255 ) ); } /// for parsing char_set atoms like `a`, `\n`, `\xff`. /// modifies `src`, consumes atom. /// - errors on end-of-input or invalid-hex. /// - null on terminal ( end of char_set ']' or literal '"' ) /// * '"' can be returned as u8 via "\"" or "\x22". /// * ']' can be returned as u8 via "\]" or "\x5d". /// * '\' can be returned as u8 via "\\" or "\x5c". /// - u8 on atom, normal. fn parse_atom( src :*[]const u8, terminal :u8 ) !?u8 { // print( "atom begin, terminal: '{c}'\n", .{ terminal } ); // defer print( "atom end\n", .{} ); // parse atoms: `a`, `\n`, `\xff` var atom_len :usize = 0; defer src.* = src.*[atom_len..]; // XXX if( src.len < 1 ) { return error.partial_atom_0; } var a0 = src.*[0]; if( a0 == terminal ) { return null; } // print( "atom 0: '{c}'\n", .{ a0 } ); atom_len = 1; if( a0 != '\\' ) { return a0; } // `a` // a0 is `\` if( src.len < 2 ) { return error.partial_atom_1; } atom_len = 2; var a1 = src.*[1]; // print( "atom 1: '{c}'\n", .{ a1 } ); if( a1 != 'x' ) { // https://ziglang.org/documentation/master/#Escape-Sequences // https://en.wikipedia.org/wiki/ASCII#Control_code_chart return switch( a1 ) { // `\q` '0' => 0x00, // null 't' => 0x09, // tab 'n' => 0x0A, // line feed 'v' => 0x0B, // vertical tab 'f' => 0x0C, // form feed 'r' => 0x0D, // carriage return else => a1, }; } // a0 is `\`, a1 is `x`, src[0..2] is `\x` if( src.len < 4 ) { return error.partial_atom_4; } atom_len = 4; var a2 = src.*[2]; var a3 = src.*[3]; // print( "atom 3, 4: '{c}', '{c}'\n", .{ a2, a3 } ); return try hex_to_byte( a2, a3 ); // `\xff` } fn hex_to_nibble( hex :u8 ) !u8 { return if( '0' <= hex and hex <= '9' ) ( hex - '0' ) else if( 'a' <= hex and hex <= 'f' ) ( hex - 'a' ) else if( 'A' <= hex and hex <= 'F' ) ( hex - 'A' ) else ( error.invalid_hex ); } fn hex_to_byte( hi :u8, lo :u8 ) !u8 { return ( try hex_to_nibble( hi ) << 4 ) | ( try hex_to_nibble( lo ) ); } fn consume( src :*[]const u8, char :u8 ) !void { if( 0 < src.len ) { if( src.*[0] == char ) { src.* = src.*[1..]; } else { return error.bad_char; } } else { return error.early_eof; } }