// pon v0.2 2024-09-10 // i should be using git... haha.. const std = @import( "std" ); const Allocator = std.mem.Allocator; const ArrayList = std.ArrayList; const assert = std.debug.assert; const debug = std.debug.print; const tst = std.testing; // zig test --test-no-exec -femit-bin=test pon-parser.zig // https://ziglang.org/documentation/master/std/#std.heap.GeneralPurposeAllocator // https://ziglang.org/documentation/master/std/#std.mem.Allocator // https://ziglang.org/documentation/master/#Tagged-union /// api around a slice of bytes /// buffering is a non-issue because the slice is all in-memory /// not intended for actual streaming; TODO find better name pub const Stream = struct { src :[]const u8, off :usize = 0, /// initialize a Stream object /// do not inc past end_of_stream pub fn init( src :[]const u8 ) Stream { return .{ .src = src }; } // /// is no more content available ? // pub fn done( stream :*const Stream ) bool { // return stream.src.len <= stream.off; // } /// is more content available ? pub fn more( stream :*const Stream ) bool { return stream.off < stream.src.len; } /// get next char /// null if eos /// does not increment pub fn peek( stream :*const Stream ) ?u8 { return if( !stream.more() ) null else stream.src[ stream.off ] ; } /// increments stream pub fn inc( stream :*Stream ) void { if( stream.more() ) { stream.off += 1; } } /// peek & increment pub fn step( stream :*Stream ) ?u8 { defer { stream.inc(); } return stream.peek(); } /// peek & compare /// does not increment /// XXX dead method pub fn next( stream :*const Stream, n :u8 ) bool { const p = stream.peek() orelse return false; return p == n; } /// step & compare /// error if char does not match stream /// increments stream pub fn consume( stream :*Stream, char :u8 ) Pon_Parse_Error!void { const c = stream.step() orelse { return error.end_of_stream; }; if( c != char ) { return error.not_equal; } } }; test Stream { var abc = Stream.init( "abc" ); try tst.expect( abc.more() ); try tst.expectEqual( abc.peek().?, 'a' ); try tst.expectEqual( abc.step().?, 'a' ); try tst.expect( abc.more() ); try abc.consume( 'b' ); try tst.expect( abc.more() ); // try tst.expect( abc.next( 'c' ) ); while( abc.more() ) { abc.inc(); } // try tst.expect( abc.done() ); try tst.expect( abc.peek() == null ); } const Pon = struct { allocator :Allocator, kind :[]const u8, data :Data, pub fn deinit( pon :*const Pon ) void { pon.*.allocator.free( pon.*.kind ); switch( pon.*.data ) { .primitive => |p| { pon.*.allocator.free( p ); }, .records => |rs| { for( rs ) |r| { r.deinit(); } pon.allocator.free( rs ); }, .list => |l| { for( l ) |x| { x.deinit(); } pon.allocator.free( l ); }, } } }; /// ( _ ) /// ( _ _ ) /// ( _ _ (_) ) /// ( _ (_) ) const Data = union(enum) { primitive :[]const u8, records :[]const Pair, list :[]const Pon, }; /// ( _ _ (_) ) /// ^^^^^^ const Pair = struct { allocator :Allocator, key :[]const u8, val :Pon, pub fn deinit( pair :*const Pair ) void { pair.*.allocator.free( pair.*.key ); pair.*.val.deinit(); } }; const Pon_Parse_Error = error { bad_escape, end_of_stream, end_of_stream_during_data, end_of_stream_during_quoted, end_of_stream_during_record_or_string, end_of_stream_during_string, missing_string, not_equal, // TODO, unexpected_character, unexpected_whitespace, unexpected_open_paren, OutOfMemory, }; const Parser = struct { allocator :Allocator, stream :Stream, err_off :?*?usize, fn init( allocator :Allocator, source :[]const u8, err_off :?*?usize ) Parser { return .{ .allocator = allocator, .stream = Stream.init( source ), .err_off = err_off, }; } fn whitespace( p :*Parser ) void { while( p.stream.peek() ) |c| { switch( c ) { ' ', '\t', '\r', '\n' => { p.stream.inc(); }, ';' => { p.stream.inc(); // try p.stream.consume( c ); while( p.stream.peek() ) |cc| { if( cc == '\n' ) { break; } else { p.stream.inc(); } } }, else => { break; }, } } } // X fn primitive( p :*Parser ) Pon_Parse_Error!XYZ {} // X fn record( p :*Parser ) Pon_Parse_Error!XYZ {} // X fn list( p :*Parser ) Pon_Parse_Error!XYZ {} /// ``` /// ( _ ) /// ( _ _ ) /// ( _ _ (_) ) /// ( _ (_) ) /// ``` fn pon( p :*Parser ) Pon_Parse_Error!Pon { // debug( "{{ pon ", .{} ); defer { debug( " pon }}\n", .{} ); } try p.stream.consume( '(' ); p.whitespace(); const kind = try p.string(); errdefer { p.allocator.free( kind ); } // p.whitespace(); const body = try p.data(); // _ = body; // return error.TODO; return .{ .allocator = p.allocator, .kind = kind, .data = body, }; } /// ``` /// string = /// | literal /// | quoted /// ; /// ``` fn string( p :*Parser ) Pon_Parse_Error![]const u8 { // debug( "{{ string ", .{} ); defer { debug( " string }}\n", .{} ); } const c = p.stream.peek() orelse { if( p.err_off ) |err| { err.* = p.stream.off; } return error.end_of_stream_during_string; }; // switch( c ) { // '"' => { return p.quoted(); }, // '(', ')', // pon boundaries // ' ', '\t', '\r', '\n', // whitespace // ';', // comment // => { return error.missing_string; }, // else => { return p.literal(); }, // } const ret = switch( c ) { '"' => p.quoted(), '(', ')', // pon boundaries ' ', '\t', '\r', '\n', // whitespace ';', // comment => error.missing_string, else => p.literal(), }; if( ret ) |val| { _ = val; // debug( "<{s}>\n", .{ val } ); // p.whitespace(); // see quoted & literal whitespace() // // also, it seems defer-for-the-happy-path has use as well // // tho that would nerf tail-call-optimization } // XXX careful removing this `else`, zig complains else |_| { // debug( "error: {s}\n", .{ @errorName( e ) } ); if( p.err_off ) |err| { err.* = p.stream.off; } } return ret; } /// ``` /// quoted =| QUOTE char* QUOTE _ ; /// /// char = /// | !QUOTE !BACKSLASH ANY /// | BACKSLASH QUOTE /// | BACKSLASH BACKSLASH /// ; /// ``` fn quoted( p :*Parser ) Pon_Parse_Error![]const u8 { // debug( "{{ quoted ", .{} ); defer { debug( " quoted }}\n", .{} ); } try p.stream.consume( '"' ); var list = ArrayList( u8 ).init( p.allocator ); defer { list.deinit(); } while( p.stream.step() ) |c| { switch( c ) { '"' => { p.whitespace(); return list.toOwnedSlice(); }, '\\' => { const c2 = p.stream.step() orelse { if( p.err_off ) |err| { err.* = p.stream.off; } return error.end_of_stream_during_quoted; }; switch( c2 ) { '"', '\\' => { try list.append( c2 ); p.stream.inc(); }, else => { if( p.err_off ) |err| { err.* = p.stream.off; } return error.bad_escape; }, } }, else => { try list.append( c ); }, } } if( p.err_off ) |err| { err.* = p.stream.off; } return error.end_of_stream_during_quoted; } /// ``` /// literal =| ( /// !PAREN_OPEN !PAREN_CLOSE /// !SPACE !TAB !RETURN !NEWLINE !SEMICOLON /// ANY /// )+ _ ; /// ``` fn literal( p :*Parser ) Pon_Parse_Error![]const u8 { // debug( "{{ literal ", .{} ); defer { debug( " literal }}\n", .{} ); } var list = ArrayList( u8 ).init( p.allocator ); defer { list.deinit(); } while( p.stream.peek() ) |c| { switch( c ) { // TODO consider adding '"' restrictions '(', ')', // pon boundaries ' ', '\t', '\r', '\n', // whitespace ';', // comment => { break; }, else => { try list.append( c ); // debug( "<{c}>", .{ c } ); p.stream.inc(); }, } } p.whitespace(); return try list.toOwnedSlice(); } /// ``` /// ( _ ) /// ^ /// ( _ _ ) /// ^^^ /// ( _ _ (_) ) /// ^^^^^^^ /// ( _ (_) ) /// ^^^^^ /// ``` fn data( p :*Parser ) Pon_Parse_Error!Data { // debug( "{{ data ", .{} ); defer { debug( " data }}\n", .{} ); } const c = p.stream.peek() orelse { if( p.err_off ) |err| { err.* = p.stream.off; } return error.end_of_stream_during_data; }; switch( c ) { ')' => { p.stream.inc(); p.whitespace(); // return .{ .list = .{ // .ptr = null, // undefined, // .len = 0, // } }; return .{ .list = ( [_]Pon {} )[0..] }; }, '(' => { return try p.data_list(); }, ' ', '\t', '\r', '\n', // whitespace ';', // comment => { if( p.err_off ) |err| { err.* = p.stream.off; } return error.unexpected_whitespace; }, else => { return try p.data_record_or_string(); }, } } /// ``` /// ( _ (_) ) /// ^^^^^ /// ( _ (_) (_) ) /// ^^^^^^^^^ /// ++++ /// ``` fn data_list( p :*Parser ) Pon_Parse_Error!Data { // debug( "{{ data_list ", .{} ); defer { debug( " data_list }}\n", .{} ); } var list = ArrayList( Pon ).init( p.allocator ); defer { for( list.items ) |l| { l.deinit(); } list.deinit(); } while( p.stream.peek() ) |c| { switch( c ) { '(' => { try list.append( try p.pon() ); }, ')' => { p.stream.inc(); p.whitespace(); break; }, else => { if( p.err_off ) |err| { err.* = p.stream.off; } return error.unexpected_character; }, } } return .{ .list = try list.toOwnedSlice() }; } /// ``` /// ( _ _ ) /// ^^^ /// ( _ _ (_) ) /// ^^^^^^^ /// ``` fn data_record_or_string( p :*Parser ) Pon_Parse_Error!Data { // debug( "{{ dros ", .{} ); defer { debug( " dros }}\n", .{} ); } const s = try p.string(); // resource // errdefer { p.allocator.free( s ); } const c = p.stream.peek() orelse { p.allocator.free( s ); if( p.err_off ) |err| { err.* = p.stream.off; } return error.end_of_stream_during_record_or_string; }; switch( c ) { ')' => { p.stream.inc(); p.whitespace(); return .{ .primitive = s }; }, '(' => { // `data_record()` takes ownership-of and // responsibility-for-freeing `s` return p.data_record( s ); }, else => { p.allocator.free( s ); if( p.err_off ) |err| { err.* = p.stream.off; } return error.unexpected_character; }, } } /// ``` /// ( _ _ (_) ) /// v ^^^^^ /// key /// ( _ _ (_) _ (_) ) /// ^^^^^^^^^^^ /// ++++++ /// ``` fn data_record( p :*Parser, key :[]const u8 ) Pon_Parse_Error!Data { // debug( "{{ data_record ", .{} ); defer { debug( " data_record }}\n", .{} ); } // errdefer { p.allocator.free( key ); } const val = p.pon() catch |e| { p.allocator.free( key ); return e; }; // errdefer { val.deinit(); } var list = ArrayList( Pair ).init( p.allocator ); errdefer { for( list.items ) |i| { i.deinit(); } list.deinit(); } list.append( .{ .allocator = p.allocator, .key = key, .val = val, } ) catch |e| { p.allocator.free( key ); val.deinit(); return e; }; while( p.stream.peek() ) |c| { switch( c ) { ')' => { p.stream.inc(); p.whitespace(); break; }, '(' => { if( p.err_off ) |err| { err.* = p.stream.off; } return error.unexpected_open_paren; }, ' ', '\t', '\r', '\n', // whitespace ';', // comment => { if( p.err_off ) |err| { err.* = p.stream.off; } return error.unexpected_whitespace; }, else => { const field = try p.string(); // errdefer { p.allocator.free( field ); } const value = p.pon() catch |e| { p.allocator.free( field ); return e; }; list.append( .{ .allocator = p.allocator, .key = field, .val = value, } ) catch |e| { p.allocator.free( field ); value.deinit(); return e; }; }, } } return .{ .records = try list.toOwnedSlice() }; } }; pub fn parse( allocator :Allocator, source :[]const u8, err_off :?*?usize ) Pon_Parse_Error!Pon { var parser = Parser.init( allocator, source, err_off ); parser.whitespace(); return parser.pon(); } // zig test --test-no-exec -femit-bin=test pon-parser.zig test "atom" { const atom = try parse( tst.allocator, "( null )", null ); defer { atom.deinit(); } // debug( "atom: {any}\n", .{ atom } ); } test "primitive" { const primitive = try parse( tst.allocator, "( integer 1 )", null ); defer { primitive.deinit(); } } test "record" { const record = try parse( tst.allocator, "( rec key( str val ) )", null ); defer { record.deinit(); } } test "record2" { const record2 = try parse( tst.allocator, "( rec f1( str value ) f2( int 2 ) )", null ); defer { record2.deinit(); } } test "list" { const list = try parse( tst.allocator, "( list ( x ) )", null ); defer { list.deinit(); } } test "list2" { const list2 = try parse( tst.allocator, "( list2 ( y ) ( z ) )", null ); defer { list2.deinit(); } } const Xoshiro256 = std.Random.Xoshiro256; /// TODO /// fuzzer that generates weighted characters /// objective is not to generate *valid* input, /// but to hammer the error handling fn fuzz_fill_buffer( buffer :[]u8 ) void { // var buffer = [1] u8 { undefined } ** 65536; // https://ziglang.org/documentation/master/std/#src/std/Random/Xoshiro256.zig const seed :u64 = 0; var r = Xoshiro256.init( seed ); // const x :u64 = r.next(); for( buffer ) |*b| { // PAREN_OPEN =| "(" ; // PAREN_CLOSE =| ")" ; // QUOTE =| "\"" ; // BACKSLASH =| "\\" ; // SPACE =| " " ; // TAB =| "\t" ; // RETURN =| "\r" ; // NEWLINE =| "\n" ; // SEMICOLON =| ";" ; // ANY =| . ; b.* = switch( r.next() % 6 ) { 0 => '(', 1 => ')', 3 => '"', 2 => '\\', 4 => switch( r.next() % 5 ) { 0 => ' ', 1 => '\t', 2 => '\r', 3 => '\n', 4 => ';', else => 'y', }, 5 => '\n', else => 'x', }; } } test "fuzzing Xoshiro256 0" { // if( true ) { return error.SkipZigTest; } var buffer :[65536*128]u8 = undefined; fuzz_fill_buffer( buffer[0..] ); fuzz_fill_buffer( &buffer ); for( 0..buffer.len ) |i| { // for( 411..412 ) |i| { // debug( "fuzz iteration {d} {{ ", .{ i } ); // defer { debug( " }}\n", .{} ); } var err :?usize = null; var pon = parse( std.testing.allocator, buffer[i..i], &err ) catch { // debug( "hello?\n", .{} ); // if( err ) |off| { // debug( "here\n", .{} ); // debug( // "{{ <{d}>, <{s}>, <{d}>, <{c}> }}\n", // .{ i, buffer[i..i+64], off, buffer[i..i+64][ off ] } // ); // debug( "there\n", .{} ); // } else { // debug( "{{ succeeded?, <{d}>, <{s}> }}\n", .{ i, buffer[i..i+64] } ); // } continue; }; defer pon.deinit(); // debug( "\n", .{} ); } } test "fuzz-411" { debug( "fuzz-411\n", .{} ); // when freeing lists, // remember to free the *content of the list* as well const input = \\(\(\"\\ \\(\"")" ; // const input = // \\( l1 ( l2 )( // ; var err :?usize = null; // const pon = parse( // std.testing.allocator, // input, // &err, // ) catch |e| { // debug( "catch\n", .{} ); // if( err ) |off| { // debug( "error: {s} {d} '{?c}'\n", .{ // @errorName( e ), // off, // ( if( off < input.len ) input[off] else null ) // } ); // } else { // debug( "umm..\n", .{} ); // } // return; // }; const pon = parse( std.testing.allocator, input, &err ) catch { return; }; defer { pon.deinit(); } }