(** Jam/cue serialization for nouns Based on the Vere implementation in pkg/noun/serial.c Jam encoding: - Atoms: tag bit 0, then mat-encoded value - Cells: tag bits 01, then recursively encode head and tail - Backrefs: tag bits 11, then mat-encoded position Mat encoding (length-prefixed): - For 0: just bit 1 - For n > 0: - Let a = bit-width of n - Let b = bit-width of a - Encode: [1 repeated b times][0][a in b-1 bits][n in a bits] *) open Noun open Bitstream type cue_progress = nouns:int -> bits:int -> depth:int -> max_depth:int -> unit type cue_event = | Cue_atom_begin of { position : int; value_bits : int } | Cue_atom_end of { position : int; total_bits : int; value_bits : int } | Cue_backref of { position : int; ref_pos : int } | Cue_emit of { nouns : int; depth : int; max_depth : int } (** Mat-encode a number into the bitstream Mat encoding is a variable-length integer encoding: - 0 is encoded as a single 1 bit - For n > 0: - a = number of bits in n (met 0 n) - b = number of bits needed to represent a - Write b 0-bits, then one 1-bit - Write a in b-1 bits - Write n in a bits *) let mat_encode w n = if Z.equal n Z.zero then write_bit w true else begin let a = Z.numbits n in (* bit-width of n *) let b = Z.numbits (Z.of_int a) in (* bit-width of a *) (* Write b 0-bits followed by one 1-bit *) for _i = 1 to b do write_bit w false done; write_bit w true; (* Write a in b-1 bits *) write_bits w (Z.of_int a) (b - 1); (* Write n in a bits *) write_bits w n a end (** Mat-decode from bitstream, returns (value, bits_read) *) let mat_decode ?on_value_bits r = let start_pos = reader_pos r in (* Count leading 0 bits until we hit the terminating 1 bit. *) let b = Bitstream.count_zero_bits_until_one r in if b = 0 then (* Just a single 1 bit means 0 *) (Option.iter (fun f -> f 0) on_value_bits; (Z.zero, reader_pos r - start_pos, 0)) else begin (* Read the length bits and compute a = 2^(b-1) + bits_read *) let bits_val = read_bits r (b - 1) in let a = Z.to_int (Z.add (Z.shift_left Z.one (b - 1)) bits_val) in Option.iter (fun f -> f a) on_value_bits; (* Read n in a bits *) let n = read_bits r a in (n, reader_pos r - start_pos, a) end (** Jam: serialize a noun to bytes Uses a hash table to track positions for backreferences. Returns the serialized bytes. *) let jam noun = let w = writer_create () in let positions = Hashtbl.create 256 in (* noun -> bit position *) let rec jam_noun n = match n with | Atom a -> (* Check if we've seen this atom before *) begin match Hashtbl.find_opt positions n with | Some pos -> (* Backref might be smaller than re-encoding *) let atom_size = 1 + (Z.numbits a) in (* rough estimate *) let backref_size = 2 + (Z.numbits (Z.of_int pos)) in if backref_size < atom_size then begin (* Encode backref: tag bits 11 *) write_bit w true; write_bit w true; mat_encode w (Z.of_int pos) end else begin (* Encode atom *) write_bit w false; mat_encode w a end | None -> (* Record position and encode atom *) Hashtbl.add positions n w.bit_pos; write_bit w false; mat_encode w a end | Cell (head, tail) -> (* Check for backref *) begin match Hashtbl.find_opt positions n with | Some pos -> (* Encode backref: tag bits 11 *) write_bit w true; write_bit w true; mat_encode w (Z.of_int pos) | None -> (* Record position and encode cell *) Hashtbl.add positions n w.bit_pos; (* Tag bits 01 for cell *) write_bit w true; write_bit w false; (* Recursively encode head and tail *) jam_noun head; jam_noun tail end in jam_noun noun; writer_to_bytes w (** Cue: deserialize bytes to a noun Uses a hash table to store nouns by bit position for backreferences. *) module IntTable = Hashtbl.Make (struct type t = int let equal = Int.equal let hash x = x land max_int end) let cue ?progress ?(progress_interval = 200_000) ?inspect bytes = let r = reader_create bytes in (* Pre-size the backref table based on payload size to minimise rehashing. *) let estimated_nouns = let approx = Bytes.length bytes / 8 in if approx < 1024 then 1024 else approx in let backref_table = IntTable.create estimated_nouns in (* Manual stack used to eliminate recursion while tracking unfinished cells. *) let initial_stack_capacity = 1024 in let stack_pos = ref (Array.make initial_stack_capacity 0) in let stack_head = ref (Array.make initial_stack_capacity None) in let stack_size = ref 0 in let max_depth = ref 0 in (* Noun counter is used for periodic progress callbacks. *) let nouns_processed = ref 0 in let report_tick, report_final = match progress with | None -> ( (fun ~nouns:_ -> ()), (fun ~nouns:_ -> ()) ) | Some callback -> let interval = if progress_interval <= 0 then 1 else progress_interval in let next_report = ref interval in let call_callback nouns = callback ~nouns ~bits:(reader_pos r) ~depth:!stack_size ~max_depth:!max_depth in let tick ~nouns = if nouns >= !next_report then begin call_callback nouns; next_report := nouns + interval end in let final ~nouns = if nouns < !next_report then call_callback nouns in (tick, final) in let inspect_event = match inspect with Some f -> f | None -> fun _ -> () in let grow_stack () = let old_pos = !stack_pos in let old_head = !stack_head in let old_len = Array.length old_pos in let new_len = old_len * 2 in let new_pos = Array.make new_len 0 in let new_head = Array.make new_len None in Array.blit old_pos 0 new_pos 0 old_len; Array.blit old_head 0 new_head 0 old_len; stack_pos := new_pos; stack_head := new_head in let push_frame pos = if !stack_size = Array.length !stack_pos then grow_stack (); let idx = !stack_size in let pos_arr = !stack_pos in let head_arr = !stack_head in pos_arr.(idx) <- pos; head_arr.(idx) <- None; stack_size := idx + 1; if !stack_size > !max_depth then max_depth := !stack_size in let result = ref None in let rec emit noun = incr nouns_processed; report_tick ~nouns:!nouns_processed; inspect_event (Cue_emit { nouns = !nouns_processed; depth = !stack_size; max_depth = !max_depth }); if !stack_size = 0 then result := Some noun else begin let idx = !stack_size - 1 in let head_arr = !stack_head in match head_arr.(idx) with | None -> head_arr.(idx) <- Some noun | Some head -> let pos_arr = !stack_pos in let cell_pos = pos_arr.(idx) in head_arr.(idx) <- None; stack_size := idx; let cell = Cell (head, noun) in IntTable.replace backref_table cell_pos cell; emit cell end in while Option.is_none !result do let pos = reader_pos r in let tag0 = read_bit r in if not tag0 then begin (* Atom: tag bit 0 *) let on_value_bits bits = inspect_event (Cue_atom_begin { position = pos; value_bits = bits }) in let (value, total_bits, value_bits) = mat_decode ~on_value_bits r in let atom = Atom value in IntTable.replace backref_table pos atom; inspect_event (Cue_atom_end { position = pos; total_bits; value_bits }); emit atom end else begin let tag1 = read_bit r in if tag1 then begin (* Backref: tag bits 11 *) let (ref_pos, _width, _value_bits) = mat_decode r in let ref_int = Z.to_int ref_pos in inspect_event (Cue_backref { position = pos; ref_pos = ref_int }); match IntTable.find_opt backref_table ref_int with | Some noun -> emit noun | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_int)) end else begin (* Cell: tag bits 01 – push frame and continue decoding head. *) push_frame pos end end done; report_final ~nouns:!nouns_processed; Option.get !result (** Convert bytes to a hex string for debugging *) let bytes_to_hex bytes = let len = Bytes.length bytes in let buf = Buffer.create (len * 2) in for i = 0 to len - 1 do Buffer.add_string buf (Printf.sprintf "%02x" (Bytes.get_uint8 bytes i)) done; Buffer.contents buf