diff options
Diffstat (limited to 'ocaml/lib')
-rw-r--r-- | ocaml/lib/bitstream.ml | 66 | ||||
-rw-r--r-- | ocaml/lib/serial.ml | 181 |
2 files changed, 212 insertions, 35 deletions
diff --git a/ocaml/lib/bitstream.ml b/ocaml/lib/bitstream.ml index 39bfd6a..e758c2a 100644 --- a/ocaml/lib/bitstream.ml +++ b/ocaml/lib/bitstream.ml @@ -67,6 +67,18 @@ let reader_create buf = len = Bytes.length buf * 8; } +(* Lookup table for trailing zero counts within a byte. Value for 0 is 8 so + callers can detect the "no one-bit present" case. *) +let trailing_zeros = + let tbl = Array.make 256 8 in + for i = 1 to 255 do + let rec count n value = + if value land 1 = 1 then n else count (n + 1) (value lsr 1) + in + tbl.(i) <- count 0 i + done; + tbl + (** Read a single bit *) let read_bit r = if r.bit_pos >= r.len then @@ -80,7 +92,36 @@ let read_bit r = (** Read multiple bits as a Z.t - optimized for bulk reads *) let read_bits r nbits = if nbits = 0 then Z.zero - else if nbits <= 64 && (r.bit_pos mod 8 = 0) && nbits mod 8 = 0 then begin + else if nbits > 4096 then begin + (* Bulk path: copy bytes then convert to Z. *) + let byte_len = (nbits + 7) / 8 in + let buf = Bytes.make byte_len '\x00' in + let bits_done = ref 0 in + + while !bits_done < nbits do + if (!bits_done land 7) = 0 && (r.bit_pos land 7) = 0 then begin + let rem_bits = nbits - !bits_done in + let bytes_to_copy = rem_bits / 8 in + if bytes_to_copy > 0 then begin + Bytes.blit r.buf (r.bit_pos / 8) buf (!bits_done / 8) bytes_to_copy; + r.bit_pos <- r.bit_pos + (bytes_to_copy * 8); + bits_done := !bits_done + (bytes_to_copy * 8) + end + end; + + if !bits_done < nbits then begin + if read_bit r then begin + let byte_idx = !bits_done / 8 in + let bit_idx = !bits_done mod 8 in + let existing = Bytes.get_uint8 buf byte_idx in + Bytes.set_uint8 buf byte_idx (existing lor (1 lsl bit_idx)) + end; + incr bits_done + end + done; + + Z.of_bits (Bytes.unsafe_to_string buf) + end else if nbits <= 64 && (r.bit_pos mod 8 = 0) && nbits mod 8 = 0 then begin (* Fast path: byte-aligned, <= 8 bytes *) let byte_pos = r.bit_pos / 8 in let num_bytes = nbits / 8 in @@ -144,3 +185,26 @@ let reader_pos r = r.bit_pos (** Check if at end of stream *) let reader_at_end r = r.bit_pos >= r.len + +let count_zero_bits_until_one r = + let buf = r.buf in + let len_bits = r.len in + let rec scan count bit_pos = + if bit_pos >= len_bits then + raise (Invalid_argument "count_zero_bits_until_one: end of stream") + else begin + let byte_idx = bit_pos lsr 3 in + let bit_off = bit_pos land 7 in + let byte = Bytes.get_uint8 buf byte_idx in + let masked = byte lsr bit_off in + if masked <> 0 then begin + let tz = trailing_zeros.(masked land 0xff) in + let zeros = count + tz in + r.bit_pos <- bit_pos + tz + 1; (* skip zeros and the terminating 1 bit *) + zeros + end else + let remaining = 8 - bit_off in + scan (count + remaining) (bit_pos + remaining) + end + in + scan 0 r.bit_pos diff --git a/ocaml/lib/serial.ml b/ocaml/lib/serial.ml index 9ededf1..ac97421 100644 --- a/ocaml/lib/serial.ml +++ b/ocaml/lib/serial.ml @@ -18,6 +18,15 @@ open Noun open Bitstream +type cue_progress = + nouns:int -> bits:int -> depth:int -> max_depth:int -> unit + +type cue_event = + | Cue_atom_begin of { position : int; value_bits : int } + | Cue_atom_end of { position : int; total_bits : int; value_bits : int } + | Cue_backref of { position : int; ref_pos : int } + | Cue_emit of { nouns : int; depth : int; max_depth : int } + (** Mat-encode a number into the bitstream Mat encoding is a variable-length integer encoding: @@ -50,28 +59,26 @@ let mat_encode w n = end (** Mat-decode from bitstream, returns (value, bits_read) *) -let mat_decode r = +let mat_decode ?on_value_bits r = let start_pos = reader_pos r in - (* Count leading 0 bits until we hit a 1 bit *) - let b = ref 0 in - while not (read_bit r) do - b := !b + 1 - done; - - let b = !b in + (* Count leading 0 bits until we hit the terminating 1 bit. *) + let b = Bitstream.count_zero_bits_until_one r in if b = 0 then (* Just a single 1 bit means 0 *) - (Z.zero, reader_pos r - start_pos) + (Option.iter (fun f -> f 0) on_value_bits; + (Z.zero, reader_pos r - start_pos, 0)) else begin (* Read the length bits and compute a = 2^(b-1) + bits_read *) let bits_val = read_bits r (b - 1) in let a = Z.to_int (Z.add (Z.shift_left Z.one (b - 1)) bits_val) in + Option.iter (fun f -> f a) on_value_bits; + (* Read n in a bits *) let n = read_bits r a in - (n, reader_pos r - start_pos) + (n, reader_pos r - start_pos, a) end (** Jam: serialize a noun to bytes @@ -137,45 +144,151 @@ let jam noun = Uses a hash table to store nouns by bit position for backreferences. *) -let cue bytes = +module IntTable = Hashtbl.Make (struct + type t = int + + let equal = Int.equal + let hash x = x land max_int +end) + +let cue ?progress ?(progress_interval = 200_000) ?inspect bytes = let r = reader_create bytes in - let backref_table = Hashtbl.create 256 in (* bit position -> noun *) - let rec cue_noun () = - let pos = reader_pos r in + (* Pre-size the backref table based on payload size to minimise rehashing. *) + let estimated_nouns = + let approx = Bytes.length bytes / 8 in + if approx < 1024 then 1024 else approx + in + let backref_table = IntTable.create estimated_nouns in + + (* Manual stack used to eliminate recursion while tracking unfinished cells. *) + let initial_stack_capacity = 1024 in + let stack_pos = ref (Array.make initial_stack_capacity 0) in + let stack_head = ref (Array.make initial_stack_capacity None) in + let stack_size = ref 0 in + let max_depth = ref 0 in + + (* Noun counter is used for periodic progress callbacks. *) + let nouns_processed = ref 0 in + + let report_tick, report_final = + match progress with + | None -> ( (fun ~nouns:_ -> ()), (fun ~nouns:_ -> ()) ) + | Some callback -> + let interval = if progress_interval <= 0 then 1 else progress_interval in + let next_report = ref interval in + + let call_callback nouns = + callback + ~nouns + ~bits:(reader_pos r) + ~depth:!stack_size + ~max_depth:!max_depth + in + + let tick ~nouns = + if nouns >= !next_report then begin + call_callback nouns; + next_report := nouns + interval + end + in + + let final ~nouns = + if nouns < !next_report then call_callback nouns + in + + (tick, final) + in - (* Read tag bit *) + let inspect_event = match inspect with Some f -> f | None -> fun _ -> () in + + let grow_stack () = + let old_pos = !stack_pos in + let old_head = !stack_head in + let old_len = Array.length old_pos in + let new_len = old_len * 2 in + let new_pos = Array.make new_len 0 in + let new_head = Array.make new_len None in + Array.blit old_pos 0 new_pos 0 old_len; + Array.blit old_head 0 new_head 0 old_len; + stack_pos := new_pos; + stack_head := new_head + in + + let push_frame pos = + if !stack_size = Array.length !stack_pos then grow_stack (); + let idx = !stack_size in + let pos_arr = !stack_pos in + let head_arr = !stack_head in + pos_arr.(idx) <- pos; + head_arr.(idx) <- None; + stack_size := idx + 1; + if !stack_size > !max_depth then max_depth := !stack_size + in + + let result = ref None in + + let rec emit noun = + incr nouns_processed; + report_tick ~nouns:!nouns_processed; + inspect_event (Cue_emit { nouns = !nouns_processed; depth = !stack_size; max_depth = !max_depth }); + + if !stack_size = 0 then + result := Some noun + else begin + let idx = !stack_size - 1 in + let head_arr = !stack_head in + match head_arr.(idx) with + | None -> + head_arr.(idx) <- Some noun + | Some head -> + let pos_arr = !stack_pos in + let cell_pos = pos_arr.(idx) in + head_arr.(idx) <- None; + stack_size := idx; + let cell = Cell (head, noun) in + IntTable.replace backref_table cell_pos cell; + emit cell + end + in + + while Option.is_none !result do + let pos = reader_pos r in let tag0 = read_bit r in if not tag0 then begin (* Atom: tag bit 0 *) - let (value, _width) = mat_decode r in - let result = Atom value in - Hashtbl.add backref_table pos result; - result + let on_value_bits bits = + inspect_event (Cue_atom_begin { position = pos; value_bits = bits }) + in + let (value, total_bits, value_bits) = mat_decode ~on_value_bits r in + let atom = Atom value in + IntTable.replace backref_table pos atom; + inspect_event (Cue_atom_end { position = pos; total_bits; value_bits }); + emit atom end else begin - (* Read second tag bit *) let tag1 = read_bit r in - if tag1 then begin (* Backref: tag bits 11 *) - let (ref_pos, _width) = mat_decode r in - let ref_pos = Z.to_int ref_pos in - match Hashtbl.find_opt backref_table ref_pos with - | Some noun -> noun - | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos)) + let (ref_pos, _width, _value_bits) = mat_decode r in + let ref_int = Z.to_int ref_pos in + inspect_event (Cue_backref { position = pos; ref_pos = ref_int }); + match IntTable.find_opt backref_table ref_int with + | Some noun -> emit noun + | None -> + raise + (Invalid_argument + (Printf.sprintf "cue: invalid backref to position %d" ref_int)) end else begin - (* Cell: tag bits 01 *) - let head = cue_noun () in - let tail = cue_noun () in - let result = Cell (head, tail) in - Hashtbl.add backref_table pos result; - result + (* Cell: tag bits 01 – push frame and continue decoding head. *) + push_frame pos end end - in + done; + + report_final ~nouns:!nouns_processed; - cue_noun () + Option.get !result (** Convert bytes to a hex string for debugging *) let bytes_to_hex bytes = |