summaryrefslogtreecommitdiff
path: root/ocaml/lib
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-10-06 12:02:02 +0700
committerpolwex <polwex@sortug.com>2025-10-06 12:02:02 +0700
commit0ca55c93a7c21f81c8f21048889d1c9608a961c7 (patch)
tree5e43c94c6cae1f789b8892be737a38b10eddde83 /ocaml/lib
parent2d6c3bab18cf5063246fcdb869ae36132bbfe3fc (diff)
pretty good actuallycodex
Diffstat (limited to 'ocaml/lib')
-rw-r--r--ocaml/lib/bitstream.ml66
-rw-r--r--ocaml/lib/serial.ml181
2 files changed, 212 insertions, 35 deletions
diff --git a/ocaml/lib/bitstream.ml b/ocaml/lib/bitstream.ml
index 39bfd6a..e758c2a 100644
--- a/ocaml/lib/bitstream.ml
+++ b/ocaml/lib/bitstream.ml
@@ -67,6 +67,18 @@ let reader_create buf =
len = Bytes.length buf * 8;
}
+(* Lookup table for trailing zero counts within a byte. Value for 0 is 8 so
+ callers can detect the "no one-bit present" case. *)
+let trailing_zeros =
+ let tbl = Array.make 256 8 in
+ for i = 1 to 255 do
+ let rec count n value =
+ if value land 1 = 1 then n else count (n + 1) (value lsr 1)
+ in
+ tbl.(i) <- count 0 i
+ done;
+ tbl
+
(** Read a single bit *)
let read_bit r =
if r.bit_pos >= r.len then
@@ -80,7 +92,36 @@ let read_bit r =
(** Read multiple bits as a Z.t - optimized for bulk reads *)
let read_bits r nbits =
if nbits = 0 then Z.zero
- else if nbits <= 64 && (r.bit_pos mod 8 = 0) && nbits mod 8 = 0 then begin
+ else if nbits > 4096 then begin
+ (* Bulk path: copy bytes then convert to Z. *)
+ let byte_len = (nbits + 7) / 8 in
+ let buf = Bytes.make byte_len '\x00' in
+ let bits_done = ref 0 in
+
+ while !bits_done < nbits do
+ if (!bits_done land 7) = 0 && (r.bit_pos land 7) = 0 then begin
+ let rem_bits = nbits - !bits_done in
+ let bytes_to_copy = rem_bits / 8 in
+ if bytes_to_copy > 0 then begin
+ Bytes.blit r.buf (r.bit_pos / 8) buf (!bits_done / 8) bytes_to_copy;
+ r.bit_pos <- r.bit_pos + (bytes_to_copy * 8);
+ bits_done := !bits_done + (bytes_to_copy * 8)
+ end
+ end;
+
+ if !bits_done < nbits then begin
+ if read_bit r then begin
+ let byte_idx = !bits_done / 8 in
+ let bit_idx = !bits_done mod 8 in
+ let existing = Bytes.get_uint8 buf byte_idx in
+ Bytes.set_uint8 buf byte_idx (existing lor (1 lsl bit_idx))
+ end;
+ incr bits_done
+ end
+ done;
+
+ Z.of_bits (Bytes.unsafe_to_string buf)
+ end else if nbits <= 64 && (r.bit_pos mod 8 = 0) && nbits mod 8 = 0 then begin
(* Fast path: byte-aligned, <= 8 bytes *)
let byte_pos = r.bit_pos / 8 in
let num_bytes = nbits / 8 in
@@ -144,3 +185,26 @@ let reader_pos r = r.bit_pos
(** Check if at end of stream *)
let reader_at_end r = r.bit_pos >= r.len
+
+let count_zero_bits_until_one r =
+ let buf = r.buf in
+ let len_bits = r.len in
+ let rec scan count bit_pos =
+ if bit_pos >= len_bits then
+ raise (Invalid_argument "count_zero_bits_until_one: end of stream")
+ else begin
+ let byte_idx = bit_pos lsr 3 in
+ let bit_off = bit_pos land 7 in
+ let byte = Bytes.get_uint8 buf byte_idx in
+ let masked = byte lsr bit_off in
+ if masked <> 0 then begin
+ let tz = trailing_zeros.(masked land 0xff) in
+ let zeros = count + tz in
+ r.bit_pos <- bit_pos + tz + 1; (* skip zeros and the terminating 1 bit *)
+ zeros
+ end else
+ let remaining = 8 - bit_off in
+ scan (count + remaining) (bit_pos + remaining)
+ end
+ in
+ scan 0 r.bit_pos
diff --git a/ocaml/lib/serial.ml b/ocaml/lib/serial.ml
index 9ededf1..ac97421 100644
--- a/ocaml/lib/serial.ml
+++ b/ocaml/lib/serial.ml
@@ -18,6 +18,15 @@
open Noun
open Bitstream
+type cue_progress =
+ nouns:int -> bits:int -> depth:int -> max_depth:int -> unit
+
+type cue_event =
+ | Cue_atom_begin of { position : int; value_bits : int }
+ | Cue_atom_end of { position : int; total_bits : int; value_bits : int }
+ | Cue_backref of { position : int; ref_pos : int }
+ | Cue_emit of { nouns : int; depth : int; max_depth : int }
+
(** Mat-encode a number into the bitstream
Mat encoding is a variable-length integer encoding:
@@ -50,28 +59,26 @@ let mat_encode w n =
end
(** Mat-decode from bitstream, returns (value, bits_read) *)
-let mat_decode r =
+let mat_decode ?on_value_bits r =
let start_pos = reader_pos r in
- (* Count leading 0 bits until we hit a 1 bit *)
- let b = ref 0 in
- while not (read_bit r) do
- b := !b + 1
- done;
-
- let b = !b in
+ (* Count leading 0 bits until we hit the terminating 1 bit. *)
+ let b = Bitstream.count_zero_bits_until_one r in
if b = 0 then
(* Just a single 1 bit means 0 *)
- (Z.zero, reader_pos r - start_pos)
+ (Option.iter (fun f -> f 0) on_value_bits;
+ (Z.zero, reader_pos r - start_pos, 0))
else begin
(* Read the length bits and compute a = 2^(b-1) + bits_read *)
let bits_val = read_bits r (b - 1) in
let a = Z.to_int (Z.add (Z.shift_left Z.one (b - 1)) bits_val) in
+ Option.iter (fun f -> f a) on_value_bits;
+
(* Read n in a bits *)
let n = read_bits r a in
- (n, reader_pos r - start_pos)
+ (n, reader_pos r - start_pos, a)
end
(** Jam: serialize a noun to bytes
@@ -137,45 +144,151 @@ let jam noun =
Uses a hash table to store nouns by bit position for backreferences.
*)
-let cue bytes =
+module IntTable = Hashtbl.Make (struct
+ type t = int
+
+ let equal = Int.equal
+ let hash x = x land max_int
+end)
+
+let cue ?progress ?(progress_interval = 200_000) ?inspect bytes =
let r = reader_create bytes in
- let backref_table = Hashtbl.create 256 in (* bit position -> noun *)
- let rec cue_noun () =
- let pos = reader_pos r in
+ (* Pre-size the backref table based on payload size to minimise rehashing. *)
+ let estimated_nouns =
+ let approx = Bytes.length bytes / 8 in
+ if approx < 1024 then 1024 else approx
+ in
+ let backref_table = IntTable.create estimated_nouns in
+
+ (* Manual stack used to eliminate recursion while tracking unfinished cells. *)
+ let initial_stack_capacity = 1024 in
+ let stack_pos = ref (Array.make initial_stack_capacity 0) in
+ let stack_head = ref (Array.make initial_stack_capacity None) in
+ let stack_size = ref 0 in
+ let max_depth = ref 0 in
+
+ (* Noun counter is used for periodic progress callbacks. *)
+ let nouns_processed = ref 0 in
+
+ let report_tick, report_final =
+ match progress with
+ | None -> ( (fun ~nouns:_ -> ()), (fun ~nouns:_ -> ()) )
+ | Some callback ->
+ let interval = if progress_interval <= 0 then 1 else progress_interval in
+ let next_report = ref interval in
+
+ let call_callback nouns =
+ callback
+ ~nouns
+ ~bits:(reader_pos r)
+ ~depth:!stack_size
+ ~max_depth:!max_depth
+ in
+
+ let tick ~nouns =
+ if nouns >= !next_report then begin
+ call_callback nouns;
+ next_report := nouns + interval
+ end
+ in
+
+ let final ~nouns =
+ if nouns < !next_report then call_callback nouns
+ in
+
+ (tick, final)
+ in
- (* Read tag bit *)
+ let inspect_event = match inspect with Some f -> f | None -> fun _ -> () in
+
+ let grow_stack () =
+ let old_pos = !stack_pos in
+ let old_head = !stack_head in
+ let old_len = Array.length old_pos in
+ let new_len = old_len * 2 in
+ let new_pos = Array.make new_len 0 in
+ let new_head = Array.make new_len None in
+ Array.blit old_pos 0 new_pos 0 old_len;
+ Array.blit old_head 0 new_head 0 old_len;
+ stack_pos := new_pos;
+ stack_head := new_head
+ in
+
+ let push_frame pos =
+ if !stack_size = Array.length !stack_pos then grow_stack ();
+ let idx = !stack_size in
+ let pos_arr = !stack_pos in
+ let head_arr = !stack_head in
+ pos_arr.(idx) <- pos;
+ head_arr.(idx) <- None;
+ stack_size := idx + 1;
+ if !stack_size > !max_depth then max_depth := !stack_size
+ in
+
+ let result = ref None in
+
+ let rec emit noun =
+ incr nouns_processed;
+ report_tick ~nouns:!nouns_processed;
+ inspect_event (Cue_emit { nouns = !nouns_processed; depth = !stack_size; max_depth = !max_depth });
+
+ if !stack_size = 0 then
+ result := Some noun
+ else begin
+ let idx = !stack_size - 1 in
+ let head_arr = !stack_head in
+ match head_arr.(idx) with
+ | None ->
+ head_arr.(idx) <- Some noun
+ | Some head ->
+ let pos_arr = !stack_pos in
+ let cell_pos = pos_arr.(idx) in
+ head_arr.(idx) <- None;
+ stack_size := idx;
+ let cell = Cell (head, noun) in
+ IntTable.replace backref_table cell_pos cell;
+ emit cell
+ end
+ in
+
+ while Option.is_none !result do
+ let pos = reader_pos r in
let tag0 = read_bit r in
if not tag0 then begin
(* Atom: tag bit 0 *)
- let (value, _width) = mat_decode r in
- let result = Atom value in
- Hashtbl.add backref_table pos result;
- result
+ let on_value_bits bits =
+ inspect_event (Cue_atom_begin { position = pos; value_bits = bits })
+ in
+ let (value, total_bits, value_bits) = mat_decode ~on_value_bits r in
+ let atom = Atom value in
+ IntTable.replace backref_table pos atom;
+ inspect_event (Cue_atom_end { position = pos; total_bits; value_bits });
+ emit atom
end else begin
- (* Read second tag bit *)
let tag1 = read_bit r in
-
if tag1 then begin
(* Backref: tag bits 11 *)
- let (ref_pos, _width) = mat_decode r in
- let ref_pos = Z.to_int ref_pos in
- match Hashtbl.find_opt backref_table ref_pos with
- | Some noun -> noun
- | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos))
+ let (ref_pos, _width, _value_bits) = mat_decode r in
+ let ref_int = Z.to_int ref_pos in
+ inspect_event (Cue_backref { position = pos; ref_pos = ref_int });
+ match IntTable.find_opt backref_table ref_int with
+ | Some noun -> emit noun
+ | None ->
+ raise
+ (Invalid_argument
+ (Printf.sprintf "cue: invalid backref to position %d" ref_int))
end else begin
- (* Cell: tag bits 01 *)
- let head = cue_noun () in
- let tail = cue_noun () in
- let result = Cell (head, tail) in
- Hashtbl.add backref_table pos result;
- result
+ (* Cell: tag bits 01 – push frame and continue decoding head. *)
+ push_frame pos
end
end
- in
+ done;
+
+ report_final ~nouns:!nouns_processed;
- cue_noun ()
+ Option.get !result
(** Convert bytes to a hex string for debugging *)
let bytes_to_hex bytes =