diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /ocaml/serial.ml |
claude is gud
Diffstat (limited to 'ocaml/serial.ml')
-rw-r--r-- | ocaml/serial.ml | 191 |
1 files changed, 191 insertions, 0 deletions
diff --git a/ocaml/serial.ml b/ocaml/serial.ml new file mode 100644 index 0000000..039cd2f --- /dev/null +++ b/ocaml/serial.ml @@ -0,0 +1,191 @@ +(** Jam/cue serialization for nouns + + Based on the Vere implementation in pkg/noun/serial.c + + Jam encoding: + - Atoms: tag bit 0, then mat-encoded value + - Cells: tag bits 01, then recursively encode head and tail + - Backrefs: tag bits 11, then mat-encoded position + + Mat encoding (length-prefixed): + - For 0: just bit 1 + - For n > 0: + - Let a = bit-width of n + - Let b = bit-width of a + - Encode: [1 repeated b times][0][a in b-1 bits][n in a bits] +*) + +open Noun +open Bitstream + +(** Mat-encode a number into the bitstream + + Mat encoding is a variable-length integer encoding: + - 0 is encoded as a single 1 bit + - For n > 0: + - a = number of bits in n (met 0 n) + - b = number of bits needed to represent a + - Write b 1-bits, then a 0-bit + - Write a-1 in b-1 bits + - Write n in a bits +*) +let rec mat_encode w n = + if Z.equal n Z.zero then + write_bit w true + else begin + let a = Z.numbits n in (* bit-width of n *) + let b = Z.numbits (Z.of_int a) in (* bit-width of a *) + + (* Write b 1-bits followed by a 0-bit *) + for _i = 1 to b do + write_bit w true + done; + write_bit w false; + + (* Write a-1 in b-1 bits *) + write_bits w (Z.of_int (a - 1)) (b - 1); + + (* Write n in a bits *) + write_bits w n a + end + +(** Mat-decode from bitstream, returns (value, bits_read) *) +let rec mat_decode r = + let start_pos = reader_pos r in + + if not (read_bit r) then + (Z.zero, reader_pos r - start_pos) + else begin + (* Count leading 1 bits *) + let b = ref 1 in + while read_bit r do + b := !b + 1 + done; + + let b = !b in + + if b = 1 then + (* Special case: just "10" means 1 *) + (Z.one, reader_pos r - start_pos) + else begin + (* Read a-1 in b-1 bits *) + let a_minus_1 = read_bits r (b - 1) in + let a = Z.to_int (Z.add a_minus_1 Z.one) in + + (* Read n in a bits *) + let n = read_bits r a in + (n, reader_pos r - start_pos) + end + end + +(** Jam: serialize a noun to bytes + + Uses a hash table to track positions for backreferences. + Returns the serialized bytes. +*) +let jam noun = + let w = writer_create () in + let positions = Hashtbl.create 256 in (* noun -> bit position *) + + let rec jam_noun n = + match n with + | Atom a -> + (* Check if we've seen this atom before *) + begin match Hashtbl.find_opt positions n with + | Some pos -> + (* Backref might be smaller than re-encoding *) + let atom_size = 1 + (Z.numbits a) in (* rough estimate *) + let backref_size = 2 + (Z.numbits (Z.of_int pos)) in + + if backref_size < atom_size then begin + (* Encode backref: tag bits 11 *) + write_bit w true; + write_bit w true; + mat_encode w (Z.of_int pos) + end else begin + (* Encode atom *) + write_bit w false; + mat_encode w a + end + | None -> + (* Record position and encode atom *) + Hashtbl.add positions n w.bit_pos; + write_bit w false; + mat_encode w a + end + + | Cell (head, tail) -> + (* Check for backref *) + begin match Hashtbl.find_opt positions n with + | Some pos -> + (* Encode backref: tag bits 11 *) + write_bit w true; + write_bit w true; + mat_encode w (Z.of_int pos) + | None -> + (* Record position and encode cell *) + Hashtbl.add positions n w.bit_pos; + (* Tag bits 01 for cell *) + write_bit w true; + write_bit w false; + (* Recursively encode head and tail *) + jam_noun head; + jam_noun tail + end + in + + jam_noun noun; + writer_to_bytes w + +(** Cue: deserialize bytes to a noun + + Uses a hash table to store nouns by bit position for backreferences. +*) +let cue bytes = + let r = reader_create bytes in + let backref_table = Hashtbl.create 256 in (* bit position -> noun *) + + let rec cue_noun () = + let pos = reader_pos r in + + (* Read tag bit *) + let tag0 = read_bit r in + + if not tag0 then begin + (* Atom: tag bit 0 *) + let (value, _width) = mat_decode r in + let result = Atom value in + Hashtbl.add backref_table pos result; + result + end else begin + (* Read second tag bit *) + let tag1 = read_bit r in + + if tag1 then begin + (* Backref: tag bits 11 *) + let (ref_pos, _width) = mat_decode r in + let ref_pos = Z.to_int ref_pos in + match Hashtbl.find_opt backref_table ref_pos with + | Some noun -> noun + | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos)) + end else begin + (* Cell: tag bits 01 *) + let head = cue_noun () in + let tail = cue_noun () in + let result = Cell (head, tail) in + Hashtbl.add backref_table pos result; + result + end + end + in + + cue_noun () + +(** Convert bytes to a hex string for debugging *) +let bytes_to_hex bytes = + let len = Bytes.length bytes in + let buf = Buffer.create (len * 2) in + for i = 0 to len - 1 do + Buffer.add_string buf (Printf.sprintf "%02x" (Bytes.get_uint8 bytes i)) + done; + Buffer.contents buf |