summaryrefslogtreecommitdiff
path: root/ocaml/serial.ml
diff options
context:
space:
mode:
Diffstat (limited to 'ocaml/serial.ml')
-rw-r--r--ocaml/serial.ml191
1 files changed, 191 insertions, 0 deletions
diff --git a/ocaml/serial.ml b/ocaml/serial.ml
new file mode 100644
index 0000000..039cd2f
--- /dev/null
+++ b/ocaml/serial.ml
@@ -0,0 +1,191 @@
+(** Jam/cue serialization for nouns
+
+ Based on the Vere implementation in pkg/noun/serial.c
+
+ Jam encoding:
+ - Atoms: tag bit 0, then mat-encoded value
+ - Cells: tag bits 01, then recursively encode head and tail
+ - Backrefs: tag bits 11, then mat-encoded position
+
+ Mat encoding (length-prefixed):
+ - For 0: just bit 1
+ - For n > 0:
+ - Let a = bit-width of n
+ - Let b = bit-width of a
+ - Encode: [1 repeated b times][0][a in b-1 bits][n in a bits]
+*)
+
+open Noun
+open Bitstream
+
+(** Mat-encode a number into the bitstream
+
+ Mat encoding is a variable-length integer encoding:
+ - 0 is encoded as a single 1 bit
+ - For n > 0:
+ - a = number of bits in n (met 0 n)
+ - b = number of bits needed to represent a
+ - Write b 1-bits, then a 0-bit
+ - Write a-1 in b-1 bits
+ - Write n in a bits
+*)
+let rec mat_encode w n =
+ if Z.equal n Z.zero then
+ write_bit w true
+ else begin
+ let a = Z.numbits n in (* bit-width of n *)
+ let b = Z.numbits (Z.of_int a) in (* bit-width of a *)
+
+ (* Write b 1-bits followed by a 0-bit *)
+ for _i = 1 to b do
+ write_bit w true
+ done;
+ write_bit w false;
+
+ (* Write a-1 in b-1 bits *)
+ write_bits w (Z.of_int (a - 1)) (b - 1);
+
+ (* Write n in a bits *)
+ write_bits w n a
+ end
+
+(** Mat-decode from bitstream, returns (value, bits_read) *)
+let rec mat_decode r =
+ let start_pos = reader_pos r in
+
+ if not (read_bit r) then
+ (Z.zero, reader_pos r - start_pos)
+ else begin
+ (* Count leading 1 bits *)
+ let b = ref 1 in
+ while read_bit r do
+ b := !b + 1
+ done;
+
+ let b = !b in
+
+ if b = 1 then
+ (* Special case: just "10" means 1 *)
+ (Z.one, reader_pos r - start_pos)
+ else begin
+ (* Read a-1 in b-1 bits *)
+ let a_minus_1 = read_bits r (b - 1) in
+ let a = Z.to_int (Z.add a_minus_1 Z.one) in
+
+ (* Read n in a bits *)
+ let n = read_bits r a in
+ (n, reader_pos r - start_pos)
+ end
+ end
+
+(** Jam: serialize a noun to bytes
+
+ Uses a hash table to track positions for backreferences.
+ Returns the serialized bytes.
+*)
+let jam noun =
+ let w = writer_create () in
+ let positions = Hashtbl.create 256 in (* noun -> bit position *)
+
+ let rec jam_noun n =
+ match n with
+ | Atom a ->
+ (* Check if we've seen this atom before *)
+ begin match Hashtbl.find_opt positions n with
+ | Some pos ->
+ (* Backref might be smaller than re-encoding *)
+ let atom_size = 1 + (Z.numbits a) in (* rough estimate *)
+ let backref_size = 2 + (Z.numbits (Z.of_int pos)) in
+
+ if backref_size < atom_size then begin
+ (* Encode backref: tag bits 11 *)
+ write_bit w true;
+ write_bit w true;
+ mat_encode w (Z.of_int pos)
+ end else begin
+ (* Encode atom *)
+ write_bit w false;
+ mat_encode w a
+ end
+ | None ->
+ (* Record position and encode atom *)
+ Hashtbl.add positions n w.bit_pos;
+ write_bit w false;
+ mat_encode w a
+ end
+
+ | Cell (head, tail) ->
+ (* Check for backref *)
+ begin match Hashtbl.find_opt positions n with
+ | Some pos ->
+ (* Encode backref: tag bits 11 *)
+ write_bit w true;
+ write_bit w true;
+ mat_encode w (Z.of_int pos)
+ | None ->
+ (* Record position and encode cell *)
+ Hashtbl.add positions n w.bit_pos;
+ (* Tag bits 01 for cell *)
+ write_bit w true;
+ write_bit w false;
+ (* Recursively encode head and tail *)
+ jam_noun head;
+ jam_noun tail
+ end
+ in
+
+ jam_noun noun;
+ writer_to_bytes w
+
+(** Cue: deserialize bytes to a noun
+
+ Uses a hash table to store nouns by bit position for backreferences.
+*)
+let cue bytes =
+ let r = reader_create bytes in
+ let backref_table = Hashtbl.create 256 in (* bit position -> noun *)
+
+ let rec cue_noun () =
+ let pos = reader_pos r in
+
+ (* Read tag bit *)
+ let tag0 = read_bit r in
+
+ if not tag0 then begin
+ (* Atom: tag bit 0 *)
+ let (value, _width) = mat_decode r in
+ let result = Atom value in
+ Hashtbl.add backref_table pos result;
+ result
+ end else begin
+ (* Read second tag bit *)
+ let tag1 = read_bit r in
+
+ if tag1 then begin
+ (* Backref: tag bits 11 *)
+ let (ref_pos, _width) = mat_decode r in
+ let ref_pos = Z.to_int ref_pos in
+ match Hashtbl.find_opt backref_table ref_pos with
+ | Some noun -> noun
+ | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos))
+ end else begin
+ (* Cell: tag bits 01 *)
+ let head = cue_noun () in
+ let tail = cue_noun () in
+ let result = Cell (head, tail) in
+ Hashtbl.add backref_table pos result;
+ result
+ end
+ end
+ in
+
+ cue_noun ()
+
+(** Convert bytes to a hex string for debugging *)
+let bytes_to_hex bytes =
+ let len = Bytes.length bytes in
+ let buf = Buffer.create (len * 2) in
+ for i = 0 to len - 1 do
+ Buffer.add_string buf (Printf.sprintf "%02x" (Bytes.get_uint8 bytes i))
+ done;
+ Buffer.contents buf