summaryrefslogtreecommitdiff
path: root/ocaml/lib/serial.ml
blob: 9ededf1c5a2968f536dcb6851dd00ab7d7e34bb0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
(** Jam/cue serialization for nouns

    Based on the Vere implementation in pkg/noun/serial.c

    Jam encoding:
    - Atoms: tag bit 0, then mat-encoded value
    - Cells: tag bits 01, then recursively encode head and tail
    - Backrefs: tag bits 11, then mat-encoded position

    Mat encoding (length-prefixed):
    - For 0: just bit 1
    - For n > 0:
      - Let a = bit-width of n
      - Let b = bit-width of a
      - Encode: [1 repeated b times][0][a in b-1 bits][n in a bits]
*)

open Noun
open Bitstream

(** Mat-encode a number into the bitstream

    Mat encoding is a variable-length integer encoding:
    - 0 is encoded as a single 1 bit
    - For n > 0:
      - a = number of bits in n (met 0 n)
      - b = number of bits needed to represent a
      - Write b 0-bits, then one 1-bit
      - Write a in b-1 bits
      - Write n in a bits
*)
let mat_encode w n =
  if Z.equal n Z.zero then
    write_bit w true
  else begin
    let a = Z.numbits n in  (* bit-width of n *)
    let b = Z.numbits (Z.of_int a) in  (* bit-width of a *)

    (* Write b 0-bits followed by one 1-bit *)
    for _i = 1 to b do
      write_bit w false
    done;
    write_bit w true;

    (* Write a in b-1 bits *)
    write_bits w (Z.of_int a) (b - 1);

    (* Write n in a bits *)
    write_bits w n a
  end

(** Mat-decode from bitstream, returns (value, bits_read) *)
let mat_decode r =
  let start_pos = reader_pos r in

  (* Count leading 0 bits until we hit a 1 bit *)
  let b = ref 0 in
  while not (read_bit r) do
    b := !b + 1
  done;

  let b = !b in

  if b = 0 then
    (* Just a single 1 bit means 0 *)
    (Z.zero, reader_pos r - start_pos)
  else begin
    (* Read the length bits and compute a = 2^(b-1) + bits_read *)
    let bits_val = read_bits r (b - 1) in
    let a = Z.to_int (Z.add (Z.shift_left Z.one (b - 1)) bits_val) in

    (* Read n in a bits *)
    let n = read_bits r a in
    (n, reader_pos r - start_pos)
  end

(** Jam: serialize a noun to bytes

    Uses a hash table to track positions for backreferences.
    Returns the serialized bytes.
*)
let jam noun =
  let w = writer_create () in
  let positions = Hashtbl.create 256 in  (* noun -> bit position *)

  let rec jam_noun n =
    match n with
    | Atom a ->
        (* Check if we've seen this atom before *)
        begin match Hashtbl.find_opt positions n with
        | Some pos ->
            (* Backref might be smaller than re-encoding *)
            let atom_size = 1 + (Z.numbits a) in  (* rough estimate *)
            let backref_size = 2 + (Z.numbits (Z.of_int pos)) in

            if backref_size < atom_size then begin
              (* Encode backref: tag bits 11 *)
              write_bit w true;
              write_bit w true;
              mat_encode w (Z.of_int pos)
            end else begin
              (* Encode atom *)
              write_bit w false;
              mat_encode w a
            end
        | None ->
            (* Record position and encode atom *)
            Hashtbl.add positions n w.bit_pos;
            write_bit w false;
            mat_encode w a
        end

    | Cell (head, tail) ->
        (* Check for backref *)
        begin match Hashtbl.find_opt positions n with
        | Some pos ->
            (* Encode backref: tag bits 11 *)
            write_bit w true;
            write_bit w true;
            mat_encode w (Z.of_int pos)
        | None ->
            (* Record position and encode cell *)
            Hashtbl.add positions n w.bit_pos;
            (* Tag bits 01 for cell *)
            write_bit w true;
            write_bit w false;
            (* Recursively encode head and tail *)
            jam_noun head;
            jam_noun tail
        end
  in

  jam_noun noun;
  writer_to_bytes w

(** Cue: deserialize bytes to a noun

    Uses a hash table to store nouns by bit position for backreferences.
*)
let cue bytes =
  let r = reader_create bytes in
  let backref_table = Hashtbl.create 256 in  (* bit position -> noun *)

  let rec cue_noun () =
    let pos = reader_pos r in

    (* Read tag bit *)
    let tag0 = read_bit r in

    if not tag0 then begin
      (* Atom: tag bit 0 *)
      let (value, _width) = mat_decode r in
      let result = Atom value in
      Hashtbl.add backref_table pos result;
      result
    end else begin
      (* Read second tag bit *)
      let tag1 = read_bit r in

      if tag1 then begin
        (* Backref: tag bits 11 *)
        let (ref_pos, _width) = mat_decode r in
        let ref_pos = Z.to_int ref_pos in
        match Hashtbl.find_opt backref_table ref_pos with
        | Some noun -> noun
        | None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos))
      end else begin
        (* Cell: tag bits 01 *)
        let head = cue_noun () in
        let tail = cue_noun () in
        let result = Cell (head, tail) in
        Hashtbl.add backref_table pos result;
        result
      end
    end
  in

  cue_noun ()

(** Convert bytes to a hex string for debugging *)
let bytes_to_hex bytes =
  let len = Bytes.length bytes in
  let buf = Buffer.create (len * 2) in
  for i = 0 to len - 1 do
    Buffer.add_string buf (Printf.sprintf "%02x" (Bytes.get_uint8 bytes i))
  done;
  Buffer.contents buf