1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
|
(** Jam/cue serialization for nouns
Based on the Vere implementation in pkg/noun/serial.c
Jam encoding:
- Atoms: tag bit 0, then mat-encoded value
- Cells: tag bits 01, then recursively encode head and tail
- Backrefs: tag bits 11, then mat-encoded position
Mat encoding (length-prefixed):
- For 0: just bit 1
- For n > 0:
- Let a = bit-width of n
- Let b = bit-width of a
- Encode: [1 repeated b times][0][a in b-1 bits][n in a bits]
*)
open Noun
open Bitstream
(** Mat-encode a number into the bitstream
Mat encoding is a variable-length integer encoding:
- 0 is encoded as a single 1 bit
- For n > 0:
- a = number of bits in n (met 0 n)
- b = number of bits needed to represent a
- Write b 1-bits, then a 0-bit
- Write a-1 in b-1 bits
- Write n in a bits
*)
let rec mat_encode w n =
if Z.equal n Z.zero then
write_bit w true
else begin
let a = Z.numbits n in (* bit-width of n *)
let b = Z.numbits (Z.of_int a) in (* bit-width of a *)
(* Write b 1-bits followed by a 0-bit *)
for _i = 1 to b do
write_bit w true
done;
write_bit w false;
(* Write a-1 in b-1 bits *)
write_bits w (Z.of_int (a - 1)) (b - 1);
(* Write n in a bits *)
write_bits w n a
end
(** Mat-decode from bitstream, returns (value, bits_read) *)
let rec mat_decode r =
let start_pos = reader_pos r in
if not (read_bit r) then
(Z.zero, reader_pos r - start_pos)
else begin
(* Count leading 1 bits *)
let b = ref 1 in
while read_bit r do
b := !b + 1
done;
let b = !b in
if b = 1 then
(* Special case: just "10" means 1 *)
(Z.one, reader_pos r - start_pos)
else begin
(* Read a-1 in b-1 bits *)
let a_minus_1 = read_bits r (b - 1) in
let a = Z.to_int (Z.add a_minus_1 Z.one) in
(* Read n in a bits *)
let n = read_bits r a in
(n, reader_pos r - start_pos)
end
end
(** Jam: serialize a noun to bytes
Uses a hash table to track positions for backreferences.
Returns the serialized bytes.
*)
let jam noun =
let w = writer_create () in
let positions = Hashtbl.create 256 in (* noun -> bit position *)
let rec jam_noun n =
match n with
| Atom a ->
(* Check if we've seen this atom before *)
begin match Hashtbl.find_opt positions n with
| Some pos ->
(* Backref might be smaller than re-encoding *)
let atom_size = 1 + (Z.numbits a) in (* rough estimate *)
let backref_size = 2 + (Z.numbits (Z.of_int pos)) in
if backref_size < atom_size then begin
(* Encode backref: tag bits 11 *)
write_bit w true;
write_bit w true;
mat_encode w (Z.of_int pos)
end else begin
(* Encode atom *)
write_bit w false;
mat_encode w a
end
| None ->
(* Record position and encode atom *)
Hashtbl.add positions n w.bit_pos;
write_bit w false;
mat_encode w a
end
| Cell (head, tail) ->
(* Check for backref *)
begin match Hashtbl.find_opt positions n with
| Some pos ->
(* Encode backref: tag bits 11 *)
write_bit w true;
write_bit w true;
mat_encode w (Z.of_int pos)
| None ->
(* Record position and encode cell *)
Hashtbl.add positions n w.bit_pos;
(* Tag bits 01 for cell *)
write_bit w true;
write_bit w false;
(* Recursively encode head and tail *)
jam_noun head;
jam_noun tail
end
in
jam_noun noun;
writer_to_bytes w
(** Cue: deserialize bytes to a noun
Uses a hash table to store nouns by bit position for backreferences.
*)
let cue bytes =
let r = reader_create bytes in
let backref_table = Hashtbl.create 256 in (* bit position -> noun *)
let rec cue_noun () =
let pos = reader_pos r in
(* Read tag bit *)
let tag0 = read_bit r in
if not tag0 then begin
(* Atom: tag bit 0 *)
let (value, _width) = mat_decode r in
let result = Atom value in
Hashtbl.add backref_table pos result;
result
end else begin
(* Read second tag bit *)
let tag1 = read_bit r in
if tag1 then begin
(* Backref: tag bits 11 *)
let (ref_pos, _width) = mat_decode r in
let ref_pos = Z.to_int ref_pos in
match Hashtbl.find_opt backref_table ref_pos with
| Some noun -> noun
| None -> raise (Invalid_argument (Printf.sprintf "cue: invalid backref to position %d" ref_pos))
end else begin
(* Cell: tag bits 01 *)
let head = cue_noun () in
let tail = cue_noun () in
let result = Cell (head, tail) in
Hashtbl.add backref_table pos result;
result
end
end
in
cue_noun ()
(** Convert bytes to a hex string for debugging *)
let bytes_to_hex bytes =
let len = Bytes.length bytes in
let buf = Buffer.create (len * 2) in
for i = 0 to len - 1 do
Buffer.add_string buf (Printf.sprintf "%02x" (Bytes.get_uint8 bytes i))
done;
Buffer.contents buf
|