diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/nasm/perllib/phash.ph |
claude is gud
Diffstat (limited to 'vere/ext/nasm/perllib/phash.ph')
-rw-r--r-- | vere/ext/nasm/perllib/phash.ph | 203 |
1 files changed, 203 insertions, 0 deletions
diff --git a/vere/ext/nasm/perllib/phash.ph b/vere/ext/nasm/perllib/phash.ph new file mode 100644 index 0000000..b5e4411 --- /dev/null +++ b/vere/ext/nasm/perllib/phash.ph @@ -0,0 +1,203 @@ +# -*- perl -*- +# +# Perfect Minimal Hash Generator written in Perl, which produces +# C output. +# + +require 'random_sv_vectors.ph'; +require 'crc64.ph'; + +# +# Compute the prehash for a key +# +# prehash(key, sv, N) +# +sub prehash($$$) { + my($key, $n, $sv) = @_; + my @c = crc64($sv, $key); + my $nmask = ($n << 1) - 2; + + # Create a bipartite graph... + $k1 = ($c[1] & $nmask) + 0; # low word + $k2 = ($c[0] & $nmask) + 1; # high word + + return ($k1, $k2); +} + +# +# Walk the assignment graph, return true on success +# +sub walk_graph($$$$) { + my($nodeval,$nodeneigh,$n,$v) = @_; + my $nx; + + # print STDERR "Vertex $n value $v\n"; + $$nodeval[$n] = $v; + + foreach $nx (@{$$nodeneigh[$n]}) { + # $nx -> [neigh, hash] + my ($o, $e) = @$nx; + + # print STDERR "Edge $n,$o value $e: "; + my $ov; + if (defined($ov = $$nodeval[$o])) { + if ($v+$ov != $e) { + # Cyclic graph with collision + # print STDERR "error, should be ", $v+$ov, "\n"; + return 0; + } else { + # print STDERR "ok\n"; + } + } else { + return 0 unless (walk_graph($nodeval, $nodeneigh, $o, $e-$v)); + } + } + return 1; +} + +# +# Generate the function assuming a given N. +# +# gen_hash_n(N, sv, \%data, run) +# +sub gen_hash_n($$$$) { + my($n, $sv, $href, $run) = @_; + my @keys = keys(%{$href}); + my $i; + my $gr; + my ($k, $v); + my $gsize = 2*$n; + my @nodeval; + my @nodeneigh; + my %edges; + + for ($i = 0; $i < $gsize; $i++) { + $nodeneigh[$i] = []; + } + + %edges = (); + foreach $k (@keys) { + my ($pf1, $pf2) = prehash($k, $n, $sv); + ($pf1,$pf2) = ($pf2,$pf1) if ($pf1 > $pf2); # Canonicalize order + + my $pf = "$pf1,$pf2"; + my $e = ${$href}{$k}; + my $xkey; + + if (defined($xkey = $edges{$pf})) { + next if ($e == ${$href}{$xkey}); # Duplicate hash, safe to ignore + if (defined($run)) { + print STDERR "$run: Collision: $pf: $k with $xkey\n"; + } + return; + } + + # print STDERR "Edge $pf value $e from $k\n"; + + $edges{$pf} = $k; + push(@{$nodeneigh[$pf1]}, [$pf2, $e]); + push(@{$nodeneigh[$pf2]}, [$pf1, $e]); + } + + # Now we need to assign values to each vertex, so that for each + # edge, the sum of the values for the two vertices give the value + # for the edge (which is our hash index.) If we find an impossible + # situation, the graph was cyclic. + @nodeval = (undef) x $gsize; + + for ($i = 0; $i < $gsize; $i++) { + if (scalar(@{$nodeneigh[$i]})) { + # This vertex has neighbors (is used) + if (!defined($nodeval[$i])) { + # First vertex in a cluster + unless (walk_graph(\@nodeval, \@nodeneigh, $i, 0)) { + if (defined($run)) { + print STDERR "$run: Graph is cyclic\n"; + } + return; + } + } + } + } + + # for ($i = 0; $i < $n; $i++) { + # print STDERR "Vertex ", $i, ": ", $g[$i], "\n"; + # } + + if (defined($run)) { + printf STDERR "$run: Done: n = $n, sv = [0x%08x, 0x%08x]\n", + $$sv[0], $$sv[1]; + } + + return ($n, $sv, \@nodeval); +} + +# +# Driver for generating the function +# +# gen_perfect_hash(\%data) +# +sub gen_perfect_hash($) { + my($href) = @_; + my @keys = keys(%{$href}); + my @hashinfo; + my ($n, $i, $j, $sv, $maxj); + my $run = 1; + + # Minimal power of 2 value for N with enough wiggle room. + # The scaling constant must be larger than 0.5 in order for the + # algorithm to ever terminate. The higher the scaling constant, + # the more space does the hash take up, but the less likely is it + # that an invalid token will require a string comparison. + my $room = int(scalar(@keys)*1.6); + $n = 1; + while ($n < $room) { + $n <<= 1; + } + + # Number of times to try... + $maxj = scalar @random_sv_vectors; + + for ($i = 0; $i < 4; $i++) { + printf STDERR "%d vectors, trying n = %d...\n", + scalar @keys, $n; + for ($j = 0; $j < $maxj; $j++) { + $sv = $random_sv_vectors[$j]; + @hashinfo = gen_hash_n($n, $sv, $href, $run++); + return @hashinfo if (@hashinfo); + } + $n <<= 1; + } + + return; +} + +# +# Verify that the hash table is actually correct... +# +sub verify_hash_table($$) +{ + my ($href, $hashinfo) = @_; + my ($n, $sv, $g) = @{$hashinfo}; + my $k; + my $err = 0; + + foreach $k (keys(%$href)) { + my ($pf1, $pf2) = prehash($k, $n, $sv); + my $g1 = ${$g}[$pf1]; + my $g2 = ${$g}[$pf2]; + + if ($g1+$g2 != ${$href}{$k}) { + printf STDERR "%s(%d,%d): %d+%d = %d != %d\n", + $k, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k}; + $err = 1; + } else { + # printf STDERR "%s: %d+%d = %d ok\n", + # $k, $g1, $g2, $g1+$g2; + } + } + + die "$0: hash validation error\n" if ($err); +} + +1; |