summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-10-05 21:56:51 +0700
committerpolwex <polwex@sortug.com>2025-10-05 21:56:51 +0700
commitfcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch)
tree51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s')
-rw-r--r--vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s193
1 files changed, 193 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s b/vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s
new file mode 100644
index 0000000..d492362
--- /dev/null
+++ b/vere/ext/gmp/gen/aarch64-macos/mpn/hamdist.s
@@ -0,0 +1,193 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 3
+ .globl ___gmpn_hamdist
+
+___gmpn_hamdist:
+
+ mov x11, #0x1fff
+ cmp x2, x11
+ b.hi Lgt8k
+
+Llt8k:
+ movi v4.16b, #0
+ movi v5.16b, #0
+
+ tbz x2, #0, Lxx0
+ sub x2, x2, #1
+ ld1 {v0.1d}, [x0], #8
+ ld1 {v16.1d}, [x1], #8
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+Lxx0: tbz x2, #1, Lx00
+ sub x2, x2, #2
+ ld1 {v0.2d}, [x0], #16
+ ld1 {v16.2d}, [x1], #16
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+Lx00: tbz x2, #2, L000
+ subs x2, x2, #4
+ ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ b.ls Lsum
+
+Lgt4: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ sub x2, x2, #4
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ b Lmid
+
+L000: subs x2, x2, #8
+ b.lo Le0
+
+Lchu: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ cnt v6.16b, v2.16b
+ cnt v7.16b, v3.16b
+ subs x2, x2, #8
+ b.lo Lend
+
+Ltop: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v0.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v1.16b
+Lmid: ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ subs x2, x2, #8
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v2.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v3.16b
+ b.hs Ltop
+
+Lend: uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+Lsum: eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+ add v4.8h, v4.8h, v5.8h
+
+Le0: uaddlp v4.4s, v4.8h
+ uaddlp v4.2d, v4.4s
+ mov x0, v4.d[0]
+ mov x1, v4.d[1]
+ add x0, x0, x1
+ ret
+
+
+
+
+Lgt8k:
+ mov x8, x30
+ mov x7, x2
+ mov x4, #0
+ mov x9, #0x1ff0*8
+ mov x10, #0x1ff0
+
+1: add x5, x0, x9
+ add x6, x1, x9
+ mov x2, #0x1ff0-8
+ movi v4.16b, #0
+ movi v5.16b, #0
+ bl Lchu
+ add x4, x4, x0
+ mov x0, x5
+ mov x1, x6
+ sub x7, x7, x10
+ cmp x7, x11
+ b.hi 1b
+
+ mov x2, x7
+ bl Llt8k
+ add x0, x4, x0
+ mov x30, x8
+ ret
+