summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s
diff options
context:
space:
mode:
Diffstat (limited to 'vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s')
-rw-r--r--vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s193
1 files changed, 193 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s b/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s
new file mode 100644
index 0000000..624b72d
--- /dev/null
+++ b/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s
@@ -0,0 +1,193 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 3
+ .globl __gmpn_hamdist
+ .type __gmpn_hamdist,@function
+__gmpn_hamdist:
+
+ mov x11, #0x1fff
+ cmp x2, x11
+ b.hi .Lgt8k
+
+.Llt8k:
+ movi v4.16b, #0
+ movi v5.16b, #0
+
+ tbz x2, #0, .Lxx0
+ sub x2, x2, #1
+ ld1 {v0.1d}, [x0], #8
+ ld1 {v16.1d}, [x1], #8
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+.Lxx0: tbz x2, #1, .Lx00
+ sub x2, x2, #2
+ ld1 {v0.2d}, [x0], #16
+ ld1 {v16.2d}, [x1], #16
+ eor v0.16b, v0.16b, v16.16b
+ cnt v6.16b, v0.16b
+ uadalp v4.8h, v6.16b
+
+.Lx00: tbz x2, #2, .L000
+ subs x2, x2, #4
+ ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ b.ls .Lsum
+
+.Lgt4: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ sub x2, x2, #4
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ b .Lmid
+
+.L000: subs x2, x2, #8
+ b.lo .Le0
+
+.Lchu: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ cnt v6.16b, v2.16b
+ cnt v7.16b, v3.16b
+ subs x2, x2, #8
+ b.lo .Lend
+
+.Ltop: ld1 {v2.2d,v3.2d}, [x0], #32
+ ld1 {v18.2d,v19.2d}, [x1], #32
+ eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v0.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v1.16b
+.Lmid: ld1 {v0.2d,v1.2d}, [x0], #32
+ ld1 {v16.2d,v17.2d}, [x1], #32
+ eor v2.16b, v2.16b, v18.16b
+ eor v3.16b, v3.16b, v19.16b
+ subs x2, x2, #8
+ uadalp v4.8h, v6.16b
+ cnt v6.16b, v2.16b
+ uadalp v5.8h, v7.16b
+ cnt v7.16b, v3.16b
+ b.hs .Ltop
+
+.Lend: uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+.Lsum: eor v0.16b, v0.16b, v16.16b
+ eor v1.16b, v1.16b, v17.16b
+ cnt v6.16b, v0.16b
+ cnt v7.16b, v1.16b
+ uadalp v4.8h, v6.16b
+ uadalp v5.8h, v7.16b
+ add v4.8h, v4.8h, v5.8h
+
+.Le0: uaddlp v4.4s, v4.8h
+ uaddlp v4.2d, v4.4s
+ mov x0, v4.d[0]
+ mov x1, v4.d[1]
+ add x0, x0, x1
+ ret
+
+
+
+
+.Lgt8k:
+ mov x8, x30
+ mov x7, x2
+ mov x4, #0
+ mov x9, #0x1ff0*8
+ mov x10, #0x1ff0
+
+1: add x5, x0, x9
+ add x6, x1, x9
+ mov x2, #0x1ff0-8
+ movi v4.16b, #0
+ movi v5.16b, #0
+ bl .Lchu
+ add x4, x4, x0
+ mov x0, x5
+ mov x1, x6
+ sub x7, x7, x10
+ cmp x7, x11
+ b.hi 1b
+
+ mov x2, x7
+ bl .Llt8k
+ add x0, x4, x0
+ mov x30, x8
+ ret
+ .size __gmpn_hamdist,.-__gmpn_hamdist