diff options
Diffstat (limited to 'vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s')
-rw-r--r-- | vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s b/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s new file mode 100644 index 0000000..624b72d --- /dev/null +++ b/vere/ext/gmp/gen/aarch64-linux/mpn/hamdist.s @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl __gmpn_hamdist + .type __gmpn_hamdist,@function +__gmpn_hamdist: + + mov x11, #0x1fff + cmp x2, x11 + b.hi .Lgt8k + +.Llt8k: + movi v4.16b, #0 + movi v5.16b, #0 + + tbz x2, #0, .Lxx0 + sub x2, x2, #1 + ld1 {v0.1d}, [x0], #8 + ld1 {v16.1d}, [x1], #8 + eor v0.16b, v0.16b, v16.16b + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +.Lxx0: tbz x2, #1, .Lx00 + sub x2, x2, #2 + ld1 {v0.2d}, [x0], #16 + ld1 {v16.2d}, [x1], #16 + eor v0.16b, v0.16b, v16.16b + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +.Lx00: tbz x2, #2, .L000 + subs x2, x2, #4 + ld1 {v0.2d,v1.2d}, [x0], #32 + ld1 {v16.2d,v17.2d}, [x1], #32 + b.ls .Lsum + +.Lgt4: ld1 {v2.2d,v3.2d}, [x0], #32 + ld1 {v18.2d,v19.2d}, [x1], #32 + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + sub x2, x2, #4 + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + b .Lmid + +.L000: subs x2, x2, #8 + b.lo .Le0 + +.Lchu: ld1 {v2.2d,v3.2d}, [x0], #32 + ld1 {v0.2d,v1.2d}, [x0], #32 + ld1 {v18.2d,v19.2d}, [x1], #32 + ld1 {v16.2d,v17.2d}, [x1], #32 + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + cnt v6.16b, v2.16b + cnt v7.16b, v3.16b + subs x2, x2, #8 + b.lo .Lend + +.Ltop: ld1 {v2.2d,v3.2d}, [x0], #32 + ld1 {v18.2d,v19.2d}, [x1], #32 + eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + uadalp v4.8h, v6.16b + cnt v6.16b, v0.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v1.16b +.Lmid: ld1 {v0.2d,v1.2d}, [x0], #32 + ld1 {v16.2d,v17.2d}, [x1], #32 + eor v2.16b, v2.16b, v18.16b + eor v3.16b, v3.16b, v19.16b + subs x2, x2, #8 + uadalp v4.8h, v6.16b + cnt v6.16b, v2.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v3.16b + b.hs .Ltop + +.Lend: uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b +.Lsum: eor v0.16b, v0.16b, v16.16b + eor v1.16b, v1.16b, v17.16b + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b + add v4.8h, v4.8h, v5.8h + +.Le0: uaddlp v4.4s, v4.8h + uaddlp v4.2d, v4.4s + mov x0, v4.d[0] + mov x1, v4.d[1] + add x0, x0, x1 + ret + + + + +.Lgt8k: + mov x8, x30 + mov x7, x2 + mov x4, #0 + mov x9, #0x1ff0*8 + mov x10, #0x1ff0 + +1: add x5, x0, x9 + add x6, x1, x9 + mov x2, #0x1ff0-8 + movi v4.16b, #0 + movi v5.16b, #0 + bl .Lchu + add x4, x4, x0 + mov x0, x5 + mov x1, x6 + sub x7, x7, x10 + cmp x7, x11 + b.hi 1b + + mov x2, x7 + bl .Llt8k + add x0, x4, x0 + mov x30, x8 + ret + .size __gmpn_hamdist,.-__gmpn_hamdist |