diff options
Diffstat (limited to 'vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s')
-rw-r--r-- | vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s | 169 |
1 files changed, 169 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s b/vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s new file mode 100644 index 0000000..e0d09b0 --- /dev/null +++ b/vere/ext/gmp/gen/aarch64-linux/mpn/popcount.s @@ -0,0 +1,169 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3 + .globl __gmpn_popcount + .type __gmpn_popcount,@function +__gmpn_popcount: + + mov x11, #0x1fff + cmp x1, x11 + b.hi .Lgt8k + +.Llt8k: + movi v4.16b, #0 + movi v5.16b, #0 + + tbz x1, #0, .Lxx0 + sub x1, x1, #1 + ld1 {v0.1d}, [x0], #8 + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +.Lxx0: tbz x1, #1, .Lx00 + sub x1, x1, #2 + ld1 {v0.2d}, [x0], #16 + cnt v6.16b, v0.16b + uadalp v4.8h, v6.16b + +.Lx00: tbz x1, #2, .L000 + subs x1, x1, #4 + ld1 {v0.2d,v1.2d}, [x0], #32 + b.ls .Lsum + +.Lgt4: ld1 {v2.2d,v3.2d}, [x0], #32 + sub x1, x1, #4 + cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + b .Lmid + +.L000: subs x1, x1, #8 + b.lo .Le0 + +.Lchu: ld1 {v2.2d,v3.2d}, [x0], #32 + ld1 {v0.2d,v1.2d}, [x0], #32 + cnt v6.16b, v2.16b + cnt v7.16b, v3.16b + subs x1, x1, #8 + b.lo .Lend + +.Ltop: ld1 {v2.2d,v3.2d}, [x0], #32 + uadalp v4.8h, v6.16b + cnt v6.16b, v0.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v1.16b +.Lmid: ld1 {v0.2d,v1.2d}, [x0], #32 + subs x1, x1, #8 + uadalp v4.8h, v6.16b + cnt v6.16b, v2.16b + uadalp v5.8h, v7.16b + cnt v7.16b, v3.16b + b.hs .Ltop + +.Lend: uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b +.Lsum: cnt v6.16b, v0.16b + cnt v7.16b, v1.16b + uadalp v4.8h, v6.16b + uadalp v5.8h, v7.16b + add v4.8h, v4.8h, v5.8h + +.Le0: uaddlp v4.4s, v4.8h + uaddlp v4.2d, v4.4s + mov x0, v4.d[0] + mov x1, v4.d[1] + add x0, x0, x1 + ret + + + +.Lgt8k: + mov x8, x30 + mov x7, x1 + mov x4, #0 + mov x9, #0x1ff0*8 + mov x10, #0x1ff0 + +1: add x5, x0, x9 + mov x1, #0x1ff0-8 + movi v4.16b, #0 + movi v5.16b, #0 + bl .Lchu + add x4, x4, x0 + mov x0, x5 + sub x7, x7, x10 + cmp x7, x11 + b.hi 1b + + mov x1, x7 + bl .Llt8k + add x0, x4, x0 + mov x30, x8 + ret + .size __gmpn_popcount,.-__gmpn_popcount |