diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s |
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s')
-rw-r--r-- | vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s | 230 |
1 files changed, 230 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s new file mode 100644 index 0000000..7528e27 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s @@ -0,0 +1,230 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_rshift + + +___gmpn_rshift: + + + movd %ecx, %xmm4 + mov $64, %eax + sub %ecx, %eax + movd %eax, %xmm5 + + neg %ecx + mov (%rsi), %rax + shl %cl, %rax + + cmp $3, %rdx + jle Lbc + + test $8, %dil + jz Lrp_aligned + + + movq (%rsi), %xmm0 + movq 8(%rsi), %xmm1 + psrlq %xmm4, %xmm0 + psllq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, (%rdi) + lea 8(%rsi), %rsi + lea 8(%rdi), %rdi + dec %rdx + +Lrp_aligned: + lea 1(%rdx), %r8d + lea (%rsi,%rdx,8), %rsi + lea (%rdi,%rdx,8), %rdi + neg %rdx + + and $6, %r8d + jz Lbu0 + cmp $4, %r8d + jz Lbu4 + jc Lbu2 +Lbu6: add $4, %rdx + jmp Li56 +Lbu0: add $6, %rdx + jmp Li70 +Lbu4: add $2, %rdx + jmp Li34 +Lbu2: add $8, %rdx + jge Lend + + .align 4, 0x90 +Ltop: movdqu -64(%rsi,%rdx,8), %xmm1 + movdqu -56(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -64(%rdi,%rdx,8) +Li70: + movdqu -48(%rsi,%rdx,8), %xmm1 + movdqu -40(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -48(%rdi,%rdx,8) +Li56: + movdqu -32(%rsi,%rdx,8), %xmm1 + movdqu -24(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -32(%rdi,%rdx,8) +Li34: + movdqu -16(%rsi,%rdx,8), %xmm1 + movdqu -8(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -16(%rdi,%rdx,8) + add $8, %rdx + jl Ltop + +Lend: test $1, %dl + jnz Le1 + + movdqu -16(%rsi), %xmm1 + movq -8(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, -16(%rdi) + + ret + +Le1: movq -8(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, -8(%rdi) + + ret + + + .align 4, 0x90 +Lbc: dec %edx + jnz 1f + movq (%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, (%rdi) + + ret + +1: movq (%rsi), %xmm1 + movq 8(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (%rdi) + dec %edx + jnz 1f + movq 8(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 8(%rdi) + + ret + +1: movq 8(%rsi), %xmm1 + movq 16(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(%rdi) + movq 16(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 16(%rdi) + + ret + + |