diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s |
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s')
-rw-r--r-- | vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s | 831 |
1 files changed, 831 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s new file mode 100644 index 0000000..852ca8f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s @@ -0,0 +1,831 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sqr_basecase + + .def __gmpn_sqr_basecase + .scl 2 + .type 32 + .endef +__gmpn_sqr_basecase: + + push %rdi + push %rsi + mov %rcx, %rdi + mov %rdx, %rsi + mov %r8, %rdx + + mov %edx, %ecx + mov %edx, %r11d + + add $-40, %rsp + + and $3, %ecx + cmp $4, %edx + lea 4(%rcx), %r8 + + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea Ltab(%rip), %rax + movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax + + .section .rdata,"dr" + .align 8, 0x90 +Ltab: .long L4-Ltab + .long L1-Ltab + .long L2-Ltab + .long L3-Ltab + .long L0m4-Ltab + .long L1m4-Ltab + .long L2m4-Ltab + .long L3m4-Ltab + .text + +L1: mov (%rsi), %rax + mul %rax + add $40, %rsp + mov %rax, (%rdi) + mov %rdx, 8(%rdi) + pop %rsi + pop %rdi + ret + +L2: mov (%rsi), %rax + mov %rax, %r8 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(%rdi) + adc %rdx, %r10 + mov %r10, 16(%rdi) + adc %r8, %r11 + mov %r11, 24(%rdi) + pop %rsi + pop %rdi + ret + +L3: mov (%rsi), %rax + mov %rax, %r10 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, 8(%rdi) + mul %rax + mov 16(%rsi), %rcx + mov %rax, 16(%rdi) + mov %rcx, %rax + mov %rdx, 24(%rdi) + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %rdx, 32(%rdi) + adc %r11, 40(%rdi) + pop %rsi + pop %rdi + ret + +L4: mov (%rsi), %rax + mov %rax, %r11 + mul %rax + mov 8(%rsi), %rbx + mov %rax, (%rdi) + mov %rbx, %rax + mov %rdx, 8(%rdi) + mul %rax + mov %rax, 16(%rdi) + mov %rdx, 24(%rdi) + mov 16(%rsi), %rax + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + mov 24(%rsi), %rax + mul %rax + mov %rax, 48(%rdi) + mov %rbx, %rax + mov %rdx, 56(%rdi) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(%rsi), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(%rsi), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(%rsi), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(%rsi), %rax + mul %rbx + pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(%rsi), %rdx + mov 24(%rsi), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, %eax + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %r11, 32(%rdi) + adc %rcx, 40(%rdi) + adc %rdx, 48(%rdi) + adc %rax, 56(%rdi) + pop %rsi + pop %rdi + ret + + +L0m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + xor %r9d, %r9d + sub %r11, %r9 + + mul %r13 + xor %ebp, %ebp + mov %rax, %rbx + mov 16(%rsi,%r9,8), %rax + mov %rdx, %r10 + jmp LL3 + + .align 16, 0x90 +Lmul_1_m3_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx + xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 +LL3: xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js Lmul_1_m3_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp Ldowhile + + +L1m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -3(%r11), %r8 + + lea -3(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %rcx + xor %ebp, %ebp + mov %rax, 8(%rdi) + jmp Lm0 + + .align 16, 0x90 +Lmul_2_m0_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp +Lm0: mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +Lm2x: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js Lmul_2_m0_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + lea 0(%r12), %r12 + jmp Ldowhile_end + + +L2m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + lea -2(%r11), %r9 + neg %r9 + + mul %r13 + mov %rax, %rbp + mov (%rsi,%r9,8), %rax + mov %rdx, %rcx + jmp LL1 + + .align 16, 0x90 +Lmul_1_m1_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx +LL1: xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 + xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js Lmul_1_m1_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp Ldowhile_mid + + +L3m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -5(%r11), %r8 + + lea -1(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, 8(%rdi) + jmp Lm2 + + .align 16, 0x90 +Lmul_2_m2_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +Lm2: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js Lmul_2_m2_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + jmp Ldowhile_mid + +Ldowhile: + + lea 4(%r8), %r9 + neg %r9 + + mov 16(%rsi,%r9,8), %r13 + mov 24(%rsi,%r9,8), %r14 + mov 24(%rsi,%r9,8), %rax + mul %r13 + xor %r10d, %r10d + add %rax, 24(%r12,%r9,8) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + jmp Lam2 + + .align 16, 0x90 +Laddmul_2_m2_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx +Lam2: mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js Laddmul_2_m2_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + + add $-2, %r8d + +Ldowhile_mid: + + lea 2(%r8), %r9 + neg %r9 + + mov (%rsi,%r9,8), %r13 + mov 8(%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %rax + mul %r13 + xor %ecx, %ecx + add %rax, 8(%r12,%r9,8) + adc %rdx, %rcx + xor %ebp, %ebp + jmp L20 + + .align 16, 0x90 +Laddmul_2_m0_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp +L20: mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js Laddmul_2_m0_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 +Ldowhile_end: + + add $-2, %r8d + jne Ldowhile + + + mov -16(%rsi), %r13 + mov -8(%rsi), %r14 + mov -8(%rsi), %rax + mul %r13 + xor %r10d, %r10d + add %rax, -8(%r12) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov (%rsi), %rax + mul %r13 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + mul %r14 + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + + lea -4(%r11,%r11), %r9 + + mov 8(%rdi), %r11 + lea -8(%rsi), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + mov (%rsi,%r9,4), %rax + mul %rax + test $2, %r9b + jnz Lodd + +Levn: add %r11, %r11 + sbb %ebx, %ebx + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + jmp Ld0 + +Lodd: add %r11, %r11 + sbb %ebp, %ebp + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + lea -2(%r9), %r9 + jmp Ld1 + + .align 16, 0x90 +Ltop: mov (%rsi,%r9,4), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi,%r9,8) +Ld0: mov %r11, 8(%rdi,%r9,8) + mov 16(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 24(%rdi,%r9,8), %r11 + adc %r11, %r11 + nop + sbb %ebp, %ebp + mov 8(%rsi,%r9,4), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(%rdi,%r9,8) +Ld1: mov %r11, 24(%rdi,%r9,8) + mov 32(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 40(%rdi,%r9,8), %r11 + adc %r11, %r11 + sbb %ebx, %ebx + add $4, %r9 + js Ltop + + mov (%rsi), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov 16(%rdi), %r10 + adc %r10, %r10 + sbb %ebp, %ebp + neg %ebp + mov 8(%rsi), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rsi + pop %rdi + ret + |