summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-10-05 21:56:51 +0700
committerpolwex <polwex@sortug.com>2025-10-05 21:56:51 +0700
commitfcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch)
tree51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s')
-rw-r--r--vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s831
1 files changed, 831 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s
new file mode 100644
index 0000000..852ca8f
--- /dev/null
+++ b/vere/ext/gmp/gen/x86_64-windows/mpn/sqr_basecase.s
@@ -0,0 +1,831 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 16, 0x90
+ .globl __gmpn_sqr_basecase
+
+ .def __gmpn_sqr_basecase
+ .scl 2
+ .type 32
+ .endef
+__gmpn_sqr_basecase:
+
+ push %rdi
+ push %rsi
+ mov %rcx, %rdi
+ mov %rdx, %rsi
+ mov %r8, %rdx
+
+ mov %edx, %ecx
+ mov %edx, %r11d
+
+ add $-40, %rsp
+
+ and $3, %ecx
+ cmp $4, %edx
+ lea 4(%rcx), %r8
+
+ mov %rbx, 32(%rsp)
+ mov %rbp, 24(%rsp)
+ mov %r12, 16(%rsp)
+ mov %r13, 8(%rsp)
+ mov %r14, (%rsp)
+
+ cmovg %r8, %rcx
+
+ lea Ltab(%rip), %rax
+ movslq (%rax,%rcx,4), %r10
+ add %r10, %rax
+ jmp *%rax
+
+ .section .rdata,"dr"
+ .align 8, 0x90
+Ltab: .long L4-Ltab
+ .long L1-Ltab
+ .long L2-Ltab
+ .long L3-Ltab
+ .long L0m4-Ltab
+ .long L1m4-Ltab
+ .long L2m4-Ltab
+ .long L3m4-Ltab
+ .text
+
+L1: mov (%rsi), %rax
+ mul %rax
+ add $40, %rsp
+ mov %rax, (%rdi)
+ mov %rdx, 8(%rdi)
+ pop %rsi
+ pop %rdi
+ ret
+
+L2: mov (%rsi), %rax
+ mov %rax, %r8
+ mul %rax
+ mov 8(%rsi), %r11
+ mov %rax, (%rdi)
+ mov %r11, %rax
+ mov %rdx, %r9
+ mul %rax
+ add $40, %rsp
+ mov %rax, %r10
+ mov %r11, %rax
+ mov %rdx, %r11
+ mul %r8
+ xor %r8, %r8
+ add %rax, %r9
+ adc %rdx, %r10
+ adc %r8, %r11
+ add %rax, %r9
+ mov %r9, 8(%rdi)
+ adc %rdx, %r10
+ mov %r10, 16(%rdi)
+ adc %r8, %r11
+ mov %r11, 24(%rdi)
+ pop %rsi
+ pop %rdi
+ ret
+
+L3: mov (%rsi), %rax
+ mov %rax, %r10
+ mul %rax
+ mov 8(%rsi), %r11
+ mov %rax, (%rdi)
+ mov %r11, %rax
+ mov %rdx, 8(%rdi)
+ mul %rax
+ mov 16(%rsi), %rcx
+ mov %rax, 16(%rdi)
+ mov %rcx, %rax
+ mov %rdx, 24(%rdi)
+ mul %rax
+ mov %rax, 32(%rdi)
+ mov %rdx, 40(%rdi)
+
+ mov %r11, %rax
+ mul %r10
+ mov %rax, %r8
+ mov %rcx, %rax
+ mov %rdx, %r9
+ mul %r10
+ xor %r10, %r10
+ add %rax, %r9
+ mov %r11, %rax
+ mov %r10, %r11
+ adc %rdx, %r10
+
+ mul %rcx
+ add $40, %rsp
+ add %rax, %r10
+ adc %r11, %rdx
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %rdx, %rdx
+ adc %r11, %r11
+ add %r8, 8(%rdi)
+ adc %r9, 16(%rdi)
+ adc %r10, 24(%rdi)
+ adc %rdx, 32(%rdi)
+ adc %r11, 40(%rdi)
+ pop %rsi
+ pop %rdi
+ ret
+
+L4: mov (%rsi), %rax
+ mov %rax, %r11
+ mul %rax
+ mov 8(%rsi), %rbx
+ mov %rax, (%rdi)
+ mov %rbx, %rax
+ mov %rdx, 8(%rdi)
+ mul %rax
+ mov %rax, 16(%rdi)
+ mov %rdx, 24(%rdi)
+ mov 16(%rsi), %rax
+ mul %rax
+ mov %rax, 32(%rdi)
+ mov %rdx, 40(%rdi)
+ mov 24(%rsi), %rax
+ mul %rax
+ mov %rax, 48(%rdi)
+ mov %rbx, %rax
+ mov %rdx, 56(%rdi)
+
+ mul %r11
+ add $32, %rsp
+ mov %rax, %r8
+ mov %rdx, %r9
+ mov 16(%rsi), %rax
+ mul %r11
+ xor %r10, %r10
+ add %rax, %r9
+ adc %rdx, %r10
+ mov 24(%rsi), %rax
+ mul %r11
+ xor %r11, %r11
+ add %rax, %r10
+ adc %rdx, %r11
+ mov 16(%rsi), %rax
+ mul %rbx
+ xor %rcx, %rcx
+ add %rax, %r10
+ adc %rdx, %r11
+ adc $0, %rcx
+ mov 24(%rsi), %rax
+ mul %rbx
+ pop %rbx
+ add %rax, %r11
+ adc %rdx, %rcx
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %rax
+ mul %rdx
+ add %rax, %rcx
+ adc $0, %rdx
+
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %r11, %r11
+ adc %rcx, %rcx
+ mov $0, %eax
+ adc %rdx, %rdx
+
+ adc %rax, %rax
+ add %r8, 8(%rdi)
+ adc %r9, 16(%rdi)
+ adc %r10, 24(%rdi)
+ adc %r11, 32(%rdi)
+ adc %rcx, 40(%rdi)
+ adc %rdx, 48(%rdi)
+ adc %rax, 56(%rdi)
+ pop %rsi
+ pop %rdi
+ ret
+
+
+L0m4:
+ lea -16(%rdi,%r11,8), %r12
+ mov (%rsi), %r13
+ mov 8(%rsi), %rax
+ lea (%rsi,%r11,8), %rsi
+
+ lea -4(%r11), %r8
+
+ xor %r9d, %r9d
+ sub %r11, %r9
+
+ mul %r13
+ xor %ebp, %ebp
+ mov %rax, %rbx
+ mov 16(%rsi,%r9,8), %rax
+ mov %rdx, %r10
+ jmp LL3
+
+ .align 16, 0x90
+Lmul_1_m3_top:
+ add %rax, %rbp
+ mov %r10, (%r12,%r9,8)
+ mov (%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ xor %ebx, %ebx
+ mul %r13
+ xor %r10d, %r10d
+ mov %rbp, 8(%r12,%r9,8)
+ add %rax, %rcx
+ adc %rdx, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ mov %rcx, 16(%r12,%r9,8)
+ xor %ebp, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov 16(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+LL3: xor %ecx, %ecx
+ mul %r13
+ add %rax, %r10
+ mov 24(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ mov %rbx, 24(%r12,%r9,8)
+ mul %r13
+ add $4, %r9
+ js Lmul_1_m3_top
+
+ add %rax, %rbp
+ mov %r10, (%r12)
+ adc %rdx, %rcx
+ mov %rbp, 8(%r12)
+ mov %rcx, 16(%r12)
+
+ lea 16(%r12), %r12
+ lea -8(%rsi), %rsi
+ jmp Ldowhile
+
+
+L1m4:
+ lea 8(%rdi,%r11,8), %r12
+ mov (%rsi), %r13
+ mov 8(%rsi), %rax
+ lea 8(%rsi,%r11,8), %rsi
+
+ lea -3(%r11), %r8
+
+ lea -3(%r11), %r9
+ neg %r9
+
+ mov %rax, %r14
+ mul %r13
+ mov %rdx, %rcx
+ xor %ebp, %ebp
+ mov %rax, 8(%rdi)
+ jmp Lm0
+
+ .align 16, 0x90
+Lmul_2_m0_top:
+ mul %r14
+ add %rax, %rbx
+ adc %rdx, %rcx
+ mov -24(%rsi,%r9,8), %rax
+ mov $0, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov -24(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r14
+ add %rax, %rcx
+ mov %rbx, -24(%r12,%r9,8)
+ adc %rdx, %rbp
+Lm0: mov -16(%rsi,%r9,8), %rax
+ mul %r13
+ mov $0, %r10d
+ add %rax, %rcx
+ adc %rdx, %rbp
+ mov -16(%rsi,%r9,8), %rax
+ adc $0, %r10d
+ mov $0, %ebx
+ mov %rcx, -16(%r12,%r9,8)
+ mul %r14
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mov $0, %ecx
+ mul %r13
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ adc $0, %ebx
+ mul %r14
+ add %rax, %r10
+ mov %rbp, -8(%r12,%r9,8)
+ adc %rdx, %rbx
+Lm2x: mov (%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ adc %rdx, %rbx
+ adc $0, %ecx
+ add $4, %r9
+ mov -32(%rsi,%r9,8), %rax
+ mov %r10, -32(%r12,%r9,8)
+ js Lmul_2_m0_top
+
+ mul %r14
+ add %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, -8(%r12)
+ mov %rcx, (%r12)
+
+ lea -16(%rsi), %rsi
+ lea 0(%r12), %r12
+ jmp Ldowhile_end
+
+
+L2m4:
+ lea -16(%rdi,%r11,8), %r12
+ mov (%rsi), %r13
+ mov 8(%rsi), %rax
+ lea (%rsi,%r11,8), %rsi
+
+ lea -4(%r11), %r8
+
+ lea -2(%r11), %r9
+ neg %r9
+
+ mul %r13
+ mov %rax, %rbp
+ mov (%rsi,%r9,8), %rax
+ mov %rdx, %rcx
+ jmp LL1
+
+ .align 16, 0x90
+Lmul_1_m1_top:
+ add %rax, %rbp
+ mov %r10, (%r12,%r9,8)
+ mov (%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+LL1: xor %ebx, %ebx
+ mul %r13
+ xor %r10d, %r10d
+ mov %rbp, 8(%r12,%r9,8)
+ add %rax, %rcx
+ adc %rdx, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ mov %rcx, 16(%r12,%r9,8)
+ xor %ebp, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov 16(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ xor %ecx, %ecx
+ mul %r13
+ add %rax, %r10
+ mov 24(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ mov %rbx, 24(%r12,%r9,8)
+ mul %r13
+ add $4, %r9
+ js Lmul_1_m1_top
+
+ add %rax, %rbp
+ mov %r10, (%r12)
+ adc %rdx, %rcx
+ mov %rbp, 8(%r12)
+ mov %rcx, 16(%r12)
+
+ lea 16(%r12), %r12
+ lea -8(%rsi), %rsi
+ jmp Ldowhile_mid
+
+
+L3m4:
+ lea 8(%rdi,%r11,8), %r12
+ mov (%rsi), %r13
+ mov 8(%rsi), %rax
+ lea 8(%rsi,%r11,8), %rsi
+
+ lea -5(%r11), %r8
+
+ lea -1(%r11), %r9
+ neg %r9
+
+ mov %rax, %r14
+ mul %r13
+ mov %rdx, %r10
+ xor %ebx, %ebx
+ xor %ecx, %ecx
+ mov %rax, 8(%rdi)
+ jmp Lm2
+
+ .align 16, 0x90
+Lmul_2_m2_top:
+ mul %r14
+ add %rax, %rbx
+ adc %rdx, %rcx
+ mov -24(%rsi,%r9,8), %rax
+ mov $0, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov -24(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r14
+ add %rax, %rcx
+ mov %rbx, -24(%r12,%r9,8)
+ adc %rdx, %rbp
+ mov -16(%rsi,%r9,8), %rax
+ mul %r13
+ mov $0, %r10d
+ add %rax, %rcx
+ adc %rdx, %rbp
+ mov -16(%rsi,%r9,8), %rax
+ adc $0, %r10d
+ mov $0, %ebx
+ mov %rcx, -16(%r12,%r9,8)
+ mul %r14
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mov $0, %ecx
+ mul %r13
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ adc $0, %ebx
+ mul %r14
+ add %rax, %r10
+ mov %rbp, -8(%r12,%r9,8)
+ adc %rdx, %rbx
+Lm2: mov (%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ adc %rdx, %rbx
+ adc $0, %ecx
+ add $4, %r9
+ mov -32(%rsi,%r9,8), %rax
+ mov %r10, -32(%r12,%r9,8)
+ js Lmul_2_m2_top
+
+ mul %r14
+ add %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, -8(%r12)
+ mov %rcx, (%r12)
+
+ lea -16(%rsi), %rsi
+ jmp Ldowhile_mid
+
+Ldowhile:
+
+ lea 4(%r8), %r9
+ neg %r9
+
+ mov 16(%rsi,%r9,8), %r13
+ mov 24(%rsi,%r9,8), %r14
+ mov 24(%rsi,%r9,8), %rax
+ mul %r13
+ xor %r10d, %r10d
+ add %rax, 24(%r12,%r9,8)
+ adc %rdx, %r10
+ xor %ebx, %ebx
+ xor %ecx, %ecx
+ jmp Lam2
+
+ .align 16, 0x90
+Laddmul_2_m2_top:
+ add %r10, (%r12,%r9,8)
+ adc %rax, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ mov $0, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r14
+ add %rbx, 8(%r12,%r9,8)
+ adc %rax, %rcx
+ adc %rdx, %rbp
+ mov 16(%rsi,%r9,8), %rax
+ mov $0, %r10d
+ mul %r13
+ add %rax, %rcx
+ mov 16(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ adc $0, %r10d
+ mul %r14
+ add %rcx, 16(%r12,%r9,8)
+ adc %rax, %rbp
+ mov 24(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mul %r13
+ mov $0, %ebx
+ add %rax, %rbp
+ adc %rdx, %r10
+ mov $0, %ecx
+ mov 24(%rsi,%r9,8), %rax
+ adc $0, %ebx
+ mul %r14
+ add %rbp, 24(%r12,%r9,8)
+ adc %rax, %r10
+ adc %rdx, %rbx
+Lam2: mov 32(%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ mov 32(%rsi,%r9,8), %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul %r14
+ add $4, %r9
+ js Laddmul_2_m2_top
+
+ add %r10, (%r12)
+ adc %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, 8(%r12)
+ mov %rcx, 16(%r12)
+
+ lea 16(%r12), %r12
+
+ add $-2, %r8d
+
+Ldowhile_mid:
+
+ lea 2(%r8), %r9
+ neg %r9
+
+ mov (%rsi,%r9,8), %r13
+ mov 8(%rsi,%r9,8), %r14
+ mov 8(%rsi,%r9,8), %rax
+ mul %r13
+ xor %ecx, %ecx
+ add %rax, 8(%r12,%r9,8)
+ adc %rdx, %rcx
+ xor %ebp, %ebp
+ jmp L20
+
+ .align 16, 0x90
+Laddmul_2_m0_top:
+ add %r10, (%r12,%r9,8)
+ adc %rax, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ mov $0, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov 8(%rsi,%r9,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r14
+ add %rbx, 8(%r12,%r9,8)
+ adc %rax, %rcx
+ adc %rdx, %rbp
+L20: mov 16(%rsi,%r9,8), %rax
+ mov $0, %r10d
+ mul %r13
+ add %rax, %rcx
+ mov 16(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ adc $0, %r10d
+ mul %r14
+ add %rcx, 16(%r12,%r9,8)
+ adc %rax, %rbp
+ mov 24(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mul %r13
+ mov $0, %ebx
+ add %rax, %rbp
+ adc %rdx, %r10
+ mov $0, %ecx
+ mov 24(%rsi,%r9,8), %rax
+ adc $0, %ebx
+ mul %r14
+ add %rbp, 24(%r12,%r9,8)
+ adc %rax, %r10
+ adc %rdx, %rbx
+ mov 32(%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ mov 32(%rsi,%r9,8), %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul %r14
+ add $4, %r9
+ js Laddmul_2_m0_top
+
+ add %r10, (%r12)
+ adc %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, 8(%r12)
+ mov %rcx, 16(%r12)
+
+ lea 16(%r12), %r12
+Ldowhile_end:
+
+ add $-2, %r8d
+ jne Ldowhile
+
+
+ mov -16(%rsi), %r13
+ mov -8(%rsi), %r14
+ mov -8(%rsi), %rax
+ mul %r13
+ xor %r10d, %r10d
+ add %rax, -8(%r12)
+ adc %rdx, %r10
+ xor %ebx, %ebx
+ xor %ecx, %ecx
+ mov (%rsi), %rax
+ mul %r13
+ add %rax, %r10
+ mov (%rsi), %rax
+ adc %rdx, %rbx
+ mul %r14
+ add %r10, (%r12)
+ adc %rax, %rbx
+ adc %rdx, %rcx
+ mov %rbx, 8(%r12)
+ mov %rcx, 16(%r12)
+
+
+ lea -4(%r11,%r11), %r9
+
+ mov 8(%rdi), %r11
+ lea -8(%rsi), %rsi
+ lea (%rdi,%r9,8), %rdi
+ neg %r9
+ mov (%rsi,%r9,4), %rax
+ mul %rax
+ test $2, %r9b
+ jnz Lodd
+
+Levn: add %r11, %r11
+ sbb %ebx, %ebx
+ add %rdx, %r11
+ mov %rax, (%rdi,%r9,8)
+ jmp Ld0
+
+Lodd: add %r11, %r11
+ sbb %ebp, %ebp
+ add %rdx, %r11
+ mov %rax, (%rdi,%r9,8)
+ lea -2(%r9), %r9
+ jmp Ld1
+
+ .align 16, 0x90
+Ltop: mov (%rsi,%r9,4), %rax
+ mul %rax
+ add %ebp, %ebp
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (%rdi,%r9,8)
+Ld0: mov %r11, 8(%rdi,%r9,8)
+ mov 16(%rdi,%r9,8), %r10
+ adc %r10, %r10
+ mov 24(%rdi,%r9,8), %r11
+ adc %r11, %r11
+ nop
+ sbb %ebp, %ebp
+ mov 8(%rsi,%r9,4), %rax
+ mul %rax
+ add %ebx, %ebx
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, 16(%rdi,%r9,8)
+Ld1: mov %r11, 24(%rdi,%r9,8)
+ mov 32(%rdi,%r9,8), %r10
+ adc %r10, %r10
+ mov 40(%rdi,%r9,8), %r11
+ adc %r11, %r11
+ sbb %ebx, %ebx
+ add $4, %r9
+ js Ltop
+
+ mov (%rsi), %rax
+ mul %rax
+ add %ebp, %ebp
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (%rdi)
+ mov %r11, 8(%rdi)
+ mov 16(%rdi), %r10
+ adc %r10, %r10
+ sbb %ebp, %ebp
+ neg %ebp
+ mov 8(%rsi), %rax
+ mul %rax
+ add %ebx, %ebx
+ adc %rax, %r10
+ adc %rbp, %rdx
+ mov %r10, 16(%rdi)
+ mov %rdx, 24(%rdi)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rsi
+ pop %rdi
+ ret
+