diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s |
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s')
-rw-r--r-- | vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s | 452 |
1 files changed, 452 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s b/vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s new file mode 100644 index 0000000..27bb7ab --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-windows/mpn/mullo_basecase.s @@ -0,0 +1,452 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mullo_basecase + + .def __gmpn_mullo_basecase + .scl 2 + .type 32 + .endef +__gmpn_mullo_basecase: + + push %rdi + push %rsi + mov %rcx, %rdi + mov %rdx, %rsi + mov %r8, %rdx + mov %r9, %rcx + + cmp $4, %rcx + jge Lgen + mov (%rsi), %rax + mov (%rdx), %r8 + + lea Ltab(%rip), %r9 + movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 + + .section .rdata,"dr" + .align 8, 0x90 +Ltab: .long Ltab-Ltab + .long L1-Ltab + .long L2-Ltab + .long L3-Ltab + .text + +L1: imul %r8, %rax + mov %rax, (%rdi) + pop %rsi + pop %rdi + ret + +L2: mov 8(%rdx), %r11 + imul %rax, %r11 + mul %r8 + mov %rax, (%rdi) + imul 8(%rsi), %r8 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(%rdi) + pop %rsi + pop %rdi + ret + +L3: mov 8(%rdx), %r9 + mov 16(%rdx), %r11 + mul %r8 + mov %rax, (%rdi) + mov (%rsi), %rax + mov %rdx, %rcx + mul %r9 + imul 8(%rsi), %r9 + mov 16(%rsi), %r10 + imul %r8, %r10 + add %rax, %rcx + adc %rdx, %r9 + add %r10, %r9 + mov 8(%rsi), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (%rsi), %rax + add %rax, %r9 + mov %rcx, 8(%rdi) + mov %r9, 16(%rdi) + pop %rsi + pop %rdi + ret + +L0m4: +L1m4: +L2m4: +L3m4: +Lgen: push %rbx + push %rbp + push %r13 + push %r14 + push %r15 + + mov (%rsi), %rax + mov (%rdx), %r13 + mov %rdx, %r11 + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + neg %rcx + + mul %r13 + + test $1, %cl + jz Lmul_2 + +Lmul_1: + lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + test $2, %cl + jnz Lmul_1_prologue_3 + +Lmul_1_prologue_2: + lea -1(%rcx), %r9 + lea Laddmul_outer_1(%rip), %r8 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + xor %r10d, %r10d + mov 16(%rsi,%rcx,8), %rax + jmp Lmul_1_entry_2 + +Lmul_1_prologue_3: + lea 1(%rcx), %r9 + lea Laddmul_outer_3(%rip), %r8 + mov %rax, %rbp + mov %rdx, %r10 + xor %ebx, %ebx + jmp Lmul_1_entry_0 + + .align 16, 0x90 +Lmul_1_top: + mov %rbx, -16(%rdi,%r9,8) + add %rax, %r15 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbp + xor %ebx, %ebx + mul %r13 + mov %r15, -8(%rdi,%r9,8) + add %rax, %rbp + adc %rdx, %r10 +Lmul_1_entry_0: + mov 8(%rsi,%r9,8), %rax + mul %r13 + mov %rbp, (%rdi,%r9,8) + add %rax, %r10 + adc %rdx, %rbx + mov 16(%rsi,%r9,8), %rax + mul %r13 + mov %r10, 8(%rdi,%r9,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r9,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +Lmul_1_entry_2: + mul %r13 + add $4, %r9 + js Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + + imul (%rsi), %r13 + add %r13, %rbp + mov %rbp, (%rdi) + + add $1, %rcx + jz Lret + + mov 8(%r11), %r13 + mov 16(%r11), %r14 + + lea 16(%rsi), %rsi + lea 8(%r11), %r11 + lea 24(%rdi), %rdi + + jmp *%r8 + + +Lmul_2: + mov 8(%r11), %r14 + test $2, %cl + jz Lmul_2_prologue_3 + + .align 16, 0x90 +Lmul_2_prologue_1: + lea 0(%rcx), %r9 + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + mov (%rsi,%rcx,8), %rax + lea Laddmul_outer_3(%rip), %r8 + jmp Lmul_2_entry_1 + + .align 16, 0x90 +Lmul_2_prologue_3: + lea 2(%rcx), %r9 + mov $0, %r10d + mov %rax, %r15 + mov (%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea Laddmul_outer_1(%rip), %r8 + jmp Lmul_2_entry_3 + + .align 16, 0x90 +Lmul_2_top: + mov -32(%rsi,%r9,8), %rax + mul %r14 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r9,8), %rax + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc $0, %ebp + mul %r14 + add %rax, %r15 + mov %rbx, -24(%rdi,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d +Lmul_2_entry_3: + mov $0, %ebx + mov %r15, -16(%rdi,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%rdi,%r9,8) + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +Lmul_2_entry_1: + add $4, %r9 + mov %r10, -32(%rdi,%r9,8) + js Lmul_2_top + + imul -16(%rsi), %r14 + add %r14, %rbx + imul -8(%rsi), %r13 + add %r13, %rbx + mov %rbx, -8(%rdi) + + add $2, %rcx + jz Lret + + mov 16(%r11), %r13 + mov 24(%r11), %r14 + + lea 16(%r11), %r11 + lea 16(%rdi), %rdi + + jmp *%r8 + + +Laddmul_outer_1: + lea -2(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + mul %r13 + mov %rax, %r10 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + lea Laddmul_outer_3(%rip), %r8 + jmp Laddmul_entry_1 + +Laddmul_outer_3: + lea 0(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + xor %r10d, %r10d + mul %r13 + mov %rax, %r15 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea Laddmul_outer_1(%rip), %r8 + jmp Laddmul_entry_3 + + .align 16, 0x90 +Laddmul_top: + add %r10, -32(%rdi,%r9,8) + adc %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp + mul %r14 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r9,8) + adc %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + mul %r13 + add %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d +Laddmul_entry_3: + mul %r14 + add %r15, -16(%rdi,%r9,8) + adc %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r9,8), %rax + adc %r15d, %ebx + mul %r14 + add %rbp, -8(%rdi,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %r15d +Laddmul_entry_1: + mul %r14 + add $4, %r9 + js Laddmul_top + + add %r10, -32(%rdi) + adc %rax, %rbx + + imul -24(%rsi), %r13 + add %r13, %rbx + add %rbx, -24(%rdi) + + add $2, %rcx + jns Lret + + lea 16(%r11), %r11 + + mov (%r11), %r13 + mov 8(%r11), %r14 + + lea -16(%rsi), %rsi + + jmp *%r8 + +Lret: pop %r15 + pop %r14 + pop %r13 + pop %rbp + pop %rbx + pop %rsi + pop %rdi + ret + |