summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s
diff options
context:
space:
mode:
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s')
-rw-r--r--vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s573
1 files changed, 573 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s
new file mode 100644
index 0000000..b607e84
--- /dev/null
+++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s
@@ -0,0 +1,573 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 16, 0x90
+ .globl __gmpn_mulmid_basecase
+ .type __gmpn_mulmid_basecase,@function
+
+__gmpn_mulmid_basecase:
+
+
+
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rcx, %r15
+
+
+ lea 1(%rdx), %r13
+ sub %r8, %r13
+
+ lea (%rdi,%r13,8), %rdi
+
+ cmp $4, %r13
+ jc .Ldiagonal
+
+ lea (%rsi,%rdx,8), %rsi
+
+ test $1, %r8
+ jz .Lmul_2
+
+
+
+
+.Lmul_1:
+ mov %r13d, %ebx
+
+ neg %r13
+ mov (%rsi,%r13,8), %rax
+ mov (%r15), %r12
+ mul %r12
+
+ and $-4, %r13
+ mov %r13, %r11
+
+ and $3, %ebx
+ jz .Lmul_1_prologue_0
+ cmp $2, %ebx
+ jc .Lmul_1_prologue_1
+ jz .Lmul_1_prologue_2
+
+.Lmul_1_prologue_3:
+ mov %rax, %r10
+ mov %rdx, %rbx
+ lea .Laddmul_prologue_3(%rip), %r14
+ jmp .Lmul_1_entry_3
+
+ .align 16, 0x90
+.Lmul_1_prologue_0:
+ mov %rax, %rbp
+ mov %rdx, %r10
+ lea .Laddmul_prologue_0(%rip), %r14
+ jmp .Lmul_1_entry_0
+
+ .align 16, 0x90
+.Lmul_1_prologue_1:
+ add $4, %r11
+ mov %rax, %rcx
+ mov %rdx, %rbp
+ mov $0, %r10d
+ mov (%rsi,%r11,8), %rax
+ lea .Laddmul_prologue_1(%rip), %r14
+ jmp .Lmul_1_entry_1
+
+ .align 16, 0x90
+.Lmul_1_prologue_2:
+ mov %rax, %rbx
+ mov %rdx, %rcx
+ mov 24(%rsi,%r11,8), %rax
+ mov $0, %ebp
+ mov $0, %r10d
+ lea .Laddmul_prologue_2(%rip), %r14
+ jmp .Lmul_1_entry_2
+
+
+
+
+ .align 16, 0x90
+.Lmul_1_top:
+ mov %rbx, -16(%rdi,%r11,8)
+ add %rax, %rcx
+ mov (%rsi,%r11,8), %rax
+ adc %rdx, %rbp
+.Lmul_1_entry_1:
+ mov $0, %ebx
+ mul %r12
+ mov %rcx, -8(%rdi,%r11,8)
+ add %rax, %rbp
+ adc %rdx, %r10
+.Lmul_1_entry_0:
+ mov 8(%rsi,%r11,8), %rax
+ mul %r12
+ mov %rbp, (%rdi,%r11,8)
+ add %rax, %r10
+ adc %rdx, %rbx
+.Lmul_1_entry_3:
+ mov 16(%rsi,%r11,8), %rax
+ mul %r12
+ mov %r10, 8(%rdi,%r11,8)
+ mov $0, %ebp
+ mov %rbp, %r10
+ add %rax, %rbx
+ mov 24(%rsi,%r11,8), %rax
+ mov %rbp, %rcx
+ adc %rdx, %rcx
+.Lmul_1_entry_2:
+ mul %r12
+ add $4, %r11
+ js .Lmul_1_top
+
+ mov %rbx, -16(%rdi)
+ add %rax, %rcx
+ mov %rcx, -8(%rdi)
+ mov %rbp, 8(%rdi)
+ adc %rdx, %rbp
+ mov %rbp, (%rdi)
+
+ dec %r8
+ jz .Lret
+
+ lea -8(%rsi), %rsi
+ lea 8(%r15), %r15
+
+ mov %r13, %r11
+ mov (%r15), %r12
+ mov 8(%r15), %r9
+
+ jmp *%r14
+
+
+
+
+ .align 16, 0x90
+.Lmul_2:
+ mov %r13d, %ebx
+
+ neg %r13
+ mov -8(%rsi,%r13,8), %rax
+ mov (%r15), %r12
+ mov 8(%r15), %r9
+ mul %r9
+
+ and $-4, %r13
+ mov %r13, %r11
+
+ and $3, %ebx
+ jz .Lmul_2_prologue_0
+ cmp $2, %ebx
+ jc .Lmul_2_prologue_1
+ jz .Lmul_2_prologue_2
+
+.Lmul_2_prologue_3:
+ mov %rax, %rcx
+ mov %rdx, %rbp
+ lea .Laddmul_prologue_3(%rip), %r14
+ jmp .Lmul_2_entry_3
+
+ .align 16, 0x90
+.Lmul_2_prologue_0:
+ mov %rax, %rbx
+ mov %rdx, %rcx
+ lea .Laddmul_prologue_0(%rip), %r14
+ jmp .Lmul_2_entry_0
+
+ .align 16, 0x90
+.Lmul_2_prologue_1:
+ mov %rax, %r10
+ mov %rdx, %rbx
+ mov $0, %ecx
+ lea .Laddmul_prologue_1(%rip), %r14
+ jmp .Lmul_2_entry_1
+
+ .align 16, 0x90
+.Lmul_2_prologue_2:
+ mov %rax, %rbp
+ mov %rdx, %r10
+ mov $0, %ebx
+ mov 16(%rsi,%r11,8), %rax
+ lea .Laddmul_prologue_2(%rip), %r14
+ jmp .Lmul_2_entry_2
+
+
+
+
+ .align 16, 0x90
+.Lmul_2_top:
+ mov -8(%rsi,%r11,8), %rax
+ mul %r9
+ add %rax, %rbx
+ adc %rdx, %rcx
+.Lmul_2_entry_0:
+ mov $0, %ebp
+ mov (%rsi,%r11,8), %rax
+ mul %r12
+ add %rax, %rbx
+ mov (%rsi,%r11,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r9
+ add %rax, %rcx
+ mov %rbx, (%rdi,%r11,8)
+ adc %rdx, %rbp
+.Lmul_2_entry_3:
+ mov 8(%rsi,%r11,8), %rax
+ mul %r12
+ mov $0, %r10d
+ add %rax, %rcx
+ adc %rdx, %rbp
+ mov $0, %ebx
+ adc $0, %r10d
+ mov 8(%rsi,%r11,8), %rax
+ mov %rcx, 8(%rdi,%r11,8)
+ mul %r9
+ add %rax, %rbp
+ mov 16(%rsi,%r11,8), %rax
+ adc %rdx, %r10
+.Lmul_2_entry_2:
+ mov $0, %ecx
+ mul %r12
+ add %rax, %rbp
+ mov 16(%rsi,%r11,8), %rax
+ adc %rdx, %r10
+ adc $0, %ebx
+ mul %r9
+ add %rax, %r10
+ mov %rbp, 16(%rdi,%r11,8)
+ adc %rdx, %rbx
+.Lmul_2_entry_1:
+ mov 24(%rsi,%r11,8), %rax
+ mul %r12
+ add %rax, %r10
+ adc %rdx, %rbx
+ adc $0, %ecx
+ add $4, %r11
+ mov %r10, -8(%rdi,%r11,8)
+ jnz .Lmul_2_top
+
+ mov %rbx, (%rdi)
+ mov %rcx, 8(%rdi)
+
+ sub $2, %r8
+ jz .Lret
+
+ lea 16(%r15), %r15
+ lea -16(%rsi), %rsi
+
+ mov %r13, %r11
+ mov (%r15), %r12
+ mov 8(%r15), %r9
+
+ jmp *%r14
+
+
+
+
+ .align 16, 0x90
+.Laddmul_prologue_0:
+ mov -8(%rsi,%r11,8), %rax
+ mul %r9
+ mov %rax, %rcx
+ mov %rdx, %rbp
+ mov $0, %r10d
+ jmp .Laddmul_entry_0
+
+ .align 16, 0x90
+.Laddmul_prologue_1:
+ mov 16(%rsi,%r11,8), %rax
+ mul %r9
+ mov %rax, %rbx
+ mov %rdx, %rcx
+ mov $0, %ebp
+ mov 24(%rsi,%r11,8), %rax
+ jmp .Laddmul_entry_1
+
+ .align 16, 0x90
+.Laddmul_prologue_2:
+ mov 8(%rsi,%r11,8), %rax
+ mul %r9
+ mov %rax, %r10
+ mov %rdx, %rbx
+ mov $0, %ecx
+ jmp .Laddmul_entry_2
+
+ .align 16, 0x90
+.Laddmul_prologue_3:
+ mov (%rsi,%r11,8), %rax
+ mul %r9
+ mov %rax, %rbp
+ mov %rdx, %r10
+ mov $0, %ebx
+ mov $0, %ecx
+ jmp .Laddmul_entry_3
+
+
+
+ .align 16, 0x90
+.Laddmul_top:
+ mov $0, %r10d
+ add %rax, %rbx
+ mov -8(%rsi,%r11,8), %rax
+ adc %rdx, %rcx
+ adc $0, %ebp
+ mul %r9
+ add %rbx, -8(%rdi,%r11,8)
+ adc %rax, %rcx
+ adc %rdx, %rbp
+.Laddmul_entry_0:
+ mov (%rsi,%r11,8), %rax
+ mul %r12
+ add %rax, %rcx
+ mov (%rsi,%r11,8), %rax
+ adc %rdx, %rbp
+ adc $0, %r10d
+ mul %r9
+ add %rcx, (%rdi,%r11,8)
+ mov $0, %ecx
+ adc %rax, %rbp
+ mov $0, %ebx
+ adc %rdx, %r10
+.Laddmul_entry_3:
+ mov 8(%rsi,%r11,8), %rax
+ mul %r12
+ add %rax, %rbp
+ mov 8(%rsi,%r11,8), %rax
+ adc %rdx, %r10
+ adc $0, %ebx
+ mul %r9
+ add %rbp, 8(%rdi,%r11,8)
+ adc %rax, %r10
+ adc %rdx, %rbx
+.Laddmul_entry_2:
+ mov 16(%rsi,%r11,8), %rax
+ mul %r12
+ add %rax, %r10
+ mov 16(%rsi,%r11,8), %rax
+ adc %rdx, %rbx
+ adc $0, %ecx
+ mul %r9
+ add %r10, 16(%rdi,%r11,8)
+ nop
+ adc %rax, %rbx
+ mov $0, %ebp
+ mov 24(%rsi,%r11,8), %rax
+ adc %rdx, %rcx
+.Laddmul_entry_1:
+ mul %r12
+ add $4, %r11
+ jnz .Laddmul_top
+
+ add %rax, %rbx
+ adc %rdx, %rcx
+ adc $0, %ebp
+
+ add %rbx, -8(%rdi)
+ adc %rcx, (%rdi)
+ adc %rbp, 8(%rdi)
+
+ sub $2, %r8
+ jz .Lret
+
+ lea 16(%r15), %r15
+ lea -16(%rsi), %rsi
+
+ mov %r13, %r11
+ mov (%r15), %r12
+ mov 8(%r15), %r9
+
+ jmp *%r14
+
+
+
+
+ .align 16, 0x90
+.Ldiagonal:
+ xor %ebx, %ebx
+ xor %ecx, %ecx
+ xor %ebp, %ebp
+
+ neg %r13
+
+ mov %r8d, %eax
+ and $3, %eax
+ jz .Ldiag_prologue_0
+ cmp $2, %eax
+ jc .Ldiag_prologue_1
+ jz .Ldiag_prologue_2
+
+.Ldiag_prologue_3:
+ lea -8(%r15), %r15
+ mov %r15, %r10
+ add $1, %r8
+ mov %r8, %r11
+ lea .Ldiag_entry_3(%rip), %r14
+ jmp .Ldiag_entry_3
+
+.Ldiag_prologue_0:
+ mov %r15, %r10
+ mov %r8, %r11
+ lea 0(%rip), %r14
+ mov -8(%rsi,%r11,8), %rax
+ jmp .Ldiag_entry_0
+
+.Ldiag_prologue_1:
+ lea 8(%r15), %r15
+ mov %r15, %r10
+ add $3, %r8
+ mov %r8, %r11
+ lea 0(%rip), %r14
+ mov -8(%r10), %rax
+ jmp .Ldiag_entry_1
+
+.Ldiag_prologue_2:
+ lea -16(%r15), %r15
+ mov %r15, %r10
+ add $2, %r8
+ mov %r8, %r11
+ lea 0(%rip), %r14
+ mov 16(%r10), %rax
+ jmp .Ldiag_entry_2
+
+
+
+
+ .align 16, 0x90
+.Ldiag_top:
+ add %rax, %rbx
+ adc %rdx, %rcx
+ mov -8(%rsi,%r11,8), %rax
+ adc $0, %rbp
+.Ldiag_entry_0:
+ mulq (%r10)
+ add %rax, %rbx
+ adc %rdx, %rcx
+ adc $0, %rbp
+.Ldiag_entry_3:
+ mov -16(%rsi,%r11,8), %rax
+ mulq 8(%r10)
+ add %rax, %rbx
+ mov 16(%r10), %rax
+ adc %rdx, %rcx
+ adc $0, %rbp
+.Ldiag_entry_2:
+ mulq -24(%rsi,%r11,8)
+ add %rax, %rbx
+ mov 24(%r10), %rax
+ adc %rdx, %rcx
+ lea 32(%r10), %r10
+ adc $0, %rbp
+.Ldiag_entry_1:
+ mulq -32(%rsi,%r11,8)
+ sub $4, %r11
+ jnz .Ldiag_top
+
+ add %rax, %rbx
+ adc %rdx, %rcx
+ adc $0, %rbp
+
+ mov %rbx, (%rdi,%r13,8)
+
+ inc %r13
+ jz .Ldiag_end
+
+ mov %r8, %r11
+ mov %r15, %r10
+
+ lea 8(%rsi), %rsi
+ mov %rcx, %rbx
+ mov %rbp, %rcx
+ xor %ebp, %ebp
+
+ jmp *%r14
+
+.Ldiag_end:
+ mov %rcx, (%rdi)
+ mov %rbp, 8(%rdi)
+
+.Lret: pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+ .size __gmpn_mulmid_basecase,.-__gmpn_mulmid_basecase