summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s
diff options
context:
space:
mode:
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s')
-rw-r--r--vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s439
1 files changed, 439 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s
new file mode 100644
index 0000000..d76272c
--- /dev/null
+++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s
@@ -0,0 +1,439 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 16, 0x90
+ .globl __gmpn_mullo_basecase
+ .type __gmpn_mullo_basecase,@function
+
+__gmpn_mullo_basecase:
+
+
+ cmp $4, %rcx
+ jge .Lgen
+ mov (%rsi), %rax
+ mov (%rdx), %r8
+
+ lea .Ltab(%rip), %r9
+ movslq (%r9,%rcx,4), %r10
+ add %r10, %r9
+ jmp *%r9
+
+ .section .data.rel.ro.local,"a",@progbits
+ .align 8, 0x90
+.Ltab: .long .Ltab-.Ltab
+ .long .L1-.Ltab
+ .long .L2-.Ltab
+ .long .L3-.Ltab
+ .text
+
+.L1: imul %r8, %rax
+ mov %rax, (%rdi)
+
+ ret
+
+.L2: mov 8(%rdx), %r11
+ imul %rax, %r11
+ mul %r8
+ mov %rax, (%rdi)
+ imul 8(%rsi), %r8
+ lea (%r11, %rdx), %rax
+ add %r8, %rax
+ mov %rax, 8(%rdi)
+
+ ret
+
+.L3: mov 8(%rdx), %r9
+ mov 16(%rdx), %r11
+ mul %r8
+ mov %rax, (%rdi)
+ mov (%rsi), %rax
+ mov %rdx, %rcx
+ mul %r9
+ imul 8(%rsi), %r9
+ mov 16(%rsi), %r10
+ imul %r8, %r10
+ add %rax, %rcx
+ adc %rdx, %r9
+ add %r10, %r9
+ mov 8(%rsi), %rax
+ mul %r8
+ add %rax, %rcx
+ adc %rdx, %r9
+ mov %r11, %rax
+ imul (%rsi), %rax
+ add %rax, %r9
+ mov %rcx, 8(%rdi)
+ mov %r9, 16(%rdi)
+
+ ret
+
+.L0m4:
+.L1m4:
+.L2m4:
+.L3m4:
+.Lgen: push %rbx
+ push %rbp
+ push %r13
+ push %r14
+ push %r15
+
+ mov (%rsi), %rax
+ mov (%rdx), %r13
+ mov %rdx, %r11
+
+ lea (%rdi,%rcx,8), %rdi
+ lea (%rsi,%rcx,8), %rsi
+ neg %rcx
+
+ mul %r13
+
+ test $1, %cl
+ jz .Lmul_2
+
+.Lmul_1:
+ lea -8(%rdi), %rdi
+ lea -8(%rsi), %rsi
+ test $2, %cl
+ jnz .Lmul_1_prologue_3
+
+.Lmul_1_prologue_2:
+ lea -1(%rcx), %r9
+ lea .Laddmul_outer_1(%rip), %r8
+ mov %rax, %rbx
+ mov %rdx, %r15
+ xor %ebp, %ebp
+ xor %r10d, %r10d
+ mov 16(%rsi,%rcx,8), %rax
+ jmp .Lmul_1_entry_2
+
+.Lmul_1_prologue_3:
+ lea 1(%rcx), %r9
+ lea .Laddmul_outer_3(%rip), %r8
+ mov %rax, %rbp
+ mov %rdx, %r10
+ xor %ebx, %ebx
+ jmp .Lmul_1_entry_0
+
+ .align 16, 0x90
+.Lmul_1_top:
+ mov %rbx, -16(%rdi,%r9,8)
+ add %rax, %r15
+ mov (%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ xor %ebx, %ebx
+ mul %r13
+ mov %r15, -8(%rdi,%r9,8)
+ add %rax, %rbp
+ adc %rdx, %r10
+.Lmul_1_entry_0:
+ mov 8(%rsi,%r9,8), %rax
+ mul %r13
+ mov %rbp, (%rdi,%r9,8)
+ add %rax, %r10
+ adc %rdx, %rbx
+ mov 16(%rsi,%r9,8), %rax
+ mul %r13
+ mov %r10, 8(%rdi,%r9,8)
+ xor %ebp, %ebp
+ mov %rbp, %r10
+ add %rax, %rbx
+ mov 24(%rsi,%r9,8), %rax
+ mov %rbp, %r15
+ adc %rdx, %r15
+.Lmul_1_entry_2:
+ mul %r13
+ add $4, %r9
+ js .Lmul_1_top
+
+ mov %rbx, -16(%rdi)
+ add %rax, %r15
+ mov %r15, -8(%rdi)
+ adc %rdx, %rbp
+
+ imul (%rsi), %r13
+ add %r13, %rbp
+ mov %rbp, (%rdi)
+
+ add $1, %rcx
+ jz .Lret
+
+ mov 8(%r11), %r13
+ mov 16(%r11), %r14
+
+ lea 16(%rsi), %rsi
+ lea 8(%r11), %r11
+ lea 24(%rdi), %rdi
+
+ jmp *%r8
+
+
+.Lmul_2:
+ mov 8(%r11), %r14
+ test $2, %cl
+ jz .Lmul_2_prologue_3
+
+ .align 16, 0x90
+.Lmul_2_prologue_1:
+ lea 0(%rcx), %r9
+ mov %rax, %r10
+ mov %rdx, %rbx
+ xor %r15d, %r15d
+ mov (%rsi,%rcx,8), %rax
+ lea .Laddmul_outer_3(%rip), %r8
+ jmp .Lmul_2_entry_1
+
+ .align 16, 0x90
+.Lmul_2_prologue_3:
+ lea 2(%rcx), %r9
+ mov $0, %r10d
+ mov %rax, %r15
+ mov (%rsi,%rcx,8), %rax
+ mov %rdx, %rbp
+ lea .Laddmul_outer_1(%rip), %r8
+ jmp .Lmul_2_entry_3
+
+ .align 16, 0x90
+.Lmul_2_top:
+ mov -32(%rsi,%r9,8), %rax
+ mul %r14
+ add %rax, %rbx
+ adc %rdx, %r15
+ mov -24(%rsi,%r9,8), %rax
+ xor %ebp, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov -24(%rsi,%r9,8), %rax
+ adc %rdx, %r15
+ adc $0, %ebp
+ mul %r14
+ add %rax, %r15
+ mov %rbx, -24(%rdi,%r9,8)
+ adc %rdx, %rbp
+ mov -16(%rsi,%r9,8), %rax
+ mul %r13
+ mov $0, %r10d
+ add %rax, %r15
+ adc %rdx, %rbp
+ mov -16(%rsi,%r9,8), %rax
+ adc $0, %r10d
+.Lmul_2_entry_3:
+ mov $0, %ebx
+ mov %r15, -16(%rdi,%r9,8)
+ mul %r14
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mov $0, %r15d
+ mul %r13
+ add %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ adc %r15d, %ebx
+ mul %r14
+ add %rax, %r10
+ mov %rbp, -8(%rdi,%r9,8)
+ adc %rdx, %rbx
+ mov (%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ adc %rdx, %rbx
+ adc $0, %r15d
+.Lmul_2_entry_1:
+ add $4, %r9
+ mov %r10, -32(%rdi,%r9,8)
+ js .Lmul_2_top
+
+ imul -16(%rsi), %r14
+ add %r14, %rbx
+ imul -8(%rsi), %r13
+ add %r13, %rbx
+ mov %rbx, -8(%rdi)
+
+ add $2, %rcx
+ jz .Lret
+
+ mov 16(%r11), %r13
+ mov 24(%r11), %r14
+
+ lea 16(%r11), %r11
+ lea 16(%rdi), %rdi
+
+ jmp *%r8
+
+
+.Laddmul_outer_1:
+ lea -2(%rcx), %r9
+ mov -16(%rsi,%rcx,8), %rax
+ mul %r13
+ mov %rax, %r10
+ mov -16(%rsi,%rcx,8), %rax
+ mov %rdx, %rbx
+ xor %r15d, %r15d
+ lea .Laddmul_outer_3(%rip), %r8
+ jmp .Laddmul_entry_1
+
+.Laddmul_outer_3:
+ lea 0(%rcx), %r9
+ mov -16(%rsi,%rcx,8), %rax
+ xor %r10d, %r10d
+ mul %r13
+ mov %rax, %r15
+ mov -16(%rsi,%rcx,8), %rax
+ mov %rdx, %rbp
+ lea .Laddmul_outer_1(%rip), %r8
+ jmp .Laddmul_entry_3
+
+ .align 16, 0x90
+.Laddmul_top:
+ add %r10, -32(%rdi,%r9,8)
+ adc %rax, %rbx
+ mov -24(%rsi,%r9,8), %rax
+ adc %rdx, %r15
+ xor %ebp, %ebp
+ mul %r13
+ add %rax, %rbx
+ mov -24(%rsi,%r9,8), %rax
+ adc %rdx, %r15
+ adc %ebp, %ebp
+ mul %r14
+ xor %r10d, %r10d
+ add %rbx, -24(%rdi,%r9,8)
+ adc %rax, %r15
+ mov -16(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ mul %r13
+ add %rax, %r15
+ mov -16(%rsi,%r9,8), %rax
+ adc %rdx, %rbp
+ adc $0, %r10d
+.Laddmul_entry_3:
+ mul %r14
+ add %r15, -16(%rdi,%r9,8)
+ adc %rax, %rbp
+ mov -8(%rsi,%r9,8), %rax
+ adc %rdx, %r10
+ mul %r13
+ xor %ebx, %ebx
+ add %rax, %rbp
+ adc %rdx, %r10
+ mov $0, %r15d
+ mov -8(%rsi,%r9,8), %rax
+ adc %r15d, %ebx
+ mul %r14
+ add %rbp, -8(%rdi,%r9,8)
+ adc %rax, %r10
+ adc %rdx, %rbx
+ mov (%rsi,%r9,8), %rax
+ mul %r13
+ add %rax, %r10
+ mov (%rsi,%r9,8), %rax
+ adc %rdx, %rbx
+ adc $0, %r15d
+.Laddmul_entry_1:
+ mul %r14
+ add $4, %r9
+ js .Laddmul_top
+
+ add %r10, -32(%rdi)
+ adc %rax, %rbx
+
+ imul -24(%rsi), %r13
+ add %r13, %rbx
+ add %rbx, -24(%rdi)
+
+ add $2, %rcx
+ jns .Lret
+
+ lea 16(%r11), %r11
+
+ mov (%r11), %r13
+ mov 8(%r11), %r14
+
+ lea -16(%rsi), %rsi
+
+ jmp *%r8
+
+.Lret: pop %r15
+ pop %r14
+ pop %r13
+ pop %rbp
+ pop %rbx
+
+ ret
+ .size __gmpn_mullo_basecase,.-__gmpn_mullo_basecase