summaryrefslogtreecommitdiff
path: root/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s
diff options
context:
space:
mode:
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s')
-rw-r--r--vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s520
1 files changed, 520 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s
new file mode 100644
index 0000000..fea3649
--- /dev/null
+++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s
@@ -0,0 +1,520 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ .text
+ .align 5, 0x90
+ .globl ___gmpn_sqr_basecase
+
+
+___gmpn_sqr_basecase:
+
+
+
+ cmp $2, %rdx
+ jae Lgt1
+
+ mov (%rsi), %rdx
+ .byte 0xc4,226,251,0xf6,210
+ mov %rax, (%rdi)
+ mov %rdx, 8(%rdi)
+
+ ret
+
+Lgt1: jne Lgt2
+
+ mov (%rsi), %rdx
+ mov 8(%rsi), %rcx
+ .byte 0xc4,98,179,0xf6,209
+ .byte 0xc4,98,251,0xf6,194
+ mov %rcx, %rdx
+ .byte 0xc4,226,163,0xf6,210
+ add %r9, %r9
+ adc %r10, %r10
+ adc $0, %rdx
+ add %r9, %r8
+ adc %r11, %r10
+ adc $0, %rdx
+ mov %rax, (%rdi)
+ mov %r8, 8(%rdi)
+ mov %r10, 16(%rdi)
+ mov %rdx, 24(%rdi)
+
+ ret
+
+Lgt2: cmp $4, %rdx
+ jae Lgt3
+
+
+
+
+
+ mov (%rsi), %r8
+ mov 8(%rsi), %rdx
+ mov %rdx, %r9
+ .byte 0xc4,194,163,0xf6,192
+ mov 16(%rsi), %rdx
+ .byte 0xc4,194,171,0xf6,200
+ mov %r11, %r8
+ add %rax, %r10
+ adc $0, %rcx
+ .byte 0xc4,194,235,0xf6,193
+ add %rcx, %rdx
+ mov %rdx, 24(%rdi)
+ adc $0, %rax
+ mov %rax, 32(%rdi)
+ xor %ecx, %ecx
+ mov (%rsi), %rdx
+ .byte 0xc4,98,251,0xf6,218
+ mov %rax, (%rdi)
+ add %r8, %r8
+ adc %r10, %r10
+ setc %cl
+ mov 8(%rsi), %rdx
+ .byte 0xc4,226,251,0xf6,210
+ add %r11, %r8
+ adc %rax, %r10
+ mov %r8, 8(%rdi)
+ mov %r10, 16(%rdi)
+ mov 24(%rdi), %r8
+ mov 32(%rdi), %r10
+ lea (%rdx,%rcx), %r11
+ adc %r8, %r8
+ adc %r10, %r10
+ setc %cl
+ mov 16(%rsi), %rdx
+ .byte 0xc4,226,251,0xf6,210
+ add %r11, %r8
+ adc %rax, %r10
+ mov %r8, 24(%rdi)
+ mov %r10, 32(%rdi)
+ adc %rcx, %rdx
+ mov %rdx, 40(%rdi)
+
+ ret
+
+Lgt3:
+
+
+
+
+
+
+
+
+
+
+
+
+
+Ldo_mul_2:
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ mov $0, %r12d
+ sub %rdx, %r12
+ push %r12
+ mov (%rsi), %r8
+ mov 8(%rsi), %rdx
+ lea 2(%r12), %rcx
+ sar $2, %rcx
+ inc %r12
+ mov %rdx, %r9
+
+ test $1, %r12b
+ jnz Lmx1
+
+Lmx0: .byte 0xc4,66,227,0xf6,216
+ mov 16(%rsi), %rdx
+ mov %rbx, 8(%rdi)
+ xor %rbx, %rbx
+ .byte 0xc4,194,171,0xf6,232
+ test $2, %r12b
+ jz Lm00
+
+Lm10: lea -8(%rdi), %rdi
+ lea -8(%rsi), %rsi
+ jmp Lmlo2
+
+Lm00: lea 8(%rsi), %rsi
+ lea 8(%rdi), %rdi
+ jmp Lmlo0
+
+Lmx1: .byte 0xc4,194,171,0xf6,232
+ mov 16(%rsi), %rdx
+ mov %r10, 8(%rdi)
+ xor %r10, %r10
+ .byte 0xc4,66,227,0xf6,216
+ test $2, %r12b
+ jz Lmlo3
+
+Lm01: lea 16(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ jmp Lmlo1
+
+ .align 5, 0x90
+Lmtop:.byte 0xc4,66,251,0xf6,209
+ add %rax, %rbx
+ mov (%rsi), %rdx
+ .byte 0xc4,66,251,0xf6,216
+ adc $0, %r10
+ add %rax, %rbx
+Lmlo1:adc $0, %r11
+ add %rbp, %rbx
+ mov %rbx, (%rdi)
+ adc $0, %r11
+ .byte 0xc4,194,251,0xf6,217
+ add %rax, %r10
+ mov 8(%rsi), %rdx
+ adc $0, %rbx
+ .byte 0xc4,194,251,0xf6,232
+ add %rax, %r10
+ adc $0, %rbp
+Lmlo0:add %r11, %r10
+ mov %r10, 8(%rdi)
+ adc $0, %rbp
+ .byte 0xc4,66,251,0xf6,209
+ add %rax, %rbx
+ mov 16(%rsi), %rdx
+ .byte 0xc4,66,251,0xf6,216
+ adc $0, %r10
+ add %rax, %rbx
+ adc $0, %r11
+Lmlo3:add %rbp, %rbx
+ mov %rbx, 16(%rdi)
+ adc $0, %r11
+ .byte 0xc4,194,251,0xf6,217
+ add %rax, %r10
+ mov 24(%rsi), %rdx
+ adc $0, %rbx
+ .byte 0xc4,194,251,0xf6,232
+ add %rax, %r10
+ adc $0, %rbp
+Lmlo2:add %r11, %r10
+ lea 32(%rsi), %rsi
+ mov %r10, 24(%rdi)
+ adc $0, %rbp
+ inc %rcx
+ lea 32(%rdi), %rdi
+ jnz Lmtop
+
+Lmend:.byte 0xc4,194,235,0xf6,193
+ add %rdx, %rbx
+ adc $0, %rax
+ add %rbp, %rbx
+ mov %rbx, (%rdi)
+ adc $0, %rax
+ mov %rax, 8(%rdi)
+
+ lea 16(%rsi), %rsi
+ lea -16(%rdi), %rdi
+
+Ldo_addmul_2:
+Louter:
+ lea (%rsi,%r12,8), %rsi
+ lea 48(%rdi,%r12,8), %rdi
+
+ mov -8(%rsi), %r8
+
+ add $2, %r12
+ cmp $-2, %r12
+ jge Lcorner
+
+ mov (%rsi), %r9
+
+ lea 1(%r12), %rcx
+ sar $2, %rcx
+
+ mov %r9, %rdx
+ test $1, %r12b
+ jnz Lbx1
+
+Lbx0: mov (%rdi), %r13
+ mov 8(%rdi), %r14
+ .byte 0xc4,66,251,0xf6,216
+ add %rax, %r13
+ adc $0, %r11
+ mov %r13, (%rdi)
+ xor %rbx, %rbx
+ test $2, %r12b
+ jnz Lb10
+
+Lb00: mov 8(%rsi), %rdx
+ lea 16(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ jmp Llo0
+
+Lb10: mov 8(%rsi), %rdx
+ mov 16(%rdi), %r13
+ lea 32(%rsi), %rsi
+ inc %rcx
+ .byte 0xc4,194,251,0xf6,232
+ jz Lex
+ jmp Llo2
+
+Lbx1: mov (%rdi), %r14
+ mov 8(%rdi), %r13
+ .byte 0xc4,194,251,0xf6,232
+ mov 8(%rsi), %rdx
+ add %rax, %r14
+ adc $0, %rbp
+ xor %r10, %r10
+ mov %r14, (%rdi)
+ .byte 0xc4,66,251,0xf6,216
+ test $2, %r12b
+ jz Lb11
+
+Lb01: mov 16(%rdi), %r14
+ lea 24(%rdi), %rdi
+ lea 24(%rsi), %rsi
+ jmp Llo1
+
+Lb11: lea 8(%rdi), %rdi
+ lea 8(%rsi), %rsi
+ jmp Llo3
+
+ .align 5, 0x90
+Ltop: .byte 0xc4,194,251,0xf6,232
+ add %r10, %r14
+ adc $0, %rbx
+Llo2: add %rax, %r14
+ adc $0, %rbp
+ .byte 0xc4,66,251,0xf6,209
+ add %rax, %r13
+ adc $0, %r10
+ lea 32(%rdi), %rdi
+ add %r11, %r14
+ mov -16(%rsi), %rdx
+ mov %r14, -24(%rdi)
+ adc $0, %rbp
+ add %rbx, %r13
+ mov -8(%rdi), %r14
+ .byte 0xc4,66,251,0xf6,216
+ adc $0, %r10
+Llo1: add %rax, %r13
+ .byte 0xc4,194,251,0xf6,217
+ adc $0, %r11
+ add %rbp, %r13
+ mov %r13, -16(%rdi)
+ adc $0, %r11
+ add %rax, %r14
+ adc $0, %rbx
+ add %r10, %r14
+ mov -8(%rsi), %rdx
+ adc $0, %rbx
+Llo0: .byte 0xc4,194,251,0xf6,232
+ add %rax, %r14
+ adc $0, %rbp
+ mov (%rdi), %r13
+ .byte 0xc4,66,251,0xf6,209
+ add %rax, %r13
+ adc $0, %r10
+ add %r11, %r14
+ mov %r14, -8(%rdi)
+ adc $0, %rbp
+ mov (%rsi), %rdx
+ add %rbx, %r13
+ .byte 0xc4,66,251,0xf6,216
+ adc $0, %r10
+Llo3: add %rax, %r13
+ adc $0, %r11
+ .byte 0xc4,194,251,0xf6,217
+ add %rbp, %r13
+ mov 8(%rdi), %r14
+ mov %r13, (%rdi)
+ mov 16(%rdi), %r13
+ adc $0, %r11
+ add %rax, %r14
+ adc $0, %rbx
+ mov 8(%rsi), %rdx
+ lea 32(%rsi), %rsi
+ inc %rcx
+ jnz Ltop
+
+Lend: .byte 0xc4,194,251,0xf6,232
+ add %r10, %r14
+ adc $0, %rbx
+Lex: add %rax, %r14
+ adc $0, %rbp
+ .byte 0xc4,194,235,0xf6,193
+ add %r11, %r14
+ mov %r14, 8(%rdi)
+ adc $0, %rbp
+ add %rbx, %rdx
+ adc $0, %rax
+ add %rdx, %rbp
+ mov %rbp, 16(%rdi)
+ adc $0, %rax
+ mov %rax, 24(%rdi)
+
+ jmp Louter
+
+Lcorner:
+ pop %r12
+ mov (%rsi), %rdx
+ jg Lsmall_corner
+
+ mov %rdx, %r9
+ mov (%rdi), %r13
+ mov %rax, %r14
+ .byte 0xc4,66,251,0xf6,216
+ add %rax, %r13
+ adc $0, %r11
+ mov %r13, (%rdi)
+ mov 8(%rsi), %rdx
+ .byte 0xc4,194,251,0xf6,232
+ add %rax, %r14
+ adc $0, %rbp
+ .byte 0xc4,194,235,0xf6,193
+ add %r11, %r14
+ mov %r14, 8(%rdi)
+ adc $0, %rbp
+ add %rbp, %rdx
+ mov %rdx, 16(%rdi)
+ adc $0, %rax
+ mov %rax, 24(%rdi)
+ lea 32(%rdi), %rdi
+ lea 16(%rsi), %rsi
+ jmp Lcom
+
+Lsmall_corner:
+ .byte 0xc4,194,139,0xf6,232
+ add %rax, %r14
+ adc $0, %rbp
+ mov %r14, (%rdi)
+ mov %rbp, 8(%rdi)
+ lea 16(%rdi), %rdi
+ lea 8(%rsi), %rsi
+
+Lcom:
+
+Lsqr_diag_addlsh1:
+ lea 8(%rsi,%r12,8), %rsi
+ lea (%rdi,%r12,8), %rdi
+ lea (%rdi,%r12,8), %rdi
+ inc %r12
+
+ mov -8(%rsi), %rdx
+ xor %ebx, %ebx
+ .byte 0xc4,98,251,0xf6,210
+ mov %rax, 8(%rdi)
+ mov 16(%rdi), %r8
+ mov 24(%rdi), %r9
+ jmp Ldm
+
+ .align 4, 0x90
+Ldtop:mov 32(%rdi), %r8
+ mov 40(%rdi), %r9
+ lea 16(%rdi), %rdi
+ lea (%rdx,%rbx), %r10
+Ldm: adc %r8, %r8
+ adc %r9, %r9
+ setc %bl
+ mov (%rsi), %rdx
+ lea 8(%rsi), %rsi
+ .byte 0xc4,226,251,0xf6,210
+ add %r10, %r8
+ adc %rax, %r9
+ mov %r8, 16(%rdi)
+ mov %r9, 24(%rdi)
+ inc %r12
+ jnz Ldtop
+
+Ldend:adc %rbx, %rdx
+ mov %rdx, 32(%rdi)
+
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+
+ ret
+