diff options
author | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-10-05 21:56:51 +0700 |
commit | fcedfddf00b3f994e4f4e40332ac7fc192c63244 (patch) | |
tree | 51d38e62c7bdfcc5f9a5e9435fe820c93cfc9a3d /vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s |
claude is gud
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s')
-rw-r--r-- | vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s | 520 |
1 files changed, 520 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s new file mode 100644 index 0000000..fea3649 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s @@ -0,0 +1,520 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_sqr_basecase + + +___gmpn_sqr_basecase: + + + + cmp $2, %rdx + jae Lgt1 + + mov (%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + mov %rax, (%rdi) + mov %rdx, 8(%rdi) + + ret + +Lgt1: jne Lgt2 + + mov (%rsi), %rdx + mov 8(%rsi), %rcx + .byte 0xc4,98,179,0xf6,209 + .byte 0xc4,98,251,0xf6,194 + mov %rcx, %rdx + .byte 0xc4,226,163,0xf6,210 + add %r9, %r9 + adc %r10, %r10 + adc $0, %rdx + add %r9, %r8 + adc %r11, %r10 + adc $0, %rdx + mov %rax, (%rdi) + mov %r8, 8(%rdi) + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + + ret + +Lgt2: cmp $4, %rdx + jae Lgt3 + + + + + + mov (%rsi), %r8 + mov 8(%rsi), %rdx + mov %rdx, %r9 + .byte 0xc4,194,163,0xf6,192 + mov 16(%rsi), %rdx + .byte 0xc4,194,171,0xf6,200 + mov %r11, %r8 + add %rax, %r10 + adc $0, %rcx + .byte 0xc4,194,235,0xf6,193 + add %rcx, %rdx + mov %rdx, 24(%rdi) + adc $0, %rax + mov %rax, 32(%rdi) + xor %ecx, %ecx + mov (%rsi), %rdx + .byte 0xc4,98,251,0xf6,218 + mov %rax, (%rdi) + add %r8, %r8 + adc %r10, %r10 + setc %cl + mov 8(%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + add %r11, %r8 + adc %rax, %r10 + mov %r8, 8(%rdi) + mov %r10, 16(%rdi) + mov 24(%rdi), %r8 + mov 32(%rdi), %r10 + lea (%rdx,%rcx), %r11 + adc %r8, %r8 + adc %r10, %r10 + setc %cl + mov 16(%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + add %r11, %r8 + adc %rax, %r10 + mov %r8, 24(%rdi) + mov %r10, 32(%rdi) + adc %rcx, %rdx + mov %rdx, 40(%rdi) + + ret + +Lgt3: + + + + + + + + + + + + + +Ldo_mul_2: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov $0, %r12d + sub %rdx, %r12 + push %r12 + mov (%rsi), %r8 + mov 8(%rsi), %rdx + lea 2(%r12), %rcx + sar $2, %rcx + inc %r12 + mov %rdx, %r9 + + test $1, %r12b + jnz Lmx1 + +Lmx0: .byte 0xc4,66,227,0xf6,216 + mov 16(%rsi), %rdx + mov %rbx, 8(%rdi) + xor %rbx, %rbx + .byte 0xc4,194,171,0xf6,232 + test $2, %r12b + jz Lm00 + +Lm10: lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + jmp Lmlo2 + +Lm00: lea 8(%rsi), %rsi + lea 8(%rdi), %rdi + jmp Lmlo0 + +Lmx1: .byte 0xc4,194,171,0xf6,232 + mov 16(%rsi), %rdx + mov %r10, 8(%rdi) + xor %r10, %r10 + .byte 0xc4,66,227,0xf6,216 + test $2, %r12b + jz Lmlo3 + +Lm01: lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Lmlo1 + + .align 5, 0x90 +Lmtop:.byte 0xc4,66,251,0xf6,209 + add %rax, %rbx + mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 + add %rax, %rbx +Lmlo1:adc $0, %r11 + add %rbp, %rbx + mov %rbx, (%rdi) + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r10 + mov 8(%rsi), %rdx + adc $0, %rbx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r10 + adc $0, %rbp +Lmlo0:add %r11, %r10 + mov %r10, 8(%rdi) + adc $0, %rbp + .byte 0xc4,66,251,0xf6,209 + add %rax, %rbx + mov 16(%rsi), %rdx + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 + add %rax, %rbx + adc $0, %r11 +Lmlo3:add %rbp, %rbx + mov %rbx, 16(%rdi) + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r10 + mov 24(%rsi), %rdx + adc $0, %rbx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r10 + adc $0, %rbp +Lmlo2:add %r11, %r10 + lea 32(%rsi), %rsi + mov %r10, 24(%rdi) + adc $0, %rbp + inc %rcx + lea 32(%rdi), %rdi + jnz Lmtop + +Lmend:.byte 0xc4,194,235,0xf6,193 + add %rdx, %rbx + adc $0, %rax + add %rbp, %rbx + mov %rbx, (%rdi) + adc $0, %rax + mov %rax, 8(%rdi) + + lea 16(%rsi), %rsi + lea -16(%rdi), %rdi + +Ldo_addmul_2: +Louter: + lea (%rsi,%r12,8), %rsi + lea 48(%rdi,%r12,8), %rdi + + mov -8(%rsi), %r8 + + add $2, %r12 + cmp $-2, %r12 + jge Lcorner + + mov (%rsi), %r9 + + lea 1(%r12), %rcx + sar $2, %rcx + + mov %r9, %rdx + test $1, %r12b + jnz Lbx1 + +Lbx0: mov (%rdi), %r13 + mov 8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,216 + add %rax, %r13 + adc $0, %r11 + mov %r13, (%rdi) + xor %rbx, %rbx + test $2, %r12b + jnz Lb10 + +Lb00: mov 8(%rsi), %rdx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Llo0 + +Lb10: mov 8(%rsi), %rdx + mov 16(%rdi), %r13 + lea 32(%rsi), %rsi + inc %rcx + .byte 0xc4,194,251,0xf6,232 + jz Lex + jmp Llo2 + +Lbx1: mov (%rdi), %r14 + mov 8(%rdi), %r13 + .byte 0xc4,194,251,0xf6,232 + mov 8(%rsi), %rdx + add %rax, %r14 + adc $0, %rbp + xor %r10, %r10 + mov %r14, (%rdi) + .byte 0xc4,66,251,0xf6,216 + test $2, %r12b + jz Lb11 + +Lb01: mov 16(%rdi), %r14 + lea 24(%rdi), %rdi + lea 24(%rsi), %rsi + jmp Llo1 + +Lb11: lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo3 + + .align 5, 0x90 +Ltop: .byte 0xc4,194,251,0xf6,232 + add %r10, %r14 + adc $0, %rbx +Llo2: add %rax, %r14 + adc $0, %rbp + .byte 0xc4,66,251,0xf6,209 + add %rax, %r13 + adc $0, %r10 + lea 32(%rdi), %rdi + add %r11, %r14 + mov -16(%rsi), %rdx + mov %r14, -24(%rdi) + adc $0, %rbp + add %rbx, %r13 + mov -8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 +Llo1: add %rax, %r13 + .byte 0xc4,194,251,0xf6,217 + adc $0, %r11 + add %rbp, %r13 + mov %r13, -16(%rdi) + adc $0, %r11 + add %rax, %r14 + adc $0, %rbx + add %r10, %r14 + mov -8(%rsi), %rdx + adc $0, %rbx +Llo0: .byte 0xc4,194,251,0xf6,232 + add %rax, %r14 + adc $0, %rbp + mov (%rdi), %r13 + .byte 0xc4,66,251,0xf6,209 + add %rax, %r13 + adc $0, %r10 + add %r11, %r14 + mov %r14, -8(%rdi) + adc $0, %rbp + mov (%rsi), %rdx + add %rbx, %r13 + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 +Llo3: add %rax, %r13 + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rbp, %r13 + mov 8(%rdi), %r14 + mov %r13, (%rdi) + mov 16(%rdi), %r13 + adc $0, %r11 + add %rax, %r14 + adc $0, %rbx + mov 8(%rsi), %rdx + lea 32(%rsi), %rsi + inc %rcx + jnz Ltop + +Lend: .byte 0xc4,194,251,0xf6,232 + add %r10, %r14 + adc $0, %rbx +Lex: add %rax, %r14 + adc $0, %rbp + .byte 0xc4,194,235,0xf6,193 + add %r11, %r14 + mov %r14, 8(%rdi) + adc $0, %rbp + add %rbx, %rdx + adc $0, %rax + add %rdx, %rbp + mov %rbp, 16(%rdi) + adc $0, %rax + mov %rax, 24(%rdi) + + jmp Louter + +Lcorner: + pop %r12 + mov (%rsi), %rdx + jg Lsmall_corner + + mov %rdx, %r9 + mov (%rdi), %r13 + mov %rax, %r14 + .byte 0xc4,66,251,0xf6,216 + add %rax, %r13 + adc $0, %r11 + mov %r13, (%rdi) + mov 8(%rsi), %rdx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r14 + adc $0, %rbp + .byte 0xc4,194,235,0xf6,193 + add %r11, %r14 + mov %r14, 8(%rdi) + adc $0, %rbp + add %rbp, %rdx + mov %rdx, 16(%rdi) + adc $0, %rax + mov %rax, 24(%rdi) + lea 32(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Lcom + +Lsmall_corner: + .byte 0xc4,194,139,0xf6,232 + add %rax, %r14 + adc $0, %rbp + mov %r14, (%rdi) + mov %rbp, 8(%rdi) + lea 16(%rdi), %rdi + lea 8(%rsi), %rsi + +Lcom: + +Lsqr_diag_addlsh1: + lea 8(%rsi,%r12,8), %rsi + lea (%rdi,%r12,8), %rdi + lea (%rdi,%r12,8), %rdi + inc %r12 + + mov -8(%rsi), %rdx + xor %ebx, %ebx + .byte 0xc4,98,251,0xf6,210 + mov %rax, 8(%rdi) + mov 16(%rdi), %r8 + mov 24(%rdi), %r9 + jmp Ldm + + .align 4, 0x90 +Ldtop:mov 32(%rdi), %r8 + mov 40(%rdi), %r9 + lea 16(%rdi), %rdi + lea (%rdx,%rbx), %r10 +Ldm: adc %r8, %r8 + adc %r9, %r9 + setc %bl + mov (%rsi), %rdx + lea 8(%rsi), %rsi + .byte 0xc4,226,251,0xf6,210 + add %r10, %r8 + adc %rax, %r9 + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + inc %r12 + jnz Ldtop + +Ldend:adc %rbx, %rdx + mov %rdx, 32(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + |