diff options
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-macos/mpn')
68 files changed, 14971 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/add_err1_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err1_n.s new file mode 100644 index 0000000..4bb4f97 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err1_n.s @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_add_err1_n + + +___gmpn_add_err1_n: + + mov 8(%rsp), %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (%rsi,%r9,8), %rsi + lea (%rdx,%r9,8), %rdx + lea (%rdi,%r9,8), %rdi + + mov %r9d, %r10d + and $3, %r10d + jz L0mod4 + cmp $2, %r10d + jc L1mod4 + jz L2mod4 +L3mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + xor %r11d, %r11d + lea -24(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 16(%r8), %rbx + adc 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc 8(%r8), %r10 + mov 16(%rsi,%r9,8), %r14 + adc 16(%rdx,%r9,8), %r14 + mov %r14, 16(%rdi,%r9,8) + cmovc (%r8), %r11 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + + add $3, %r9 + jnz Lloop + jmp Lend + + .align 4, 0x90 +L0mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea (%r8,%r9,8), %r8 + neg %r9 + jmp Lloop + + .align 4, 0x90 +L1mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea -8(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc (%r8), %rbx + setc %al + + add $1, %r9 + jnz Lloop + jmp Lend + + .align 4, 0x90 +L2mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + lea -16(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 8(%r8), %rbx + adc 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc (%r8), %r10 + setc %al + add %r10, %rbx + adc $0, %rbp + + add $2, %r9 + jnz Lloop + jmp Lend + + .align 5, 0x90 +Lloop: + mov (%rsi,%r9,8), %r14 + shr $1, %al + mov -8(%r8), %r10 + mov $0, %r13d + adc (%rdx,%r9,8), %r14 + cmovnc %r13, %r10 + mov %r14, (%rdi,%r9,8) + mov 8(%rsi,%r9,8), %r15 + mov 16(%rsi,%r9,8), %r14 + adc 8(%rdx,%r9,8), %r15 + mov -16(%r8), %r11 + cmovnc %r13, %r11 + mov -24(%r8), %r12 + mov %r15, 8(%rdi,%r9,8) + adc 16(%rdx,%r9,8), %r14 + cmovnc %r13, %r12 + mov 24(%rsi,%r9,8), %r15 + adc 24(%rdx,%r9,8), %r15 + cmovc -32(%r8), %r13 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + add %r12, %rbx + adc $0, %rbp + lea -32(%r8), %r8 + mov %r14, 16(%rdi,%r9,8) + add %r13, %rbx + adc $0, %rbp + add $4, %r9 + mov %r15, -8(%rdi,%r9,8) + jnz Lloop + +Lend: + mov %rbx, (%rcx) + mov %rbp, 8(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/add_err2_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err2_n.s new file mode 100644 index 0000000..ba4fb6d --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err2_n.s @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_add_err2_n + + +___gmpn_add_err2_n: + + mov 16(%rsp), %rax + mov 8(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + + sub %r8, %r9 + + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + + test $1, %r10 + jnz Lodd + + lea -8(%r8,%r10,8), %r8 + neg %r10 + jmp Ltop + + .align 4, 0x90 +Lodd: + lea -16(%r8,%r10,8), %r8 + neg %r10 + shr $1, %rax + mov (%rsi,%r10,8), %rbx + adc (%rdx,%r10,8), %rbx + cmovc 8(%r8), %rbp + cmovc 8(%r8,%r9), %r12 + mov %rbx, (%rdi,%r10,8) + sbb %rax, %rax + inc %r10 + jz Lend + + .align 4, 0x90 +Ltop: + mov (%rsi,%r10,8), %rbx + shr $1, %rax + adc (%rdx,%r10,8), %rbx + mov %rbx, (%rdi,%r10,8) + sbb %r14, %r14 + + mov 8(%rsi,%r10,8), %rbx + adc 8(%rdx,%r10,8), %rbx + mov %rbx, 8(%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %r14, %rbx + add %rbx, %rbp + adc $0, %r11 + + and (%r8,%r9), %r14 + add %r14, %r12 + adc $0, %r13 + + mov -8(%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov -8(%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + add $2, %r10 + lea -16(%r8), %r8 + jnz Ltop +Lend: + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + + and $1, %eax + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/add_err3_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err3_n.s new file mode 100644 index 0000000..3e0e39d --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/add_err3_n.s @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_add_err3_n + + +___gmpn_add_err3_n: + + mov 24(%rsp), %rax + mov 16(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push %rcx + mov 64(%rsp), %rcx + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + sub %r8, %r9 + sub %r8, %rcx + + lea -8(%r8,%r10,8), %r8 + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + neg %r10 + + .align 4, 0x90 +Ltop: + shr $1, %rax + mov (%rsi,%r10,8), %rax + adc (%rdx,%r10,8), %rax + mov %rax, (%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov (%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + mov (%r8,%rcx), %rbx + and %rax, %rbx + add %rbx, %r14 + adc $0, %r15 + + lea -8(%r8), %r8 + inc %r10 + jnz Ltop + +Lend: + and $1, %eax + pop %rcx + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + mov %r14, 32(%rcx) + mov %r15, 40(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/add_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/add_n.s new file mode 100644 index 0000000..84fc0f3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/add_n.s @@ -0,0 +1,289 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_add_nc + + +___gmpn_add_nc: + + + + + mov %ecx, %eax + shr $3, %rcx + and $7, %eax + + lea Ltab(%rip), %r9 + neg %r8 + + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax + jmp *%rax + + + + .align 4, 0x90 + .globl ___gmpn_add_n + + +___gmpn_add_n: + + + + mov %ecx, %eax + shr $3, %rcx + and $7, %eax + + lea Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax + jmp *%rax + + +L0: mov (%rsi), %r8 + mov 8(%rsi), %r9 + adc (%rdx), %r8 + jmp Le0 + +L4: mov (%rsi), %r8 + mov 8(%rsi), %r9 + adc (%rdx), %r8 + lea -32(%rsi), %rsi + lea -32(%rdx), %rdx + lea -32(%rdi), %rdi + inc %rcx + jmp Le4 + +L5: mov (%rsi), %r11 + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + adc (%rdx), %r11 + lea -24(%rsi), %rsi + lea -24(%rdx), %rdx + lea -24(%rdi), %rdi + inc %rcx + jmp Le5 + +L6: mov (%rsi), %r10 + adc (%rdx), %r10 + mov 8(%rsi), %r11 + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + inc %rcx + jmp Le6 + +L7: mov (%rsi), %r9 + mov 8(%rsi), %r10 + adc (%rdx), %r9 + adc 8(%rdx), %r10 + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + inc %rcx + jmp Le7 + + .align 4, 0x90 +Ltop: +Le3: mov %r9, 40(%rdi) +Le2: mov %r10, 48(%rdi) +Le1: mov (%rsi), %r8 + mov 8(%rsi), %r9 + adc (%rdx), %r8 + mov %r11, 56(%rdi) + lea 64(%rdi), %rdi +Le0: mov 16(%rsi), %r10 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + mov %r8, (%rdi) +Le7: mov 24(%rsi), %r11 + mov %r9, 8(%rdi) +Le6: mov 32(%rsi), %r8 + mov 40(%rsi), %r9 + adc 24(%rdx), %r11 + mov %r10, 16(%rdi) +Le5: adc 32(%rdx), %r8 + mov %r11, 24(%rdi) +Le4: mov 48(%rsi), %r10 + mov 56(%rsi), %r11 + mov %r8, 32(%rdi) + lea 64(%rsi), %rsi + adc 40(%rdx), %r9 + adc 48(%rdx), %r10 + adc 56(%rdx), %r11 + lea 64(%rdx), %rdx + dec %rcx + jnz Ltop + +Lend: mov %r9, 40(%rdi) + mov %r10, 48(%rdi) + mov %r11, 56(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L3: mov (%rsi), %r9 + mov 8(%rsi), %r10 + mov 16(%rsi), %r11 + adc (%rdx), %r9 + adc 8(%rdx), %r10 + adc 16(%rdx), %r11 + jrcxz Lx3 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea -40(%rdi), %rdi + jmp Le3 +Lx3: mov %r9, (%rdi) + mov %r10, 8(%rdi) + mov %r11, 16(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L1: mov (%rsi), %r11 + adc (%rdx), %r11 + jrcxz Lx1 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea -56(%rdi), %rdi + jmp Le1 +Lx1: mov %r11, (%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L2: mov (%rsi), %r10 + mov 8(%rsi), %r11 + adc (%rdx), %r10 + adc 8(%rdx), %r11 + jrcxz Lx2 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea -48(%rdi), %rdi + jmp Le2 +Lx2: mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh1_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh1_n.s new file mode 100644 index 0000000..90fca0b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh1_n.s @@ -0,0 +1,212 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_addlsh1_nc + + +___gmpn_addlsh1_nc: + + + + push %rbp + mov %r8, %rax + neg %rax + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $63, %r8, %rbp + mov %ecx, %r9d + and $3, %r9d + je Lb00 + cmp $2, %r9d + jc Lb01 + je Lb10 + jmp Lb11 + + + .align 4, 0x90 + .globl ___gmpn_addlsh1_n + + +___gmpn_addlsh1_n: + + + push %rbp + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $63, %r8, %rbp + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + mov 16(%rdx), %r10 + shrd $63, %r10, %r9 + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + adc 16(%rsi), %r9 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, %rbp + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %eax, %eax + sub $3, %rcx + ja Ltop + jmp Lend + +Lb01: add %eax, %eax + adc (%rsi), %rbp + mov %rbp, (%rdi) + mov %r8, %rbp + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %eax, %eax + sub $1, %rcx + ja Ltop + jmp Lend + +Lb10: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, %rbp + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %eax, %eax + sub $2, %rcx + ja Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + shrd $63, %r8, %rbp +Lb00: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + mov 16(%rdx), %r10 + shrd $63, %r10, %r9 + mov 24(%rdx), %r11 + shrd $63, %r11, %r10 + lea 32(%rdx), %rdx + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + adc 16(%rsi), %r9 + adc 24(%rsi), %r10 + lea 32(%rsi), %rsi + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbp + lea 32(%rdi), %rdi + sbb %eax, %eax + sub $4, %rcx + jnz Ltop + +Lend: shr $63, %rbp + add %eax, %eax + adc $0, %rbp + mov %rbp, %rax + pop %rbp + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh2_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh2_n.s new file mode 100644 index 0000000..5d61f82 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh2_n.s @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_addlsh2_nc + + +___gmpn_addlsh2_nc: + + + + push %rbp + mov %r8, %rax + neg %rax + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $62, %r8, %rbp + mov %ecx, %r9d + and $3, %r9d + je Lb00 + cmp $2, %r9d + jc Lb01 + je Lb10 + jmp Lb11 + + + .align 4, 0x90 + .globl ___gmpn_addlsh2_n + + +___gmpn_addlsh2_n: + + + push %rbp + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $62, %r8, %rbp + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + mov 16(%rdx), %r10 + shrd $62, %r10, %r9 + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + adc 16(%rsi), %r9 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, %rbp + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %eax, %eax + sub $3, %rcx + ja Ltop + jmp Lend + +Lb01: add %eax, %eax + adc (%rsi), %rbp + mov %rbp, (%rdi) + mov %r8, %rbp + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %eax, %eax + sub $1, %rcx + ja Ltop + jmp Lend + +Lb10: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, %rbp + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %eax, %eax + sub $2, %rcx + ja Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + shrd $62, %r8, %rbp +Lb00: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + mov 16(%rdx), %r10 + shrd $62, %r10, %r9 + mov 24(%rdx), %r11 + shrd $62, %r11, %r10 + lea 32(%rdx), %rdx + add %eax, %eax + adc (%rsi), %rbp + adc 8(%rsi), %r8 + adc 16(%rsi), %r9 + adc 24(%rsi), %r10 + lea 32(%rsi), %rsi + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbp + lea 32(%rdi), %rdi + sbb %eax, %eax + sub $4, %rcx + jnz Ltop + +Lend: shr $62, %rbp + add %eax, %eax + adc $0, %rbp + mov %rbp, %rax + pop %rbp + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s new file mode 100644 index 0000000..f71088e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/addlsh_n.s @@ -0,0 +1,269 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_addlsh_n + + +___gmpn_addlsh_n: + + + + + mov (%rdx), %r10 + + mov %ecx, %eax + shr $3, %rcx + xor %r9d, %r9d + sub %r8, %r9 + and $7, %eax + + lea Ltab(%rip), %r11 + + movslq (%r11,%rax,4), %rax + add %r11, %rax + jmp *%rax + + +L0: lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + lea 32(%rdi), %rdi + xor %r11d, %r11d + jmp Le0 + +L7: mov %r10, %r11 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + xor %r10d, %r10d + jmp Le7 + +L6: lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + xor %r11d, %r11d + jmp Le6 + +L5: mov %r10, %r11 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + xor %r10d, %r10d + jmp Le5 + +Lend: adc 24(%rsi), %rax + mov %rax, -40(%rdi) + .byte 0xc4,194,179,0xf7,195 + adc %rcx, %rax + + ret + + .align 5, 0x90 +Ltop: jrcxz Lend + mov -32(%rdx), %r10 + adc 24(%rsi), %rax + lea 64(%rsi), %rsi + .byte 0xc4,66,179,0xf7,219 + mov %rax, -40(%rdi) +Le0: dec %rcx + .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov -24(%rdx), %r11 + adc -32(%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, -32(%rdi) +Le7: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov -16(%rdx), %r10 + adc -24(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, -24(%rdi) +Le6: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov -8(%rdx), %r11 + adc -16(%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, -16(%rdi) +Le5: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov (%rdx), %r10 + adc -8(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, -8(%rdi) +Le4: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov 8(%rdx), %r11 + adc (%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, (%rdi) +Le3: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov 16(%rdx), %r10 + adc 8(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, 8(%rdi) +Le2: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov 24(%rdx), %r11 + adc 16(%rsi), %rax + lea 64(%rdx), %rdx + .byte 0xc4,66,179,0xf7,210 + mov %rax, 16(%rdi) + lea 64(%rdi), %rdi +Le1: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + jmp Ltop + +L4: xor %r11d, %r11d + jmp Le4 + +L3: mov %r10, %r11 + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + xor %r10d, %r10d + jmp Le3 + +L2: lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + xor %r11d, %r11d + jmp Le2 + +L1: mov %r10, %r11 + lea -24(%rsi), %rsi + lea 40(%rdx), %rdx + lea 40(%rdi), %rdi + xor %r10d, %r10d + jmp Le1 + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s new file mode 100644 index 0000000..a53c8bf --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_1.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_addmul_1 + + +___gmpn_addmul_1: + + + push %rbx + push %rbp + push %r12 + push %r13 + + mov %rdx, %rbp + mov %rcx, %rdx + + test $1, %bpl + jnz Lbx1 + +Lbx0: shr $2, %rbp + jc Lb10 + +Lb00: .byte 0xc4,98,147,0xf6,38 + .byte 0xc4,226,227,0xf6,70,8 + add %r12, %rbx + adc $0, %rax + mov (%rdi), %r12 + mov 8(%rdi), %rcx + .byte 0xc4,98,179,0xf6,70,16 + lea -16(%rdi), %rdi + lea 16(%rsi), %rsi + add %r13, %r12 + jmp Llo0 + +Lbx1: shr $2, %rbp + jc Lb11 + +Lb01: .byte 0xc4,98,163,0xf6,22 + jnz Lgt1 +Ln1: add %r11, (%rdi) + mov $0, %eax + adc %r10, %rax + jmp Lret + +Lgt1: .byte 0xc4,98,147,0xf6,102,8 + .byte 0xc4,226,227,0xf6,70,16 + lea 24(%rsi), %rsi + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (%rdi), %r10 + mov 8(%rdi), %r12 + mov 16(%rdi), %rcx + lea -8(%rdi), %rdi + add %r11, %r10 + jmp Llo1 + +Lb11: .byte 0xc4,226,227,0xf6,6 + mov (%rdi), %rcx + .byte 0xc4,98,179,0xf6,70,8 + lea 8(%rsi), %rsi + lea -24(%rdi), %rdi + inc %rbp + add %rbx, %rcx + jmp Llo3 + +Lb10: .byte 0xc4,98,179,0xf6,6 + .byte 0xc4,98,163,0xf6,86,8 + lea -32(%rdi), %rdi + mov $0, %eax + clc + jz Lend + + .align 4, 0x90 +Ltop: adc %rax, %r9 + lea 32(%rdi), %rdi + adc %r8, %r11 + .byte 0xc4,98,147,0xf6,102,16 + mov (%rdi), %r8 + .byte 0xc4,226,227,0xf6,70,24 + lea 32(%rsi), %rsi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(%rdi), %r10 + mov 16(%rdi), %r12 + add %r9, %r8 + mov 24(%rdi), %rcx + mov %r8, (%rdi) + adc %r11, %r10 +Llo1: .byte 0xc4,98,179,0xf6,6 + mov %r10, 8(%rdi) + adc %r13, %r12 +Llo0: mov %r12, 16(%rdi) + adc %rbx, %rcx +Llo3: .byte 0xc4,98,163,0xf6,86,8 + mov %rcx, 24(%rdi) + dec %rbp + jnz Ltop + +Lend: adc %rax, %r9 + adc %r8, %r11 + mov 32(%rdi), %r8 + mov %r10, %rax + adc $0, %rax + mov 40(%rdi), %r10 + add %r9, %r8 + mov %r8, 32(%rdi) + adc %r11, %r10 + mov %r10, 40(%rdi) + adc $0, %rax + +Lret: pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_2.s b/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_2.s new file mode 100644 index 0000000..d2b04a9 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/addmul_2.s @@ -0,0 +1,255 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_addmul_2 + + +___gmpn_addmul_2: + + + push %rbx + push %rbp + push %r12 + push %r13 + + mov (%rcx), %r8 + mov 8(%rcx), %r9 + + mov %rdx, %r11 + shr $2, %r11 + + test $1, %dl + jnz Lbx1 + +Lbx0: mov (%rdi), %r12 + mov 8(%rdi), %r13 + test $2, %dl + jnz Lb10 + +Lb00: mov (%rsi), %rdx + lea 16(%rsi), %rsi + .byte 0xc4,194,251,0xf6,200 + add %rax, %r12 + .byte 0xc4,194,251,0xf6,233 + adc $0, %rcx + mov %r12, (%rdi) + add %rax, %r13 + adc $0, %rbp + mov -8(%rsi), %rdx + lea 16(%rdi), %rdi + jmp Llo0 + +Lb10: mov (%rsi), %rdx + inc %r11 + .byte 0xc4,194,251,0xf6,200 + add %rax, %r12 + adc $0, %rcx + .byte 0xc4,194,251,0xf6,233 + mov %r12, (%rdi) + mov 16(%rdi), %r12 + add %rax, %r13 + adc $0, %rbp + xor %rbx, %rbx + jmp Llo2 + +Lbx1: mov (%rdi), %r13 + mov 8(%rdi), %r12 + test $2, %dl + jnz Lb11 + +Lb01: mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,208 + add %rax, %r13 + adc $0, %r10 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r12 + adc $0, %rbx + mov 8(%rsi), %rdx + mov %r13, (%rdi) + mov 16(%rdi), %r13 + .byte 0xc4,194,251,0xf6,200 + lea 24(%rdi), %rdi + lea 24(%rsi), %rsi + jmp Llo1 + +Lb11: mov (%rsi), %rdx + inc %r11 + .byte 0xc4,66,251,0xf6,208 + add %rax, %r13 + adc $0, %r10 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r12 + adc $0, %rbx + mov %r13, (%rdi) + mov 8(%rsi), %rdx + .byte 0xc4,194,251,0xf6,200 + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo3 + + .align 4, 0x90 +Ltop: .byte 0xc4,66,251,0xf6,208 + add %rbx, %r13 + adc $0, %rbp + add %rax, %r13 + adc $0, %r10 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r12 + adc $0, %rbx + lea 32(%rdi), %rdi + add %rcx, %r13 + mov -16(%rsi), %rdx + mov %r13, -24(%rdi) + adc $0, %r10 + add %rbp, %r12 + mov -8(%rdi), %r13 + .byte 0xc4,194,251,0xf6,200 + adc $0, %rbx +Llo1: add %rax, %r12 + .byte 0xc4,194,251,0xf6,233 + adc $0, %rcx + add %r10, %r12 + mov %r12, -16(%rdi) + adc $0, %rcx + add %rax, %r13 + adc $0, %rbp + add %rbx, %r13 + mov -8(%rsi), %rdx + adc $0, %rbp +Llo0: .byte 0xc4,66,251,0xf6,208 + add %rax, %r13 + adc $0, %r10 + mov (%rdi), %r12 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r12 + adc $0, %rbx + add %rcx, %r13 + mov %r13, -8(%rdi) + adc $0, %r10 + mov (%rsi), %rdx + add %rbp, %r12 + .byte 0xc4,194,251,0xf6,200 + adc $0, %rbx +Llo3: add %rax, %r12 + adc $0, %rcx + .byte 0xc4,194,251,0xf6,233 + add %r10, %r12 + mov 8(%rdi), %r13 + mov %r12, (%rdi) + mov 16(%rdi), %r12 + adc $0, %rcx + add %rax, %r13 + adc $0, %rbp +Llo2: mov 8(%rsi), %rdx + lea 32(%rsi), %rsi + dec %r11 + jnz Ltop + +Lend: .byte 0xc4,66,251,0xf6,208 + add %rbx, %r13 + adc $0, %rbp + add %rax, %r13 + adc $0, %r10 + .byte 0xc4,194,235,0xf6,193 + add %rcx, %r13 + mov %r13, 8(%rdi) + adc $0, %r10 + add %rbp, %rdx + adc $0, %rax + add %r10, %rdx + mov %rdx, 16(%rdi) + adc $0, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/and_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/and_n.s new file mode 100644 index 0000000..bb75a2c --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/and_n.s @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_and_n + + +___gmpn_and_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: and (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: and (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 +Lb00: mov 8(%rdx), %r9 + and (%rsi), %r8 + and 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 +Le10: mov 24(%rdx), %r9 + lea 32(%rdx), %rdx + and 16(%rsi), %r8 + and 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/andn_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/andn_n.s new file mode 100644 index 0000000..704eaa3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/andn_n.s @@ -0,0 +1,163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_andn_n + + +___gmpn_andn_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: and (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: and (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + not %r8 +Lb00: mov 8(%rdx), %r9 + not %r9 + and (%rsi), %r8 + and 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 + not %r8 +Le10: mov 24(%rdx), %r9 + not %r9 + lea 32(%rdx), %rdx + and 16(%rsi), %r8 + and 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_dbm1c.s b/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_dbm1c.s new file mode 100644 index 0000000..b4e7295 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_dbm1c.s @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_bdiv_dbm1c + + +___gmpn_bdiv_dbm1c: + + + + mov (%rsi), %rax + mov %rdx, %r9 + mov %edx, %r11d + mul %rcx + lea (%rsi,%r9,8), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + and $3, %r11d + jz Llo0 + lea -4(%r9,%r11), %r9 + cmp $2, %r11d + jc Llo1 + jz Llo2 + jmp Llo3 + + .align 4, 0x90 +Ltop: mov (%rsi,%r9,8), %rax + mul %rcx +Llo0: sub %rax, %r8 + mov %r8, (%rdi,%r9,8) + sbb %rdx, %r8 + mov 8(%rsi,%r9,8), %rax + mul %rcx +Llo3: sub %rax, %r8 + mov %r8, 8(%rdi,%r9,8) + sbb %rdx, %r8 + mov 16(%rsi,%r9,8), %rax + mul %rcx +Llo2: sub %rax, %r8 + mov %r8, 16(%rdi,%r9,8) + sbb %rdx, %r8 + mov 24(%rsi,%r9,8), %rax + mul %rcx +Llo1: sub %rax, %r8 + mov %r8, 24(%rdi,%r9,8) + sbb %rdx, %r8 + add $4, %r9 + jnz Ltop + + mov %r8, %rax + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_q_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_q_1.s new file mode 100644 index 0000000..c21f024 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/bdiv_q_1.s @@ -0,0 +1,215 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_bdiv_q_1 + + +___gmpn_bdiv_q_1: + + + push %rbx + + mov %rcx, %rax + xor %ecx, %ecx + mov %rdx, %r10 + + bt $0, %eax + jnc Levn + +Lodd: mov %rax, %rbx + shr %eax + and $127, %eax + + lea ___gmp_binvert_limb_table(%rip), %rdx + + + + movzbl (%rdx,%rax), %eax + + mov %rbx, %r11 + + lea (%rax,%rax), %edx + imul %eax, %eax + imul %ebx, %eax + sub %eax, %edx + + lea (%rdx,%rdx), %eax + imul %edx, %edx + imul %ebx, %edx + sub %edx, %eax + + lea (%rax,%rax), %r8 + imul %rax, %rax + imul %rbx, %rax + sub %rax, %r8 + + jmp Lpi1 + +Levn: bsf %rax, %rcx + shr %cl, %rax + jmp Lodd + + + .globl ___gmpn_pi1_bdiv_q_1 + + +___gmpn_pi1_bdiv_q_1: + + + + + push %rbx + + mov %rcx, %r11 + mov %rdx, %r10 + mov %r9, %rcx + +Lpi1: mov (%rsi), %rax + + dec %r10 + jz Lone + + lea 8(%rsi,%r10,8), %rsi + lea (%rdi,%r10,8), %rdi + neg %r10 + + test %ecx, %ecx + jnz Lunorm + xor %ebx, %ebx + jmp Lnent + + .align 3, 0x90 +Lntop:mul %r11 + mov -8(%rsi,%r10,8), %rax + sub %rbx, %rax + setc %bl + sub %rdx, %rax + adc $0, %ebx +Lnent:imul %r8, %rax + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz Lntop + + mov -8(%rsi), %r9 + jmp Lcom + +Lunorm: + mov (%rsi,%r10,8), %r9 + shr %cl, %rax + neg %ecx + shl %cl, %r9 + neg %ecx + or %r9, %rax + xor %ebx, %ebx + jmp Luent + + .align 3, 0x90 +Lutop:mul %r11 + mov (%rsi,%r10,8), %rax + shl %cl, %rax + neg %ecx + or %r9, %rax + sub %rbx, %rax + setc %bl + sub %rdx, %rax + adc $0, %ebx +Luent:imul %r8, %rax + mov (%rsi,%r10,8), %r9 + shr %cl, %r9 + neg %ecx + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz Lutop + +Lcom: mul %r11 + sub %rbx, %r9 + sub %rdx, %r9 + imul %r8, %r9 + mov %r9, (%rdi) + pop %rbx + + ret + +Lone: shr %cl, %rax + imul %r8, %rax + mov %rax, (%rdi) + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_add_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_add_n.s new file mode 100644 index 0000000..274a59f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_add_n.s @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_cnd_add_n + + +___gmpn_cnd_add_n: + + + + push %rbx + + neg %rdi + sbb %rbx, %rbx + + test $1, %r8b + jz Lx0 +Lx1: test $2, %r8b + jz Lb1 + +Lb3: mov (%rcx), %rdi + mov 8(%rcx), %r9 + mov 16(%rcx), %r10 + and %rbx, %rdi + and %rbx, %r9 + and %rbx, %r10 + add (%rdx), %rdi + mov %rdi, (%rsi) + adc 8(%rdx), %r9 + mov %r9, 8(%rsi) + adc 16(%rdx), %r10 + mov %r10, 16(%rsi) + sbb %eax, %eax + lea 24(%rdx), %rdx + lea 24(%rcx), %rcx + lea 24(%rsi), %rsi + sub $3, %r8 + jnz Ltop + jmp Lend + +Lx0: xor %eax, %eax + test $2, %r8b + jz Ltop + +Lb2: mov (%rcx), %rdi + mov 8(%rcx), %r9 + and %rbx, %rdi + and %rbx, %r9 + add (%rdx), %rdi + mov %rdi, (%rsi) + adc 8(%rdx), %r9 + mov %r9, 8(%rsi) + sbb %eax, %eax + lea 16(%rdx), %rdx + lea 16(%rcx), %rcx + lea 16(%rsi), %rsi + sub $2, %r8 + jnz Ltop + jmp Lend + +Lb1: mov (%rcx), %rdi + and %rbx, %rdi + add (%rdx), %rdi + mov %rdi, (%rsi) + sbb %eax, %eax + lea 8(%rdx), %rdx + lea 8(%rcx), %rcx + lea 8(%rsi), %rsi + dec %r8 + jz Lend + + .align 4, 0x90 +Ltop: mov (%rcx), %rdi + mov 8(%rcx), %r9 + mov 16(%rcx), %r10 + mov 24(%rcx), %r11 + lea 32(%rcx), %rcx + and %rbx, %rdi + and %rbx, %r9 + and %rbx, %r10 + and %rbx, %r11 + add %eax, %eax + adc (%rdx), %rdi + mov %rdi, (%rsi) + adc 8(%rdx), %r9 + mov %r9, 8(%rsi) + adc 16(%rdx), %r10 + mov %r10, 16(%rsi) + adc 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r11, 24(%rsi) + lea 32(%rsi), %rsi + sbb %eax, %eax + sub $4, %r8 + jnz Ltop + +Lend: neg %eax + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_sub_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_sub_n.s new file mode 100644 index 0000000..79ac5c3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/cnd_sub_n.s @@ -0,0 +1,207 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_cnd_sub_n + + +___gmpn_cnd_sub_n: + + + + push %rbx + push %rbp + push %r12 + push %r13 + + neg %rdi + sbb %rbx, %rbx + + test $1, %r8b + jz Lx0 +Lx1: test $2, %r8b + jz Lb1 + +Lb3: mov (%rcx), %rdi + mov 8(%rcx), %r9 + mov 16(%rcx), %r10 + and %rbx, %rdi + mov (%rdx), %r12 + and %rbx, %r9 + mov 8(%rdx), %r13 + and %rbx, %r10 + mov 16(%rdx), %rbp + sub %rdi, %r12 + mov %r12, (%rsi) + sbb %r9, %r13 + mov %r13, 8(%rsi) + sbb %r10, %rbp + mov %rbp, 16(%rsi) + sbb %eax, %eax + lea 24(%rdx), %rdx + lea 24(%rcx), %rcx + lea 24(%rsi), %rsi + sub $3, %r8 + jnz Ltop + jmp Lend + +Lx0: xor %eax, %eax + test $2, %r8b + jz Ltop + +Lb2: mov (%rcx), %rdi + mov 8(%rcx), %r9 + mov (%rdx), %r12 + and %rbx, %rdi + mov 8(%rdx), %r13 + and %rbx, %r9 + sub %rdi, %r12 + mov %r12, (%rsi) + sbb %r9, %r13 + mov %r13, 8(%rsi) + sbb %eax, %eax + lea 16(%rdx), %rdx + lea 16(%rcx), %rcx + lea 16(%rsi), %rsi + sub $2, %r8 + jnz Ltop + jmp Lend + +Lb1: mov (%rcx), %rdi + mov (%rdx), %r12 + and %rbx, %rdi + sub %rdi, %r12 + mov %r12, (%rsi) + sbb %eax, %eax + lea 8(%rdx), %rdx + lea 8(%rcx), %rcx + lea 8(%rsi), %rsi + dec %r8 + jz Lend + + .align 4, 0x90 +Ltop: mov (%rcx), %rdi + mov 8(%rcx), %r9 + mov 16(%rcx), %r10 + mov 24(%rcx), %r11 + lea 32(%rcx), %rcx + and %rbx, %rdi + mov (%rdx), %r12 + and %rbx, %r9 + mov 8(%rdx), %r13 + and %rbx, %r10 + mov 16(%rdx), %rbp + and %rbx, %r11 + add %eax, %eax + mov 24(%rdx), %rax + lea 32(%rdx), %rdx + sbb %rdi, %r12 + mov %r12, (%rsi) + sbb %r9, %r13 + mov %r13, 8(%rsi) + sbb %r10, %rbp + mov %rbp, 16(%rsi) + sbb %r11, %rax + mov %rax, 24(%rsi) + lea 32(%rsi), %rsi + sbb %eax, %eax + sub $4, %r8 + jnz Ltop + +Lend: neg %eax + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/com.s b/vere/ext/gmp/gen/x86_64-macos/mpn/com.s new file mode 100644 index 0000000..bfac7e2 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/com.s @@ -0,0 +1,335 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_com + + +___gmpn_com: + + + + cmp $7, %rdx + jbe Lbc + + pcmpeqb %xmm5, %xmm5 + + test $8, %dil + jz Lrp_aligned + + mov (%rsi), %r8 + lea 8(%rsi), %rsi + not %r8 + mov %r8, (%rdi) + lea 8(%rdi), %rdi + dec %rdx + +Lrp_aligned: + test $8, %sil + jnz Luent + + jmp Lam + + .align 4, 0x90 +Latop:movaps 0(%rsi), %xmm0 + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + lea 64(%rsi), %rsi + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movaps %xmm0, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + lea 64(%rdi), %rdi +Lam: sub $8, %rdx + jnc Latop + + test $4, %dl + jz 1f + movaps (%rsi), %xmm0 + movaps 16(%rsi), %xmm1 + lea 32(%rsi), %rsi + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + movaps %xmm0, (%rdi) + movaps %xmm1, 16(%rdi) + lea 32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movaps (%rsi), %xmm0 + lea 16(%rsi), %rsi + pxor %xmm5, %xmm0 + movaps %xmm0, (%rdi) + lea 16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + +1: + ret + +Luent: + + + + + lea -40(%rsi), %rax + sub %rdi, %rax + cmp $80, %rax + jbe Lbc + + sub $16, %rdx + jc Luend + + movaps 120(%rsi), %xmm3 + + sub $16, %rdx + jmp Lum + + .align 4, 0x90 +Lutop:movaps 120(%rsi), %xmm3 + pxor %xmm5, %xmm0 + movaps %xmm0, -128(%rdi) + sub $16, %rdx +Lum: movaps 104(%rsi), %xmm2 + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movaps 88(%rsi), %xmm1 + pxor %xmm5, %xmm3 + movaps %xmm3, 112(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movaps 72(%rsi), %xmm0 + pxor %xmm5, %xmm2 + movaps %xmm2, 96(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps 56(%rsi), %xmm3 + pxor %xmm5, %xmm1 + movaps %xmm1, 80(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,195,8 + movaps 40(%rsi), %xmm2 + pxor %xmm5, %xmm0 + movaps %xmm0, 64(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movaps 24(%rsi), %xmm1 + pxor %xmm5, %xmm3 + movaps %xmm3, 48(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movaps 8(%rsi), %xmm0 + pxor %xmm5, %xmm2 + movaps %xmm2, 32(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps -8(%rsi), %xmm3 + pxor %xmm5, %xmm1 + movaps %xmm1, 16(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 128(%rsi), %rsi + lea 128(%rdi), %rdi + jnc Lutop + + pxor %xmm5, %xmm0 + movaps %xmm0, -128(%rdi) + +Luend:test $8, %dl + jz 1f + movaps 56(%rsi), %xmm3 + movaps 40(%rsi), %xmm2 + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movaps 24(%rsi), %xmm1 + pxor %xmm5, %xmm3 + movaps %xmm3, 48(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movaps 8(%rsi), %xmm0 + pxor %xmm5, %xmm2 + movaps %xmm2, 32(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps -8(%rsi), %xmm3 + pxor %xmm5, %xmm1 + movaps %xmm1, 16(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 64(%rsi), %rsi + pxor %xmm5, %xmm0 + movaps %xmm0, (%rdi) + lea 64(%rdi), %rdi + +1: test $4, %dl + jz 1f + movaps 24(%rsi), %xmm1 + movaps 8(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps -8(%rsi), %xmm3 + pxor %xmm5, %xmm1 + movaps %xmm1, 16(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 32(%rsi), %rsi + pxor %xmm5, %xmm0 + movaps %xmm0, (%rdi) + lea 32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movaps 8(%rsi), %xmm0 + movaps -8(%rsi), %xmm3 + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 16(%rsi), %rsi + pxor %xmm5, %xmm0 + movaps %xmm0, (%rdi) + lea 16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + +1: + ret + + + + +Lbc: lea -8(%rdi), %rdi + sub $4, %edx + jc Lend + + .align 4, 0x90 +Ltop: mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + lea 32(%rsi), %rsi + not %r8 + not %r9 + not %r10 + not %r11 + mov %r8, -24(%rdi) + mov %r9, -16(%rdi) + sub $4, %edx + mov %r10, -8(%rdi) + mov %r11, (%rdi) + jnc Ltop + +Lend: test $1, %dl + jz 1f + mov (%rsi), %r8 + not %r8 + mov %r8, 8(%rdi) + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi +1: test $2, %dl + jz 1f + mov (%rsi), %r8 + mov 8(%rsi), %r9 + not %r8 + not %r9 + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) +1: + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/copyd.s b/vere/ext/gmp/gen/x86_64-macos/mpn/copyd.s new file mode 100644 index 0000000..eced825 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/copyd.s @@ -0,0 +1,279 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_copyd + + +___gmpn_copyd: + + + + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + + cmp $7, %rdx + jbe Lbc + + test $8, %dil + jnz Lrp_aligned + + mov (%rsi), %rax + mov %rax, (%rdi) + lea -8(%rsi), %rsi + lea -8(%rdi), %rdi + dec %rdx + +Lrp_aligned: + test $8, %sil + jz Luent + + jmp Lam + + .align 4, 0x90 +Latop:movaps -8(%rsi), %xmm0 + movaps -24(%rsi), %xmm1 + movaps -40(%rsi), %xmm2 + movaps -56(%rsi), %xmm3 + lea -64(%rsi), %rsi + movaps %xmm0, -8(%rdi) + movaps %xmm1, -24(%rdi) + movaps %xmm2, -40(%rdi) + movaps %xmm3, -56(%rdi) + lea -64(%rdi), %rdi +Lam: sub $8, %rdx + jnc Latop + + test $4, %dl + jz 1f + movaps -8(%rsi), %xmm0 + movaps -24(%rsi), %xmm1 + lea -32(%rsi), %rsi + movaps %xmm0, -8(%rdi) + movaps %xmm1, -24(%rdi) + lea -32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movaps -8(%rsi), %xmm0 + lea -16(%rsi), %rsi + movaps %xmm0, -8(%rdi) + lea -16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, (%rdi) + +1: + ret + +Luent:sub $16, %rdx + movaps (%rsi), %xmm0 + jc Luend + + .align 4, 0x90 +Lutop:sub $16, %rdx + movaps -16(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -8(%rdi) + movaps -32(%rsi), %xmm2 + .byte 0x66,0x0f,0x3a,0x0f,202,8 + movaps %xmm1, -24(%rdi) + movaps -48(%rsi), %xmm3 + .byte 0x66,0x0f,0x3a,0x0f,211,8 + movaps %xmm2, -40(%rdi) + movaps -64(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,216,8 + movaps %xmm3, -56(%rdi) + movaps -80(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -72(%rdi) + movaps -96(%rsi), %xmm2 + .byte 0x66,0x0f,0x3a,0x0f,202,8 + movaps %xmm1, -88(%rdi) + movaps -112(%rsi), %xmm3 + .byte 0x66,0x0f,0x3a,0x0f,211,8 + movaps %xmm2, -104(%rdi) + movaps -128(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,216,8 + movaps %xmm3, -120(%rdi) + lea -128(%rsi), %rsi + lea -128(%rdi), %rdi + jnc Lutop + +Luend:test $8, %dl + jz 1f + movaps -16(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -8(%rdi) + movaps -32(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps %xmm1, -24(%rdi) + movaps -48(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -40(%rdi) + movaps -64(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps %xmm1, -56(%rdi) + lea -64(%rsi), %rsi + lea -64(%rdi), %rdi + +1: test $4, %dl + jz 1f + movaps -16(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -8(%rdi) + movaps -32(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps %xmm1, -24(%rdi) + lea -32(%rsi), %rsi + lea -32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movaps -16(%rsi), %xmm1 + .byte 0x66,0x0f,0x3a,0x0f,193,8 + movaps %xmm0, -8(%rdi) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, (%rdi) + +1: + ret + + + + +Lbc: sub $4, %edx + jc Lend + + .align 4, 0x90 +Ltop: mov (%rsi), %r8 + mov -8(%rsi), %r9 + lea -32(%rdi), %rdi + mov -16(%rsi), %r10 + mov -24(%rsi), %r11 + lea -32(%rsi), %rsi + mov %r8, 32(%rdi) + mov %r9, 24(%rdi) + + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + +Lend: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, (%rdi) + lea -8(%rdi), %rdi + lea -8(%rsi), %rsi +1: test $2, %dl + jz 1f + mov (%rsi), %r8 + mov -8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, -8(%rdi) +1: + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/copyi.s b/vere/ext/gmp/gen/x86_64-macos/mpn/copyi.s new file mode 100644 index 0000000..9f77e50 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/copyi.s @@ -0,0 +1,324 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_copyi + + +___gmpn_copyi: + + + + cmp $7, %rdx + jbe Lbc + + test $8, %dil + jz Lrp_aligned + + movsq + dec %rdx + +Lrp_aligned: + test $8, %sil + jnz Luent + + jmp Lam + + .align 4, 0x90 +Latop:movdqa 0(%rsi), %xmm0 + movdqa 16(%rsi), %xmm1 + movdqa 32(%rsi), %xmm2 + movdqa 48(%rsi), %xmm3 + lea 64(%rsi), %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 16(%rdi) + movdqa %xmm2, 32(%rdi) + movdqa %xmm3, 48(%rdi) + lea 64(%rdi), %rdi +Lam: sub $8, %rdx + jnc Latop + + test $4, %dl + jz 1f + movdqa (%rsi), %xmm0 + movdqa 16(%rsi), %xmm1 + lea 32(%rsi), %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 16(%rdi) + lea 32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movdqa (%rsi), %xmm0 + lea 16(%rsi), %rsi + movdqa %xmm0, (%rdi) + lea 16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, (%rdi) + +1: + ret + +Luent: + + + cmp $16, %rdx + jc Lued0 + + + + + + + movaps 120(%rsi), %xmm7 + movaps 104(%rsi), %xmm6 + movaps 88(%rsi), %xmm5 + movaps 72(%rsi), %xmm4 + movaps 56(%rsi), %xmm3 + movaps 40(%rsi), %xmm2 + lea 128(%rsi), %rsi + sub $32, %rdx + jc Lued1 + + .align 4, 0x90 +Lutop:movaps -104(%rsi), %xmm1 + sub $16, %rdx + movaps -120(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,254,8 + movaps -136(%rsi), %xmm8 + movdqa %xmm7, 112(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,245,8 + movaps 120(%rsi), %xmm7 + movdqa %xmm6, 96(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,236,8 + movaps 104(%rsi), %xmm6 + movdqa %xmm5, 80(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,227,8 + movaps 88(%rsi), %xmm5 + movdqa %xmm4, 64(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movaps 72(%rsi), %xmm4 + movdqa %xmm3, 48(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movaps 56(%rsi), %xmm3 + movdqa %xmm2, 32(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps 40(%rsi), %xmm2 + movdqa %xmm1, 16(%rdi) + .byte 0x66,65,0x0f,0x3a,0x0f,192,8 + lea 128(%rsi), %rsi + movdqa %xmm0, (%rdi) + lea 128(%rdi), %rdi + jnc Lutop + +Lued1:movaps -104(%rsi), %xmm1 + movaps -120(%rsi), %xmm0 + movaps -136(%rsi), %xmm8 + .byte 0x66,0x0f,0x3a,0x0f,254,8 + movdqa %xmm7, 112(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,245,8 + movdqa %xmm6, 96(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,236,8 + movdqa %xmm5, 80(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,227,8 + movdqa %xmm4, 64(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movdqa %xmm3, 48(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movdqa %xmm2, 32(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movdqa %xmm1, 16(%rdi) + .byte 0x66,65,0x0f,0x3a,0x0f,192,8 + movdqa %xmm0, (%rdi) + lea 128(%rdi), %rdi + + + + + + +Lued0:test $8, %dl + jz 1f + movaps 56(%rsi), %xmm3 + movaps 40(%rsi), %xmm2 + movaps 24(%rsi), %xmm1 + movaps 8(%rsi), %xmm0 + movaps -8(%rsi), %xmm4 + .byte 0x66,0x0f,0x3a,0x0f,218,8 + movdqa %xmm3, 48(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,209,8 + movdqa %xmm2, 32(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movdqa %xmm1, 16(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,196,8 + lea 64(%rsi), %rsi + movdqa %xmm0, (%rdi) + lea 64(%rdi), %rdi + +1: test $4, %dl + jz 1f + movaps 24(%rsi), %xmm1 + movaps 8(%rsi), %xmm0 + .byte 0x66,0x0f,0x3a,0x0f,200,8 + movaps -8(%rsi), %xmm3 + movdqa %xmm1, 16(%rdi) + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 32(%rsi), %rsi + movdqa %xmm0, (%rdi) + lea 32(%rdi), %rdi + +1: test $2, %dl + jz 1f + movdqa 8(%rsi), %xmm0 + movdqa -8(%rsi), %xmm3 + .byte 0x66,0x0f,0x3a,0x0f,195,8 + lea 16(%rsi), %rsi + movdqa %xmm0, (%rdi) + lea 16(%rdi), %rdi + +1: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, (%rdi) + +1: + ret + + + + +Lbc: lea -8(%rdi), %rdi + sub $4, %edx + jc Lend + + .align 4, 0x90 +Ltop: mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + lea 32(%rsi), %rsi + mov %r8, -24(%rdi) + mov %r9, -16(%rdi) + + mov %r10, -8(%rdi) + mov %r11, (%rdi) + + +Lend: test $1, %dl + jz 1f + mov (%rsi), %r8 + mov %r8, 8(%rdi) + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi +1: test $2, %dl + jz 1f + mov (%rsi), %r8 + mov 8(%rsi), %r9 + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) +1: + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_1n_pi1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_1n_pi1.s new file mode 100644 index 0000000..5ca7107 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_1n_pi1.s @@ -0,0 +1,259 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_div_qr_1n_pi1 + + +___gmpn_div_qr_1n_pi1: + + + + + dec %rdx + jnz Lfirst + + + + lea 1(%rcx), %r10 + mov %rcx, %rax + mul %r9 + mov (%rsi), %r11 + add %r11, %rax + adc %r10, %rdx + mov %rdx, %r10 + imul %r8, %rdx + sub %rdx, %r11 + cmp %r11, %rax + lea (%r11, %r8), %rax + cmovnc %r11, %rax + sbb $0, %r10 + cmp %r8, %rax + jc Lsingle_div_done + sub %r8, %rax + add $1, %r10 +Lsingle_div_done: + mov %r10, (%rdi) + + ret +Lfirst: + + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov %r8, %rbp + imul %r9, %rbp + neg %rbp + mov %rbp, %rbx + sub %r8, %rbx + + + push %r8 + mov %rdx, %r8 + + mov %r9, %rax + mul %rcx + mov %rax, %r13 + add %rcx, %rdx + mov %rdx, %r10 + + mov %rbp, %rax + mul %rcx + mov -8(%rsi, %r8, 8), %r11 + mov (%rsi, %r8, 8), %rcx + mov %r10, (%rdi, %r8, 8) + add %rax, %r11 + adc %rdx, %rcx + sbb %r12, %r12 + dec %r8 + mov %rcx, %rax + jz Lfinal + + .align 4, 0x90 + + + +Lloop: + + + mov %r9, %r14 + mov %r12, %r15 + and %r12, %r14 + neg %r15 + mul %r9 + add %rdx, %r14 + adc $0, %r15 + add %r13, %r14 + mov %rax, %r13 + mov %rbp, %rax + lea (%rbx, %r11), %r10 + adc $0, %r15 + + + mul %rcx + and %rbp, %r12 + add %r12, %r11 + cmovnc %r11, %r10 + + + adc %rcx, %r14 + mov -8(%rsi, %r8, 8), %r11 + adc %r15, 8(%rdi, %r8, 8) + jc Lq_incr +Lq_incr_done: + add %rax, %r11 + mov %r10, %rax + adc %rdx, %rax + mov %r14, (%rdi, %r8, 8) + sbb %r12, %r12 + dec %r8 + mov %rax, %rcx + jnz Lloop + +Lfinal: + pop %r8 + + mov %r12, %r14 + and %r8, %r12 + sub %r12, %rax + neg %r14 + + mov %rax, %rcx + sub %r8, %rax + cmovc %rcx, %rax + sbb $-1, %r14 + + lea 1(%rax), %r10 + mul %r9 + add %r11, %rax + adc %r10, %rdx + mov %rdx, %r10 + imul %r8, %rdx + sub %rdx, %r11 + cmp %r11, %rax + lea (%r11, %r8), %rax + cmovnc %r11, %rax + sbb $0, %r10 + cmp %r8, %rax + jc Ldiv_done + sub %r8, %rax + add $1, %r10 +Ldiv_done: + add %r10, %r13 + mov %r13, (%rdi) + adc %r14, 8(%rdi) + jnc Ldone +Lfinal_q_incr: + addq $1, 16(%rdi) + lea 8(%rdi), %rdi + jc Lfinal_q_incr + +Ldone: + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +Lq_incr: + + lea 16(%rdi, %r8, 8), %rcx +Lq_incr_loop: + addq $1, (%rcx) + jnc Lq_incr_done + lea 8(%rcx), %rcx + jmp Lq_incr_loop + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2n_pi1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2n_pi1.s new file mode 100644 index 0000000..1334c1e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2n_pi1.s @@ -0,0 +1,171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_div_qr_2n_pi1 + + +___gmpn_div_qr_2n_pi1: + + + + + + mov 8(%rsp), %r10 + mov %rdx, %r11 + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + mov -16(%r11, %rcx, 8), %r12 + mov -8(%r11, %rcx, 8), %rbx + + mov %r12, %r14 + mov %rbx, %r13 + sub %r9, %r14 + sbb %r8, %r13 + cmovnc %r14, %r12 + cmovnc %r13, %rbx + + sbb %rax, %rax + inc %rax + push %rax + lea -2(%rcx), %rcx + mov %r8, %r15 + neg %r15 + + jmp Lnext + + .align 4, 0x90 +Lloop: + + + + mov %r10, %rax + mul %rbx + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov (%r11, %rcx, 8), %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae Lfix +Lbck: + mov %r13, (%rdi, %rcx, 8) +Lnext: + sub $1, %rcx + jnc Lloop +Lend: + mov %rbx, 8(%rsi) + mov %r12, (%rsi) + + + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +Lfix: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je Lbck + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp Lbck + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2u_pi1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2u_pi1.s new file mode 100644 index 0000000..62fb8b7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/div_qr_2u_pi1.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + + .globl ___gmpn_div_qr_2u_pi1 + + +___gmpn_div_qr_2u_pi1: + + mov 0+16(%rsp), %r10 + mov %rdx, %r11 + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + push %rsi + + lea -2(%rcx), %rbp + mov %r8, %r15 + neg %r15 + + + movl 56+8(%rsp), %ecx + + + + xor %ebx, %ebx + mov 8(%r11, %rbp, 8), %r12 + shld %cl, %r12, %rbx + + + mov %r10, %rax + mul %rbx + mov (%r11, %rbp, 8), %rsi + shld %cl, %rsi, %r12 + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov %rsi, %r12 + shl %cl, %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae Lfix_qh +Lbck_qh: + push %r13 + + jmp Lnext + + .align 4, 0x90 +Lloop: + + + + mov %r10, %rax + mul %rbx + mov (%r11, %rbp, 8), %rsi + xor %r13d, %r13d + shld %cl, %rsi, %r13 + or %r13, %r12 + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov %rsi, %r12 + shl %cl, %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae Lfix +Lbck: + mov %r13, (%rdi, %rbp, 8) +Lnext: + sub $1, %rbp + jnc Lloop +Lend: + + pop %rax + pop %rsi + shrd %cl, %rbx, %r12 + shr %cl, %rbx + mov %rbx, 8(%rsi) + mov %r12, (%rsi) + + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret + +Lfix: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je Lbck + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp Lbck + + +Lfix_qh: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je Lbck_qh + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp Lbck_qh + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/dive_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/dive_1.s new file mode 100644 index 0000000..4d45de0 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/dive_1.s @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_divexact_1 + + +___gmpn_divexact_1: + + + push %rbx + + mov %rcx, %rax + xor %ecx, %ecx + mov %rdx, %r8 + + bt $0, %eax + jnc Levn + +Lodd: mov %rax, %rbx + shr %eax + and $127, %eax + + lea ___gmp_binvert_limb_table(%rip), %rdx + + + + movzbl (%rdx,%rax), %eax + + mov %rbx, %r11 + + lea (%rax,%rax), %edx + imul %eax, %eax + imul %ebx, %eax + sub %eax, %edx + + lea (%rdx,%rdx), %eax + imul %edx, %edx + imul %ebx, %edx + sub %edx, %eax + + lea (%rax,%rax), %r10 + imul %rax, %rax + imul %rbx, %rax + sub %rax, %r10 + + lea (%rsi,%r8,8), %rsi + lea -8(%rdi,%r8,8), %rdi + neg %r8 + + mov (%rsi,%r8,8), %rax + + inc %r8 + jz Lone + + mov (%rsi,%r8,8), %rdx + + shrd %cl, %rdx, %rax + + xor %ebx, %ebx + jmp Lent + +Levn: bsf %rax, %rcx + shr %cl, %rax + jmp Lodd + + .align 3, 0x90 +Ltop: + + + + + + + + + + + mul %r11 + mov -8(%rsi,%r8,8), %rax + mov (%rsi,%r8,8), %r9 + shrd %cl, %r9, %rax + nop + sub %rbx, %rax + setc %bl + sub %rdx, %rax + adc $0, %rbx +Lent: imul %r10, %rax + mov %rax, (%rdi,%r8,8) + inc %r8 + jnz Ltop + + mul %r11 + mov -8(%rsi), %rax + shr %cl, %rax + sub %rbx, %rax + sub %rdx, %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + + ret + +Lone: shr %cl, %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_1.s new file mode 100644 index 0000000..c0d5b59 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_1.s @@ -0,0 +1,348 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_preinv_divrem_1 + + +___gmpn_preinv_divrem_1: + + + + + xor %eax, %eax + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rsi, %r12 + mov %rcx, %rbx + add %rsi, %rcx + mov %rdx, %rsi + + lea -8(%rdi,%rcx,8), %rdi + + test %r8, %r8 + js Lnent + + mov 40(%rsp), %cl + shl %cl, %r8 + jmp Luent + + + .align 4, 0x90 + .globl ___gmpn_divrem_1 + + +___gmpn_divrem_1: + + + + xor %eax, %eax + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rsi, %r12 + mov %rcx, %rbx + add %rsi, %rcx + mov %rdx, %rsi + je Lret + + lea -8(%rdi,%rcx,8), %rdi + xor %ebp, %ebp + + test %r8, %r8 + jns Lunnormalized + +Lnormalized: + test %rbx, %rbx + je L8 + mov -8(%rsi,%rbx,8), %rbp + dec %rbx + mov %rbp, %rax + sub %r8, %rbp + cmovc %rax, %rbp + sbb %eax, %eax + inc %eax + mov %rax, (%rdi) + lea -8(%rdi), %rdi +L8: + push %rdi + push %rsi + push %r8 + mov %r8, %rdi + + + + call ___gmpn_invert_limb + + pop %r8 + pop %rsi + pop %rdi + + mov %rax, %r9 + mov %rbp, %rax + jmp Lnent + + .align 4, 0x90 +Lntop:mov (%rsi,%rbx,8), %r10 + mul %r9 + add %r10, %rax + adc %rbp, %rdx + mov %rax, %rbp + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %r10 + mov %r8, %rax + add %r10, %rax + cmp %rbp, %r10 + cmovc %r10, %rax + adc $-1, %r13 + cmp %r8, %rax + jae Lnfx +Lnok: mov %r13, (%rdi) + sub $8, %rdi +Lnent:lea 1(%rax), %rbp + dec %rbx + jns Lntop + + xor %ecx, %ecx + jmp Lfrac + +Lnfx: sub %r8, %rax + inc %r13 + jmp Lnok + +Lunnormalized: + test %rbx, %rbx + je L44 + mov -8(%rsi,%rbx,8), %rax + cmp %r8, %rax + jae L44 + mov %rbp, (%rdi) + mov %rax, %rbp + lea -8(%rdi), %rdi + je Lret + dec %rbx +L44: + bsr %r8, %rcx + not %ecx + shl %cl, %r8 + shl %cl, %rbp + + push %rcx + push %rdi + push %rsi + push %r8 + sub $8, %rsp + mov %r8, %rdi + + + + call ___gmpn_invert_limb + add $8, %rsp + + pop %r8 + pop %rsi + pop %rdi + pop %rcx + + mov %rax, %r9 + mov %rbp, %rax + test %rbx, %rbx + je Lfrac + +Luent:dec %rbx + mov (%rsi,%rbx,8), %rbp + neg %ecx + shr %cl, %rbp + neg %ecx + or %rbp, %rax + jmp Lent + + .align 4, 0x90 +Lutop:mov (%rsi,%rbx,8), %r10 + shl %cl, %rbp + neg %ecx + shr %cl, %r10 + neg %ecx + or %r10, %rbp + mul %r9 + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %rbp + mov %r8, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp %r8, %rax + jae Lufx +Luok: mov %r13, (%rdi) + sub $8, %rdi +Lent: mov (%rsi,%rbx,8), %rbp + dec %rbx + lea 1(%rax), %r11 + jns Lutop + +Luend:shl %cl, %rbp + mul %r9 + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %rbp + mov %r8, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp %r8, %rax + jae Lefx +Leok: mov %r13, (%rdi) + sub $8, %rdi + jmp Lfrac + +Lufx: sub %r8, %rax + inc %r13 + jmp Luok +Lefx: sub %r8, %rax + inc %r13 + jmp Leok + +Lfrac:mov %r8, %rbp + neg %rbp + jmp Lfent + + .align 4, 0x90 +Lftop:mul %r9 + add %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %rbp, %rdx + mov %r8, %rax + add %rdx, %rax + cmp %r11, %rdx + cmovc %rdx, %rax + adc $-1, %r13 + mov %r13, (%rdi) + sub $8, %rdi +Lfent:lea 1(%rax), %r11 + dec %r12 + jns Lftop + + shr %cl, %rax +Lret: pop %rbx + pop %rbp + pop %r12 + pop %r13 + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_2.s b/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_2.s new file mode 100644 index 0000000..a1c614e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/divrem_2.s @@ -0,0 +1,207 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_divrem_2 + + +___gmpn_divrem_2: + + + + push %r15 + push %r14 + push %r13 + push %r12 + lea -24(%rdx,%rcx,8), %r12 + mov %rsi, %r13 + push %rbp + mov %rdi, %rbp + push %rbx + mov 8(%r8), %r11 + mov 16(%r12), %rbx + mov (%r8), %r8 + mov 8(%r12), %r10 + + xor %r15d, %r15d + cmp %rbx, %r11 + ja L2 + setb %dl + cmp %r10, %r8 + setbe %al + orb %al, %dl + je L2 + inc %r15d + sub %r8, %r10 + sbb %r11, %rbx +L2: + lea -3(%rcx,%r13), %r14 + test %r14, %r14 + js Lend + + push %r8 + push %r10 + push %r11 + mov %r11, %rdi + + + + call ___gmpn_invert_limb + + pop %r11 + pop %r10 + pop %r8 + + mov %r11, %rdx + mov %rax, %rdi + imul %rax, %rdx + mov %rdx, %r9 + mul %r8 + xor %ecx, %ecx + add %r8, %r9 + adc $-1, %rcx + add %rdx, %r9 + adc $0, %rcx + js 2f +1: dec %rdi + sub %r11, %r9 + sbb $0, %rcx + jns 1b +2: + + lea (%rbp,%r14,8), %rbp + mov %r11, %rsi + neg %rsi + + + + + .align 4, 0x90 +Ltop: mov %rdi, %rax + mul %rbx + mov %r10, %rcx + add %rax, %rcx + adc %rbx, %rdx + mov %rdx, %r9 + imul %rsi, %rdx + mov %r8, %rax + lea (%rdx, %r10), %rbx + xor %r10d, %r10d + mul %r9 + cmp %r14, %r13 + jg L19 + mov (%r12), %r10 + sub $8, %r12 +L19: sub %r8, %r10 + sbb %r11, %rbx + sub %rax, %r10 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %rcx, %rbx + cmovnc %r8, %rax + cmovnc %r11, %rdx + adc $0, %r9 + nop + add %rax, %r10 + adc %rdx, %rbx + cmp %r11, %rbx + jae Lfix +Lbck: mov %r9, (%rbp) + sub $8, %rbp + dec %r14 + jns Ltop + +Lend: mov %r10, 8(%r12) + mov %rbx, 16(%r12) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + mov %r15, %rax + pop %r15 + + ret + +Lfix: seta %dl + cmp %r8, %r10 + setae %al + orb %dl, %al + je Lbck + inc %r9 + sub %r8, %r10 + sbb %r11, %rbx + jmp Lbck + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/fib_table.c b/vere/ext/gmp/gen/x86_64-macos/mpn/fib_table.c new file mode 100644 index 0000000..a830475 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/fib_table.c @@ -0,0 +1,107 @@ +/* This file generated by gen-fib.c - DO NOT EDIT. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +const mp_limb_t +__gmp_fib_table[FIB_TABLE_LIMIT+2] = { + CNST_LIMB (0x1), /* -1 */ + CNST_LIMB (0x0), /* 0 */ + CNST_LIMB (0x1), /* 1 */ + CNST_LIMB (0x1), /* 2 */ + CNST_LIMB (0x2), /* 3 */ + CNST_LIMB (0x3), /* 4 */ + CNST_LIMB (0x5), /* 5 */ + CNST_LIMB (0x8), /* 6 */ + CNST_LIMB (0xd), /* 7 */ + CNST_LIMB (0x15), /* 8 */ + CNST_LIMB (0x22), /* 9 */ + CNST_LIMB (0x37), /* 10 */ + CNST_LIMB (0x59), /* 11 */ + CNST_LIMB (0x90), /* 12 */ + CNST_LIMB (0xe9), /* 13 */ + CNST_LIMB (0x179), /* 14 */ + CNST_LIMB (0x262), /* 15 */ + CNST_LIMB (0x3db), /* 16 */ + CNST_LIMB (0x63d), /* 17 */ + CNST_LIMB (0xa18), /* 18 */ + CNST_LIMB (0x1055), /* 19 */ + CNST_LIMB (0x1a6d), /* 20 */ + CNST_LIMB (0x2ac2), /* 21 */ + CNST_LIMB (0x452f), /* 22 */ + CNST_LIMB (0x6ff1), /* 23 */ + CNST_LIMB (0xb520), /* 24 */ + CNST_LIMB (0x12511), /* 25 */ + CNST_LIMB (0x1da31), /* 26 */ + CNST_LIMB (0x2ff42), /* 27 */ + CNST_LIMB (0x4d973), /* 28 */ + CNST_LIMB (0x7d8b5), /* 29 */ + CNST_LIMB (0xcb228), /* 30 */ + CNST_LIMB (0x148add), /* 31 */ + CNST_LIMB (0x213d05), /* 32 */ + CNST_LIMB (0x35c7e2), /* 33 */ + CNST_LIMB (0x5704e7), /* 34 */ + CNST_LIMB (0x8cccc9), /* 35 */ + CNST_LIMB (0xe3d1b0), /* 36 */ + CNST_LIMB (0x1709e79), /* 37 */ + CNST_LIMB (0x2547029), /* 38 */ + CNST_LIMB (0x3c50ea2), /* 39 */ + CNST_LIMB (0x6197ecb), /* 40 */ + CNST_LIMB (0x9de8d6d), /* 41 */ + CNST_LIMB (0xff80c38), /* 42 */ + CNST_LIMB (0x19d699a5), /* 43 */ + CNST_LIMB (0x29cea5dd), /* 44 */ + CNST_LIMB (0x43a53f82), /* 45 */ + CNST_LIMB (0x6d73e55f), /* 46 */ + CNST_LIMB (0xb11924e1), /* 47 */ + CNST_LIMB (0x11e8d0a40), /* 48 */ + CNST_LIMB (0x1cfa62f21), /* 49 */ + CNST_LIMB (0x2ee333961), /* 50 */ + CNST_LIMB (0x4bdd96882), /* 51 */ + CNST_LIMB (0x7ac0ca1e3), /* 52 */ + CNST_LIMB (0xc69e60a65), /* 53 */ + CNST_LIMB (0x1415f2ac48), /* 54 */ + CNST_LIMB (0x207fd8b6ad), /* 55 */ + CNST_LIMB (0x3495cb62f5), /* 56 */ + CNST_LIMB (0x5515a419a2), /* 57 */ + CNST_LIMB (0x89ab6f7c97), /* 58 */ + CNST_LIMB (0xdec1139639), /* 59 */ + CNST_LIMB (0x1686c8312d0), /* 60 */ + CNST_LIMB (0x2472d96a909), /* 61 */ + CNST_LIMB (0x3af9a19bbd9), /* 62 */ + CNST_LIMB (0x5f6c7b064e2), /* 63 */ + CNST_LIMB (0x9a661ca20bb), /* 64 */ + CNST_LIMB (0xf9d297a859d), /* 65 */ + CNST_LIMB (0x19438b44a658), /* 66 */ + CNST_LIMB (0x28e0b4bf2bf5), /* 67 */ + CNST_LIMB (0x42244003d24d), /* 68 */ + CNST_LIMB (0x6b04f4c2fe42), /* 69 */ + CNST_LIMB (0xad2934c6d08f), /* 70 */ + CNST_LIMB (0x1182e2989ced1), /* 71 */ + CNST_LIMB (0x1c5575e509f60), /* 72 */ + CNST_LIMB (0x2dd8587da6e31), /* 73 */ + CNST_LIMB (0x4a2dce62b0d91), /* 74 */ + CNST_LIMB (0x780626e057bc2), /* 75 */ + CNST_LIMB (0xc233f54308953), /* 76 */ + CNST_LIMB (0x13a3a1c2360515), /* 77 */ + CNST_LIMB (0x1fc6e116668e68), /* 78 */ + CNST_LIMB (0x336a82d89c937d), /* 79 */ + CNST_LIMB (0x533163ef0321e5), /* 80 */ + CNST_LIMB (0x869be6c79fb562), /* 81 */ + CNST_LIMB (0xd9cd4ab6a2d747), /* 82 */ + CNST_LIMB (0x16069317e428ca9), /* 83 */ + CNST_LIMB (0x23a367c34e563f0), /* 84 */ + CNST_LIMB (0x39a9fadb327f099), /* 85 */ + CNST_LIMB (0x5d4d629e80d5489), /* 86 */ + CNST_LIMB (0x96f75d79b354522), /* 87 */ + CNST_LIMB (0xf444c01834299ab), /* 88 */ + CNST_LIMB (0x18b3c1d91e77decd), /* 89 */ + CNST_LIMB (0x27f80ddaa1ba7878), /* 90 */ + CNST_LIMB (0x40abcfb3c0325745), /* 91 */ + CNST_LIMB (0x68a3dd8e61eccfbd), /* 92 */ + CNST_LIMB (0xa94fad42221f2702), /* 93 */ +}; diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_11.s b/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_11.s new file mode 100644 index 0000000..700ac46 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_11.s @@ -0,0 +1,120 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_gcd_11 + + +___gmpn_gcd_11: + + + jmp Lodd + + .align 4, 0x90 +Ltop: cmovc %rdx, %rdi + cmovc %rax, %rsi + shr %cl, %rdi +Lodd: mov %rsi, %rdx + sub %rdi, %rdx + bsf %rdx, %rcx + mov %rdi, %rax + sub %rsi, %rdi + jnz Ltop + +Lend: + + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_22.s b/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_22.s new file mode 100644 index 0000000..02be83c --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/gcd_22.s @@ -0,0 +1,152 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_gcd_22 + + +___gmpn_gcd_22: + + + + .align 4, 0x90 +Ltop: mov %rcx, %r10 + sub %rsi, %r10 + jz Llowz + mov %rdx, %r11 + sbb %rdi, %r11 + + rep;bsf %r10, %rax + + mov %rsi, %r8 + sub %rcx, %rsi + mov %rdi, %r9 + sbb %rdx, %rdi + +Lbck: cmovc %r10, %rsi + cmovc %r11, %rdi + cmovc %r8, %rcx + cmovc %r9, %rdx + + xor %r10d, %r10d + sub %rax, %r10 + .byte 0xc4,98,169,0xf7,207 + .byte 0xc4,226,251,0xf7,246 + .byte 0xc4,226,251,0xf7,255 + or %r9, %rsi + + test %rdx, %rdx + jnz Ltop + test %rdi, %rdi + jnz Ltop + +Lgcd_11: + mov %rcx, %rdi + + jmp ___gmpn_gcd_11 + +Llowz: + + + mov %rdx, %r10 + sub %rdi, %r10 + je Lend + + xor %r11, %r11 + mov %rsi, %r8 + mov %rdi, %r9 + rep;bsf %r10, %rax + mov %rdi, %rsi + xor %rdi, %rdi + sub %rdx, %rsi + jmp Lbck + +Lend: mov %rcx, %rax + +Lret: + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/hamdist.s b/vere/ext/gmp/gen/x86_64-macos/mpn/hamdist.s new file mode 100644 index 0000000..32c21c6 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/hamdist.s @@ -0,0 +1,217 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_hamdist + + +___gmpn_hamdist: + + + push %rbx + push %rbp + + mov (%rdi), %r10 + xor (%rsi), %r10 + + mov %edx, %r8d + and $3, %r8d + + xor %ecx, %ecx + .byte 0xf3,0x49,0x0f,0xb8,0xc2 + + lea Ltab(%rip), %r9 + + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 + + +L3: mov 8(%rdi), %r10 + mov 16(%rdi), %r11 + xor 8(%rsi), %r10 + xor 16(%rsi), %r11 + xor %ebp, %ebp + sub $4, %rdx + jle Lx3 + mov 24(%rdi), %r8 + mov 32(%rdi), %r9 + add $24, %rdi + add $24, %rsi + jmp Le3 + +L0: mov 8(%rdi), %r9 + xor 8(%rsi), %r9 + mov 16(%rdi), %r10 + mov 24(%rdi), %r11 + xor %ebx, %ebx + xor 16(%rsi), %r10 + xor 24(%rsi), %r11 + add $32, %rdi + add $32, %rsi + sub $4, %rdx + jle Lx4 + + .align 4, 0x90 +Ltop: +Le0: .byte 0xf3,0x49,0x0f,0xb8,0xe9 + mov (%rdi), %r8 + mov 8(%rdi), %r9 + add %rbx, %rax +Le3: .byte 0xf3,0x49,0x0f,0xb8,0xda + xor (%rsi), %r8 + xor 8(%rsi), %r9 + add %rbp, %rcx +Le2: .byte 0xf3,0x49,0x0f,0xb8,0xeb + mov 16(%rdi), %r10 + mov 24(%rdi), %r11 + add $32, %rdi + add %rbx, %rax +Le1: .byte 0xf3,0x49,0x0f,0xb8,0xd8 + xor 16(%rsi), %r10 + xor 24(%rsi), %r11 + add $32, %rsi + add %rbp, %rcx + sub $4, %rdx + jg Ltop + +Lx4: .byte 0xf3,0x49,0x0f,0xb8,0xe9 + add %rbx, %rax +Lx3: .byte 0xf3,0x49,0x0f,0xb8,0xda + add %rbp, %rcx + .byte 0xf3,0x49,0x0f,0xb8,0xeb + add %rbx, %rax + add %rbp, %rcx +Lx2: add %rcx, %rax +Lx1: pop %rbp + pop %rbx + + ret + +L2: mov 8(%rdi), %r11 + xor 8(%rsi), %r11 + sub $2, %rdx + jle Ln2 + mov 16(%rdi), %r8 + mov 24(%rdi), %r9 + xor %ebx, %ebx + xor 16(%rsi), %r8 + xor 24(%rsi), %r9 + add $16, %rdi + add $16, %rsi + jmp Le2 +Ln2: .byte 0xf3,0x49,0x0f,0xb8,0xcb + jmp Lx2 + +L1: dec %rdx + jle Lx1 + mov 8(%rdi), %r8 + mov 16(%rdi), %r9 + xor 8(%rsi), %r8 + xor 16(%rsi), %r9 + xor %ebp, %ebp + mov 24(%rdi), %r10 + mov 32(%rdi), %r11 + add $40, %rdi + add $8, %rsi + jmp Le1 + + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb.s b/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb.s new file mode 100644 index 0000000..a5f251b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb.s @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.private_extern __gmpn_invert_limb_table + + + .text + .align 4, 0x90 + .globl ___gmpn_invert_limb + + +___gmpn_invert_limb: + + + mov %rdi, %rax + shr $55, %rax + + lea __gmpn_invert_limb_table(%rip), %r8 + add $-512, %r8 + + movzwl (%r8,%rax,2), %ecx + + + mov %rdi, %rsi + mov %ecx, %eax + imul %ecx, %ecx + shr $24, %rsi + inc %rsi + imul %rsi, %rcx + shr $40, %rcx + sal $11, %eax + dec %eax + sub %ecx, %eax + + + mov $0x1000000000000000, %rcx + imul %rax, %rsi + sub %rsi, %rcx + imul %rax, %rcx + sal $13, %rax + shr $47, %rcx + add %rax, %rcx + + + mov %rdi, %rsi + shr %rsi + sbb %rax, %rax + sub %rax, %rsi + imul %rcx, %rsi + and %rcx, %rax + shr %rax + sub %rsi, %rax + mul %rcx + sal $31, %rcx + shr %rdx + add %rdx, %rcx + + mov %rdi, %rax + mul %rcx + add %rdi, %rax + mov %rcx, %rax + adc %rdi, %rdx + sub %rdx, %rax + + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb_table.s b/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb_table.s new file mode 100644 index 0000000..b937cd0 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/invert_limb_table.s @@ -0,0 +1,313 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.private_extern __gmpn_invert_limb_table + + + + + .section __TEXT,__const + .align 1, 0x90 + .globl __gmpn_invert_limb_table +__gmpn_invert_limb_table: + .value 2045 + .value 2037 + .value 2029 + .value 2021 + .value 2013 + .value 2005 + .value 1998 + .value 1990 + .value 1983 + .value 1975 + .value 1968 + .value 1960 + .value 1953 + .value 1946 + .value 1938 + .value 1931 + .value 1924 + .value 1917 + .value 1910 + .value 1903 + .value 1896 + .value 1889 + .value 1883 + .value 1876 + .value 1869 + .value 1863 + .value 1856 + .value 1849 + .value 1843 + .value 1836 + .value 1830 + .value 1824 + .value 1817 + .value 1811 + .value 1805 + .value 1799 + .value 1792 + .value 1786 + .value 1780 + .value 1774 + .value 1768 + .value 1762 + .value 1756 + .value 1750 + .value 1745 + .value 1739 + .value 1733 + .value 1727 + .value 1722 + .value 1716 + .value 1710 + .value 1705 + .value 1699 + .value 1694 + .value 1688 + .value 1683 + .value 1677 + .value 1672 + .value 1667 + .value 1661 + .value 1656 + .value 1651 + .value 1646 + .value 1641 + .value 1636 + .value 1630 + .value 1625 + .value 1620 + .value 1615 + .value 1610 + .value 1605 + .value 1600 + .value 1596 + .value 1591 + .value 1586 + .value 1581 + .value 1576 + .value 1572 + .value 1567 + .value 1562 + .value 1558 + .value 1553 + .value 1548 + .value 1544 + .value 1539 + .value 1535 + .value 1530 + .value 1526 + .value 1521 + .value 1517 + .value 1513 + .value 1508 + .value 1504 + .value 1500 + .value 1495 + .value 1491 + .value 1487 + .value 1483 + .value 1478 + .value 1474 + .value 1470 + .value 1466 + .value 1462 + .value 1458 + .value 1454 + .value 1450 + .value 1446 + .value 1442 + .value 1438 + .value 1434 + .value 1430 + .value 1426 + .value 1422 + .value 1418 + .value 1414 + .value 1411 + .value 1407 + .value 1403 + .value 1399 + .value 1396 + .value 1392 + .value 1388 + .value 1384 + .value 1381 + .value 1377 + .value 1374 + .value 1370 + .value 1366 + .value 1363 + .value 1359 + .value 1356 + .value 1352 + .value 1349 + .value 1345 + .value 1342 + .value 1338 + .value 1335 + .value 1332 + .value 1328 + .value 1325 + .value 1322 + .value 1318 + .value 1315 + .value 1312 + .value 1308 + .value 1305 + .value 1302 + .value 1299 + .value 1295 + .value 1292 + .value 1289 + .value 1286 + .value 1283 + .value 1280 + .value 1276 + .value 1273 + .value 1270 + .value 1267 + .value 1264 + .value 1261 + .value 1258 + .value 1255 + .value 1252 + .value 1249 + .value 1246 + .value 1243 + .value 1240 + .value 1237 + .value 1234 + .value 1231 + .value 1228 + .value 1226 + .value 1223 + .value 1220 + .value 1217 + .value 1214 + .value 1211 + .value 1209 + .value 1206 + .value 1203 + .value 1200 + .value 1197 + .value 1195 + .value 1192 + .value 1189 + .value 1187 + .value 1184 + .value 1181 + .value 1179 + .value 1176 + .value 1173 + .value 1171 + .value 1168 + .value 1165 + .value 1163 + .value 1160 + .value 1158 + .value 1155 + .value 1153 + .value 1150 + .value 1148 + .value 1145 + .value 1143 + .value 1140 + .value 1138 + .value 1135 + .value 1133 + .value 1130 + .value 1128 + .value 1125 + .value 1123 + .value 1121 + .value 1118 + .value 1116 + .value 1113 + .value 1111 + .value 1109 + .value 1106 + .value 1104 + .value 1102 + .value 1099 + .value 1097 + .value 1095 + .value 1092 + .value 1090 + .value 1088 + .value 1086 + .value 1083 + .value 1081 + .value 1079 + .value 1077 + .value 1074 + .value 1072 + .value 1070 + .value 1068 + .value 1066 + .value 1064 + .value 1061 + .value 1059 + .value 1057 + .value 1055 + .value 1053 + .value 1051 + .value 1049 + .value 1047 + .value 1044 + .value 1042 + .value 1040 + .value 1038 + .value 1036 + .value 1034 + .value 1032 + .value 1030 + .value 1028 + .value 1026 + .value 1024 + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/ior_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/ior_n.s new file mode 100644 index 0000000..7a15d18 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/ior_n.s @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_ior_n + + +___gmpn_ior_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: or (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: or (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 +Lb00: mov 8(%rdx), %r9 + or (%rsi), %r8 + or 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 +Le10: mov 24(%rdx), %r9 + lea 32(%rdx), %rdx + or 16(%rsi), %r8 + or 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/iorn_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/iorn_n.s new file mode 100644 index 0000000..b14be40 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/iorn_n.s @@ -0,0 +1,163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_iorn_n + + +___gmpn_iorn_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: or (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: or (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + not %r8 +Lb00: mov 8(%rdx), %r9 + not %r9 + or (%rsi), %r8 + or 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 + not %r8 +Le10: mov 24(%rdx), %r9 + not %r9 + lea 32(%rdx), %rdx + or 16(%rsi), %r8 + or 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/jacobitab.h b/vere/ext/gmp/gen/x86_64-macos/mpn/jacobitab.h new file mode 100644 index 0000000..4bdbfcc --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/jacobitab.h @@ -0,0 +1,13 @@ + 0, 0, 0, 0, 0,12, 8, 4, 1, 1, 1, 1, 1,13, 9, 5, + 2, 2, 2, 2, 2, 6,10,14, 3, 3, 3, 3, 3, 7,11,15, + 4,16, 6,18, 4, 0,12, 8, 5,17, 7,19, 5, 1,13, 9, + 6,18, 4,16, 6,10,14, 2, 7,19, 5,17, 7,11,15, 3, + 8,10, 9,11, 8, 4, 0,12, 9,11, 8,10, 9, 5, 1,13, +10, 9,11, 8,10,14, 2, 6,11, 8,10, 9,11,15, 3, 7, +12,22,24,20,12, 8, 4, 0,13,23,25,21,13, 9, 5, 1, +25,21,13,23,14, 2, 6,10,24,20,12,22,15, 3, 7,11, +16, 6,18, 4,16,16,16,16,17, 7,19, 5,17,17,17,17, +18, 4,16, 6,18,22,19,23,19, 5,17, 7,19,23,18,22, +20,12,22,24,20,20,20,20,21,13,23,25,21,21,21,21, +22,24,20,12,22,19,23,18,23,25,21,13,23,18,22,19, +24,20,12,22,15, 3, 7,11,25,21,13,23,14, 2, 6,10, diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/lshift.s b/vere/ext/gmp/gen/x86_64-macos/mpn/lshift.s new file mode 100644 index 0000000..6463cbe --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/lshift.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_lshift + + +___gmpn_lshift: + + + movd %ecx, %xmm4 + mov $64, %eax + sub %ecx, %eax + movd %eax, %xmm5 + + neg %ecx + mov -8(%rsi,%rdx,8), %rax + shr %cl, %rax + + cmp $3, %rdx + jle Lbc + + lea (%rdi,%rdx,8), %ecx + test $8, %cl + jz Lrp_aligned + + + movq -8(%rsi,%rdx,8), %xmm0 + movq -16(%rsi,%rdx,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(%rdi,%rdx,8) + dec %rdx + +Lrp_aligned: + lea 1(%rdx), %r8d + + and $6, %r8d + jz Lba0 + cmp $4, %r8d + jz Lba4 + jc Lba2 +Lba6: add $-4, %rdx + jmp Li56 +Lba0: add $-6, %rdx + jmp Li70 +Lba4: add $-2, %rdx + jmp Li34 +Lba2: add $-8, %rdx + jle Lend + + .align 4, 0x90 +Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 + movdqu 48(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 48(%rdi,%rdx,8) +Li70: + movdqu 24(%rsi,%rdx,8), %xmm1 + movdqu 32(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 32(%rdi,%rdx,8) +Li56: + movdqu 8(%rsi,%rdx,8), %xmm1 + movdqu 16(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 16(%rdi,%rdx,8) +Li34: + movdqu -8(%rsi,%rdx,8), %xmm1 + movdqu (%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (%rdi,%rdx,8) + sub $8, %rdx + jg Ltop + +Lend: test $1, %dl + jnz Lend8 + + movdqu (%rsi), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (%rdi) + + ret + + + .align 4, 0x90 +Lbc: dec %edx + jz Lend8 + + movq (%rsi,%rdx,8), %xmm1 + movq -8(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (%rdi,%rdx,8) + sub $2, %edx + jl Lend8 + movq 8(%rsi), %xmm1 + movq (%rsi), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(%rdi) + +Lend8:movq (%rsi), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (%rdi) + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/lshiftc.s b/vere/ext/gmp/gen/x86_64-macos/mpn/lshiftc.s new file mode 100644 index 0000000..ffd7e35 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/lshiftc.s @@ -0,0 +1,222 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_lshiftc + + +___gmpn_lshiftc: + + + movd %ecx, %xmm4 + mov $64, %eax + sub %ecx, %eax + movd %eax, %xmm5 + + neg %ecx + mov -8(%rsi,%rdx,8), %rax + shr %cl, %rax + + pcmpeqb %xmm3, %xmm3 + + cmp $3, %rdx + jle Lbc + + lea (%rdi,%rdx,8), %ecx + test $8, %cl + jz Lrp_aligned + + + movq -8(%rsi,%rdx,8), %xmm0 + movq -16(%rsi,%rdx,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, -8(%rdi,%rdx,8) + dec %rdx + +Lrp_aligned: + lea 1(%rdx), %r8d + + and $6, %r8d + jz Lba0 + cmp $4, %r8d + jz Lba4 + jc Lba2 +Lba6: add $-4, %rdx + jmp Li56 +Lba0: add $-6, %rdx + jmp Li70 +Lba4: add $-2, %rdx + jmp Li34 +Lba2: add $-8, %rdx + jle Lend + + .align 4, 0x90 +Ltop: movdqu 40(%rsi,%rdx,8), %xmm1 + movdqu 48(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 48(%rdi,%rdx,8) +Li70: + movdqu 24(%rsi,%rdx,8), %xmm1 + movdqu 32(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 32(%rdi,%rdx,8) +Li56: + movdqu 8(%rsi,%rdx,8), %xmm1 + movdqu 16(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 16(%rdi,%rdx,8) +Li34: + movdqu -8(%rsi,%rdx,8), %xmm1 + movdqu (%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (%rdi,%rdx,8) + sub $8, %rdx + jg Ltop + +Lend: test $1, %dl + jnz Lend8 + + movdqu (%rsi), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (%rdi) + + ret + + + .align 4, 0x90 +Lbc: dec %edx + jz Lend8 + + movq (%rsi,%rdx,8), %xmm1 + movq -8(%rsi,%rdx,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (%rdi,%rdx,8) + sub $2, %edx + jl Lend8 + movq 8(%rsi), %xmm1 + movq (%rsi), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, 8(%rdi) + +Lend8:movq (%rsi), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (%rdi) + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_1.s new file mode 100644 index 0000000..a9b4801 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_1.s @@ -0,0 +1,240 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_mod_1_1p + + +___gmpn_mod_1_1p: + + + push %rbp + push %rbx + mov %rdx, %rbx + mov %rcx, %r8 + + mov -8(%rdi, %rsi, 8), %rax + cmp $3, %rsi + jnc Lfirst + mov -16(%rdi, %rsi, 8), %rbp + jmp Lreduce_two + +Lfirst: + + mov 24(%r8), %r11 + mul %r11 + mov -24(%rdi, %rsi, 8), %rbp + add %rax, %rbp + mov -16(%rdi, %rsi, 8), %rax + adc %rdx, %rax + sbb %rcx, %rcx + sub $4, %rsi + jc Lreduce_three + + mov %r11, %r10 + sub %rbx, %r10 + + .align 4, 0x90 +Ltop: and %r11, %rcx + lea (%r10, %rbp), %r9 + mul %r11 + add %rbp, %rcx + mov (%rdi, %rsi, 8), %rbp + cmovc %r9, %rcx + add %rax, %rbp + mov %rcx, %rax + adc %rdx, %rax + sbb %rcx, %rcx + sub $1, %rsi + jnc Ltop + +Lreduce_three: + + and %rbx, %rcx + sub %rcx, %rax + +Lreduce_two: + mov 8(%r8), %ecx + test %ecx, %ecx + jz Lnormalized + + + mulq 16(%r8) + xor %r9, %r9 + add %rax, %rbp + adc %rdx, %r9 + mov %r9, %rax + + + + shld %cl, %rbp, %rax + + shl %cl, %rbp + jmp Ludiv + +Lnormalized: + mov %rax, %r9 + sub %rbx, %r9 + cmovnc %r9, %rax + +Ludiv: + lea 1(%rax), %r9 + mulq (%r8) + add %rbp, %rax + adc %r9, %rdx + imul %rbx, %rdx + sub %rdx, %rbp + cmp %rbp, %rax + lea (%rbx, %rbp), %rax + cmovnc %rbp, %rax + cmp %rbx, %rax + jnc Lfix +Lok: shr %cl, %rax + + pop %rbx + pop %rbp + + ret +Lfix: sub %rbx, %rax + jmp Lok + + + .align 4, 0x90 + .globl ___gmpn_mod_1_1p_cps + + +___gmpn_mod_1_1p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call ___gmpn_invert_limb + + neg %r12 + mov %r12, %r8 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + imul %rax, %r12 + mov %r12, 24(%rbx) + mov %ebp, %ecx + test %ecx, %ecx + jz Lz + + mov $1, %edx + + shld %cl, %rax, %rdx + + imul %rdx, %r8 + shr %cl, %r8 + mov %r8, 16(%rbx) +Lz: + pop %r12 + pop %rbx + pop %rbp + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_2.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_2.s new file mode 100644 index 0000000..9feb233 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_2.s @@ -0,0 +1,251 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_mod_1s_2p + + +___gmpn_mod_1s_2p: + + + push %r14 + test $1, %sil + mov %rdx, %r14 + push %r13 + mov %rcx, %r13 + push %r12 + push %rbp + push %rbx + mov 16(%rcx), %r10 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + je Lb0 + dec %rsi + je Lone + mov -8(%rdi,%rsi,8), %rax + mul %r10 + mov %rax, %r9 + mov %rdx, %r8 + mov (%rdi,%rsi,8), %rax + add -16(%rdi,%rsi,8), %r9 + adc $0, %r8 + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + jmp L11 + +Lb0: mov -8(%rdi,%rsi,8), %r8 + mov -16(%rdi,%rsi,8), %r9 + +L11: sub $4, %rsi + jb Led2 + lea 40(%rdi,%rsi,8), %rdi + mov -40(%rdi), %r11 + mov -32(%rdi), %rax + jmp Lm0 + + .align 4, 0x90 +Ltop: mov -24(%rdi), %r9 + add %rax, %r11 + mov -16(%rdi), %rax + adc %rdx, %r12 + mul %r10 + add %rax, %r9 + mov %r11, %rax + mov %rdx, %r8 + adc $0, %r8 + mul %rbx + add %rax, %r9 + mov %r12, %rax + adc %rdx, %r8 + mul %rbp + sub $2, %rsi + jb Led1 + mov -40(%rdi), %r11 + add %rax, %r9 + mov -32(%rdi), %rax + adc %rdx, %r8 +Lm0: mul %r10 + add %rax, %r11 + mov %r9, %rax + mov %rdx, %r12 + adc $0, %r12 + mul %rbx + add %rax, %r11 + lea -32(%rdi), %rdi + mov %r8, %rax + adc %rdx, %r12 + mul %rbp + sub $2, %rsi + jae Ltop + +Led0: mov %r11, %r9 + mov %r12, %r8 +Led1: add %rax, %r9 + adc %rdx, %r8 +Led2: mov 8(%r13), %edi + mov %r8, %rax + mov %r9, %r8 + mul %r10 + add %rax, %r8 + adc $0, %rdx +L1: xor %ecx, %ecx + mov %r8, %r9 + sub %edi, %ecx + shr %cl, %r9 + mov %edi, %ecx + sal %cl, %rdx + or %rdx, %r9 + sal %cl, %r8 + mov %r9, %rax + mulq (%r13) + mov %rax, %rsi + inc %r9 + add %r8, %rsi + adc %r9, %rdx + imul %r14, %rdx + sub %rdx, %r8 + lea (%r8,%r14), %rax + cmp %r8, %rsi + cmovc %rax, %r8 + mov %r8, %rax + sub %r14, %rax + cmovc %r8, %rax + mov %edi, %ecx + shr %cl, %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + + ret +Lone: + mov (%rdi), %r8 + mov 8(%rcx), %edi + xor %rdx, %rdx + jmp L1 + + + .align 4, 0x90 + .globl ___gmpn_mod_1s_2p_cps + + +___gmpn_mod_1s_2p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call ___gmpn_invert_limb + + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + neg %r8 + mov %ebp, %ecx + mov $1, %esi + + shld %cl, %rax, %rsi + + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 16(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 24(%rbx) + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr %cl, %r12 + mov %r12, 32(%rbx) + + pop %r12 + pop %rbx + pop %rbp + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_4.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_4.s new file mode 100644 index 0000000..675a4eb --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_1_4.s @@ -0,0 +1,282 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_mod_1s_4p + + +___gmpn_mod_1s_4p: + + + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rdx, %r15 + mov %rcx, %r14 + mov 16(%rcx), %r11 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + mov 40(%rcx), %r13 + mov 48(%rcx), %r12 + xor %r8d, %r8d + mov %esi, %edx + and $3, %edx + je Lb0 + cmp $2, %edx + jc Lb1 + je Lb2 + +Lb3: lea -24(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + jmp Lm0 + + .align 3, 0x90 +Lb0: lea -32(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + mov 24(%rdi), %rax + mul %rbp + jmp Lm0 + + .align 3, 0x90 +Lb1: lea -8(%rdi,%rsi,8), %rdi + mov (%rdi), %r9 + jmp Lm1 + + .align 3, 0x90 +Lb2: lea -16(%rdi,%rsi,8), %rdi + mov 8(%rdi), %r8 + mov (%rdi), %r9 + jmp Lm1 + + .align 4, 0x90 +Ltop: mov -24(%rdi), %rax + mov -32(%rdi), %r10 + mul %r11 + add %rax, %r10 + mov -16(%rdi), %rax + mov $0, %ecx + adc %rdx, %rcx + mul %rbx + add %rax, %r10 + mov -8(%rdi), %rax + adc %rdx, %rcx + sub $32, %rdi + mul %rbp + add %rax, %r10 + mov %r13, %rax + adc %rdx, %rcx + mul %r9 + add %rax, %r10 + mov %r12, %rax + adc %rdx, %rcx + mul %r8 + mov %r10, %r9 + mov %rcx, %r8 +Lm0: add %rax, %r9 + adc %rdx, %r8 +Lm1: sub $4, %rsi + ja Ltop + +Lend: mov 8(%r14), %esi + mov %r8, %rax + mul %r11 + mov %rax, %r8 + add %r9, %r8 + adc $0, %rdx + xor %ecx, %ecx + sub %esi, %ecx + mov %r8, %rdi + shr %cl, %rdi + mov %esi, %ecx + sal %cl, %rdx + or %rdx, %rdi + mov %rdi, %rax + mulq (%r14) + mov %r15, %rbx + mov %rax, %r9 + sal %cl, %r8 + inc %rdi + add %r8, %r9 + adc %rdi, %rdx + imul %rbx, %rdx + sub %rdx, %r8 + lea (%r8,%rbx), %rax + cmp %r8, %r9 + cmovc %rax, %r8 + mov %r8, %rax + sub %rbx, %rax + cmovc %r8, %rax + shr %cl, %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + + + .align 4, 0x90 + .globl ___gmpn_mod_1s_4p_cps + + +___gmpn_mod_1s_4p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call ___gmpn_invert_limb + + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + neg %r8 + mov %ebp, %ecx + mov $1, %esi + + shld %cl, %rax, %rsi + + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 16(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 24(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 32(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 40(%rbx) + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr %cl, %r12 + mov %r12, 48(%rbx) + + pop %r12 + pop %rbx + pop %rbp + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mod_34lsub1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_34lsub1.s new file mode 100644 index 0000000..141c810 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mod_34lsub1.s @@ -0,0 +1,246 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_mod_34lsub1 + + +___gmpn_mod_34lsub1: + + + + mov $0x0000FFFFFFFFFFFF, %r11 + + mov (%rdi), %rax + + cmp $2, %rsi + ja Lgt2 + + jb Lone + + mov 8(%rdi), %rsi + mov %rax, %rdx + shr $48, %rax + + and %r11, %rdx + add %rdx, %rax + mov %esi, %edx + + shr $32, %rsi + add %rsi, %rax + + shl $16, %rdx + add %rdx, %rax +Lone: + ret + + + + + +Lgt2: mov 8(%rdi), %rcx + mov 16(%rdi), %rdx + xor %r9, %r9 + add $24, %rdi + sub $12, %rsi + jc Lend + .align 4, 0x90 +Ltop: + add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add 24(%rdi), %rax + adc 32(%rdi), %rcx + adc 40(%rdi), %rdx + adc $0, %r9 + add 48(%rdi), %rax + adc 56(%rdi), %rcx + adc 64(%rdi), %rdx + adc $0, %r9 + add $72, %rdi + sub $9, %rsi + jnc Ltop + +Lend: + lea Ltab(%rip), %r8 + movslq 36(%r8,%rsi,4), %r10 + add %r10, %r8 + jmp *%r8 + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + + .set L8_tmp, L8-Ltab + .long L8_tmp + + .text + +L6: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +L3: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + jmp Lcj1 + +L7: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +L4: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +L1: add (%rdi), %rax + adc $0, %rcx + jmp Lcj2 + +L8: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +L5: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +L2: add (%rdi), %rax + adc 8(%rdi), %rcx + +Lcj2: adc $0, %rdx +Lcj1: adc $0, %r9 +L0: add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax + + mov %rax, %rdi + shr $48, %rax + + and %r11, %rdi + mov %ecx, %r10d + + shr $32, %rcx + + add %rdi, %rax + movzwl %dx, %edi + shl $16, %r10 + + add %rcx, %rax + shr $16, %rdx + + add %r10, %rax + shl $32, %rdi + + add %rdx, %rax + add %rdi, %rax + + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mode1o.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mode1o.s new file mode 100644 index 0000000..c715f7c --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mode1o.s @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_modexact_1_odd + + +___gmpn_modexact_1_odd: + + + mov $0, %ecx + + + .globl ___gmpn_modexact_1c_odd + + +___gmpn_modexact_1c_odd: + + +Lent: + + + + + + mov %rdx, %r8 + shr %edx + + lea ___gmp_binvert_limb_table(%rip), %r9 + + + + and $127, %edx + mov %rcx, %r10 + + movzbl (%r9,%rdx), %edx + + mov (%rdi), %rax + lea (%rdi,%rsi,8), %r11 + mov %r8, %rdi + + lea (%rdx,%rdx), %ecx + imul %edx, %edx + + neg %rsi + + imul %edi, %edx + + sub %edx, %ecx + + lea (%rcx,%rcx), %edx + imul %ecx, %ecx + + imul %edi, %ecx + + sub %ecx, %edx + xor %ecx, %ecx + + lea (%rdx,%rdx), %r9 + imul %rdx, %rdx + + imul %r8, %rdx + + sub %rdx, %r9 + mov %r10, %rdx + + + + inc %rsi + jz Lone + + + .align 4, 0x90 +Ltop: + + + + + + + + + + sub %rdx, %rax + + adc $0, %rcx + imul %r9, %rax + + mul %r8 + + mov (%r11,%rsi,8), %rax + sub %rcx, %rax + setc %cl + + inc %rsi + jnz Ltop + + +Lone: + sub %rdx, %rax + + adc $0, %rcx + imul %r9, %rax + + mul %r8 + + lea (%rcx,%rdx), %rax + + ret + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mp_bases.c b/vere/ext/gmp/gen/x86_64-macos/mpn/mp_bases.c new file mode 100644 index 0000000..c72c531 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mp_bases.c @@ -0,0 +1,268 @@ +/* This file generated by gen-bases.c - DO NOT EDIT. */ + +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +const struct bases mp_bases[257] = +{ + /* 0 */ { 0, 0, 0, 0, 0 }, + /* 1 */ { 0, 0, 0, 0, 0 }, + /* 2 */ { 64, CNST_LIMB(0xffffffffffffffff), CNST_LIMB(0x1fffffffffffffff), CNST_LIMB(0x1), CNST_LIMB(0x0) }, + /* 3 */ { 40, CNST_LIMB(0xa1849cc1a9a9e94e), CNST_LIMB(0x32b803473f7ad0f3), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 4 */ { 32, CNST_LIMB(0x7fffffffffffffff), CNST_LIMB(0x3fffffffffffffff), CNST_LIMB(0x2), CNST_LIMB(0x0) }, + /* 5 */ { 27, CNST_LIMB(0x6e40d1a4143dcb94), CNST_LIMB(0x4a4d3c25e68dc57f), CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90) }, + /* 6 */ { 24, CNST_LIMB(0x6308c91b702a7cf4), CNST_LIMB(0x52b803473f7ad0f3), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 7 */ { 22, CNST_LIMB(0x5b3064eb3aa6d388), CNST_LIMB(0x59d5d9fd5010b366), CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b) }, + /* 8 */ { 21, CNST_LIMB(0x5555555555555555), CNST_LIMB(0x5fffffffffffffff), CNST_LIMB(0x3), CNST_LIMB(0x0) }, + /* 9 */ { 20, CNST_LIMB(0x50c24e60d4d4f4a7), CNST_LIMB(0x6570068e7ef5a1e7), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 10 */ { 19, CNST_LIMB(0x4d104d427de7fbcc), CNST_LIMB(0x6a4d3c25e68dc57f), CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a) }, + /* 11 */ { 18, CNST_LIMB(0x4a00270775914e88), CNST_LIMB(0x6eb3a9f01975077f), CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b) }, + /* 12 */ { 17, CNST_LIMB(0x4768ce0d05818e12), CNST_LIMB(0x72b803473f7ad0f3), CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f) }, + /* 13 */ { 17, CNST_LIMB(0x452e53e365907bda), CNST_LIMB(0x766a008e4788cbcd), CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3) }, + /* 14 */ { 16, CNST_LIMB(0x433cfffb4b5aae55), CNST_LIMB(0x79d5d9fd5010b366), CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86) }, + /* 15 */ { 16, CNST_LIMB(0x41867711b4f85355), CNST_LIMB(0x7d053f6d26089673), CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48) }, + /* 16 */ { 16, CNST_LIMB(0x3fffffffffffffff), CNST_LIMB(0x7fffffffffffffff), CNST_LIMB(0x4), CNST_LIMB(0x0) }, + /* 17 */ { 15, CNST_LIMB(0x3ea16afd58b10966), CNST_LIMB(0x82cc7edf592262cf), CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323) }, + /* 18 */ { 15, CNST_LIMB(0x3d64598d154dc4de), CNST_LIMB(0x8570068e7ef5a1e7), CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c) }, + /* 19 */ { 15, CNST_LIMB(0x3c43c23018bb5563), CNST_LIMB(0x87ef05ae409a0288), CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec) }, + /* 20 */ { 14, CNST_LIMB(0x3b3b9a42873069c7), CNST_LIMB(0x8a4d3c25e68dc57f), CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e) }, + /* 21 */ { 14, CNST_LIMB(0x3a4898f06cf41ac9), CNST_LIMB(0x8c8ddd448f8b845a), CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239) }, + /* 22 */ { 14, CNST_LIMB(0x39680b13582e7c18), CNST_LIMB(0x8eb3a9f01975077f), CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08) }, + /* 23 */ { 14, CNST_LIMB(0x3897b2b751ae561a), CNST_LIMB(0x90c10500d63aa658), CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38) }, + /* 24 */ { 13, CNST_LIMB(0x37d5aed131f19c98), CNST_LIMB(0x92b803473f7ad0f3), CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1) }, + /* 25 */ { 13, CNST_LIMB(0x372068d20a1ee5ca), CNST_LIMB(0x949a784bcd1b8afe), CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5) }, + /* 26 */ { 13, CNST_LIMB(0x3676867e5d60de29), CNST_LIMB(0x966a008e4788cbcd), CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9) }, + /* 27 */ { 13, CNST_LIMB(0x35d6deeb388df86f), CNST_LIMB(0x982809d5be7072db), CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae) }, + /* 28 */ { 13, CNST_LIMB(0x354071d61c77fa2e), CNST_LIMB(0x99d5d9fd5010b366), CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636) }, + /* 29 */ { 13, CNST_LIMB(0x34b260c5671b18ac), CNST_LIMB(0x9b74948f5532da4b), CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d) }, + /* 30 */ { 13, CNST_LIMB(0x342be986572b45cc), CNST_LIMB(0x9d053f6d26089673), CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6) }, + /* 31 */ { 12, CNST_LIMB(0x33ac61b998fbbdf2), CNST_LIMB(0x9e88c6b3626a72aa), CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5) }, + /* 32 */ { 12, CNST_LIMB(0x3333333333333333), CNST_LIMB(0x9fffffffffffffff), CNST_LIMB(0x5), CNST_LIMB(0x0) }, + /* 33 */ { 12, CNST_LIMB(0x32bfd90114c12861), CNST_LIMB(0xa16bad3758efd873), CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f) }, + /* 34 */ { 12, CNST_LIMB(0x3251dcf6169e45f2), CNST_LIMB(0xa2cc7edf592262cf), CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec) }, + /* 35 */ { 12, CNST_LIMB(0x31e8d59f180dc630), CNST_LIMB(0xa4231623369e78e5), CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509) }, + /* 36 */ { 12, CNST_LIMB(0x3184648db8153e7a), CNST_LIMB(0xa570068e7ef5a1e7), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 37 */ { 12, CNST_LIMB(0x312434e89c35dacd), CNST_LIMB(0xa6b3d78b6d3b24fb), CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f) }, + /* 38 */ { 12, CNST_LIMB(0x30c7fa349460a541), CNST_LIMB(0xa7ef05ae409a0288), CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544) }, + /* 39 */ { 12, CNST_LIMB(0x306f6f4c8432bc6d), CNST_LIMB(0xa92203d587039cc1), CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c) }, + /* 40 */ { 12, CNST_LIMB(0x301a557ffbfdd252), CNST_LIMB(0xaa4d3c25e68dc57f), CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197) }, + /* 41 */ { 11, CNST_LIMB(0x2fc873d1fda55f3b), CNST_LIMB(0xab7110e6ce866f2b), CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae) }, + /* 42 */ { 11, CNST_LIMB(0x2f799652a4e6dc49), CNST_LIMB(0xac8ddd448f8b845a), CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7) }, + /* 43 */ { 11, CNST_LIMB(0x2f2d8d8f64460aad), CNST_LIMB(0xada3f5fb9c415052), CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94) }, + /* 44 */ { 11, CNST_LIMB(0x2ee42e164e8f53a4), CNST_LIMB(0xaeb3a9f01975077f), CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a) }, + /* 45 */ { 11, CNST_LIMB(0x2e9d500984041dbd), CNST_LIMB(0xafbd42b465836767), CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46) }, + /* 46 */ { 11, CNST_LIMB(0x2e58cec05a6a8144), CNST_LIMB(0xb0c10500d63aa658), CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083) }, + /* 47 */ { 11, CNST_LIMB(0x2e1688743ef9104c), CNST_LIMB(0xb1bf311e95d00de3), CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988) }, + /* 48 */ { 11, CNST_LIMB(0x2dd65df7a583598f), CNST_LIMB(0xb2b803473f7ad0f3), CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167) }, + /* 49 */ { 11, CNST_LIMB(0x2d9832759d5369c4), CNST_LIMB(0xb3abb3faa02166cc), CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b) }, + /* 50 */ { 11, CNST_LIMB(0x2d5beb38dcd1394c), CNST_LIMB(0xb49a784bcd1b8afe), CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962) }, + /* 51 */ { 11, CNST_LIMB(0x2d216f7943e2ba6a), CNST_LIMB(0xb5848226989d33c3), CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e) }, + /* 52 */ { 11, CNST_LIMB(0x2ce8a82efbb3ff2c), CNST_LIMB(0xb66a008e4788cbcd), CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d) }, + /* 53 */ { 11, CNST_LIMB(0x2cb17fea7ad7e332), CNST_LIMB(0xb74b1fd64e0753c6), CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25) }, + /* 54 */ { 11, CNST_LIMB(0x2c7be2b0cfa1ba50), CNST_LIMB(0xb82809d5be7072db), CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a) }, + /* 55 */ { 11, CNST_LIMB(0x2c47bddba92d7463), CNST_LIMB(0xb900e6160002ccfe), CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d) }, + /* 56 */ { 11, CNST_LIMB(0x2c14fffcaa8b131e), CNST_LIMB(0xb9d5d9fd5010b366), CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1) }, + /* 57 */ { 10, CNST_LIMB(0x2be398c3a38be053), CNST_LIMB(0xbaa708f58014d37c), CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1) }, + /* 58 */ { 10, CNST_LIMB(0x2bb378e758451068), CNST_LIMB(0xbb74948f5532da4b), CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c) }, + /* 59 */ { 10, CNST_LIMB(0x2b8492108be5e5f7), CNST_LIMB(0xbc3e9ca2e1a05533), CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492) }, + /* 60 */ { 10, CNST_LIMB(0x2b56d6c70d55481b), CNST_LIMB(0xbd053f6d26089673), CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5) }, + /* 61 */ { 10, CNST_LIMB(0x2b2a3a608c72ddd5), CNST_LIMB(0xbdc899ab3ff56c5e), CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416) }, + /* 62 */ { 10, CNST_LIMB(0x2afeb0f1060c7e41), CNST_LIMB(0xbe88c6b3626a72aa), CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5) }, + /* 63 */ { 10, CNST_LIMB(0x2ad42f3c9aca595c), CNST_LIMB(0xbf45e08bcf06554e), CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641) }, + /* 64 */ { 10, CNST_LIMB(0x2aaaaaaaaaaaaaaa), CNST_LIMB(0xbfffffffffffffff), CNST_LIMB(0x6), CNST_LIMB(0x0) }, + /* 65 */ { 10, CNST_LIMB(0x2a82193a13425883), CNST_LIMB(0xc0b73cb42e16914c), CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1) }, + /* 66 */ { 10, CNST_LIMB(0x2a5a717672f66450), CNST_LIMB(0xc16bad3758efd873), CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33) }, + /* 67 */ { 10, CNST_LIMB(0x2a33aa6e56d9c71c), CNST_LIMB(0xc21d6713f453f356), CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0) }, + /* 68 */ { 10, CNST_LIMB(0x2a0dbbaa3bdfcea4), CNST_LIMB(0xc2cc7edf592262cf), CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409) }, + /* 69 */ { 10, CNST_LIMB(0x29e89d244eb4bfaf), CNST_LIMB(0xc379084815b5774c), CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f) }, + /* 70 */ { 10, CNST_LIMB(0x29c44740d7db51e6), CNST_LIMB(0xc4231623369e78e5), CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62) }, + /* 71 */ { 10, CNST_LIMB(0x29a0b2c743b14d74), CNST_LIMB(0xc4caba789e2b8687), CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2) }, + /* 72 */ { 10, CNST_LIMB(0x297dd8dbb7c22a2d), CNST_LIMB(0xc570068e7ef5a1e7), CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2) }, + /* 73 */ { 10, CNST_LIMB(0x295bb2f9285c8c1b), CNST_LIMB(0xc6130af40bc0ecbf), CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b) }, + /* 74 */ { 10, CNST_LIMB(0x293a3aebe2be1c92), CNST_LIMB(0xc6b3d78b6d3b24fb), CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5) }, + /* 75 */ { 10, CNST_LIMB(0x29196acc815ebd9f), CNST_LIMB(0xc7527b930c965bf2), CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e) }, + /* 76 */ { 10, CNST_LIMB(0x28f93cfb40f5c22a), CNST_LIMB(0xc7ef05ae409a0288), CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd) }, + /* 77 */ { 10, CNST_LIMB(0x28d9ac1badc64117), CNST_LIMB(0xc88983ed6985bae5), CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec) }, + /* 78 */ { 10, CNST_LIMB(0x28bab310a196b478), CNST_LIMB(0xc92203d587039cc1), CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505) }, + /* 79 */ { 10, CNST_LIMB(0x289c4cf88b774469), CNST_LIMB(0xc9b892675266f66c), CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac) }, + /* 80 */ { 10, CNST_LIMB(0x287e7529fb244e91), CNST_LIMB(0xca4d3c25e68dc57f), CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d) }, + /* 81 */ { 10, CNST_LIMB(0x286127306a6a7a53), CNST_LIMB(0xcae00d1cfdeb43cf), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 82 */ { 10, CNST_LIMB(0x28445ec93f792b1e), CNST_LIMB(0xcb7110e6ce866f2b), CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f) }, + /* 83 */ { 10, CNST_LIMB(0x282817e1038950fa), CNST_LIMB(0xcc0052b18b0e2a19), CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47) }, + /* 84 */ { 10, CNST_LIMB(0x280c4e90c9ab1f45), CNST_LIMB(0xcc8ddd448f8b845a), CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a) }, + /* 85 */ { 9, CNST_LIMB(0x27f0ff1bc1ee87cd), CNST_LIMB(0xcd19bb053fb0284e), CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4) }, + /* 86 */ { 9, CNST_LIMB(0x27d625ecf571c340), CNST_LIMB(0xcda3f5fb9c415052), CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e) }, + /* 87 */ { 9, CNST_LIMB(0x27bbbf95282fcd45), CNST_LIMB(0xce2c97d694adab3f), CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89) }, + /* 88 */ { 9, CNST_LIMB(0x27a1c8c8ddaf84da), CNST_LIMB(0xceb3a9f01975077f), CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa) }, + /* 89 */ { 9, CNST_LIMB(0x27883e5e7df3f518), CNST_LIMB(0xcf393550f3aa6906), CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383) }, + /* 90 */ { 9, CNST_LIMB(0x276f1d4c9847e90e), CNST_LIMB(0xcfbd42b465836767), CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10) }, + /* 91 */ { 9, CNST_LIMB(0x275662a841b30191), CNST_LIMB(0xd03fda8b97997f33), CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43) }, + /* 92 */ { 9, CNST_LIMB(0x273e0ba38d15a47b), CNST_LIMB(0xd0c10500d63aa658), CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447) }, + /* 93 */ { 9, CNST_LIMB(0x2726158c1b13cf03), CNST_LIMB(0xd140c9faa1e5439e), CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0) }, + /* 94 */ { 9, CNST_LIMB(0x270e7dc9c01d8e9b), CNST_LIMB(0xd1bf311e95d00de3), CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4) }, + /* 95 */ { 9, CNST_LIMB(0x26f741dd3f070d61), CNST_LIMB(0xd23c41d42727c808), CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f) }, + /* 96 */ { 9, CNST_LIMB(0x26e05f5f16c2159e), CNST_LIMB(0xd2b803473f7ad0f3), CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794) }, + /* 97 */ { 9, CNST_LIMB(0x26c9d3fe61e80598), CNST_LIMB(0xd3327c6ab49ca6c8), CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc) }, + /* 98 */ { 9, CNST_LIMB(0x26b39d7fc6ddab08), CNST_LIMB(0xd3abb3faa02166cc), CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3) }, + /* 99 */ { 9, CNST_LIMB(0x269db9bc7772a5cc), CNST_LIMB(0xd423b07e986aa967), CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518) }, + /* 100 */ { 9, CNST_LIMB(0x268826a13ef3fde6), CNST_LIMB(0xd49a784bcd1b8afe), CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e) }, + /* 101 */ { 9, CNST_LIMB(0x2672e22d9dbdbd9f), CNST_LIMB(0xd510118708a8f8dd), CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f) }, + /* 102 */ { 9, CNST_LIMB(0x265dea72f169cc99), CNST_LIMB(0xd5848226989d33c3), CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1) }, + /* 103 */ { 9, CNST_LIMB(0x26493d93a8cb2514), CNST_LIMB(0xd5f7cff41e09aeb8), CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328) }, + /* 104 */ { 9, CNST_LIMB(0x2634d9c282f3ef82), CNST_LIMB(0xd66a008e4788cbcd), CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c) }, + /* 105 */ { 9, CNST_LIMB(0x2620bd41d8933adc), CNST_LIMB(0xd6db196a761949d9), CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179) }, + /* 106 */ { 9, CNST_LIMB(0x260ce662ef04088a), CNST_LIMB(0xd74b1fd64e0753c6), CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf) }, + /* 107 */ { 9, CNST_LIMB(0x25f95385547353fd), CNST_LIMB(0xd7ba18f93502e409), CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32) }, + /* 108 */ { 9, CNST_LIMB(0x25e60316448db8e1), CNST_LIMB(0xd82809d5be7072db), CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188) }, + /* 109 */ { 9, CNST_LIMB(0x25d2f390152f74f5), CNST_LIMB(0xd894f74b06ef8b40), CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c) }, + /* 110 */ { 9, CNST_LIMB(0x25c02379aa9ad043), CNST_LIMB(0xd900e6160002ccfe), CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f) }, + /* 111 */ { 9, CNST_LIMB(0x25ad9165f2c18907), CNST_LIMB(0xd96bdad2acb5f5ef), CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0) }, + /* 112 */ { 9, CNST_LIMB(0x259b3bf36735c90c), CNST_LIMB(0xd9d5d9fd5010b366), CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8) }, + /* 113 */ { 9, CNST_LIMB(0x258921cb955e7693), CNST_LIMB(0xda3ee7f38e181ed0), CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82) }, + /* 114 */ { 9, CNST_LIMB(0x257741a2ac9170af), CNST_LIMB(0xdaa708f58014d37c), CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a) }, + /* 115 */ { 9, CNST_LIMB(0x25659a3711bc827d), CNST_LIMB(0xdb0e4126bcc86bd7), CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5) }, + /* 116 */ { 9, CNST_LIMB(0x25542a50f84b9c39), CNST_LIMB(0xdb74948f5532da4b), CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610) }, + /* 117 */ { 9, CNST_LIMB(0x2542f0c20000377d), CNST_LIMB(0xdbda071cc67e6db5), CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c) }, + /* 118 */ { 9, CNST_LIMB(0x2531ec64d772bd64), CNST_LIMB(0xdc3e9ca2e1a05533), CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977) }, + /* 119 */ { 9, CNST_LIMB(0x25211c1ce2fb5a6e), CNST_LIMB(0xdca258dca9331635), CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e) }, + /* 120 */ { 9, CNST_LIMB(0x25107ed5e7c3ec3b), CNST_LIMB(0xdd053f6d26089673), CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5) }, + /* 121 */ { 9, CNST_LIMB(0x25001383bac8a744), CNST_LIMB(0xdd6753e032ea0efe), CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b) }, + /* 122 */ { 9, CNST_LIMB(0x24efd921f390bce3), CNST_LIMB(0xddc899ab3ff56c5e), CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5) }, + /* 123 */ { 9, CNST_LIMB(0x24dfceb3a26bb203), CNST_LIMB(0xde29142e0e01401f), CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b) }, + /* 124 */ { 9, CNST_LIMB(0x24cff3430a0341a7), CNST_LIMB(0xde88c6b3626a72aa), CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10) }, + /* 125 */ { 9, CNST_LIMB(0x24c045e15c149931), CNST_LIMB(0xdee7b471b3a9507d), CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90) }, + /* 126 */ { 9, CNST_LIMB(0x24b0c5a679267ae2), CNST_LIMB(0xdf45e08bcf06554e), CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8) }, + /* 127 */ { 9, CNST_LIMB(0x24a171b0b31461c8), CNST_LIMB(0xdfa34e1177c23362), CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1) }, + /* 128 */ { 9, CNST_LIMB(0x2492492492492492), CNST_LIMB(0xdfffffffffffffff), CNST_LIMB(0x7), CNST_LIMB(0x0) }, + /* 129 */ { 9, CNST_LIMB(0x24834b2c9d85cdfe), CNST_LIMB(0xe05bf942dbbc2145), CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d) }, + /* 130 */ { 9, CNST_LIMB(0x247476f924137501), CNST_LIMB(0xe0b73cb42e16914c), CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0) }, + /* 131 */ { 9, CNST_LIMB(0x2465cbc00a40cec0), CNST_LIMB(0xe111cd1d5133412e), CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085) }, + /* 132 */ { 9, CNST_LIMB(0x245748bc980e0427), CNST_LIMB(0xe16bad3758efd873), CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d) }, + /* 133 */ { 9, CNST_LIMB(0x2448ed2f49eb0633), CNST_LIMB(0xe1c4dfab90aab5ef), CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25) }, + /* 134 */ { 9, CNST_LIMB(0x243ab85da36e3167), CNST_LIMB(0xe21d6713f453f356), CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d) }, + /* 135 */ { 9, CNST_LIMB(0x242ca99203ea8c18), CNST_LIMB(0xe27545fba4fe385a), CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914) }, + /* 136 */ { 9, CNST_LIMB(0x241ec01b7cce4ea0), CNST_LIMB(0xe2cc7edf592262cf), CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849) }, + /* 137 */ { 9, CNST_LIMB(0x2410fb4da9b3b0fc), CNST_LIMB(0xe323142dc8c66b55), CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9) }, + /* 138 */ { 9, CNST_LIMB(0x24035a808a0f315e), CNST_LIMB(0xe379084815b5774c), CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0) }, + /* 139 */ { 8, CNST_LIMB(0x23f5dd105c67ab9d), CNST_LIMB(0xe3ce5d822ff4b643), CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf) }, + /* 140 */ { 8, CNST_LIMB(0x23e8825d7b05abb1), CNST_LIMB(0xe4231623369e78e5), CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05) }, + /* 141 */ { 8, CNST_LIMB(0x23db49cc3a0866fe), CNST_LIMB(0xe4773465d54aded7), CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19) }, + /* 142 */ { 8, CNST_LIMB(0x23ce32c4c6cfb9f5), CNST_LIMB(0xe4caba789e2b8687), CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe) }, + /* 143 */ { 8, CNST_LIMB(0x23c13cb308ab6ab7), CNST_LIMB(0xe51daa7e60fdd34c), CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7) }, + /* 144 */ { 8, CNST_LIMB(0x23b4670682c0c709), CNST_LIMB(0xe570068e7ef5a1e7), CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757) }, + /* 145 */ { 8, CNST_LIMB(0x23a7b13237187c8b), CNST_LIMB(0xe5c1d0b53bc09fca), CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae) }, + /* 146 */ { 8, CNST_LIMB(0x239b1aac8ac74728), CNST_LIMB(0xe6130af40bc0ecbf), CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40) }, + /* 147 */ { 8, CNST_LIMB(0x238ea2ef2b24c379), CNST_LIMB(0xe663b741df9c37c0), CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f) }, + /* 148 */ { 8, CNST_LIMB(0x23824976f4045a26), CNST_LIMB(0xe6b3d78b6d3b24fb), CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d) }, + /* 149 */ { 8, CNST_LIMB(0x23760dc3d6e4d729), CNST_LIMB(0xe7036db376537b90), CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1) }, + /* 150 */ { 8, CNST_LIMB(0x2369ef58c30bd43e), CNST_LIMB(0xe7527b930c965bf2), CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe) }, + /* 151 */ { 8, CNST_LIMB(0x235dedbb8e82aa1c), CNST_LIMB(0xe7a102f9d39a9331), CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e) }, + /* 152 */ { 8, CNST_LIMB(0x23520874dfeb1ffd), CNST_LIMB(0xe7ef05ae409a0288), CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc) }, + /* 153 */ { 8, CNST_LIMB(0x23463f1019228dd7), CNST_LIMB(0xe83c856dd81804b7), CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910) }, + /* 154 */ { 8, CNST_LIMB(0x233a911b42aa9b3c), CNST_LIMB(0xe88983ed6985bae5), CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614) }, + /* 155 */ { 8, CNST_LIMB(0x232efe26f7cf33f9), CNST_LIMB(0xe8d602d948f83829), CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2) }, + /* 156 */ { 8, CNST_LIMB(0x232385c65381b485), CNST_LIMB(0xe92203d587039cc1), CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d) }, + /* 157 */ { 8, CNST_LIMB(0x2318278edde1b39b), CNST_LIMB(0xe96d887e26cd57b7), CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782) }, + /* 158 */ { 8, CNST_LIMB(0x230ce3187a6c2be9), CNST_LIMB(0xe9b892675266f66c), CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b) }, + /* 159 */ { 8, CNST_LIMB(0x2301b7fd56ca21bb), CNST_LIMB(0xea03231d8d8224ba), CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd) }, + /* 160 */ { 8, CNST_LIMB(0x22f6a5d9da38341c), CNST_LIMB(0xea4d3c25e68dc57f), CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9) }, + /* 161 */ { 8, CNST_LIMB(0x22ebac4c9580d89f), CNST_LIMB(0xea96defe264b59be), CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753) }, + /* 162 */ { 8, CNST_LIMB(0x22e0caf633834beb), CNST_LIMB(0xeae00d1cfdeb43cf), CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7) }, + /* 163 */ { 8, CNST_LIMB(0x22d601796a418886), CNST_LIMB(0xeb28c7f233bdd372), CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb) }, + /* 164 */ { 8, CNST_LIMB(0x22cb4f7aec6fd8b4), CNST_LIMB(0xeb7110e6ce866f2b), CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac) }, + /* 165 */ { 8, CNST_LIMB(0x22c0b4a15b80d83e), CNST_LIMB(0xebb8e95d3f7d9df2), CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e) }, + /* 166 */ { 8, CNST_LIMB(0x22b630953a28f77a), CNST_LIMB(0xec0052b18b0e2a19), CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004) }, + /* 167 */ { 8, CNST_LIMB(0x22abc300df54ca7c), CNST_LIMB(0xec474e39705912d2), CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8) }, + /* 168 */ { 8, CNST_LIMB(0x22a16b90698da5d2), CNST_LIMB(0xec8ddd448f8b845a), CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5) }, + /* 169 */ { 8, CNST_LIMB(0x229729f1b2c83ded), CNST_LIMB(0xecd4011c8f11979a), CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe) }, + /* 170 */ { 8, CNST_LIMB(0x228cfdd444992f78), CNST_LIMB(0xed19bb053fb0284e), CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532) }, + /* 171 */ { 8, CNST_LIMB(0x2282e6e94ccb8588), CNST_LIMB(0xed5f0c3cbf8fa470), CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02) }, + /* 172 */ { 8, CNST_LIMB(0x2278e4e392557ecf), CNST_LIMB(0xeda3f5fb9c415052), CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317) }, + /* 173 */ { 8, CNST_LIMB(0x226ef7776aa7fd29), CNST_LIMB(0xede87974f3c81855), CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c) }, + /* 174 */ { 8, CNST_LIMB(0x22651e5aaf5532d0), CNST_LIMB(0xee2c97d694adab3f), CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e) }, + /* 175 */ { 8, CNST_LIMB(0x225b5944b40b4694), CNST_LIMB(0xee7052491d2c3e64), CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286) }, + /* 176 */ { 8, CNST_LIMB(0x2251a7ee3cdfcca5), CNST_LIMB(0xeeb3a9f01975077f), CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb) }, + /* 177 */ { 8, CNST_LIMB(0x22480a1174e913d9), CNST_LIMB(0xeef69fea211b2627), CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b) }, + /* 178 */ { 8, CNST_LIMB(0x223e7f69e522683c), CNST_LIMB(0xef393550f3aa6906), CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1) }, + /* 179 */ { 8, CNST_LIMB(0x223507b46b988abe), CNST_LIMB(0xef7b6b399471103e), CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef) }, + /* 180 */ { 8, CNST_LIMB(0x222ba2af32dbbb9e), CNST_LIMB(0xefbd42b465836767), CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3) }, + /* 181 */ { 8, CNST_LIMB(0x22225019a9b4d16c), CNST_LIMB(0xeffebccd41ffcd5c), CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb) }, + /* 182 */ { 8, CNST_LIMB(0x22190fb47b1af172), CNST_LIMB(0xf03fda8b97997f33), CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8) }, + /* 183 */ { 8, CNST_LIMB(0x220fe14186679801), CNST_LIMB(0xf0809cf27f703d52), CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005) }, + /* 184 */ { 8, CNST_LIMB(0x2206c483d7c6b786), CNST_LIMB(0xf0c10500d63aa658), CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926) }, + /* 185 */ { 8, CNST_LIMB(0x21fdb93fa0e0ccc5), CNST_LIMB(0xf10113b153c8ea7b), CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f) }, + /* 186 */ { 8, CNST_LIMB(0x21f4bf3a31bcdcaa), CNST_LIMB(0xf140c9faa1e5439e), CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6) }, + /* 187 */ { 8, CNST_LIMB(0x21ebd639f1d86584), CNST_LIMB(0xf18028cf72976a4e), CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497) }, + /* 188 */ { 8, CNST_LIMB(0x21e2fe06597361a6), CNST_LIMB(0xf1bf311e95d00de3), CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67) }, + /* 189 */ { 8, CNST_LIMB(0x21da3667eb0e8ccb), CNST_LIMB(0xf1fde3d30e812642), CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b) }, + /* 190 */ { 8, CNST_LIMB(0x21d17f282d1a300e), CNST_LIMB(0xf23c41d42727c808), CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56) }, + /* 191 */ { 8, CNST_LIMB(0x21c8d811a3d3c9e1), CNST_LIMB(0xf27a4c0585cbf805), CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0) }, + /* 192 */ { 8, CNST_LIMB(0x21c040efcb50f858), CNST_LIMB(0xf2b803473f7ad0f3), CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf) }, + /* 193 */ { 8, CNST_LIMB(0x21b7b98f11b61c1a), CNST_LIMB(0xf2f56875eb3f2614), CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a) }, + /* 194 */ { 8, CNST_LIMB(0x21af41bcd19739ba), CNST_LIMB(0xf3327c6ab49ca6c8), CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962) }, + /* 195 */ { 8, CNST_LIMB(0x21a6d9474c81adf0), CNST_LIMB(0xf36f3ffb6d916240), CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e) }, + /* 196 */ { 8, CNST_LIMB(0x219e7ffda5ad572a), CNST_LIMB(0xf3abb3faa02166cc), CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86) }, + /* 197 */ { 8, CNST_LIMB(0x219635afdcd3e46d), CNST_LIMB(0xf3e7d9379f70166a), CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca) }, + /* 198 */ { 8, CNST_LIMB(0x218dfa2ec92d0643), CNST_LIMB(0xf423b07e986aa967), CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2) }, + /* 199 */ { 8, CNST_LIMB(0x2185cd4c148e4ae2), CNST_LIMB(0xf45f3a98a20738a4), CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9) }, + /* 200 */ { 8, CNST_LIMB(0x217daeda36ad7a5c), CNST_LIMB(0xf49a784bcd1b8afe), CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6) }, + /* 201 */ { 8, CNST_LIMB(0x21759eac708452fe), CNST_LIMB(0xf4d56a5b33cec44a), CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22) }, + /* 202 */ { 8, CNST_LIMB(0x216d9c96c7d490d4), CNST_LIMB(0xf510118708a8f8dd), CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20) }, + /* 203 */ { 8, CNST_LIMB(0x2165a86e02cb358c), CNST_LIMB(0xf54a6e8ca5438db1), CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf) }, + /* 204 */ { 8, CNST_LIMB(0x215dc207a3c20fdf), CNST_LIMB(0xf5848226989d33c3), CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9) }, + /* 205 */ { 8, CNST_LIMB(0x2155e939e51e8b37), CNST_LIMB(0xf5be4d0cb51434aa), CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff) }, + /* 206 */ { 8, CNST_LIMB(0x214e1ddbb54cd933), CNST_LIMB(0xf5f7cff41e09aeb8), CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a) }, + /* 207 */ { 8, CNST_LIMB(0x21465fc4b2d68f98), CNST_LIMB(0xf6310b8f55304840), CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd) }, + /* 208 */ { 8, CNST_LIMB(0x213eaecd2893dd60), CNST_LIMB(0xf66a008e4788cbcd), CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985) }, + /* 209 */ { 8, CNST_LIMB(0x21370ace09f681c6), CNST_LIMB(0xf6a2af9e5a0f0a08), CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7) }, + /* 210 */ { 8, CNST_LIMB(0x212f73a0ef6db7cb), CNST_LIMB(0xf6db196a761949d9), CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175) }, + /* 211 */ { 8, CNST_LIMB(0x2127e92012e25004), CNST_LIMB(0xf7133e9b156c7be5), CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e) }, + /* 212 */ { 8, CNST_LIMB(0x21206b264c4a39a7), CNST_LIMB(0xf74b1fd64e0753c6), CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef) }, + /* 213 */ { 8, CNST_LIMB(0x2118f98f0e52c28f), CNST_LIMB(0xf782bdbfdda6577b), CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9) }, + /* 214 */ { 8, CNST_LIMB(0x211194366320dc66), CNST_LIMB(0xf7ba18f93502e409), CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc) }, + /* 215 */ { 8, CNST_LIMB(0x210a3af8e926bb78), CNST_LIMB(0xf7f1322182cf15d1), CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b) }, + /* 216 */ { 8, CNST_LIMB(0x2102edb3d00e29a6), CNST_LIMB(0xf82809d5be7072db), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 217 */ { 8, CNST_LIMB(0x20fbac44d5b6edc2), CNST_LIMB(0xf85ea0b0b27b2610), CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2) }, + /* 218 */ { 8, CNST_LIMB(0x20f4768a4348ad08), CNST_LIMB(0xf894f74b06ef8b40), CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049) }, + /* 219 */ { 8, CNST_LIMB(0x20ed4c62ea57b1f0), CNST_LIMB(0xf8cb0e3b4b3bbdb3), CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada) }, + /* 220 */ { 8, CNST_LIMB(0x20e62dae221c087a), CNST_LIMB(0xf900e6160002ccfe), CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933) }, + /* 221 */ { 8, CNST_LIMB(0x20df1a4bc4ba6525), CNST_LIMB(0xf9367f6da0ab2e9c), CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d) }, + /* 222 */ { 8, CNST_LIMB(0x20d8121c2c9e506e), CNST_LIMB(0xf96bdad2acb5f5ef), CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b) }, + /* 223 */ { 8, CNST_LIMB(0x20d1150031e51549), CNST_LIMB(0xf9a0f8d3b0e04fde), CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4) }, + /* 224 */ { 8, CNST_LIMB(0x20ca22d927d8f54d), CNST_LIMB(0xf9d5d9fd5010b366), CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053) }, + /* 225 */ { 8, CNST_LIMB(0x20c33b88da7c29aa), CNST_LIMB(0xfa0a7eda4c112ce6), CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48) }, + /* 226 */ { 8, CNST_LIMB(0x20bc5ef18c233bdf), CNST_LIMB(0xfa3ee7f38e181ed0), CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9) }, + /* 227 */ { 8, CNST_LIMB(0x20b58cf5f31e4526), CNST_LIMB(0xfa7315d02f20c7bd), CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e) }, + /* 228 */ { 8, CNST_LIMB(0x20aec5793770a74d), CNST_LIMB(0xfaa708f58014d37c), CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b) }, + /* 229 */ { 8, CNST_LIMB(0x20a8085ef096d530), CNST_LIMB(0xfadac1e711c832d1), CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1) }, + /* 230 */ { 8, CNST_LIMB(0x20a1558b2359c4b1), CNST_LIMB(0xfb0e4126bcc86bd7), CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762) }, + /* 231 */ { 8, CNST_LIMB(0x209aace23fafa72e), CNST_LIMB(0xfb418734a9008bd9), CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555) }, + /* 232 */ { 8, CNST_LIMB(0x20940e491ea988d7), CNST_LIMB(0xfb74948f5532da4b), CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe) }, + /* 233 */ { 8, CNST_LIMB(0x208d79a5006d7a47), CNST_LIMB(0xfba769b39e49640e), CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f) }, + /* 234 */ { 8, CNST_LIMB(0x2086eedb8a3cead3), CNST_LIMB(0xfbda071cc67e6db5), CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3) }, + /* 235 */ { 8, CNST_LIMB(0x20806dd2c486dcc6), CNST_LIMB(0xfc0c6d447c5dd362), CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c) }, + /* 236 */ { 8, CNST_LIMB(0x2079f67119059fae), CNST_LIMB(0xfc3e9ca2e1a05533), CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773) }, + /* 237 */ { 8, CNST_LIMB(0x2073889d50e7bf63), CNST_LIMB(0xfc7095ae91e1c760), CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5) }, + /* 238 */ { 8, CNST_LIMB(0x206d243e9303d929), CNST_LIMB(0xfca258dca9331635), CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f) }, + /* 239 */ { 8, CNST_LIMB(0x2066c93c62170aa8), CNST_LIMB(0xfcd3e6a0ca8906c2), CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c) }, + /* 240 */ { 8, CNST_LIMB(0x2060777e9b0db0f6), CNST_LIMB(0xfd053f6d26089673), CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529) }, + /* 241 */ { 8, CNST_LIMB(0x205a2eed73563032), CNST_LIMB(0xfd3663b27f31d529), CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1) }, + /* 242 */ { 8, CNST_LIMB(0x2053ef71773d7e6a), CNST_LIMB(0xfd6753e032ea0efe), CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2) }, + /* 243 */ { 8, CNST_LIMB(0x204db8f388552ea9), CNST_LIMB(0xfd9810643d6614c3), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 244 */ { 8, CNST_LIMB(0x20478b5cdbe2bb2f), CNST_LIMB(0xfdc899ab3ff56c5e), CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6) }, + /* 245 */ { 8, CNST_LIMB(0x20416696f957cfbf), CNST_LIMB(0xfdf8f02086af2c4b), CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2) }, + /* 246 */ { 8, CNST_LIMB(0x203b4a8bb8d356e7), CNST_LIMB(0xfe29142e0e01401f), CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47) }, + /* 247 */ { 8, CNST_LIMB(0x2035372541ab0f0d), CNST_LIMB(0xfe59063c8822ce56), CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016) }, + /* 248 */ { 8, CNST_LIMB(0x202f2c4e08fd6dcc), CNST_LIMB(0xfe88c6b3626a72aa), CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18) }, + /* 249 */ { 8, CNST_LIMB(0x202929f0d04b99e9), CNST_LIMB(0xfeb855f8ca88fb0d), CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d) }, + /* 250 */ { 8, CNST_LIMB(0x20232ff8a41b45eb), CNST_LIMB(0xfee7b471b3a9507d), CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5) }, + /* 251 */ { 8, CNST_LIMB(0x201d3e50daa036db), CNST_LIMB(0xff16e281db76303b), CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3) }, + /* 252 */ { 8, CNST_LIMB(0x201754e5126d446d), CNST_LIMB(0xff45e08bcf06554e), CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d) }, + /* 253 */ { 8, CNST_LIMB(0x201173a1312ca135), CNST_LIMB(0xff74aef0efafadd7), CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac) }, + /* 254 */ { 8, CNST_LIMB(0x200b9a71625f3b13), CNST_LIMB(0xffa34e1177c23362), CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93) }, + /* 255 */ { 8, CNST_LIMB(0x2005c94216230568), CNST_LIMB(0xffd1be4c7f2af942), CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f) }, + /* 256 */ { 8, CNST_LIMB(0x1fffffffffffffff), CNST_LIMB(0xffffffffffffffff), CNST_LIMB(0x8), CNST_LIMB(0x0) }, +}; diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mul_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_1.s new file mode 100644 index 0000000..cfa791d --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_1.s @@ -0,0 +1,173 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_mul_1 + + +___gmpn_mul_1: + + + push %rbx + push %rbp + push %r12 + + mov %rdx, %rbp + shr $2, %rbp + + test $1, %dl + jnz Lbx1 + +Lbx0: test $2, %dl + mov %rcx, %rdx + jnz Lb10 + +Lb00: .byte 0xc4,98,179,0xf6,6 + .byte 0xc4,98,163,0xf6,86,8 + .byte 0xc4,98,243,0xf6,102,16 + lea -32(%rdi), %rdi + jmp Llo0 + +Lb10: .byte 0xc4,98,243,0xf6,38 + .byte 0xc4,226,227,0xf6,70,8 + lea -16(%rdi), %rdi + test %rbp, %rbp + jz Lcj2 + .byte 0xc4,98,179,0xf6,70,16 + lea 16(%rsi), %rsi + jmp Llo2 + +Lbx1: test $2, %dl + mov %rcx, %rdx + jnz Lb11 + +Lb01: .byte 0xc4,226,227,0xf6,6 + lea -24(%rdi), %rdi + test %rbp, %rbp + jz Lcj1 + .byte 0xc4,98,179,0xf6,70,8 + lea 8(%rsi), %rsi + jmp Llo1 + +Lb11: .byte 0xc4,98,163,0xf6,22 + .byte 0xc4,98,243,0xf6,102,8 + .byte 0xc4,226,227,0xf6,70,16 + lea -8(%rdi), %rdi + test %rbp, %rbp + jz Lcj3 + lea 24(%rsi), %rsi + jmp Llo3 + + .align 5, 0x90 +Ltop: lea 32(%rdi), %rdi + mov %r9, (%rdi) + adc %r8, %r11 +Llo3: .byte 0xc4,98,179,0xf6,6 + mov %r11, 8(%rdi) + adc %r10, %rcx +Llo2: mov %rcx, 16(%rdi) + adc %r12, %rbx +Llo1: .byte 0xc4,98,163,0xf6,86,8 + adc %rax, %r9 + .byte 0xc4,98,243,0xf6,102,16 + mov %rbx, 24(%rdi) +Llo0: .byte 0xc4,226,227,0xf6,70,24 + lea 32(%rsi), %rsi + dec %rbp + jnz Ltop + +Lend: lea 32(%rdi), %rdi + mov %r9, (%rdi) + adc %r8, %r11 +Lcj3: mov %r11, 8(%rdi) + adc %r10, %rcx +Lcj2: mov %rcx, 16(%rdi) + adc %r12, %rbx +Lcj1: mov %rbx, 24(%rdi) + adc $0, %rax + + pop %r12 + pop %rbp + pop %rbx + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mul_2.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_2.s new file mode 100644 index 0000000..132a72b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_2.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_mul_2 + + +___gmpn_mul_2: + + + push %rbx + push %rbp + + mov (%rcx), %r8 + mov 8(%rcx), %r9 + + lea 3(%rdx), %r11 + shr $2, %r11 + + test $1, %dl + jnz Lbx1 + +Lbx0: xor %rbx, %rbx + test $2, %dl + mov (%rsi), %rdx + .byte 0xc4,194,211,0xf6,200 + jz Llo0 + +Lb10: lea -16(%rdi), %rdi + lea -16(%rsi), %rsi + jmp Llo2 + +Lbx1: xor %rbp, %rbp + test $2, %dl + mov (%rsi), %rdx + .byte 0xc4,66,227,0xf6,208 + jnz Lb11 + +Lb01: lea -24(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo1 + +Lb11: lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + jmp Llo3 + + .align 4, 0x90 +Ltop: .byte 0xc4,194,251,0xf6,217 + add %rax, %rbp + mov (%rsi), %rdx + .byte 0xc4,194,251,0xf6,200 + adc $0, %rbx + add %rax, %rbp + adc $0, %rcx + add %r10, %rbp +Llo0: mov %rbp, (%rdi) + adc $0, %rcx + .byte 0xc4,194,251,0xf6,233 + add %rax, %rbx + mov 8(%rsi), %rdx + adc $0, %rbp + .byte 0xc4,66,251,0xf6,208 + add %rax, %rbx + adc $0, %r10 + add %rcx, %rbx +Llo3: mov %rbx, 8(%rdi) + adc $0, %r10 + .byte 0xc4,194,251,0xf6,217 + add %rax, %rbp + mov 16(%rsi), %rdx + .byte 0xc4,194,251,0xf6,200 + adc $0, %rbx + add %rax, %rbp + adc $0, %rcx + add %r10, %rbp +Llo2: mov %rbp, 16(%rdi) + adc $0, %rcx + .byte 0xc4,194,251,0xf6,233 + add %rax, %rbx + mov 24(%rsi), %rdx + adc $0, %rbp + .byte 0xc4,66,251,0xf6,208 + add %rax, %rbx + adc $0, %r10 + add %rcx, %rbx + lea 32(%rsi), %rsi +Llo1: mov %rbx, 24(%rdi) + adc $0, %r10 + dec %r11 + lea 32(%rdi), %rdi + jnz Ltop + +Lend: .byte 0xc4,194,235,0xf6,193 + add %rdx, %rbp + adc $0, %rax + add %r10, %rbp + mov %rbp, (%rdi) + adc $0, %rax + + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s new file mode 100644 index 0000000..b5439c0 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mul_basecase.s @@ -0,0 +1,455 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_mul_basecase + + +___gmpn_mul_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov %rdx, %rbx + neg %rbx + + mov %rdx, %rbp + sar $2, %rbp + + test $1, %r8b + jz Ldo_mul_2 + + + + + mov (%rcx), %rdx + +Ldo_mul_1: + test $1, %bl + jnz Lm1x1 + +Lm1x0:test $2, %bl + jnz Lm110 + +Lm100: + .byte 0xc4,98,139,0xf6,38 + .byte 0xc4,98,163,0xf6,110,8 + lea -24(%rdi), %rdi + jmp Lm1l0 + +Lm110: + .byte 0xc4,98,147,0xf6,14 + .byte 0xc4,98,163,0xf6,118,8 + lea -8(%rdi), %rdi + test %rbp, %rbp + jz Lcj2 + .byte 0xc4,98,171,0xf6,102,16 + lea 16(%rsi), %rsi + jmp Lm1l2 + +Lm1x1:test $2, %bl + jz Lm111 + +Lm101: + .byte 0xc4,98,179,0xf6,54 + lea -16(%rdi), %rdi + test %rbp, %rbp + jz Lcj1 + .byte 0xc4,98,171,0xf6,102,8 + lea 8(%rsi), %rsi + jmp Lm1l1 + +Lm111: + .byte 0xc4,98,155,0xf6,46 + .byte 0xc4,98,171,0xf6,78,8 + .byte 0xc4,98,163,0xf6,118,16 + lea 24(%rsi), %rsi + test %rbp, %rbp + jnz Lgt3 + add %r10, %r13 + jmp Lcj3 +Lgt3: add %r10, %r13 + jmp Lm1l3 + + .align 5, 0x90 +Lm1tp:lea 32(%rdi), %rdi +Lm1l3:mov %r12, (%rdi) + .byte 0xc4,98,171,0xf6,38 +Lm1l2:mov %r13, 8(%rdi) + adc %r11, %r9 +Lm1l1:adc %r10, %r14 + mov %r9, 16(%rdi) + .byte 0xc4,98,163,0xf6,110,8 +Lm1l0:mov %r14, 24(%rdi) + .byte 0xc4,98,171,0xf6,78,16 + adc %r11, %r12 + .byte 0xc4,98,163,0xf6,118,24 + adc %r10, %r13 + lea 32(%rsi), %rsi + dec %rbp + jnz Lm1tp + +Lm1ed:lea 32(%rdi), %rdi +Lcj3: mov %r12, (%rdi) +Lcj2: mov %r13, 8(%rdi) + adc %r11, %r9 +Lcj1: mov %r9, 16(%rdi) + adc $0, %r14 + mov %r14, 24(%rdi) + + dec %r8d + jz Lret5 + + lea 8(%rcx), %rcx + lea 32(%rdi), %rdi + + + + jmp Ldo_addmul + +Ldo_mul_2: + + + + + + mov (%rcx), %r9 + mov 8(%rcx), %r14 + + lea (%rbx), %rbp + sar $2, %rbp + + test $1, %bl + jnz Lm2x1 + +Lm2x0:xor %r10, %r10 + test $2, %bl + mov (%rsi), %rdx + .byte 0xc4,66,155,0xf6,217 + jz Lm2l0 + +Lm210:lea -16(%rdi), %rdi + lea -16(%rsi), %rsi + jmp Lm2l2 + +Lm2x1:xor %r12, %r12 + test $2, %bl + mov (%rsi), %rdx + .byte 0xc4,66,171,0xf6,233 + jz Lm211 + +Lm201:lea -24(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Lm2l1 + +Lm211:lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + jmp Lm2l3 + + .align 4, 0x90 +Lm2tp:.byte 0xc4,66,251,0xf6,214 + add %rax, %r12 + mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 + add %rax, %r12 + adc $0, %r11 + add %r13, %r12 +Lm2l0:mov %r12, (%rdi) + adc $0, %r11 + .byte 0xc4,66,251,0xf6,230 + add %rax, %r10 + mov 8(%rsi), %rdx + adc $0, %r12 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r10 + adc $0, %r13 + add %r11, %r10 +Lm2l3:mov %r10, 8(%rdi) + adc $0, %r13 + .byte 0xc4,66,251,0xf6,214 + add %rax, %r12 + mov 16(%rsi), %rdx + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 + add %rax, %r12 + adc $0, %r11 + add %r13, %r12 +Lm2l2:mov %r12, 16(%rdi) + adc $0, %r11 + .byte 0xc4,66,251,0xf6,230 + add %rax, %r10 + mov 24(%rsi), %rdx + adc $0, %r12 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r10 + adc $0, %r13 + add %r11, %r10 + lea 32(%rsi), %rsi +Lm2l1:mov %r10, 24(%rdi) + adc $0, %r13 + inc %rbp + lea 32(%rdi), %rdi + jnz Lm2tp + +Lm2ed:.byte 0xc4,194,235,0xf6,198 + add %rdx, %r12 + adc $0, %rax + add %r13, %r12 + mov %r12, (%rdi) + adc $0, %rax + mov %rax, 8(%rdi) + + add $-2, %r8d + jz Lret5 + lea 16(%rcx), %rcx + lea 16(%rdi), %rdi + + +Ldo_addmul: + push %r15 + push %r8 + + + + + + lea (%rdi,%rbx,8), %rdi + lea (%rsi,%rbx,8), %rsi + +Louter: + mov (%rcx), %r9 + mov 8(%rcx), %r8 + + lea 2(%rbx), %rbp + sar $2, %rbp + + mov (%rsi), %rdx + test $1, %bl + jnz Lbx1 + +Lbx0: mov (%rdi), %r14 + mov 8(%rdi), %r15 + .byte 0xc4,66,251,0xf6,217 + add %rax, %r14 + .byte 0xc4,66,251,0xf6,224 + adc $0, %r11 + mov %r14, (%rdi) + add %rax, %r15 + adc $0, %r12 + mov 8(%rsi), %rdx + test $2, %bl + jnz Lb10 + +Lb00: lea 16(%rsi), %rsi + lea 16(%rdi), %rdi + jmp Llo0 + +Lb10: mov 16(%rdi), %r14 + lea 32(%rsi), %rsi + .byte 0xc4,66,251,0xf6,233 + jmp Llo2 + +Lbx1: mov (%rdi), %r15 + mov 8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r15 + adc $0, %r13 + .byte 0xc4,66,251,0xf6,208 + add %rax, %r14 + adc $0, %r10 + mov 8(%rsi), %rdx + mov %r15, (%rdi) + .byte 0xc4,66,251,0xf6,217 + test $2, %bl + jz Lb11 + +Lb01: mov 16(%rdi), %r15 + lea 24(%rdi), %rdi + lea 24(%rsi), %rsi + jmp Llo1 + +Lb11: lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo3 + + .align 4, 0x90 +Ltop: .byte 0xc4,66,251,0xf6,233 + add %r10, %r15 + adc $0, %r12 +Llo2: add %rax, %r15 + adc $0, %r13 + .byte 0xc4,66,251,0xf6,208 + add %rax, %r14 + adc $0, %r10 + lea 32(%rdi), %rdi + add %r11, %r15 + mov -16(%rsi), %rdx + mov %r15, -24(%rdi) + adc $0, %r13 + add %r12, %r14 + mov -8(%rdi), %r15 + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 +Llo1: add %rax, %r14 + .byte 0xc4,66,251,0xf6,224 + adc $0, %r11 + add %r13, %r14 + mov %r14, -16(%rdi) + adc $0, %r11 + add %rax, %r15 + adc $0, %r12 + add %r10, %r15 + mov -8(%rsi), %rdx + adc $0, %r12 +Llo0: .byte 0xc4,66,251,0xf6,233 + add %rax, %r15 + adc $0, %r13 + mov (%rdi), %r14 + .byte 0xc4,66,251,0xf6,208 + add %rax, %r14 + adc $0, %r10 + add %r11, %r15 + mov %r15, -8(%rdi) + adc $0, %r13 + mov (%rsi), %rdx + add %r12, %r14 + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 +Llo3: add %rax, %r14 + adc $0, %r11 + .byte 0xc4,66,251,0xf6,224 + add %r13, %r14 + mov 8(%rdi), %r15 + mov %r14, (%rdi) + mov 16(%rdi), %r14 + adc $0, %r11 + add %rax, %r15 + adc $0, %r12 + mov 8(%rsi), %rdx + lea 32(%rsi), %rsi + inc %rbp + jnz Ltop + +Lend: .byte 0xc4,66,251,0xf6,233 + add %r10, %r15 + adc $0, %r12 + add %rax, %r15 + adc $0, %r13 + .byte 0xc4,194,235,0xf6,192 + add %r11, %r15 + mov %r15, 8(%rdi) + adc $0, %r13 + add %r12, %rdx + adc $0, %rax + add %r13, %rdx + mov %rdx, 16(%rdi) + adc $0, %rax + mov %rax, 24(%rdi) + + addl $-2, (%rsp) + lea 16(%rcx), %rcx + lea -16(%rsi,%rbx,8), %rsi + lea 32(%rdi,%rbx,8), %rdi + jnz Louter + + pop %rax + pop %r15 +Lret5:pop %r14 +Lret4:pop %r13 +Lret3:pop %r12 +Lret2:pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/mullo_basecase.s b/vere/ext/gmp/gen/x86_64-macos/mpn/mullo_basecase.s new file mode 100644 index 0000000..127dd3f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/mullo_basecase.s @@ -0,0 +1,436 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_mullo_basecase + + +___gmpn_mullo_basecase: + + + + mov %rdx, %r8 + mov (%rsi), %rdx + + cmp $4, %rcx + jb Lsmall + + push %rbx + push %rbp + push %r12 + push %r13 + + mov (%r8), %r9 + mov 8(%r8), %rbx + + lea 2(%rcx), %rbp + shr $2, %rbp + neg %rcx + add $2, %rcx + + push %rsi + + test $1, %cl + jnz Lm2x1 + +Lm2x0:.byte 0xc4,66,171,0xf6,233 + xor %r12d, %r12d + test $2, %cl + jz Lm2b2 + +Lm2b0:lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + jmp Lm2e0 + +Lm2b2:lea -24(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Lm2e2 + +Lm2x1:.byte 0xc4,66,155,0xf6,217 + xor %r10d, %r10d + test $2, %cl + jnz Lm2b3 + +Lm2b1:jmp Lm2e1 + +Lm2b3:lea -16(%rdi), %rdi + lea -16(%rsi), %rsi + jmp Lm2e3 + + .align 4, 0x90 +Lm2tp:.byte 0xc4,98,251,0xf6,211 + add %rax, %r12 + mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 + add %rax, %r12 + adc $0, %r11 + add %r13, %r12 +Lm2e1:mov %r12, (%rdi) + adc $0, %r11 + .byte 0xc4,98,251,0xf6,227 + add %rax, %r10 + mov 8(%rsi), %rdx + adc $0, %r12 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r10 + adc $0, %r13 + add %r11, %r10 +Lm2e0:mov %r10, 8(%rdi) + adc $0, %r13 + .byte 0xc4,98,251,0xf6,211 + add %rax, %r12 + mov 16(%rsi), %rdx + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 + add %rax, %r12 + adc $0, %r11 + add %r13, %r12 +Lm2e3:mov %r12, 16(%rdi) + adc $0, %r11 + .byte 0xc4,98,251,0xf6,227 + add %rax, %r10 + mov 24(%rsi), %rdx + adc $0, %r12 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r10 + adc $0, %r13 + add %r11, %r10 + lea 32(%rsi), %rsi +Lm2e2:mov %r10, 24(%rdi) + adc $0, %r13 + dec %rbp + lea 32(%rdi), %rdi + jnz Lm2tp + +Lm2ed:.byte 0xc4,98,251,0xf6,211 + add %rax, %r12 + mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,217 + add %r12, %rax + add %r13, %rax + mov %rax, (%rdi) + + mov (%rsp), %rsi + lea 16(%r8), %r8 + lea 8(%rdi,%rcx,8), %rdi + add $2, %rcx + jge Lcor1 + + push %r14 + push %r15 + +Louter: + mov (%r8), %r9 + mov 8(%r8), %rbx + + lea (%rcx), %rbp + sar $2, %rbp + + mov (%rsi), %rdx + test $1, %cl + jnz Lbx1 + +Lbx0: mov (%rdi), %r15 + mov 8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,233 + add %rax, %r15 + adc $0, %r13 + .byte 0xc4,98,251,0xf6,211 + add %rax, %r14 + adc $0, %r10 + mov 8(%rsi), %rdx + mov %r15, (%rdi) + .byte 0xc4,66,251,0xf6,217 + test $2, %cl + jz Lb2 + +Lb0: lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo0 + +Lb2: mov 16(%rdi), %r15 + lea 24(%rdi), %rdi + lea 24(%rsi), %rsi + jmp Llo2 + +Lbx1: mov (%rdi), %r14 + mov 8(%rdi), %r15 + .byte 0xc4,66,251,0xf6,217 + add %rax, %r14 + .byte 0xc4,98,251,0xf6,227 + adc $0, %r11 + mov %r14, (%rdi) + add %rax, %r15 + adc $0, %r12 + mov 8(%rsi), %rdx + test $2, %cl + jnz Lb3 + +Lb1: lea 16(%rsi), %rsi + lea 16(%rdi), %rdi + jmp Llo1 + +Lb3: mov 16(%rdi), %r14 + lea 32(%rsi), %rsi + .byte 0xc4,66,251,0xf6,233 + inc %rbp + jz Lcj3 + jmp Llo3 + + .align 4, 0x90 +Ltop: .byte 0xc4,66,251,0xf6,233 + add %r10, %r15 + adc $0, %r12 +Llo3: add %rax, %r15 + adc $0, %r13 + .byte 0xc4,98,251,0xf6,211 + add %rax, %r14 + adc $0, %r10 + lea 32(%rdi), %rdi + add %r11, %r15 + mov -16(%rsi), %rdx + mov %r15, -24(%rdi) + adc $0, %r13 + add %r12, %r14 + mov -8(%rdi), %r15 + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 +Llo2: add %rax, %r14 + .byte 0xc4,98,251,0xf6,227 + adc $0, %r11 + add %r13, %r14 + mov %r14, -16(%rdi) + adc $0, %r11 + add %rax, %r15 + adc $0, %r12 + add %r10, %r15 + mov -8(%rsi), %rdx + adc $0, %r12 +Llo1: .byte 0xc4,66,251,0xf6,233 + add %rax, %r15 + adc $0, %r13 + mov (%rdi), %r14 + .byte 0xc4,98,251,0xf6,211 + add %rax, %r14 + adc $0, %r10 + add %r11, %r15 + mov %r15, -8(%rdi) + adc $0, %r13 + mov (%rsi), %rdx + add %r12, %r14 + .byte 0xc4,66,251,0xf6,217 + adc $0, %r10 +Llo0: add %rax, %r14 + adc $0, %r11 + .byte 0xc4,98,251,0xf6,227 + add %r13, %r14 + mov 8(%rdi), %r15 + mov %r14, (%rdi) + mov 16(%rdi), %r14 + adc $0, %r11 + add %rax, %r15 + adc $0, %r12 + mov 8(%rsi), %rdx + lea 32(%rsi), %rsi + inc %rbp + jnz Ltop + +Lend: .byte 0xc4,66,251,0xf6,233 + add %r10, %r15 + adc $0, %r12 +Lcj3: add %rax, %r15 + adc $0, %r13 + .byte 0xc4,98,251,0xf6,211 + add %rax, %r14 + add %r11, %r15 + mov -16(%rsi), %rdx + mov %r15, 8(%rdi) + adc $0, %r13 + add %r12, %r14 + .byte 0xc4,66,251,0xf6,217 + add %r14, %rax + add %r13, %rax + mov %rax, 16(%rdi) + + mov 16(%rsp), %rsi + lea 16(%r8), %r8 + lea 24(%rdi,%rcx,8), %rdi + add $2, %rcx + jl Louter + + pop %r15 + pop %r14 + + jnz Lcor0 + +Lcor1:mov (%r8), %r9 + mov 8(%r8), %rbx + mov (%rsi), %rdx + .byte 0xc4,194,155,0xf6,233 + add (%rdi), %r12 + adc %rax, %rbp + mov 8(%rsi), %r10 + imul %r9, %r10 + imul %rbx, %rdx + mov %r12, (%rdi) + add %r10, %rdx + add %rbp, %rdx + mov %rdx, 8(%rdi) + pop %rax + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + +Lcor0:mov (%r8), %r11 + imul (%rsi), %r11 + add %rax, %r11 + mov %r11, (%rdi) + pop %rax + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + .align 4, 0x90 +Lsmall: + cmp $2, %rcx + jae Lgt1 +Ln1: imul (%r8), %rdx + mov %rdx, (%rdi) + + ret +Lgt1: ja Lgt2 +Ln2: mov (%r8), %r9 + .byte 0xc4,194,251,0xf6,209 + mov %rax, (%rdi) + mov 8(%rsi), %rax + imul %r9, %rax + add %rax, %rdx + mov 8(%r8), %r9 + mov (%rsi), %rcx + imul %r9, %rcx + add %rcx, %rdx + mov %rdx, 8(%rdi) + + ret +Lgt2: +Ln3: mov (%r8), %r9 + .byte 0xc4,66,251,0xf6,209 + mov %rax, (%rdi) + mov 8(%rsi), %rdx + .byte 0xc4,194,251,0xf6,209 + imul 16(%rsi), %r9 + add %rax, %r10 + adc %rdx, %r9 + mov 8(%r8), %r11 + mov (%rsi), %rdx + .byte 0xc4,194,251,0xf6,211 + add %rax, %r10 + adc %rdx, %r9 + imul 8(%rsi), %r11 + add %r11, %r9 + mov %r10, 8(%rdi) + mov 16(%r8), %r10 + mov (%rsi), %rax + imul %rax, %r10 + add %r10, %r9 + mov %r9, 16(%rdi) + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/nand_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/nand_n.s new file mode 100644 index 0000000..79898b8 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/nand_n.s @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_nand_n + + +___gmpn_nand_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: and (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: and (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 +Lb00: mov 8(%rdx), %r9 + and (%rsi), %r8 + not %r8 + and 8(%rsi), %r9 + not %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 +Le10: mov 24(%rdx), %r9 + lea 32(%rdx), %rdx + and 16(%rsi), %r8 + not %r8 + and 24(%rsi), %r9 + lea 32(%rsi), %rsi + not %r9 + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/nior_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/nior_n.s new file mode 100644 index 0000000..b8d0008 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/nior_n.s @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_nior_n + + +___gmpn_nior_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: or (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: or (%rsi), %r8 + not %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 +Lb00: mov 8(%rdx), %r9 + or (%rsi), %r8 + not %r8 + or 8(%rsi), %r9 + not %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 +Le10: mov 24(%rdx), %r9 + lea 32(%rdx), %rdx + or 16(%rsi), %r8 + not %r8 + or 24(%rsi), %r9 + lea 32(%rsi), %rsi + not %r9 + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/perfsqr.h b/vere/ext/gmp/gen/x86_64-macos/mpn/perfsqr.h new file mode 100644 index 0000000..80c5eb7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/perfsqr.h @@ -0,0 +1,46 @@ +/* This file generated by gen-psqr.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 || GMP_NAIL_BITS != 0 +Error, error, this data is for 64 bit limb and 0 bit nail +#endif + +/* Non-zero bit indicates a quadratic residue mod 0x100. + This test identifies 82.81% as non-squares (212/256). */ +static const mp_limb_t +sq_res_0x100[4] = { + CNST_LIMB(0x202021202030213), + CNST_LIMB(0x202021202020213), + CNST_LIMB(0x202021202030212), + CNST_LIMB(0x202021202020212), +}; + +/* 2^48-1 = 3^2 * 5 * 7 * 13 * 17 * 97 ... */ +#define PERFSQR_MOD_BITS 49 + +/* This test identifies 97.81% as non-squares. */ +#define PERFSQR_MOD_TEST(up, usize) \ + do { \ + mp_limb_t r; \ + PERFSQR_MOD_34 (r, up, usize); \ + \ + /* 69.23% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(91), CNST_LIMB(0xfd2fd2fd2fd3), \ + CNST_LIMB(0x2191240), CNST_LIMB(0x8850a206953820e1)); \ + \ + /* 68.24% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(85), CNST_LIMB(0xfcfcfcfcfcfd), \ + CNST_LIMB(0x82158), CNST_LIMB(0x10b48c4b4206a105)); \ + \ + /* 55.56% */ \ + PERFSQR_MOD_1 (r, CNST_LIMB( 9), CNST_LIMB(0xe38e38e38e39), \ + CNST_LIMB(0x93)); \ + \ + /* 49.48% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(97), CNST_LIMB(0xfd5c5f02a3a1), \ + CNST_LIMB(0x1eb628b47), CNST_LIMB(0x6067981b8b451b5f)); \ + } while (0) + +/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, 99.62% non-squares. */ + +/* helper for tests/mpz/t-perfsqr.c */ +#define PERFSQR_DIVISORS { 256, 91, 85, 9, 97, } diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/popcount.s b/vere/ext/gmp/gen/x86_64-macos/mpn/popcount.s new file mode 100644 index 0000000..c7695bb --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/popcount.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_popcount + + +___gmpn_popcount: + + + + mov %esi, %r8d + and $7, %r8d + + .byte 0xf3,0x48,0x0f,0xb8,0x07 + xor %ecx, %ecx + + lea Ltab(%rip), %r9 + + movslq (%r9,%r8,4), %r8 + add %r9, %r8 + jmp *%r8 + + +L3: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 + add $24, %rdi + sub $8, %rsi + jg Le34 + add %r10, %rax + add %r11, %rax +Ls1: + ret + +L1: sub $8, %rsi + jle Ls1 + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 + add $8, %rdi + jmp Le12 + +L7: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x10 + add $-8, %rdi + jmp Le07 + +L0: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 + jmp Le07 + +L4: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 + add $32, %rdi + sub $8, %rsi + jle Lx4 + + .align 4, 0x90 +Ltop: +Le34: .byte 0xf3,0x4c,0x0f,0xb8,0x07 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x08 + add %r10, %rcx + add %r11, %rax +Le12: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x10 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x18 + add %r8, %rcx + add %r9, %rax +Le07: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x20 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x28 + add %r10, %rcx + add %r11, %rax +Le56: .byte 0xf3,0x4c,0x0f,0xb8,0x57,0x30 + .byte 0xf3,0x4c,0x0f,0xb8,0x5f,0x38 + add $64, %rdi + add %r8, %rcx + add %r9, %rax + sub $8, %rsi + jg Ltop + +Lx4: add %r10, %rcx + add %r11, %rax +Lx2: add %rcx, %rax + + + ret + +L2: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 + sub $8, %rsi + jle Lx2 + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 + add $16, %rdi + jmp Le12 + +L5: .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x10 + add $-24, %rdi + jmp Le56 + +L6: .byte 0xf3,0x48,0x0f,0xb8,0x4f,0x08 + .byte 0xf3,0x4c,0x0f,0xb8,0x47,0x10 + .byte 0xf3,0x4c,0x0f,0xb8,0x4f,0x18 + add $-16, %rdi + jmp Le56 + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/redc_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/redc_1.s new file mode 100644 index 0000000..55ef11f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/redc_1.s @@ -0,0 +1,446 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_redc_1 + + +___gmpn_redc_1: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi + mov %rdx, %rdi + mov (%rsi), %rdx + + neg %rcx + push %r8 + imul %r8, %rdx + mov %rcx, %r15 + + test $1, %cl + jnz Lbx1 + +Lbx0: test $2, %cl + jz Lo0b + + cmp $-2, %ecx + jnz Lo2 + + + mov 8(%rsp), %rbx + lea 16(%rsp), %rsp + .byte 0xc4,98,179,0xf6,39 + .byte 0xc4,98,163,0xf6,87,8 + add %r12, %r11 + adc $0, %r10 + add (%rsi), %r9 + adc 8(%rsi), %r11 + adc $0, %r10 + mov %r11, %rdx + imul %r8, %rdx + .byte 0xc4,98,147,0xf6,39 + .byte 0xc4,98,139,0xf6,127,8 + xor %eax, %eax + add %r12, %r14 + adc $0, %r15 + add %r11, %r13 + adc 16(%rsi), %r14 + adc $0, %r15 + add %r14, %r10 + adc 24(%rsi), %r15 + mov %r10, (%rbx) + mov %r15, 8(%rbx) + setc %al + jmp Lret + +Lo2: lea 2(%rcx), %r14 + .byte 0xc4,98,179,0xf6,7 + .byte 0xc4,98,163,0xf6,87,8 + sar $2, %r14 + add %r8, %r11 + jmp Llo2 + + .align 4, 0x90 +Ltp2: adc %rax, %r9 + lea 32(%rsi), %rsi + adc %r8, %r11 +Llo2: .byte 0xc4,98,147,0xf6,103,16 + mov (%rsi), %r8 + .byte 0xc4,226,227,0xf6,71,24 + lea 32(%rdi), %rdi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(%rsi), %r10 + mov 16(%rsi), %r12 + add %r9, %r8 + mov 24(%rsi), %rbp + mov %r8, (%rsi) + adc %r11, %r10 + .byte 0xc4,98,179,0xf6,7 + mov %r10, 8(%rsi) + adc %r13, %r12 + mov %r12, 16(%rsi) + adc %rbx, %rbp + .byte 0xc4,98,163,0xf6,87,8 + mov %rbp, 24(%rsi) + inc %r14 + jnz Ltp2 + +Led2: mov 56(%rsi,%rcx,8), %rdx + lea 16(%rdi,%rcx,8), %rdi + adc %rax, %r9 + adc %r8, %r11 + mov 32(%rsi), %r8 + adc $0, %r10 + imul (%rsp), %rdx + mov 40(%rsi), %rax + add %r9, %r8 + mov %r8, 32(%rsi) + adc %r11, %rax + mov %rax, 40(%rsi) + lea 56(%rsi,%rcx,8), %rsi + adc $0, %r10 + mov %r10, -8(%rsi) + inc %r15 + jnz Lo2 + + jmp Lcj + + +Lbx1: test $2, %cl + jz Lo3a + +Lo1a: cmp $-1, %ecx + jnz Lo1b + + + mov 8(%rsp), %rbx + lea 16(%rsp), %rsp + .byte 0xc4,98,163,0xf6,23 + add (%rsi), %r11 + adc 8(%rsi), %r10 + mov %r10, (%rbx) + mov $0, %eax + setc %al + jmp Lret + +Lo1b: lea 24(%rdi), %rdi +Lo1: lea 1(%rcx), %r14 + .byte 0xc4,98,163,0xf6,87,232 + .byte 0xc4,98,147,0xf6,103,240 + .byte 0xc4,226,227,0xf6,71,248 + sar $2, %r14 + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (%rsi), %r10 + mov 8(%rsi), %r12 + mov 16(%rsi), %rbp + add %r11, %r10 + jmp Llo1 + + .align 4, 0x90 +Ltp1: adc %rax, %r9 + lea 32(%rsi), %rsi + adc %r8, %r11 + .byte 0xc4,98,147,0xf6,103,16 + mov -8(%rsi), %r8 + .byte 0xc4,226,227,0xf6,71,24 + lea 32(%rdi), %rdi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (%rsi), %r10 + mov 8(%rsi), %r12 + add %r9, %r8 + mov 16(%rsi), %rbp + mov %r8, -8(%rsi) + adc %r11, %r10 +Llo1: .byte 0xc4,98,179,0xf6,7 + mov %r10, (%rsi) + adc %r13, %r12 + mov %r12, 8(%rsi) + adc %rbx, %rbp + .byte 0xc4,98,163,0xf6,87,8 + mov %rbp, 16(%rsi) + inc %r14 + jnz Ltp1 + +Led1: mov 48(%rsi,%rcx,8), %rdx + lea 40(%rdi,%rcx,8), %rdi + adc %rax, %r9 + adc %r8, %r11 + mov 24(%rsi), %r8 + adc $0, %r10 + imul (%rsp), %rdx + mov 32(%rsi), %rax + add %r9, %r8 + mov %r8, 24(%rsi) + adc %r11, %rax + mov %rax, 32(%rsi) + lea 48(%rsi,%rcx,8), %rsi + adc $0, %r10 + mov %r10, -8(%rsi) + inc %r15 + jnz Lo1 + + jmp Lcj + +Lo3a: cmp $-3, %ecx + jnz Lo3b + + +Ln3: .byte 0xc4,226,227,0xf6,7 + .byte 0xc4,98,179,0xf6,119,8 + add (%rsi), %rbx + .byte 0xc4,98,163,0xf6,87,16 + adc %rax, %r9 + adc %r14, %r11 + mov 8(%rsi), %r14 + mov %r8, %rdx + adc $0, %r10 + mov 16(%rsi), %rax + add %r9, %r14 + mov %r14, 8(%rsi) + .byte 0xc4,66,235,0xf6,238 + adc %r11, %rax + mov %rax, 16(%rsi) + adc $0, %r10 + mov %r10, (%rsi) + lea 8(%rsi), %rsi + inc %r15 + jnz Ln3 + + jmp Lcj + +Lo3b: lea 8(%rdi), %rdi +Lo3: lea 4(%rcx), %r14 + .byte 0xc4,226,227,0xf6,71,248 + .byte 0xc4,98,179,0xf6,7 + mov (%rsi), %rbp + .byte 0xc4,98,163,0xf6,87,8 + sar $2, %r14 + add %rbx, %rbp + nop + adc %rax, %r9 + jmp Llo3 + + .align 4, 0x90 +Ltp3: adc %rax, %r9 + lea 32(%rsi), %rsi +Llo3: adc %r8, %r11 + .byte 0xc4,98,147,0xf6,103,16 + mov 8(%rsi), %r8 + .byte 0xc4,226,227,0xf6,71,24 + lea 32(%rdi), %rdi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 16(%rsi), %r10 + mov 24(%rsi), %r12 + add %r9, %r8 + mov 32(%rsi), %rbp + mov %r8, 8(%rsi) + adc %r11, %r10 + .byte 0xc4,98,179,0xf6,7 + mov %r10, 16(%rsi) + adc %r13, %r12 + mov %r12, 24(%rsi) + adc %rbx, %rbp + .byte 0xc4,98,163,0xf6,87,8 + mov %rbp, 32(%rsi) + inc %r14 + jnz Ltp3 + +Led3: mov 64(%rsi,%rcx,8), %rdx + lea 24(%rdi,%rcx,8), %rdi + adc %rax, %r9 + adc %r8, %r11 + mov 40(%rsi), %r8 + adc $0, %r10 + imul (%rsp), %rdx + mov 48(%rsi), %rax + add %r9, %r8 + mov %r8, 40(%rsi) + adc %r11, %rax + mov %rax, 48(%rsi) + lea 64(%rsi,%rcx,8), %rsi + adc $0, %r10 + mov %r10, -8(%rsi) + inc %r15 + jnz Lo3 + + jmp Lcj + +Lo0b: lea 16(%rdi), %rdi +Lo0: mov %rcx, %r14 + .byte 0xc4,98,147,0xf6,103,240 + .byte 0xc4,226,227,0xf6,71,248 + sar $2, %r14 + add %r12, %rbx + adc $0, %rax + mov (%rsi), %r12 + mov 8(%rsi), %rbp + .byte 0xc4,98,179,0xf6,7 + add %r13, %r12 + jmp Llo0 + + .align 4, 0x90 +Ltp0: adc %rax, %r9 + lea 32(%rsi), %rsi + adc %r8, %r11 + .byte 0xc4,98,147,0xf6,103,16 + mov -16(%rsi), %r8 + .byte 0xc4,226,227,0xf6,71,24 + lea 32(%rdi), %rdi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov -8(%rsi), %r10 + mov (%rsi), %r12 + add %r9, %r8 + mov 8(%rsi), %rbp + mov %r8, -16(%rsi) + adc %r11, %r10 + .byte 0xc4,98,179,0xf6,7 + mov %r10, -8(%rsi) + adc %r13, %r12 + mov %r12, (%rsi) +Llo0: adc %rbx, %rbp + .byte 0xc4,98,163,0xf6,87,8 + mov %rbp, 8(%rsi) + inc %r14 + jnz Ltp0 + +Led0: mov 40(%rsi,%rcx,8), %rdx + lea 32(%rdi,%rcx,8), %rdi + adc %rax, %r9 + adc %r8, %r11 + mov 16(%rsi), %r8 + adc $0, %r10 + imul (%rsp), %rdx + mov 24(%rsi), %rax + add %r9, %r8 + mov %r8, 16(%rsi) + adc %r11, %rax + mov %rax, 24(%rsi) + lea 40(%rsi,%rcx,8), %rsi + adc $0, %r10 + mov %r10, -8(%rsi) + inc %r15 + jnz Lo0 + +Lcj: + mov 8(%rsp), %rdi + lea 16-8(%rsp), %rsp + lea (%rsi,%rcx,8), %rdx + neg %ecx + + + + + call ___gmpn_add_n + + lea 8(%rsp), %rsp + + +Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh1_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh1_n.s new file mode 100644 index 0000000..824e7af --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh1_n.s @@ -0,0 +1,212 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_rsblsh1_nc + + +___gmpn_rsblsh1_nc: + + + + push %rbp + mov %r8, %rax + neg %rax + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $63, %r8, %rbp + mov %ecx, %r9d + and $3, %r9d + je Lb00 + cmp $2, %r9d + jc Lb01 + je Lb10 + jmp Lb11 + + + .align 4, 0x90 + .globl ___gmpn_rsblsh1_n + + +___gmpn_rsblsh1_n: + + + push %rbp + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $63, %r8, %rbp + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + mov 16(%rdx), %r10 + shrd $63, %r10, %r9 + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + sbb 16(%rsi), %r9 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, %rbp + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %eax, %eax + sub $3, %rcx + ja Ltop + jmp Lend + +Lb01: add %eax, %eax + sbb (%rsi), %rbp + mov %rbp, (%rdi) + mov %r8, %rbp + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %eax, %eax + sub $1, %rcx + ja Ltop + jmp Lend + +Lb10: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, %rbp + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %eax, %eax + sub $2, %rcx + ja Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + shrd $63, %r8, %rbp +Lb00: mov 8(%rdx), %r9 + shrd $63, %r9, %r8 + mov 16(%rdx), %r10 + shrd $63, %r10, %r9 + mov 24(%rdx), %r11 + shrd $63, %r11, %r10 + lea 32(%rdx), %rdx + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + sbb 16(%rsi), %r9 + sbb 24(%rsi), %r10 + lea 32(%rsi), %rsi + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbp + lea 32(%rdi), %rdi + sbb %eax, %eax + sub $4, %rcx + jnz Ltop + +Lend: shr $63, %rbp + add %eax, %eax + sbb $0, %rbp + mov %rbp, %rax + pop %rbp + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh2_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh2_n.s new file mode 100644 index 0000000..77e0be2 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh2_n.s @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_rsblsh2_nc + + +___gmpn_rsblsh2_nc: + + + + push %rbp + mov %r8, %rax + neg %rax + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $62, %r8, %rbp + mov %ecx, %r9d + and $3, %r9d + je Lb00 + cmp $2, %r9d + jc Lb01 + je Lb10 + jmp Lb11 + + + .align 4, 0x90 + .globl ___gmpn_rsblsh2_n + + +___gmpn_rsblsh2_n: + + + push %rbp + xor %ebp, %ebp + mov (%rdx), %r8 + shrd $62, %r8, %rbp + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + mov 16(%rdx), %r10 + shrd $62, %r10, %r9 + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + sbb 16(%rsi), %r9 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, %rbp + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %eax, %eax + sub $3, %rcx + ja Ltop + jmp Lend + +Lb01: add %eax, %eax + sbb (%rsi), %rbp + mov %rbp, (%rdi) + mov %r8, %rbp + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %eax, %eax + sub $1, %rcx + ja Ltop + jmp Lend + +Lb10: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, %rbp + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %eax, %eax + sub $2, %rcx + ja Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + shrd $62, %r8, %rbp +Lb00: mov 8(%rdx), %r9 + shrd $62, %r9, %r8 + mov 16(%rdx), %r10 + shrd $62, %r10, %r9 + mov 24(%rdx), %r11 + shrd $62, %r11, %r10 + lea 32(%rdx), %rdx + add %eax, %eax + sbb (%rsi), %rbp + sbb 8(%rsi), %r8 + sbb 16(%rsi), %r9 + sbb 24(%rsi), %r10 + lea 32(%rsi), %rsi + mov %rbp, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbp + lea 32(%rdi), %rdi + sbb %eax, %eax + sub $4, %rcx + jnz Ltop + +Lend: shr $62, %rbp + add %eax, %eax + sbb $0, %rbp + mov %rbp, %rax + pop %rbp + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s new file mode 100644 index 0000000..329c600 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rsblsh_n.s @@ -0,0 +1,269 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_rsblsh_n + + +___gmpn_rsblsh_n: + + + + + mov (%rdx), %r10 + + mov %ecx, %eax + shr $3, %rcx + xor %r9d, %r9d + sub %r8, %r9 + and $7, %eax + + lea Ltab(%rip), %r11 + + movslq (%r11,%rax,4), %rax + add %r11, %rax + jmp *%rax + + +L0: lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + lea 32(%rdi), %rdi + xor %r11d, %r11d + jmp Le0 + +L7: mov %r10, %r11 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + xor %r10d, %r10d + jmp Le7 + +L6: lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + xor %r11d, %r11d + jmp Le6 + +L5: mov %r10, %r11 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + xor %r10d, %r10d + jmp Le5 + +Lend: sbb 24(%rsi), %rax + mov %rax, -40(%rdi) + .byte 0xc4,194,179,0xf7,195 + sbb %rcx, %rax + + ret + + .align 5, 0x90 +Ltop: jrcxz Lend + mov -32(%rdx), %r10 + sbb 24(%rsi), %rax + lea 64(%rsi), %rsi + .byte 0xc4,66,179,0xf7,219 + mov %rax, -40(%rdi) +Le0: dec %rcx + .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov -24(%rdx), %r11 + sbb -32(%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, -32(%rdi) +Le7: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov -16(%rdx), %r10 + sbb -24(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, -24(%rdi) +Le6: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov -8(%rdx), %r11 + sbb -16(%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, -16(%rdi) +Le5: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov (%rdx), %r10 + sbb -8(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, -8(%rdi) +Le4: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov 8(%rdx), %r11 + sbb (%rsi), %rax + .byte 0xc4,66,179,0xf7,210 + mov %rax, (%rdi) +Le3: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + mov 16(%rdx), %r10 + sbb 8(%rsi), %rax + .byte 0xc4,66,179,0xf7,219 + mov %rax, 8(%rdi) +Le2: .byte 0xc4,194,185,0xf7,194 + lea (%r11,%rax), %rax + mov 24(%rdx), %r11 + sbb 16(%rsi), %rax + lea 64(%rdx), %rdx + .byte 0xc4,66,179,0xf7,210 + mov %rax, 16(%rdi) + lea 64(%rdi), %rdi +Le1: .byte 0xc4,194,185,0xf7,195 + lea (%r10,%rax), %rax + jmp Ltop + +L4: xor %r11d, %r11d + jmp Le4 + +L3: mov %r10, %r11 + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + xor %r10d, %r10d + jmp Le3 + +L2: lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + xor %r11d, %r11d + jmp Le2 + +L1: mov %r10, %r11 + lea -24(%rsi), %rsi + lea 40(%rdx), %rdx + lea 40(%rdi), %rdi + xor %r10d, %r10d + jmp Le1 + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1add_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1add_n.s new file mode 100644 index 0000000..96749ec --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1add_n.s @@ -0,0 +1,208 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + + .align 4, 0x90 + .globl ___gmpn_rsh1add_nc + + +___gmpn_rsh1add_nc: + + + + push %rbx + push %rbp + + neg %r8 + mov (%rsi), %rbp + adc (%rdx), %rbp + + jmp Lent + + + .align 4, 0x90 + .globl ___gmpn_rsh1add_n + + +___gmpn_rsh1add_n: + + + push %rbx + push %rbp + + mov (%rsi), %rbp + add (%rdx), %rbp +Lent: + sbb %ebx, %ebx + mov %ebp, %eax + and $1, %eax + + mov %ecx, %r11d + and $3, %r11d + + cmp $1, %r11d + je Ldo + +Ln1: cmp $2, %r11d + jne Ln2 + add %ebx, %ebx + mov 8(%rsi), %r10 + adc 8(%rdx), %r10 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r10, %rbp + mov %rbp, -8(%rdi) + jmp Lcj1 + +Ln2: cmp $3, %r11d + jne Ln3 + add %ebx, %ebx + mov 8(%rsi), %r9 + mov 16(%rsi), %r10 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r9, %rbp + mov %rbp, -16(%rdi) + jmp Lcj2 + +Ln3: dec %rcx + add %ebx, %ebx + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + adc 8(%rdx), %r8 + adc 16(%rdx), %r9 + mov 24(%rsi), %r10 + adc 24(%rdx), %r10 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r8, %rbp + mov %rbp, -24(%rdi) + shrd $1, %r9, %r8 + mov %r8, -16(%rdi) +Lcj2: shrd $1, %r10, %r9 + mov %r9, -8(%rdi) +Lcj1: mov %r10, %rbp + +Ldo: + shr $2, %rcx + je Lend + .align 4, 0x90 +Ltop: add %ebx, %ebx + + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + adc 8(%rdx), %r8 + adc 16(%rdx), %r9 + mov 24(%rsi), %r10 + mov 32(%rsi), %r11 + adc 24(%rdx), %r10 + adc 32(%rdx), %r11 + + lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + + sbb %ebx, %ebx + + shrd $1, %r8, %rbp + mov %rbp, (%rdi) + shrd $1, %r9, %r8 + mov %r8, 8(%rdi) + shrd $1, %r10, %r9 + mov %r9, 16(%rdi) + shrd $1, %r11, %r10 + mov %r10, 24(%rdi) + + dec %rcx + mov %r11, %rbp + lea 32(%rdi), %rdi + jne Ltop + +Lend: shrd $1, %rbx, %rbp + mov %rbp, (%rdi) + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1sub_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1sub_n.s new file mode 100644 index 0000000..ca201b3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rsh1sub_n.s @@ -0,0 +1,208 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + + .align 4, 0x90 + .globl ___gmpn_rsh1sub_nc + + +___gmpn_rsh1sub_nc: + + + + push %rbx + push %rbp + + neg %r8 + mov (%rsi), %rbp + sbb (%rdx), %rbp + + jmp Lent + + + .align 4, 0x90 + .globl ___gmpn_rsh1sub_n + + +___gmpn_rsh1sub_n: + + + push %rbx + push %rbp + + mov (%rsi), %rbp + sub (%rdx), %rbp +Lent: + sbb %ebx, %ebx + mov %ebp, %eax + and $1, %eax + + mov %ecx, %r11d + and $3, %r11d + + cmp $1, %r11d + je Ldo + +Ln1: cmp $2, %r11d + jne Ln2 + add %ebx, %ebx + mov 8(%rsi), %r10 + sbb 8(%rdx), %r10 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r10, %rbp + mov %rbp, -8(%rdi) + jmp Lcj1 + +Ln2: cmp $3, %r11d + jne Ln3 + add %ebx, %ebx + mov 8(%rsi), %r9 + mov 16(%rsi), %r10 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r9, %rbp + mov %rbp, -16(%rdi) + jmp Lcj2 + +Ln3: dec %rcx + add %ebx, %ebx + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + sbb 8(%rdx), %r8 + sbb 16(%rdx), %r9 + mov 24(%rsi), %r10 + sbb 24(%rdx), %r10 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + sbb %ebx, %ebx + + shrd $1, %r8, %rbp + mov %rbp, -24(%rdi) + shrd $1, %r9, %r8 + mov %r8, -16(%rdi) +Lcj2: shrd $1, %r10, %r9 + mov %r9, -8(%rdi) +Lcj1: mov %r10, %rbp + +Ldo: + shr $2, %rcx + je Lend + .align 4, 0x90 +Ltop: add %ebx, %ebx + + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + sbb 8(%rdx), %r8 + sbb 16(%rdx), %r9 + mov 24(%rsi), %r10 + mov 32(%rsi), %r11 + sbb 24(%rdx), %r10 + sbb 32(%rdx), %r11 + + lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + + sbb %ebx, %ebx + + shrd $1, %r8, %rbp + mov %rbp, (%rdi) + shrd $1, %r9, %r8 + mov %r8, 8(%rdi) + shrd $1, %r10, %r9 + mov %r9, 16(%rdi) + shrd $1, %r11, %r10 + mov %r10, 24(%rdi) + + dec %rcx + mov %r11, %rbp + lea 32(%rdi), %rdi + jne Ltop + +Lend: shrd $1, %rbx, %rbp + mov %rbp, (%rdi) + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s b/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s new file mode 100644 index 0000000..7528e27 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/rshift.s @@ -0,0 +1,230 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 6, 0x90 + .globl ___gmpn_rshift + + +___gmpn_rshift: + + + movd %ecx, %xmm4 + mov $64, %eax + sub %ecx, %eax + movd %eax, %xmm5 + + neg %ecx + mov (%rsi), %rax + shl %cl, %rax + + cmp $3, %rdx + jle Lbc + + test $8, %dil + jz Lrp_aligned + + + movq (%rsi), %xmm0 + movq 8(%rsi), %xmm1 + psrlq %xmm4, %xmm0 + psllq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, (%rdi) + lea 8(%rsi), %rsi + lea 8(%rdi), %rdi + dec %rdx + +Lrp_aligned: + lea 1(%rdx), %r8d + lea (%rsi,%rdx,8), %rsi + lea (%rdi,%rdx,8), %rdi + neg %rdx + + and $6, %r8d + jz Lbu0 + cmp $4, %r8d + jz Lbu4 + jc Lbu2 +Lbu6: add $4, %rdx + jmp Li56 +Lbu0: add $6, %rdx + jmp Li70 +Lbu4: add $2, %rdx + jmp Li34 +Lbu2: add $8, %rdx + jge Lend + + .align 4, 0x90 +Ltop: movdqu -64(%rsi,%rdx,8), %xmm1 + movdqu -56(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -64(%rdi,%rdx,8) +Li70: + movdqu -48(%rsi,%rdx,8), %xmm1 + movdqu -40(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -48(%rdi,%rdx,8) +Li56: + movdqu -32(%rsi,%rdx,8), %xmm1 + movdqu -24(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -32(%rdi,%rdx,8) +Li34: + movdqu -16(%rsi,%rdx,8), %xmm1 + movdqu -8(%rsi,%rdx,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -16(%rdi,%rdx,8) + add $8, %rdx + jl Ltop + +Lend: test $1, %dl + jnz Le1 + + movdqu -16(%rsi), %xmm1 + movq -8(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, -16(%rdi) + + ret + +Le1: movq -8(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, -8(%rdi) + + ret + + + .align 4, 0x90 +Lbc: dec %edx + jnz 1f + movq (%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, (%rdi) + + ret + +1: movq (%rsi), %xmm1 + movq 8(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (%rdi) + dec %edx + jnz 1f + movq 8(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 8(%rdi) + + ret + +1: movq 8(%rsi), %xmm1 + movq 16(%rsi), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(%rdi) + movq 16(%rsi), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 16(%rdi) + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sec_tabselect.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sec_tabselect.s new file mode 100644 index 0000000..46c87ed --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sec_tabselect.s @@ -0,0 +1,233 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sec_tabselect + + +___gmpn_sec_tabselect: + + + + + + + + + + + movd %r8, %xmm8 + pshufd $0, %xmm8, %xmm8 + mov $1, %eax + movd %rax, %xmm9 + pshufd $0, %xmm9, %xmm9 + + mov %rdx, %r9 + add $-8, %r9 + js Louter_end + +Louter_top: + mov %rcx, %r10 + mov %rsi, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + .align 4, 0x90 +Ltop: movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + lea (%rsi,%rdx,8), %rsi + add $-1, %r10 + jne Ltop + + movdqu %xmm4, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) + movdqu %xmm7, 48(%rdi) + + lea 64(%r11), %rsi + lea 64(%rdi), %rdi + add $-8, %r9 + jns Louter_top +Louter_end: + + test $4, %dl + je Lb0xx +Lb1xx:mov %rcx, %r10 + mov %rsi, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + .align 4, 0x90 +Ltp4: movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(%rsi), %xmm2 + movdqu 16(%rsi), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + lea (%rsi,%rdx,8), %rsi + add $-1, %r10 + jne Ltp4 + movdqu %xmm4, 0(%rdi) + movdqu %xmm5, 16(%rdi) + lea 32(%r11), %rsi + lea 32(%rdi), %rdi + +Lb0xx:test $2, %dl + je Lb00x +Lb01x:mov %rcx, %r10 + mov %rsi, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + .align 4, 0x90 +Ltp2: movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(%rsi), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (%rsi,%rdx,8), %rsi + add $-1, %r10 + jne Ltp2 + movdqu %xmm4, 0(%rdi) + lea 16(%r11), %rsi + lea 16(%rdi), %rdi + +Lb00x:test $1, %dl + je Lb000 +Lb001:mov %rcx, %r10 + mov %rsi, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + .align 4, 0x90 +Ltp1: movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movq 0(%rsi), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (%rsi,%rdx,8), %rsi + add $-1, %r10 + jne Ltp1 + movq %xmm4, 0(%rdi) + +Lb000: + + + + + + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s new file mode 100644 index 0000000..fea3649 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_basecase.s @@ -0,0 +1,520 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_sqr_basecase + + +___gmpn_sqr_basecase: + + + + cmp $2, %rdx + jae Lgt1 + + mov (%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + mov %rax, (%rdi) + mov %rdx, 8(%rdi) + + ret + +Lgt1: jne Lgt2 + + mov (%rsi), %rdx + mov 8(%rsi), %rcx + .byte 0xc4,98,179,0xf6,209 + .byte 0xc4,98,251,0xf6,194 + mov %rcx, %rdx + .byte 0xc4,226,163,0xf6,210 + add %r9, %r9 + adc %r10, %r10 + adc $0, %rdx + add %r9, %r8 + adc %r11, %r10 + adc $0, %rdx + mov %rax, (%rdi) + mov %r8, 8(%rdi) + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + + ret + +Lgt2: cmp $4, %rdx + jae Lgt3 + + + + + + mov (%rsi), %r8 + mov 8(%rsi), %rdx + mov %rdx, %r9 + .byte 0xc4,194,163,0xf6,192 + mov 16(%rsi), %rdx + .byte 0xc4,194,171,0xf6,200 + mov %r11, %r8 + add %rax, %r10 + adc $0, %rcx + .byte 0xc4,194,235,0xf6,193 + add %rcx, %rdx + mov %rdx, 24(%rdi) + adc $0, %rax + mov %rax, 32(%rdi) + xor %ecx, %ecx + mov (%rsi), %rdx + .byte 0xc4,98,251,0xf6,218 + mov %rax, (%rdi) + add %r8, %r8 + adc %r10, %r10 + setc %cl + mov 8(%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + add %r11, %r8 + adc %rax, %r10 + mov %r8, 8(%rdi) + mov %r10, 16(%rdi) + mov 24(%rdi), %r8 + mov 32(%rdi), %r10 + lea (%rdx,%rcx), %r11 + adc %r8, %r8 + adc %r10, %r10 + setc %cl + mov 16(%rsi), %rdx + .byte 0xc4,226,251,0xf6,210 + add %r11, %r8 + adc %rax, %r10 + mov %r8, 24(%rdi) + mov %r10, 32(%rdi) + adc %rcx, %rdx + mov %rdx, 40(%rdi) + + ret + +Lgt3: + + + + + + + + + + + + + +Ldo_mul_2: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + mov $0, %r12d + sub %rdx, %r12 + push %r12 + mov (%rsi), %r8 + mov 8(%rsi), %rdx + lea 2(%r12), %rcx + sar $2, %rcx + inc %r12 + mov %rdx, %r9 + + test $1, %r12b + jnz Lmx1 + +Lmx0: .byte 0xc4,66,227,0xf6,216 + mov 16(%rsi), %rdx + mov %rbx, 8(%rdi) + xor %rbx, %rbx + .byte 0xc4,194,171,0xf6,232 + test $2, %r12b + jz Lm00 + +Lm10: lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + jmp Lmlo2 + +Lm00: lea 8(%rsi), %rsi + lea 8(%rdi), %rdi + jmp Lmlo0 + +Lmx1: .byte 0xc4,194,171,0xf6,232 + mov 16(%rsi), %rdx + mov %r10, 8(%rdi) + xor %r10, %r10 + .byte 0xc4,66,227,0xf6,216 + test $2, %r12b + jz Lmlo3 + +Lm01: lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Lmlo1 + + .align 5, 0x90 +Lmtop:.byte 0xc4,66,251,0xf6,209 + add %rax, %rbx + mov (%rsi), %rdx + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 + add %rax, %rbx +Lmlo1:adc $0, %r11 + add %rbp, %rbx + mov %rbx, (%rdi) + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r10 + mov 8(%rsi), %rdx + adc $0, %rbx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r10 + adc $0, %rbp +Lmlo0:add %r11, %r10 + mov %r10, 8(%rdi) + adc $0, %rbp + .byte 0xc4,66,251,0xf6,209 + add %rax, %rbx + mov 16(%rsi), %rdx + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 + add %rax, %rbx + adc $0, %r11 +Lmlo3:add %rbp, %rbx + mov %rbx, 16(%rdi) + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rax, %r10 + mov 24(%rsi), %rdx + adc $0, %rbx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r10 + adc $0, %rbp +Lmlo2:add %r11, %r10 + lea 32(%rsi), %rsi + mov %r10, 24(%rdi) + adc $0, %rbp + inc %rcx + lea 32(%rdi), %rdi + jnz Lmtop + +Lmend:.byte 0xc4,194,235,0xf6,193 + add %rdx, %rbx + adc $0, %rax + add %rbp, %rbx + mov %rbx, (%rdi) + adc $0, %rax + mov %rax, 8(%rdi) + + lea 16(%rsi), %rsi + lea -16(%rdi), %rdi + +Ldo_addmul_2: +Louter: + lea (%rsi,%r12,8), %rsi + lea 48(%rdi,%r12,8), %rdi + + mov -8(%rsi), %r8 + + add $2, %r12 + cmp $-2, %r12 + jge Lcorner + + mov (%rsi), %r9 + + lea 1(%r12), %rcx + sar $2, %rcx + + mov %r9, %rdx + test $1, %r12b + jnz Lbx1 + +Lbx0: mov (%rdi), %r13 + mov 8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,216 + add %rax, %r13 + adc $0, %r11 + mov %r13, (%rdi) + xor %rbx, %rbx + test $2, %r12b + jnz Lb10 + +Lb00: mov 8(%rsi), %rdx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Llo0 + +Lb10: mov 8(%rsi), %rdx + mov 16(%rdi), %r13 + lea 32(%rsi), %rsi + inc %rcx + .byte 0xc4,194,251,0xf6,232 + jz Lex + jmp Llo2 + +Lbx1: mov (%rdi), %r14 + mov 8(%rdi), %r13 + .byte 0xc4,194,251,0xf6,232 + mov 8(%rsi), %rdx + add %rax, %r14 + adc $0, %rbp + xor %r10, %r10 + mov %r14, (%rdi) + .byte 0xc4,66,251,0xf6,216 + test $2, %r12b + jz Lb11 + +Lb01: mov 16(%rdi), %r14 + lea 24(%rdi), %rdi + lea 24(%rsi), %rsi + jmp Llo1 + +Lb11: lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + jmp Llo3 + + .align 5, 0x90 +Ltop: .byte 0xc4,194,251,0xf6,232 + add %r10, %r14 + adc $0, %rbx +Llo2: add %rax, %r14 + adc $0, %rbp + .byte 0xc4,66,251,0xf6,209 + add %rax, %r13 + adc $0, %r10 + lea 32(%rdi), %rdi + add %r11, %r14 + mov -16(%rsi), %rdx + mov %r14, -24(%rdi) + adc $0, %rbp + add %rbx, %r13 + mov -8(%rdi), %r14 + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 +Llo1: add %rax, %r13 + .byte 0xc4,194,251,0xf6,217 + adc $0, %r11 + add %rbp, %r13 + mov %r13, -16(%rdi) + adc $0, %r11 + add %rax, %r14 + adc $0, %rbx + add %r10, %r14 + mov -8(%rsi), %rdx + adc $0, %rbx +Llo0: .byte 0xc4,194,251,0xf6,232 + add %rax, %r14 + adc $0, %rbp + mov (%rdi), %r13 + .byte 0xc4,66,251,0xf6,209 + add %rax, %r13 + adc $0, %r10 + add %r11, %r14 + mov %r14, -8(%rdi) + adc $0, %rbp + mov (%rsi), %rdx + add %rbx, %r13 + .byte 0xc4,66,251,0xf6,216 + adc $0, %r10 +Llo3: add %rax, %r13 + adc $0, %r11 + .byte 0xc4,194,251,0xf6,217 + add %rbp, %r13 + mov 8(%rdi), %r14 + mov %r13, (%rdi) + mov 16(%rdi), %r13 + adc $0, %r11 + add %rax, %r14 + adc $0, %rbx + mov 8(%rsi), %rdx + lea 32(%rsi), %rsi + inc %rcx + jnz Ltop + +Lend: .byte 0xc4,194,251,0xf6,232 + add %r10, %r14 + adc $0, %rbx +Lex: add %rax, %r14 + adc $0, %rbp + .byte 0xc4,194,235,0xf6,193 + add %r11, %r14 + mov %r14, 8(%rdi) + adc $0, %rbp + add %rbx, %rdx + adc $0, %rax + add %rdx, %rbp + mov %rbp, 16(%rdi) + adc $0, %rax + mov %rax, 24(%rdi) + + jmp Louter + +Lcorner: + pop %r12 + mov (%rsi), %rdx + jg Lsmall_corner + + mov %rdx, %r9 + mov (%rdi), %r13 + mov %rax, %r14 + .byte 0xc4,66,251,0xf6,216 + add %rax, %r13 + adc $0, %r11 + mov %r13, (%rdi) + mov 8(%rsi), %rdx + .byte 0xc4,194,251,0xf6,232 + add %rax, %r14 + adc $0, %rbp + .byte 0xc4,194,235,0xf6,193 + add %r11, %r14 + mov %r14, 8(%rdi) + adc $0, %rbp + add %rbp, %rdx + mov %rdx, 16(%rdi) + adc $0, %rax + mov %rax, 24(%rdi) + lea 32(%rdi), %rdi + lea 16(%rsi), %rsi + jmp Lcom + +Lsmall_corner: + .byte 0xc4,194,139,0xf6,232 + add %rax, %r14 + adc $0, %rbp + mov %r14, (%rdi) + mov %rbp, 8(%rdi) + lea 16(%rdi), %rdi + lea 8(%rsi), %rsi + +Lcom: + +Lsqr_diag_addlsh1: + lea 8(%rsi,%r12,8), %rsi + lea (%rdi,%r12,8), %rdi + lea (%rdi,%r12,8), %rdi + inc %r12 + + mov -8(%rsi), %rdx + xor %ebx, %ebx + .byte 0xc4,98,251,0xf6,210 + mov %rax, 8(%rdi) + mov 16(%rdi), %r8 + mov 24(%rdi), %r9 + jmp Ldm + + .align 4, 0x90 +Ldtop:mov 32(%rdi), %r8 + mov 40(%rdi), %r9 + lea 16(%rdi), %rdi + lea (%rdx,%rbx), %r10 +Ldm: adc %r8, %r8 + adc %r9, %r9 + setc %bl + mov (%rsi), %rdx + lea 8(%rsi), %rsi + .byte 0xc4,226,251,0xf6,210 + add %r10, %r8 + adc %rax, %r9 + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + inc %r12 + jnz Ldtop + +Ldend:adc %rbx, %rdx + mov %rdx, 32(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_diag_addlsh1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_diag_addlsh1.s new file mode 100644 index 0000000..f82bd03 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sqr_diag_addlsh1.s @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_sqr_diag_addlsh1 + + +___gmpn_sqr_diag_addlsh1: + + + push %rbx + + dec %rcx + shl %rcx + + mov (%rdx), %rax + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,4), %r11 + neg %rcx + + mul %rax + mov %rax, (%rdi,%rcx,8) + + xor %ebx, %ebx + jmp Lmid + + .align 4, 0x90 +Ltop: add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(%rdi,%rcx,8) + mov %r9, (%rdi,%rcx,8) +Lmid: mov 8(%r11,%rcx,4), %rax + mov (%rsi,%rcx,8), %r8 + mov 8(%rsi,%rcx,8), %r9 + adc %r8, %r8 + adc %r9, %r9 + lea (%rdx,%rbx), %r10 + setc %bl + mul %rax + add $2, %rcx + js Ltop + +Lend: add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(%rdi) + mov %r9, (%rdi) + adc %rbx, %rdx + mov %rdx, 8(%rdi) + + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err1_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err1_n.s new file mode 100644 index 0000000..68dcb90 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err1_n.s @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sub_err1_n + + +___gmpn_sub_err1_n: + + mov 8(%rsp), %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (%rsi,%r9,8), %rsi + lea (%rdx,%r9,8), %rdx + lea (%rdi,%r9,8), %rdi + + mov %r9d, %r10d + and $3, %r10d + jz L0mod4 + cmp $2, %r10d + jc L1mod4 + jz L2mod4 +L3mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + xor %r11d, %r11d + lea -24(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 16(%r8), %rbx + sbb 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc 8(%r8), %r10 + mov 16(%rsi,%r9,8), %r14 + sbb 16(%rdx,%r9,8), %r14 + mov %r14, 16(%rdi,%r9,8) + cmovc (%r8), %r11 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + + add $3, %r9 + jnz Lloop + jmp Lend + + .align 4, 0x90 +L0mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea (%r8,%r9,8), %r8 + neg %r9 + jmp Lloop + + .align 4, 0x90 +L1mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea -8(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc (%r8), %rbx + setc %al + + add $1, %r9 + jnz Lloop + jmp Lend + + .align 4, 0x90 +L2mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + lea -16(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 8(%r8), %rbx + sbb 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc (%r8), %r10 + setc %al + add %r10, %rbx + adc $0, %rbp + + add $2, %r9 + jnz Lloop + jmp Lend + + .align 5, 0x90 +Lloop: + mov (%rsi,%r9,8), %r14 + shr $1, %al + mov -8(%r8), %r10 + mov $0, %r13d + sbb (%rdx,%r9,8), %r14 + cmovnc %r13, %r10 + mov %r14, (%rdi,%r9,8) + mov 8(%rsi,%r9,8), %r15 + mov 16(%rsi,%r9,8), %r14 + sbb 8(%rdx,%r9,8), %r15 + mov -16(%r8), %r11 + cmovnc %r13, %r11 + mov -24(%r8), %r12 + mov %r15, 8(%rdi,%r9,8) + sbb 16(%rdx,%r9,8), %r14 + cmovnc %r13, %r12 + mov 24(%rsi,%r9,8), %r15 + sbb 24(%rdx,%r9,8), %r15 + cmovc -32(%r8), %r13 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + add %r12, %rbx + adc $0, %rbp + lea -32(%r8), %r8 + mov %r14, 16(%rdi,%r9,8) + add %r13, %rbx + adc $0, %rbp + add $4, %r9 + mov %r15, -8(%rdi,%r9,8) + jnz Lloop + +Lend: + mov %rbx, (%rcx) + mov %rbp, 8(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err2_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err2_n.s new file mode 100644 index 0000000..bfa02d4 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err2_n.s @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sub_err2_n + + +___gmpn_sub_err2_n: + + mov 16(%rsp), %rax + mov 8(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + + sub %r8, %r9 + + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + + test $1, %r10 + jnz Lodd + + lea -8(%r8,%r10,8), %r8 + neg %r10 + jmp Ltop + + .align 4, 0x90 +Lodd: + lea -16(%r8,%r10,8), %r8 + neg %r10 + shr $1, %rax + mov (%rsi,%r10,8), %rbx + sbb (%rdx,%r10,8), %rbx + cmovc 8(%r8), %rbp + cmovc 8(%r8,%r9), %r12 + mov %rbx, (%rdi,%r10,8) + sbb %rax, %rax + inc %r10 + jz Lend + + .align 4, 0x90 +Ltop: + mov (%rsi,%r10,8), %rbx + shr $1, %rax + sbb (%rdx,%r10,8), %rbx + mov %rbx, (%rdi,%r10,8) + sbb %r14, %r14 + + mov 8(%rsi,%r10,8), %rbx + sbb 8(%rdx,%r10,8), %rbx + mov %rbx, 8(%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %r14, %rbx + add %rbx, %rbp + adc $0, %r11 + + and (%r8,%r9), %r14 + add %r14, %r12 + adc $0, %r13 + + mov -8(%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov -8(%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + add $2, %r10 + lea -16(%r8), %r8 + jnz Ltop +Lend: + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + + and $1, %eax + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err3_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err3_n.s new file mode 100644 index 0000000..fcccfe3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_err3_n.s @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sub_err3_n + + +___gmpn_sub_err3_n: + + mov 24(%rsp), %rax + mov 16(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push %rcx + mov 64(%rsp), %rcx + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + sub %r8, %r9 + sub %r8, %rcx + + lea -8(%r8,%r10,8), %r8 + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + neg %r10 + + .align 4, 0x90 +Ltop: + shr $1, %rax + mov (%rsi,%r10,8), %rax + sbb (%rdx,%r10,8), %rax + mov %rax, (%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov (%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + mov (%r8,%rcx), %rbx + and %rax, %rbx + add %rbx, %r14 + adc $0, %r15 + + lea -8(%r8), %r8 + inc %r10 + jnz Ltop + +Lend: + and $1, %eax + pop %rcx + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + mov %r14, 32(%rcx) + mov %r15, 40(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sub_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_n.s new file mode 100644 index 0000000..f9868bb --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sub_n.s @@ -0,0 +1,289 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_sub_nc + + +___gmpn_sub_nc: + + + + + mov %ecx, %eax + shr $3, %rcx + and $7, %eax + + lea Ltab(%rip), %r9 + neg %r8 + + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax + jmp *%rax + + + + .align 4, 0x90 + .globl ___gmpn_sub_n + + +___gmpn_sub_n: + + + + mov %ecx, %eax + shr $3, %rcx + and $7, %eax + + lea Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + lea (%r9,%rax), %rax + jmp *%rax + + +L0: mov (%rsi), %r8 + mov 8(%rsi), %r9 + sbb (%rdx), %r8 + jmp Le0 + +L4: mov (%rsi), %r8 + mov 8(%rsi), %r9 + sbb (%rdx), %r8 + lea -32(%rsi), %rsi + lea -32(%rdx), %rdx + lea -32(%rdi), %rdi + inc %rcx + jmp Le4 + +L5: mov (%rsi), %r11 + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + sbb (%rdx), %r11 + lea -24(%rsi), %rsi + lea -24(%rdx), %rdx + lea -24(%rdi), %rdi + inc %rcx + jmp Le5 + +L6: mov (%rsi), %r10 + sbb (%rdx), %r10 + mov 8(%rsi), %r11 + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + inc %rcx + jmp Le6 + +L7: mov (%rsi), %r9 + mov 8(%rsi), %r10 + sbb (%rdx), %r9 + sbb 8(%rdx), %r10 + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + inc %rcx + jmp Le7 + + .align 4, 0x90 +Ltop: +Le3: mov %r9, 40(%rdi) +Le2: mov %r10, 48(%rdi) +Le1: mov (%rsi), %r8 + mov 8(%rsi), %r9 + sbb (%rdx), %r8 + mov %r11, 56(%rdi) + lea 64(%rdi), %rdi +Le0: mov 16(%rsi), %r10 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + mov %r8, (%rdi) +Le7: mov 24(%rsi), %r11 + mov %r9, 8(%rdi) +Le6: mov 32(%rsi), %r8 + mov 40(%rsi), %r9 + sbb 24(%rdx), %r11 + mov %r10, 16(%rdi) +Le5: sbb 32(%rdx), %r8 + mov %r11, 24(%rdi) +Le4: mov 48(%rsi), %r10 + mov 56(%rsi), %r11 + mov %r8, 32(%rdi) + lea 64(%rsi), %rsi + sbb 40(%rdx), %r9 + sbb 48(%rdx), %r10 + sbb 56(%rdx), %r11 + lea 64(%rdx), %rdx + dec %rcx + jnz Ltop + +Lend: mov %r9, 40(%rdi) + mov %r10, 48(%rdi) + mov %r11, 56(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L3: mov (%rsi), %r9 + mov 8(%rsi), %r10 + mov 16(%rsi), %r11 + sbb (%rdx), %r9 + sbb 8(%rdx), %r10 + sbb 16(%rdx), %r11 + jrcxz Lx3 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea -40(%rdi), %rdi + jmp Le3 +Lx3: mov %r9, (%rdi) + mov %r10, 8(%rdi) + mov %r11, 16(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L1: mov (%rsi), %r11 + sbb (%rdx), %r11 + jrcxz Lx1 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea -56(%rdi), %rdi + jmp Le1 +Lx1: mov %r11, (%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .align 4, 0x90 +L2: mov (%rsi), %r10 + mov 8(%rsi), %r11 + sbb (%rdx), %r10 + sbb 8(%rdx), %r11 + jrcxz Lx2 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea -48(%rdi), %rdi + jmp Le2 +Lx2: mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov %ecx, %eax + adc %ecx, %eax + + ret + + .text + .align 3, 0x90 +Ltab: .set L0_tmp, L0-Ltab + .long L0_tmp + + .set L1_tmp, L1-Ltab + .long L1_tmp + + .set L2_tmp, L2-Ltab + .long L2_tmp + + .set L3_tmp, L3-Ltab + .long L3_tmp + + .set L4_tmp, L4-Ltab + .long L4_tmp + + .set L5_tmp, L5-Ltab + .long L5_tmp + + .set L6_tmp, L6-Ltab + .long L6_tmp + + .set L7_tmp, L7-Ltab + .long L7_tmp + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh1_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh1_n.s new file mode 100644 index 0000000..40b2b48 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh1_n.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3, 0x90 + .globl ___gmpn_sublsh1_n + + +___gmpn_sublsh1_n: + + + push %rbx + push %r12 + + mov %ecx, %eax + lea 24(%rsi,%rcx,8), %rsi + lea 24(%rdx,%rcx,8), %rdx + lea 24(%rdi,%rcx,8), %rdi + neg %rcx + + xor %r11d, %r11d + + mov -24(%rdx,%rcx,8), %r8 + shrd $63, %r8, %r11 + + and $3, %eax + je Lb0 + cmp $2, %eax + jc Lb1 + je Lb2 + +Lb3: mov -16(%rdx,%rcx,8), %r9 + shrd $63, %r9, %r8 + mov -8(%rdx,%rcx,8), %r10 + shrd $63, %r10, %r9 + mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + mov -8(%rsi,%rcx,8), %r12 + sbb %r9, %r12 + mov %r12, -8(%rdi,%rcx,8) + mov %r10, %r11 + sbb %eax, %eax + add $3, %rcx + js Ltop + jmp Lend + +Lb1: mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov %r8, %r11 + sbb %eax, %eax + inc %rcx + js Ltop + jmp Lend + +Lb2: mov -16(%rdx,%rcx,8), %r9 + shrd $63, %r9, %r8 + mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + mov %r9, %r11 + sbb %eax, %eax + add $2, %rcx + js Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov -24(%rdx,%rcx,8), %r8 + shrd $63, %r8, %r11 +Lb0: mov -16(%rdx,%rcx,8), %r9 + shrd $63, %r9, %r8 + mov -8(%rdx,%rcx,8), %r10 + shrd $63, %r10, %r9 + mov (%rdx,%rcx,8), %rbx + shrd $63, %rbx, %r10 + + add %eax, %eax + + mov -24(%rsi,%rcx,8), %r12 + sbb %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + + mov -8(%rsi,%rcx,8), %r12 + sbb %r9, %r12 + mov %r12, -8(%rdi,%rcx,8) + + mov (%rsi,%rcx,8), %r12 + sbb %r10, %r12 + mov %r12, (%rdi,%rcx,8) + + mov %rbx, %r11 + sbb %eax, %eax + + add $4, %rcx + js Ltop + +Lend: shr $63, %r11 + pop %r12 + pop %rbx + sub %r11d, %eax + neg %eax + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh2_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh2_n.s new file mode 100644 index 0000000..8e62cfe --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/sublsh2_n.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 3, 0x90 + .globl ___gmpn_sublsh2_n + + +___gmpn_sublsh2_n: + + + push %rbx + push %r12 + + mov %ecx, %eax + lea 24(%rsi,%rcx,8), %rsi + lea 24(%rdx,%rcx,8), %rdx + lea 24(%rdi,%rcx,8), %rdi + neg %rcx + + xor %r11d, %r11d + + mov -24(%rdx,%rcx,8), %r8 + shrd $62, %r8, %r11 + + and $3, %eax + je Lb0 + cmp $2, %eax + jc Lb1 + je Lb2 + +Lb3: mov -16(%rdx,%rcx,8), %r9 + shrd $62, %r9, %r8 + mov -8(%rdx,%rcx,8), %r10 + shrd $62, %r10, %r9 + mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + mov -8(%rsi,%rcx,8), %r12 + sbb %r9, %r12 + mov %r12, -8(%rdi,%rcx,8) + mov %r10, %r11 + sbb %eax, %eax + add $3, %rcx + js Ltop + jmp Lend + +Lb1: mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov %r8, %r11 + sbb %eax, %eax + inc %rcx + js Ltop + jmp Lend + +Lb2: mov -16(%rdx,%rcx,8), %r9 + shrd $62, %r9, %r8 + mov -24(%rsi,%rcx,8), %r12 + sub %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + mov %r9, %r11 + sbb %eax, %eax + add $2, %rcx + js Ltop + jmp Lend + + .align 4, 0x90 +Ltop: mov -24(%rdx,%rcx,8), %r8 + shrd $62, %r8, %r11 +Lb0: mov -16(%rdx,%rcx,8), %r9 + shrd $62, %r9, %r8 + mov -8(%rdx,%rcx,8), %r10 + shrd $62, %r10, %r9 + mov (%rdx,%rcx,8), %rbx + shrd $62, %rbx, %r10 + + add %eax, %eax + + mov -24(%rsi,%rcx,8), %r12 + sbb %r11, %r12 + mov %r12, -24(%rdi,%rcx,8) + + mov -16(%rsi,%rcx,8), %r12 + sbb %r8, %r12 + mov %r12, -16(%rdi,%rcx,8) + + mov -8(%rsi,%rcx,8), %r12 + sbb %r9, %r12 + mov %r12, -8(%rdi,%rcx,8) + + mov (%rsi,%rcx,8), %r12 + sbb %r10, %r12 + mov %r12, (%rdi,%rcx,8) + + mov %rbx, %r11 + sbb %eax, %eax + + add $4, %rcx + js Ltop + +Lend: shr $62, %r11 + pop %r12 + pop %rbx + sub %r11d, %eax + neg %eax + + ret + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/submul_1.s b/vere/ext/gmp/gen/x86_64-macos/mpn/submul_1.s new file mode 100644 index 0000000..37fcb54 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/submul_1.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 4, 0x90 + .globl ___gmpn_submul_1 + + +___gmpn_submul_1: + + + push %rbx + push %rbp + push %r12 + push %r13 + + mov %rdx, %rbp + mov %rcx, %rdx + + test $1, %bpl + jnz Lbx1 + +Lbx0: shr $2, %rbp + jc Lb10 + +Lb00: .byte 0xc4,98,147,0xf6,38 + .byte 0xc4,226,227,0xf6,70,8 + add %r12, %rbx + adc $0, %rax + mov (%rdi), %r12 + mov 8(%rdi), %rcx + .byte 0xc4,98,179,0xf6,70,16 + lea -16(%rdi), %rdi + lea 16(%rsi), %rsi + sub %r13, %r12 + jmp Llo0 + +Lbx1: shr $2, %rbp + jc Lb11 + +Lb01: .byte 0xc4,98,163,0xf6,22 + jnz Lgt1 +Ln1: sub %r11, (%rdi) + mov $0, %eax + adc %r10, %rax + jmp Lret + +Lgt1: .byte 0xc4,98,147,0xf6,102,8 + .byte 0xc4,226,227,0xf6,70,16 + lea 24(%rsi), %rsi + add %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov (%rdi), %r10 + mov 8(%rdi), %r12 + mov 16(%rdi), %rcx + lea -8(%rdi), %rdi + sub %r11, %r10 + jmp Llo1 + +Lb11: .byte 0xc4,226,227,0xf6,6 + mov (%rdi), %rcx + .byte 0xc4,98,179,0xf6,70,8 + lea 8(%rsi), %rsi + lea -24(%rdi), %rdi + inc %rbp + sub %rbx, %rcx + jmp Llo3 + +Lb10: .byte 0xc4,98,179,0xf6,6 + .byte 0xc4,98,163,0xf6,86,8 + lea -32(%rdi), %rdi + mov $0, %eax + clc + jz Lend + + .align 4, 0x90 +Ltop: adc %rax, %r9 + lea 32(%rdi), %rdi + adc %r8, %r11 + .byte 0xc4,98,147,0xf6,102,16 + mov (%rdi), %r8 + .byte 0xc4,226,227,0xf6,70,24 + lea 32(%rsi), %rsi + adc %r10, %r13 + adc %r12, %rbx + adc $0, %rax + mov 8(%rdi), %r10 + mov 16(%rdi), %r12 + sub %r9, %r8 + mov 24(%rdi), %rcx + mov %r8, (%rdi) + sbb %r11, %r10 +Llo1: .byte 0xc4,98,179,0xf6,6 + mov %r10, 8(%rdi) + sbb %r13, %r12 +Llo0: mov %r12, 16(%rdi) + sbb %rbx, %rcx +Llo3: .byte 0xc4,98,163,0xf6,86,8 + mov %rcx, 24(%rdi) + dec %rbp + jnz Ltop + +Lend: adc %rax, %r9 + adc %r8, %r11 + mov 32(%rdi), %r8 + mov %r10, %rax + adc $0, %rax + mov 40(%rdi), %r10 + sub %r9, %r8 + mov %r8, 32(%rdi) + sbb %r11, %r10 + mov %r10, 40(%rdi) + adc $0, %rax + +Lret: pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/xnor_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/xnor_n.s new file mode 100644 index 0000000..fb72dba --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/xnor_n.s @@ -0,0 +1,163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_xnor_n + + +___gmpn_xnor_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: xor (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: xor (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 + not %r8 +Lb00: mov 8(%rdx), %r9 + not %r9 + xor (%rsi), %r8 + xor 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 + not %r8 +Le10: mov 24(%rdx), %r9 + not %r9 + lea 32(%rdx), %rdx + xor 16(%rsi), %r8 + xor 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + diff --git a/vere/ext/gmp/gen/x86_64-macos/mpn/xor_n.s b/vere/ext/gmp/gen/x86_64-macos/mpn/xor_n.s new file mode 100644 index 0000000..e6ec0c5 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-macos/mpn/xor_n.s @@ -0,0 +1,158 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 5, 0x90 + .globl ___gmpn_xor_n + + +___gmpn_xor_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + and $3, %eax + je Lb00 + cmp $2, %eax + jc Lb01 + je Lb10 + +Lb11: xor (%rsi), %r8 + mov %r8, (%rdi) + inc %rcx + lea -8(%rsi), %rsi + lea -8(%rdx), %rdx + lea -8(%rdi), %rdi + jmp Le11 +Lb10: add $2, %rcx + lea -16(%rsi), %rsi + lea -16(%rdx), %rdx + lea -16(%rdi), %rdi + jmp Le10 +Lb01: xor (%rsi), %r8 + mov %r8, (%rdi) + dec %rcx + jz Lret + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + + .align 4, 0x90 +Ltop: mov (%rdx), %r8 +Lb00: mov 8(%rdx), %r9 + xor (%rsi), %r8 + xor 8(%rsi), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) +Le11: mov 16(%rdx), %r8 +Le10: mov 24(%rdx), %r9 + lea 32(%rdx), %rdx + xor 16(%rsi), %r8 + xor 24(%rsi), %r9 + lea 32(%rsi), %rsi + mov %r8, 16(%rdi) + mov %r9, 24(%rdi) + lea 32(%rdi), %rdi + sub $4, %rcx + jnz Ltop + +Lret: + ret + + + + + + |