diff options
Diffstat (limited to 'vere/ext/gmp/gen/x86_64-linux/mpn')
69 files changed, 15072 insertions, 0 deletions
diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s new file mode 100644 index 0000000..2cbba6a --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err1_n.s @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_add_err1_n + .type __gmpn_add_err1_n,@function + +__gmpn_add_err1_n: + + mov 8(%rsp), %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (%rsi,%r9,8), %rsi + lea (%rdx,%r9,8), %rdx + lea (%rdi,%r9,8), %rdi + + mov %r9d, %r10d + and $3, %r10d + jz .L0mod4 + cmp $2, %r10d + jc .L1mod4 + jz .L2mod4 +.L3mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + xor %r11d, %r11d + lea -24(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 16(%r8), %rbx + adc 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc 8(%r8), %r10 + mov 16(%rsi,%r9,8), %r14 + adc 16(%rdx,%r9,8), %r14 + mov %r14, 16(%rdi,%r9,8) + cmovc (%r8), %r11 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + + add $3, %r9 + jnz .Lloop + jmp .Lend + + .align 16, 0x90 +.L0mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea (%r8,%r9,8), %r8 + neg %r9 + jmp .Lloop + + .align 16, 0x90 +.L1mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea -8(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc (%r8), %rbx + setc %al + + add $1, %r9 + jnz .Lloop + jmp .Lend + + .align 16, 0x90 +.L2mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + lea -16(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + adc (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 8(%r8), %rbx + adc 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc (%r8), %r10 + setc %al + add %r10, %rbx + adc $0, %rbp + + add $2, %r9 + jnz .Lloop + jmp .Lend + + .align 32, 0x90 +.Lloop: + shr $1, %al + mov -8(%r8), %r10 + mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + adc (%rdx,%r9,8), %r14 + cmovnc %r13, %r10 + adc 8(%rdx,%r9,8), %r15 + mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) + cmovnc %r13, %r11 + mov -24(%r8), %r12 + adc 16(%rdx,%r9,8), %r14 + cmovnc %r13, %r12 + mov 24(%rsi,%r9,8), %r15 + adc 24(%rdx,%r9,8), %r15 + cmovc -32(%r8), %r13 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + add %r12, %rbx + adc $0, %rbp + mov %r14, 16(%rdi,%r9,8) + add %r13, %rbx + lea -32(%r8), %r8 + adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) + add $4, %r9 + jnz .Lloop + +.Lend: + mov %rbx, (%rcx) + mov %rbp, 8(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_add_err1_n,.-__gmpn_add_err1_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/add_err2_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err2_n.s new file mode 100644 index 0000000..1008479 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err2_n.s @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_add_err2_n + .type __gmpn_add_err2_n,@function + +__gmpn_add_err2_n: + + mov 16(%rsp), %rax + mov 8(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + + sub %r8, %r9 + + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + + test $1, %r10 + jnz .Lodd + + lea -8(%r8,%r10,8), %r8 + neg %r10 + jmp .Ltop + + .align 16, 0x90 +.Lodd: + lea -16(%r8,%r10,8), %r8 + neg %r10 + shr $1, %rax + mov (%rsi,%r10,8), %rbx + adc (%rdx,%r10,8), %rbx + cmovc 8(%r8), %rbp + cmovc 8(%r8,%r9), %r12 + mov %rbx, (%rdi,%r10,8) + sbb %rax, %rax + inc %r10 + jz .Lend + + .align 16, 0x90 +.Ltop: + mov (%rsi,%r10,8), %rbx + shr $1, %rax + adc (%rdx,%r10,8), %rbx + mov %rbx, (%rdi,%r10,8) + sbb %r14, %r14 + + mov 8(%rsi,%r10,8), %rbx + adc 8(%rdx,%r10,8), %rbx + mov %rbx, 8(%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %r14, %rbx + add %rbx, %rbp + adc $0, %r11 + + and (%r8,%r9), %r14 + add %r14, %r12 + adc $0, %r13 + + mov -8(%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov -8(%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + add $2, %r10 + lea -16(%r8), %r8 + jnz .Ltop +.Lend: + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + + and $1, %eax + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_add_err2_n,.-__gmpn_add_err2_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/add_err3_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err3_n.s new file mode 100644 index 0000000..cf99415 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/add_err3_n.s @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_add_err3_n + .type __gmpn_add_err3_n,@function + +__gmpn_add_err3_n: + + mov 24(%rsp), %rax + mov 16(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push %rcx + mov 64(%rsp), %rcx + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + sub %r8, %r9 + sub %r8, %rcx + + lea -8(%r8,%r10,8), %r8 + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + neg %r10 + + .align 16, 0x90 +.Ltop: + shr $1, %rax + mov (%rsi,%r10,8), %rax + adc (%rdx,%r10,8), %rax + mov %rax, (%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov (%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + mov (%r8,%rcx), %rbx + and %rax, %rbx + add %rbx, %r14 + adc $0, %r15 + + lea -8(%r8), %r8 + inc %r10 + jnz .Ltop + +.Lend: + and $1, %eax + pop %rcx + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + mov %r14, 32(%rcx) + mov %r15, 40(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_add_err3_n,.-__gmpn_add_err3_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/add_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/add_n.s new file mode 100644 index 0000000..14cc32b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/add_n.s @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_add_nc + .type __gmpn_add_nc,@function + +__gmpn_add_nc: + + + + mov %ecx, %eax + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 + + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + + .size __gmpn_add_nc,.-__gmpn_add_nc + .align 16, 0x90 + .globl __gmpn_add_n + .type __gmpn_add_n,@function + +__gmpn_add_n: + + + mov %ecx, %eax + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 + + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 + adc (%rdx), %r8 + mov %r8, (%rdi) + adc %eax, %eax + + ret + +.L2: dec %eax + mov 8(%rsi), %r9 + jnz .L3 + adc (%rdx), %r8 + adc 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret + +.L3: mov 16(%rsi), %r10 + adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + setc %al + + ret + + .align 16, 0x90 +.Ltop: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop + +.Lend: lea 32(%rsi), %rsi + adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi + + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax + + ret + .size __gmpn_add_n,.-__gmpn_add_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addaddmul_1msb0.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addaddmul_1msb0.s new file mode 100644 index 0000000..c821f7b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addaddmul_1msb0.s @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addaddmul_1msb0 + .type __gmpn_addaddmul_1msb0,@function + +__gmpn_addaddmul_1msb0: + + push %r12 + push %rbp + + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rbp + lea (%rdi,%rcx,8), %rdi + neg %rcx + + mov (%rsi,%rcx,8), %rax + mul %r8 + mov %rax, %r12 + mov (%rbp,%rcx,8), %rax + mov %rdx, %r10 + add $3, %rcx + jns .Lend + + .align 16, 0x90 +.Ltop: mul %r9 + add %rax, %r12 + mov -16(%rsi,%rcx,8), %rax + adc %rdx, %r10 + mov %r12, -24(%rdi,%rcx,8) + mul %r8 + add %rax, %r10 + mov -16(%rbp,%rcx,8), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(%rsi,%rcx,8), %rax + adc %rdx, %r11 + mov %r10, -16(%rdi,%rcx,8) + mul %r8 + add %rax, %r11 + mov -8(%rbp,%rcx,8), %rax + mov $0, %r12d + adc %rdx, %r12 + mul %r9 + add %rax, %r11 + adc %rdx, %r12 + mov (%rsi,%rcx,8), %rax + mul %r8 + add %rax, %r12 + mov %r11, -8(%rdi,%rcx,8) + mov (%rbp,%rcx,8), %rax + mov $0, %r10d + adc %rdx, %r10 + add $3, %rcx + js .Ltop + +.Lend: cmp $1, %ecx + ja 2f + jz 1f + + mul %r9 + add %rax, %r12 + mov -16(%rsi), %rax + adc %rdx, %r10 + mov %r12, -24(%rdi) + mul %r8 + add %rax, %r10 + mov -16(%rbp), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(%rsi), %rax + adc %rdx, %r11 + mov %r10, -16(%rdi) + mul %r8 + add %rax, %r11 + mov -8(%rbp), %rax + mov $0, %r12d + adc %rdx, %r12 + mul %r9 + add %rax, %r11 + adc %rdx, %r12 + mov %r11, -8(%rdi) + mov %r12, %rax + pop %rbp + pop %r12 + ret + +1: mul %r9 + add %rax, %r12 + mov -8(%rsi), %rax + adc %rdx, %r10 + mov %r12, -16(%rdi) + mul %r8 + add %rax, %r10 + mov -8(%rbp), %rax + mov $0, %r11d + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + adc %rdx, %r11 + mov %r10, -8(%rdi) + mov %r11, %rax + pop %rbp + pop %r12 + ret + +2: mul %r9 + add %rax, %r12 + mov %r12, -8(%rdi) + adc %rdx, %r10 + mov %r10, %rax + pop %rbp + pop %r12 + ret + .size __gmpn_addaddmul_1msb0,.-__gmpn_addaddmul_1msb0 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s new file mode 100644 index 0000000..e3d3aae --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh1_n.s @@ -0,0 +1,179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addlsh1_n + .type __gmpn_addlsh1_n,@function + +__gmpn_addlsh1_n: + + + push %rbp + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + sbb %eax, %eax + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + sbb %eax, %eax + add (%rsi,%rcx,8), %r8 + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 + sbb %eax, %eax + add (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + adc (%rsi,%rcx,8), %r8 + nop + adc 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + adc 16(%rsi,%rcx,8), %r10 + adc 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + add %ebp, %eax + neg %eax + + + pop %rbp + + ret + .size __gmpn_addlsh1_n,.-__gmpn_addlsh1_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s new file mode 100644 index 0000000..00e2090 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh2_n.s @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addlsh2_n + .type __gmpn_addlsh2_n,@function + +__gmpn_addlsh2_n: + + + push %r12 + push %r13 + push %r14 + push %r15 + + mov (%rdx), %r8 + lea (,%r8,4), %r12 + shr $62, %r8 + + mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al + je .Lb00 + cmp $2, %al + jc .Lb01 + je .Lb10 + +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r14 + adc 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop + jmp .Lend + +.Lb01: mov %r8, %r11 + add (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop + jmp .Lend + +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + add (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop + jmp .Lend + +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + + .align 16, 0x90 +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + adc (%rsi,%rcx,8), %r12 + adc 8(%rsi,%rcx,8), %r13 + adc 16(%rsi,%rcx,8), %r14 + adc 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + sub %r11d, %eax + neg %eax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret + .size __gmpn_addlsh2_n,.-__gmpn_addlsh2_n + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s new file mode 100644 index 0000000..2d261d5 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addlsh_n.s @@ -0,0 +1,228 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addlsh_n + .type __gmpn_addlsh_n,@function + +__gmpn_addlsh_n: + + + + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + add 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + add 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend + + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + adc 16(%rsi,%rbp,8), %r10 + adc 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + adc (%rsi,%rbp,8), %rbx + adc 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + adc $0, %rax + pop %rbx + pop %rbp + pop %r12 + + ret + .size __gmpn_addlsh_n,.-__gmpn_addlsh_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s new file mode 100644 index 0000000..8daf1ac --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_1.s @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addmul_1 + .type __gmpn_addmul_1,@function + +__gmpn_addmul_1: + + + + + + + mov (%rsi), %rax + push %rbx + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + add %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: add %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + add %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + add %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + add %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + add %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + add %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_addmul_1,.-__gmpn_addmul_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s b/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s new file mode 100644 index 0000000..5883dab --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/addmul_2.s @@ -0,0 +1,209 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_addmul_2 + .type __gmpn_addmul_2,@function + +__gmpn_addmul_2: + + + mov %rdx, %r11 + push %rbx + push %rbp + + mov 0(%rcx), %r8 + mov 8(%rcx), %r9 + + mov %edx, %ebx + mov (%rsi), %rax + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + mul %r8 + neg %r11 + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %rcx + mov %rdx, %rbp + xor %r10d, %r10d + mov 8(%rsi,%r11,8), %rax + dec %r11 + jmp .Llo3 + +.Lb2: mov %rax, %rbp + mov 8(%rsi,%r11,8), %rax + mov %rdx, %r10 + xor %ebx, %ebx + add $-2, %r11 + jmp .Llo2 + +.Lb1: mov %rax, %r10 + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rbx + xor %ecx, %ecx + inc %r11 + jmp .Llo1 + +.Lb0: mov $0, %r10d + mov %rax, %rbx + mov 8(%rsi,%r11,8), %rax + mov %rdx, %rcx + xor %ebp, %ebp + jmp .Llo0 + + .align 32, 0x90 +.Ltop: mov $0, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx +.Llo1: mul %r9 + add %r10, (%rdi,%r11,8) + mov $0, %r10d + adc %rax, %rbx + mov $0, %ebp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + mul %r8 + add %rax, %rbx + mov 8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp +.Llo0: mul %r9 + add %rbx, 8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %rbp + adc $0, %r10d + mov 16(%rsi,%r11,8), %rax +.Llo3: mul %r9 + add %rcx, 16(%rdi,%r11,8) + adc %rax, %rbp + adc %rdx, %r10 + xor %ebx, %ebx + mov 24(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx +.Llo2: mul %r9 + add %rbp, 24(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + add $4, %r11 + js .Ltop + +.Lend: xor %ecx, %ecx + mul %r8 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + adc %ecx, %ecx + mul %r9 + add %r10, (%rdi) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax + + pop %rbp + pop %rbx + + ret + .size __gmpn_addmul_2,.-__gmpn_addmul_2 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/and_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/and_n.s new file mode 100644 index 0000000..946906e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/and_n.s @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_and_n + .type __gmpn_and_n,@function + +__gmpn_and_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_and_n,.-__gmpn_and_n + + + + + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/andn_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/andn_n.s new file mode 100644 index 0000000..aee1df4 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/andn_n.s @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_andn_n + .type __gmpn_andn_n,@function + +__gmpn_andn_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 + not %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + not %r9 + and (%rsi,%rcx,8), %r8 + and 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 + not %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + not %r9 + and 16(%rsi,%rcx,8), %r8 + and 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_andn_n,.-__gmpn_andn_n + + + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_dbm1c.s b/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_dbm1c.s new file mode 100644 index 0000000..2fda4a0 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_dbm1c.s @@ -0,0 +1,121 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_bdiv_dbm1c + .type __gmpn_bdiv_dbm1c,@function + +__gmpn_bdiv_dbm1c: + + + + mov (%rsi), %rax + mov %rdx, %r9 + mov %edx, %r11d + mul %rcx + lea (%rsi,%r9,8), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + and $3, %r11d + jz .Llo0 + lea -4(%r9,%r11), %r9 + cmp $2, %r11d + jc .Llo1 + jz .Llo2 + jmp .Llo3 + + .align 16, 0x90 +.Ltop: mov (%rsi,%r9,8), %rax + mul %rcx +.Llo0: sub %rax, %r8 + mov %r8, (%rdi,%r9,8) + sbb %rdx, %r8 + mov 8(%rsi,%r9,8), %rax + mul %rcx +.Llo3: sub %rax, %r8 + mov %r8, 8(%rdi,%r9,8) + sbb %rdx, %r8 + mov 16(%rsi,%r9,8), %rax + mul %rcx +.Llo2: sub %rax, %r8 + mov %r8, 16(%rdi,%r9,8) + sbb %rdx, %r8 + mov 24(%rsi,%r9,8), %rax + mul %rcx +.Llo1: sub %rax, %r8 + mov %r8, 24(%rdi,%r9,8) + sbb %rdx, %r8 + add $4, %r9 + jnz .Ltop + + mov %r8, %rax + + ret + .size __gmpn_bdiv_dbm1c,.-__gmpn_bdiv_dbm1c diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s new file mode 100644 index 0000000..4f58778 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/bdiv_q_1.s @@ -0,0 +1,198 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_bdiv_q_1 + .type __gmpn_bdiv_q_1,@function + +__gmpn_bdiv_q_1: + + + push %rbx + + mov %rcx, %rax + xor %ecx, %ecx + mov %rdx, %r10 + + bt $0, %eax + jnc .Levn + +.Lodd: mov %rax, %rbx + shr %eax + and $127, %eax + + mov __gmp_binvert_limb_table@GOTPCREL(%rip), %rdx + + + + movzbl (%rdx,%rax), %eax + + mov %rbx, %r11 + + lea (%rax,%rax), %edx + imul %eax, %eax + imul %ebx, %eax + sub %eax, %edx + + lea (%rdx,%rdx), %eax + imul %edx, %edx + imul %ebx, %edx + sub %edx, %eax + + lea (%rax,%rax), %r8 + imul %rax, %rax + imul %rbx, %rax + sub %rax, %r8 + + jmp .Lpi1 + +.Levn: bsf %rax, %rcx + shr %cl, %rax + jmp .Lodd + .size __gmpn_bdiv_q_1,.-__gmpn_bdiv_q_1 + + .globl __gmpn_pi1_bdiv_q_1 + .type __gmpn_pi1_bdiv_q_1,@function + +__gmpn_pi1_bdiv_q_1: + + + + + push %rbx + + mov %rcx, %r11 + mov %rdx, %r10 + mov %r9, %rcx + +.Lpi1: mov (%rsi), %rax + + dec %r10 + jz .Lone + + mov 8(%rsi), %rdx + lea (%rsi,%r10,8), %rsi + lea (%rdi,%r10,8), %rdi + neg %r10 + + shrd %cl, %rdx, %rax + + xor %ebx, %ebx + jmp .Lent + + .align 8, 0x90 +.Ltop: + + + + + + + + mul %r11 + mov (%rsi,%r10,8), %rax + mov 8(%rsi,%r10,8), %r9 + shrd %cl, %r9, %rax + nop + sub %rbx, %rax + setc %bl + sub %rdx, %rax + adc $0, %ebx +.Lent: imul %r8, %rax + mov %rax, (%rdi,%r10,8) + inc %r10 + jnz .Ltop + + mul %r11 + mov (%rsi), %rax + shr %cl, %rax + sub %rbx, %rax + sub %rdx, %rax + imul %r8, %rax + mov %rax, (%rdi) + pop %rbx + + ret + +.Lone: shr %cl, %rax + imul %r8, %rax + mov %rax, (%rdi) + pop %rbx + + ret + .size __gmpn_pi1_bdiv_q_1,.-__gmpn_pi1_bdiv_q_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s new file mode 100644 index 0000000..b046e36 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_add_n.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_cnd_add_n + .type __gmpn_cnd_add_n,@function + +__gmpn_cnd_add_n: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + neg %rdi + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %eax, %eax + add $3, %r8 + js .Ltop + jmp .Lend + +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %eax, %eax + add $2, %r8 + js .Ltop + jmp .Lend + +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + add %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %eax, %eax + add $1, %r8 + jns .Lend + + .align 16, 0x90 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 + add %eax, %eax + adc %r12, %r10 + mov %r10, (%rsi,%r8,8) + adc %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + adc %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + adc %r11, %r9 + mov %r9, 24(%rsi,%r8,8) + sbb %eax, %eax + add $4, %r8 + js .Ltop + +.Lend: neg %eax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_cnd_add_n,.-__gmpn_cnd_add_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s new file mode 100644 index 0000000..596dd8f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/cnd_sub_n.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_cnd_sub_n + .type __gmpn_cnd_sub_n,@function + +__gmpn_cnd_sub_n: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + neg %rdi + sbb %rdi, %rdi + + lea (%rcx,%r8,8), %rcx + lea (%rdx,%r8,8), %rdx + lea (%rsi,%r8,8), %rsi + + mov %r8d, %eax + neg %r8 + and $3, %eax + jz .Ltop + cmp $2, %eax + jc .Lb1 + jz .Lb2 + +.Lb3: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %eax, %eax + add $3, %r8 + js .Ltop + jmp .Lend + +.Lb2: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r13 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %eax, %eax + add $2, %r8 + js .Ltop + jmp .Lend + +.Lb1: mov (%rcx,%r8,8), %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r12 + sub %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %eax, %eax + add $1, %r8 + jns .Lend + + .align 16, 0x90 +.Ltop: mov (%rcx,%r8,8), %r12 + mov 8(%rcx,%r8,8), %r13 + mov 16(%rcx,%r8,8), %r14 + mov 24(%rcx,%r8,8), %r11 + and %rdi, %r12 + mov (%rdx,%r8,8), %r10 + and %rdi, %r13 + mov 8(%rdx,%r8,8), %rbx + and %rdi, %r14 + mov 16(%rdx,%r8,8), %rbp + and %rdi, %r11 + mov 24(%rdx,%r8,8), %r9 + add %eax, %eax + sbb %r12, %r10 + mov %r10, (%rsi,%r8,8) + sbb %r13, %rbx + mov %rbx, 8(%rsi,%r8,8) + sbb %r14, %rbp + mov %rbp, 16(%rsi,%r8,8) + sbb %r11, %r9 + mov %r9, 24(%rsi,%r8,8) + sbb %eax, %eax + add $4, %r8 + js .Ltop + +.Lend: neg %eax + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_cnd_sub_n,.-__gmpn_cnd_sub_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/com.s b/vere/ext/gmp/gen/x86_64-linux/mpn/com.s new file mode 100644 index 0000000..ff14001 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/com.s @@ -0,0 +1,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_com + .type __gmpn_com,@function + +__gmpn_com: + + + movq (%rsi), %r8 + movl %edx, %eax + leaq (%rsi,%rdx,8), %rsi + leaq (%rdi,%rdx,8), %rdi + negq %rdx + andl $3, %eax + je .Lb00 + cmpl $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: notq %r8 + movq %r8, (%rdi,%rdx,8) + decq %rdx + jmp .Le11 +.Lb10: addq $-2, %rdx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: notq %r8 + movq %r8, (%rdi,%rdx,8) + incq %rdx + jz .Lret + +.Loop: movq (%rsi,%rdx,8), %r8 +.Lb00: movq 8(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, (%rdi,%rdx,8) + movq %r9, 8(%rdi,%rdx,8) +.Le11: movq 16(%rsi,%rdx,8), %r8 +.Le10: movq 24(%rsi,%rdx,8), %r9 + notq %r8 + notq %r9 + movq %r8, 16(%rdi,%rdx,8) + movq %r9, 24(%rdi,%rdx,8) + addq $4, %rdx + jnc .Loop +.Lret: + ret + .size __gmpn_com,.-__gmpn_com diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/copyd.s b/vere/ext/gmp/gen/x86_64-linux/mpn/copyd.s new file mode 100644 index 0000000..f375481 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/copyd.s @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 64, 0x90 + .globl __gmpn_copyd + .type __gmpn_copyd,@function + +__gmpn_copyd: + + lea -8(%rsi,%rdx,8), %rsi + lea (%rdi,%rdx,8), %rdi + sub $4, %rdx + jc .Lend + nop + +.Ltop: mov (%rsi), %rax + mov -8(%rsi), %r9 + lea -32(%rdi), %rdi + mov -16(%rsi), %r10 + mov -24(%rsi), %r11 + lea -32(%rsi), %rsi + mov %rax, 24(%rdi) + mov %r9, 16(%rdi) + sub $4, %rdx + mov %r10, 8(%rdi) + mov %r11, (%rdi) + jnc .Ltop + +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, -8(%rdi) + lea -8(%rdi), %rdi + lea -8(%rsi), %rsi +1: shr %edx + jnc 1f + mov (%rsi), %rax + mov -8(%rsi), %r9 + mov %rax, -8(%rdi) + mov %r9, -16(%rdi) +1: ret + .size __gmpn_copyd,.-__gmpn_copyd diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/copyi.s b/vere/ext/gmp/gen/x86_64-linux/mpn/copyi.s new file mode 100644 index 0000000..dc746b2 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/copyi.s @@ -0,0 +1,107 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 64, 0x90 + .byte 0,0,0,0,0,0 + .globl __gmpn_copyi + .type __gmpn_copyi,@function + +__gmpn_copyi: + + lea -8(%rdi), %rdi + sub $4, %rdx + jc .Lend + +.Ltop: mov (%rsi), %rax + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + lea 32(%rsi), %rsi + mov %rax, -24(%rdi) + mov %r9, -16(%rdi) + sub $4, %rdx + mov %r10, -8(%rdi) + mov %r11, (%rdi) + jnc .Ltop + +.Lend: shr %edx + jnc 1f + mov (%rsi), %rax + mov %rax, 8(%rdi) + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi +1: shr %edx + jnc 1f + mov (%rsi), %rax + mov 8(%rsi), %r9 + mov %rax, 8(%rdi) + mov %r9, 16(%rdi) +1: ret + .size __gmpn_copyi,.-__gmpn_copyi diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s new file mode 100644 index 0000000..fd8ce8e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_1n_pi1.s @@ -0,0 +1,261 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_div_qr_1n_pi1 + .type __gmpn_div_qr_1n_pi1,@function + +__gmpn_div_qr_1n_pi1: + + + + + dec %rdx + jnz .Lfirst + + + + lea 1(%rcx), %r10 + mov %rcx, %rax + mul %r9 + mov (%rsi), %r11 + add %r11, %rax + adc %r10, %rdx + mov %rdx, %r10 + imul %r8, %rdx + sub %rdx, %r11 + cmp %r11, %rax + lea (%r11, %r8), %rax + cmovnc %r11, %rax + sbb $0, %r10 + cmp %r8, %rax + jc .Lsingle_div_done + sub %r8, %rax + add $1, %r10 +.Lsingle_div_done: + mov %r10, (%rdi) + + ret +.Lfirst: + + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov %r8, %rbp + imul %r9, %rbp + neg %rbp + mov %rbp, %rbx + sub %r8, %rbx + + + push %r8 + mov %rdx, %r8 + + mov %r9, %rax + mul %rcx + mov %rax, %r13 + add %rcx, %rdx + mov %rdx, %r10 + + mov %rbp, %rax + mul %rcx + mov -8(%rsi, %r8, 8), %r11 + mov (%rsi, %r8, 8), %rcx + mov %r10, (%rdi, %r8, 8) + add %rax, %r11 + adc %rdx, %rcx + sbb %r12, %r12 + dec %r8 + mov %rcx, %rax + jz .Lfinal + mov $0, %r14d + + .align 16, 0x90 + + + + +.Lloop: + + + cmovc %r9, %r14 + mov %r12, %r15 + neg %r15 + mul %r9 + add %rdx, %r14 + adc $0, %r15 + add %r13, %r14 + mov %rax, %r13 + mov %rbp, %rax + lea (%rbx, %r11), %r10 + adc $0, %r15 + + + mul %rcx + and %rbp, %r12 + add %r12, %r11 + cmovnc %r11, %r10 + + + adc %rcx, %r14 + mov -8(%rsi, %r8, 8), %r11 + adc %r15, 8(%rdi, %r8, 8) + jc .Lq_incr +.Lq_incr_done: + add %rax, %r11 + mov %r10, %rax + adc %rdx, %rax + mov %r14, (%rdi, %r8, 8) + mov $0, %r14d + sbb %r12, %r12 + dec %r8 + mov %rax, %rcx + jnz .Lloop + +.Lfinal: + pop %r8 + + mov %r12, %r14 + and %r8, %r12 + sub %r12, %rax + neg %r14 + + mov %rax, %rcx + sub %r8, %rax + cmovc %rcx, %rax + sbb $-1, %r14 + + lea 1(%rax), %r10 + mul %r9 + add %r11, %rax + adc %r10, %rdx + mov %rdx, %r10 + imul %r8, %rdx + sub %rdx, %r11 + cmp %r11, %rax + lea (%r11, %r8), %rax + cmovnc %r11, %rax + sbb $0, %r10 + cmp %r8, %rax + jc .Ldiv_done + sub %r8, %rax + add $1, %r10 +.Ldiv_done: + add %r10, %r13 + mov %r13, (%rdi) + adc %r14, 8(%rdi) + jnc .Ldone +.Lfinal_q_incr: + addq $1, 16(%rdi) + lea 8(%rdi), %rdi + jc .Lfinal_q_incr + +.Ldone: + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +.Lq_incr: + + lea 16(%rdi, %r8, 8), %rcx +.Lq_incr_loop: + addq $1, (%rcx) + jnc .Lq_incr_done + lea 8(%rcx), %rcx + jmp .Lq_incr_loop + .size __gmpn_div_qr_1n_pi1,.-__gmpn_div_qr_1n_pi1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2n_pi1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2n_pi1.s new file mode 100644 index 0000000..67618f7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2n_pi1.s @@ -0,0 +1,171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_div_qr_2n_pi1 + .type __gmpn_div_qr_2n_pi1,@function + +__gmpn_div_qr_2n_pi1: + + + + + + mov 8(%rsp), %r10 + mov %rdx, %r11 + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + mov -16(%r11, %rcx, 8), %r12 + mov -8(%r11, %rcx, 8), %rbx + + mov %r12, %r14 + mov %rbx, %r13 + sub %r9, %r14 + sbb %r8, %r13 + cmovnc %r14, %r12 + cmovnc %r13, %rbx + + sbb %rax, %rax + inc %rax + push %rax + lea -2(%rcx), %rcx + mov %r8, %r15 + neg %r15 + + jmp .Lnext + + .align 16, 0x90 +.Lloop: + + + + mov %r10, %rax + mul %rbx + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov (%r11, %rcx, 8), %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae .Lfix +.Lbck: + mov %r13, (%rdi, %rcx, 8) +.Lnext: + sub $1, %rcx + jnc .Lloop +.Lend: + mov %rbx, 8(%rsi) + mov %r12, (%rsi) + + + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + +.Lfix: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je .Lbck + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp .Lbck + .size __gmpn_div_qr_2n_pi1,.-__gmpn_div_qr_2n_pi1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2u_pi1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2u_pi1.s new file mode 100644 index 0000000..a11a847 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/div_qr_2u_pi1.s @@ -0,0 +1,211 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + + .globl __gmpn_div_qr_2u_pi1 + .type __gmpn_div_qr_2u_pi1,@function + +__gmpn_div_qr_2u_pi1: + + mov 0+16(%rsp), %r10 + mov %rdx, %r11 + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + push %rsi + + lea -2(%rcx), %rbp + mov %r8, %r15 + neg %r15 + + + movl 56+8(%rsp), %ecx + + + + xor %ebx, %ebx + mov 8(%r11, %rbp, 8), %r12 + shld %cl, %r12, %rbx + + + mov %r10, %rax + mul %rbx + mov (%r11, %rbp, 8), %rsi + shld %cl, %rsi, %r12 + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov %rsi, %r12 + shl %cl, %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae .Lfix_qh +.Lbck_qh: + push %r13 + + jmp .Lnext + + .align 16, 0x90 +.Lloop: + + + + mov %r10, %rax + mul %rbx + mov (%r11, %rbp, 8), %rsi + xor %r13d, %r13d + shld %cl, %rsi, %r13 + or %r13, %r12 + mov %r12, %r14 + add %rax, %r14 + adc %rbx, %rdx + mov %rdx, %r13 + imul %r15, %rdx + mov %r9, %rax + lea (%rdx, %r12), %rbx + mul %r13 + mov %rsi, %r12 + shl %cl, %r12 + sub %r9, %r12 + sbb %r8, %rbx + sub %rax, %r12 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %r14, %rbx + cmovnc %r9, %rax + cmovnc %r8, %rdx + adc $0, %r13 + nop + add %rax, %r12 + adc %rdx, %rbx + cmp %r8, %rbx + jae .Lfix +.Lbck: + mov %r13, (%rdi, %rbp, 8) +.Lnext: + sub $1, %rbp + jnc .Lloop +.Lend: + + pop %rax + pop %rsi + shrd %cl, %rbx, %r12 + shr %cl, %rbx + mov %rbx, 8(%rsi) + mov %r12, (%rsi) + + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + ret + +.Lfix: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je .Lbck + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp .Lbck + + +.Lfix_qh: + seta %dl + cmp %r9, %r12 + setae %al + orb %dl, %al + je .Lbck_qh + inc %r13 + sub %r9, %r12 + sbb %r8, %rbx + jmp .Lbck_qh + .size __gmpn_div_qr_2u_pi1,.-__gmpn_div_qr_2u_pi1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/dive_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/dive_1.s new file mode 100644 index 0000000..23a35c8 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/dive_1.s @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_divexact_1 + .type __gmpn_divexact_1,@function + +__gmpn_divexact_1: + + + push %rbx + + mov %rcx, %rax + xor %ecx, %ecx + mov %rdx, %r8 + + bt $0, %eax + jnc .Levn + +.Lodd: mov %rax, %rbx + shr %eax + and $127, %eax + + mov __gmp_binvert_limb_table@GOTPCREL(%rip), %rdx + + + + movzbl (%rdx,%rax), %eax + + mov %rbx, %r11 + + lea (%rax,%rax), %edx + imul %eax, %eax + imul %ebx, %eax + sub %eax, %edx + + lea (%rdx,%rdx), %eax + imul %edx, %edx + imul %ebx, %edx + sub %edx, %eax + + lea (%rax,%rax), %r10 + imul %rax, %rax + imul %rbx, %rax + sub %rax, %r10 + + lea (%rsi,%r8,8), %rsi + lea -8(%rdi,%r8,8), %rdi + neg %r8 + + mov (%rsi,%r8,8), %rax + + inc %r8 + jz .Lone + + mov (%rsi,%r8,8), %rdx + + shrd %cl, %rdx, %rax + + xor %ebx, %ebx + jmp .Lent + +.Levn: bsf %rax, %rcx + shr %cl, %rax + jmp .Lodd + + .align 8, 0x90 +.Ltop: + + + + + + + + + + + mul %r11 + mov -8(%rsi,%r8,8), %rax + mov (%rsi,%r8,8), %r9 + shrd %cl, %r9, %rax + nop + sub %rbx, %rax + setc %bl + sub %rdx, %rax + adc $0, %rbx +.Lent: imul %r10, %rax + mov %rax, (%rdi,%r8,8) + inc %r8 + jnz .Ltop + + mul %r11 + mov -8(%rsi), %rax + shr %cl, %rax + sub %rbx, %rax + sub %rdx, %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + + ret + +.Lone: shr %cl, %rax + imul %r10, %rax + mov %rax, (%rdi) + pop %rbx + + ret + + .size __gmpn_divexact_1,.-__gmpn_divexact_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s new file mode 100644 index 0000000..e689bd2 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_1.s @@ -0,0 +1,335 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_preinv_divrem_1 + .type __gmpn_preinv_divrem_1,@function + +__gmpn_preinv_divrem_1: + + + + + xor %eax, %eax + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rsi, %r12 + mov %rcx, %rbx + add %rsi, %rcx + mov %rdx, %rsi + + lea -8(%rdi,%rcx,8), %rdi + + test %r8, %r8 + js .Lnent + + mov 40(%rsp), %cl + shl %cl, %r8 + jmp .Luent + .size __gmpn_preinv_divrem_1,.-__gmpn_preinv_divrem_1 + + .align 16, 0x90 + .globl __gmpn_divrem_1 + .type __gmpn_divrem_1,@function + +__gmpn_divrem_1: + + + + xor %eax, %eax + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rsi, %r12 + mov %rcx, %rbx + add %rsi, %rcx + mov %rdx, %rsi + je .Lret + + lea -8(%rdi,%rcx,8), %rdi + xor %ebp, %ebp + + test %r8, %r8 + jns .Lunnormalized + +.Lnormalized: + test %rbx, %rbx + je .L8 + mov -8(%rsi,%rbx,8), %rbp + dec %rbx + mov %rbp, %rax + sub %r8, %rbp + cmovc %rax, %rbp + sbb %eax, %eax + inc %eax + mov %rax, (%rdi) + lea -8(%rdi), %rdi +.L8: + push %rdi + push %rsi + push %r8 + mov %r8, %rdi + + + + call __gmpn_invert_limb@PLT + + + pop %r8 + pop %rsi + pop %rdi + + mov %rax, %r9 + mov %rbp, %rax + jmp .Lnent + + .align 16, 0x90 +.Lntop:mov (%rsi,%rbx,8), %r10 + mul %r9 + add %r10, %rax + adc %rbp, %rdx + mov %rax, %rbp + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %r10 + mov %r8, %rax + add %r10, %rax + cmp %rbp, %r10 + cmovc %r10, %rax + adc $-1, %r13 + cmp %r8, %rax + jae .Lnfx +.Lnok: mov %r13, (%rdi) + sub $8, %rdi +.Lnent:lea 1(%rax), %rbp + dec %rbx + jns .Lntop + + xor %ecx, %ecx + jmp .Lfrac + +.Lnfx: sub %r8, %rax + inc %r13 + jmp .Lnok + +.Lunnormalized: + test %rbx, %rbx + je .L44 + mov -8(%rsi,%rbx,8), %rax + cmp %r8, %rax + jae .L44 + mov %rbp, (%rdi) + mov %rax, %rbp + lea -8(%rdi), %rdi + je .Lret + dec %rbx +.L44: + bsr %r8, %rcx + not %ecx + shl %cl, %r8 + shl %cl, %rbp + + push %rcx + push %rdi + push %rsi + push %r8 + sub $8, %rsp + mov %r8, %rdi + + + + call __gmpn_invert_limb@PLT + + add $8, %rsp + + pop %r8 + pop %rsi + pop %rdi + pop %rcx + + mov %rax, %r9 + mov %rbp, %rax + test %rbx, %rbx + je .Lfrac + +.Luent:dec %rbx + mov (%rsi,%rbx,8), %rbp + neg %ecx + shr %cl, %rbp + neg %ecx + or %rbp, %rax + jmp .Lent + + .align 16, 0x90 +.Lutop:mov (%rsi,%rbx,8), %r10 + shl %cl, %rbp + neg %ecx + shr %cl, %r10 + neg %ecx + or %r10, %rbp + mul %r9 + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %rbp + mov %r8, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp %r8, %rax + jae .Lufx +.Luok: mov %r13, (%rdi) + sub $8, %rdi +.Lent: mov (%rsi,%rbx,8), %rbp + dec %rbx + lea 1(%rax), %r11 + jns .Lutop + +.Luend:shl %cl, %rbp + mul %r9 + add %rbp, %rax + adc %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %r8, %rdx + sub %rdx, %rbp + mov %r8, %rax + add %rbp, %rax + cmp %r11, %rbp + cmovc %rbp, %rax + adc $-1, %r13 + cmp %r8, %rax + jae .Lefx +.Leok: mov %r13, (%rdi) + sub $8, %rdi + jmp .Lfrac + +.Lufx: sub %r8, %rax + inc %r13 + jmp .Luok +.Lefx: sub %r8, %rax + inc %r13 + jmp .Leok + +.Lfrac:mov %r8, %rbp + neg %rbp + jmp .Lfent + + .align 16, 0x90 +.Lftop:mul %r9 + add %r11, %rdx + mov %rax, %r11 + mov %rdx, %r13 + imul %rbp, %rdx + mov %r8, %rax + add %rdx, %rax + cmp %r11, %rdx + cmovc %rdx, %rax + adc $-1, %r13 + mov %r13, (%rdi) + sub $8, %rdi +.Lfent:lea 1(%rax), %r11 + dec %r12 + jns .Lftop + + shr %cl, %rax +.Lret: pop %rbx + pop %rbp + pop %r12 + pop %r13 + + ret + .size __gmpn_divrem_1,.-__gmpn_divrem_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_2.s b/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_2.s new file mode 100644 index 0000000..b1c0d5b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/divrem_2.s @@ -0,0 +1,208 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_divrem_2 + .type __gmpn_divrem_2,@function + +__gmpn_divrem_2: + + + + push %r15 + push %r14 + push %r13 + push %r12 + lea -24(%rdx,%rcx,8), %r12 + mov %rsi, %r13 + push %rbp + mov %rdi, %rbp + push %rbx + mov 8(%r8), %r11 + mov 16(%r12), %rbx + mov (%r8), %r8 + mov 8(%r12), %r10 + + xor %r15d, %r15d + cmp %rbx, %r11 + ja .L2 + setb %dl + cmp %r10, %r8 + setbe %al + orb %al, %dl + je .L2 + inc %r15d + sub %r8, %r10 + sbb %r11, %rbx +.L2: + lea -3(%rcx,%r13), %r14 + test %r14, %r14 + js .Lend + + push %r8 + push %r10 + push %r11 + mov %r11, %rdi + + + + call __gmpn_invert_limb@PLT + + + pop %r11 + pop %r10 + pop %r8 + + mov %r11, %rdx + mov %rax, %rdi + imul %rax, %rdx + mov %rdx, %r9 + mul %r8 + xor %ecx, %ecx + add %r8, %r9 + adc $-1, %rcx + add %rdx, %r9 + adc $0, %rcx + js 2f +1: dec %rdi + sub %r11, %r9 + sbb $0, %rcx + jns 1b +2: + + lea (%rbp,%r14,8), %rbp + mov %r11, %rsi + neg %rsi + + + + + .align 16, 0x90 +.Ltop: mov %rdi, %rax + mul %rbx + mov %r10, %rcx + add %rax, %rcx + adc %rbx, %rdx + mov %rdx, %r9 + imul %rsi, %rdx + mov %r8, %rax + lea (%rdx, %r10), %rbx + xor %r10d, %r10d + mul %r9 + cmp %r14, %r13 + jg .L19 + mov (%r12), %r10 + sub $8, %r12 +.L19: sub %r8, %r10 + sbb %r11, %rbx + sub %rax, %r10 + sbb %rdx, %rbx + xor %eax, %eax + xor %edx, %edx + cmp %rcx, %rbx + cmovnc %r8, %rax + cmovnc %r11, %rdx + adc $0, %r9 + nop + add %rax, %r10 + adc %rdx, %rbx + cmp %r11, %rbx + jae .Lfix +.Lbck: mov %r9, (%rbp) + sub $8, %rbp + dec %r14 + jns .Ltop + +.Lend: mov %r10, 8(%r12) + mov %rbx, 16(%r12) + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + mov %r15, %rax + pop %r15 + + ret + +.Lfix: seta %dl + cmp %r8, %r10 + setae %al + orb %dl, %al + je .Lbck + inc %r9 + sub %r8, %r10 + sbb %r11, %rbx + jmp .Lbck + .size __gmpn_divrem_2,.-__gmpn_divrem_2 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/fib_table.c b/vere/ext/gmp/gen/x86_64-linux/mpn/fib_table.c new file mode 100644 index 0000000..a830475 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/fib_table.c @@ -0,0 +1,107 @@ +/* This file generated by gen-fib.c - DO NOT EDIT. */ + +#include "gmp.h" +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +const mp_limb_t +__gmp_fib_table[FIB_TABLE_LIMIT+2] = { + CNST_LIMB (0x1), /* -1 */ + CNST_LIMB (0x0), /* 0 */ + CNST_LIMB (0x1), /* 1 */ + CNST_LIMB (0x1), /* 2 */ + CNST_LIMB (0x2), /* 3 */ + CNST_LIMB (0x3), /* 4 */ + CNST_LIMB (0x5), /* 5 */ + CNST_LIMB (0x8), /* 6 */ + CNST_LIMB (0xd), /* 7 */ + CNST_LIMB (0x15), /* 8 */ + CNST_LIMB (0x22), /* 9 */ + CNST_LIMB (0x37), /* 10 */ + CNST_LIMB (0x59), /* 11 */ + CNST_LIMB (0x90), /* 12 */ + CNST_LIMB (0xe9), /* 13 */ + CNST_LIMB (0x179), /* 14 */ + CNST_LIMB (0x262), /* 15 */ + CNST_LIMB (0x3db), /* 16 */ + CNST_LIMB (0x63d), /* 17 */ + CNST_LIMB (0xa18), /* 18 */ + CNST_LIMB (0x1055), /* 19 */ + CNST_LIMB (0x1a6d), /* 20 */ + CNST_LIMB (0x2ac2), /* 21 */ + CNST_LIMB (0x452f), /* 22 */ + CNST_LIMB (0x6ff1), /* 23 */ + CNST_LIMB (0xb520), /* 24 */ + CNST_LIMB (0x12511), /* 25 */ + CNST_LIMB (0x1da31), /* 26 */ + CNST_LIMB (0x2ff42), /* 27 */ + CNST_LIMB (0x4d973), /* 28 */ + CNST_LIMB (0x7d8b5), /* 29 */ + CNST_LIMB (0xcb228), /* 30 */ + CNST_LIMB (0x148add), /* 31 */ + CNST_LIMB (0x213d05), /* 32 */ + CNST_LIMB (0x35c7e2), /* 33 */ + CNST_LIMB (0x5704e7), /* 34 */ + CNST_LIMB (0x8cccc9), /* 35 */ + CNST_LIMB (0xe3d1b0), /* 36 */ + CNST_LIMB (0x1709e79), /* 37 */ + CNST_LIMB (0x2547029), /* 38 */ + CNST_LIMB (0x3c50ea2), /* 39 */ + CNST_LIMB (0x6197ecb), /* 40 */ + CNST_LIMB (0x9de8d6d), /* 41 */ + CNST_LIMB (0xff80c38), /* 42 */ + CNST_LIMB (0x19d699a5), /* 43 */ + CNST_LIMB (0x29cea5dd), /* 44 */ + CNST_LIMB (0x43a53f82), /* 45 */ + CNST_LIMB (0x6d73e55f), /* 46 */ + CNST_LIMB (0xb11924e1), /* 47 */ + CNST_LIMB (0x11e8d0a40), /* 48 */ + CNST_LIMB (0x1cfa62f21), /* 49 */ + CNST_LIMB (0x2ee333961), /* 50 */ + CNST_LIMB (0x4bdd96882), /* 51 */ + CNST_LIMB (0x7ac0ca1e3), /* 52 */ + CNST_LIMB (0xc69e60a65), /* 53 */ + CNST_LIMB (0x1415f2ac48), /* 54 */ + CNST_LIMB (0x207fd8b6ad), /* 55 */ + CNST_LIMB (0x3495cb62f5), /* 56 */ + CNST_LIMB (0x5515a419a2), /* 57 */ + CNST_LIMB (0x89ab6f7c97), /* 58 */ + CNST_LIMB (0xdec1139639), /* 59 */ + CNST_LIMB (0x1686c8312d0), /* 60 */ + CNST_LIMB (0x2472d96a909), /* 61 */ + CNST_LIMB (0x3af9a19bbd9), /* 62 */ + CNST_LIMB (0x5f6c7b064e2), /* 63 */ + CNST_LIMB (0x9a661ca20bb), /* 64 */ + CNST_LIMB (0xf9d297a859d), /* 65 */ + CNST_LIMB (0x19438b44a658), /* 66 */ + CNST_LIMB (0x28e0b4bf2bf5), /* 67 */ + CNST_LIMB (0x42244003d24d), /* 68 */ + CNST_LIMB (0x6b04f4c2fe42), /* 69 */ + CNST_LIMB (0xad2934c6d08f), /* 70 */ + CNST_LIMB (0x1182e2989ced1), /* 71 */ + CNST_LIMB (0x1c5575e509f60), /* 72 */ + CNST_LIMB (0x2dd8587da6e31), /* 73 */ + CNST_LIMB (0x4a2dce62b0d91), /* 74 */ + CNST_LIMB (0x780626e057bc2), /* 75 */ + CNST_LIMB (0xc233f54308953), /* 76 */ + CNST_LIMB (0x13a3a1c2360515), /* 77 */ + CNST_LIMB (0x1fc6e116668e68), /* 78 */ + CNST_LIMB (0x336a82d89c937d), /* 79 */ + CNST_LIMB (0x533163ef0321e5), /* 80 */ + CNST_LIMB (0x869be6c79fb562), /* 81 */ + CNST_LIMB (0xd9cd4ab6a2d747), /* 82 */ + CNST_LIMB (0x16069317e428ca9), /* 83 */ + CNST_LIMB (0x23a367c34e563f0), /* 84 */ + CNST_LIMB (0x39a9fadb327f099), /* 85 */ + CNST_LIMB (0x5d4d629e80d5489), /* 86 */ + CNST_LIMB (0x96f75d79b354522), /* 87 */ + CNST_LIMB (0xf444c01834299ab), /* 88 */ + CNST_LIMB (0x18b3c1d91e77decd), /* 89 */ + CNST_LIMB (0x27f80ddaa1ba7878), /* 90 */ + CNST_LIMB (0x40abcfb3c0325745), /* 91 */ + CNST_LIMB (0x68a3dd8e61eccfbd), /* 92 */ + CNST_LIMB (0xa94fad42221f2702), /* 93 */ +}; diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s b/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s new file mode 100644 index 0000000..cf35d25 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_11.s @@ -0,0 +1,256 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table + + + + + + + + + .text + .align 64, 0x90 + .globl __gmpn_gcd_11 + .type __gmpn_gcd_11,@function + +__gmpn_gcd_11: + + + mov ctz_table@GOTPCREL(%rip), %r8 + + + jmp .Lent + + .align 16, 0x90 +.Ltop: cmovc %rdx, %rdi + cmovc %rax, %rsi +.Lmid: and $127, %edx + movzbl (%r8,%rdx), %ecx + jz .Lshift_alot + shr %cl, %rdi +.Lent: mov %rdi, %rax + mov %rsi, %rdx + sub %rdi, %rdx + sub %rsi, %rdi + jnz .Ltop + +.Lend: + + + ret + +.Lshift_alot: + shr $7, %rdi + mov %rdi, %rdx + jmp .Lmid + .size __gmpn_gcd_11,.-__gmpn_gcd_11 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s b/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s new file mode 100644 index 0000000..60f4c71 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/gcd_22.s @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .section .rodata + .align 64, 0x90 +ctz_table: + + .byte 8 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 7 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 6 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 5 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 4 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + .byte 3 + .byte 0 + .byte 1 + .byte 0 + .byte 2 + .byte 0 + .byte 1 + .byte 0 + + .size ctz_table,.-ctz_table + + + + + + + + + + + + + + + + + + .text + .align 64, 0x90 + .globl __gmpn_gcd_22 + .type __gmpn_gcd_22,@function + +__gmpn_gcd_22: + + + mov %rcx, %rax + + mov ctz_table@GOTPCREL(%rip), %r10 + + + + .align 16, 0x90 +.Ltop: mov %rax, %rcx + sub %rsi, %rcx + jz .Llowz + mov %rdx, %r11 + sbb %rdi, %r11 + + mov %rsi, %r8 + mov %rdi, %r9 + + sub %rax, %rsi + sbb %rdx, %rdi + +.Lbck: cmovc %rcx, %rsi + cmovc %r11, %rdi + cmovc %r8, %rax + cmovc %r9, %rdx + + and $255, %ecx + movzbl (%r10,%rcx), %ecx + jz .Lcount_better + +.Lshr: shr %cl, %rsi + mov %rdi, %r11 + shr %cl, %rdi + neg %rcx + shl %cl, %r11 + or %r11, %rsi + + test %rdx, %rdx + jnz .Ltop + test %rdi, %rdi + jnz .Ltop + +.Lgcd_11: + mov %rax, %rdi + + jmp __gmpn_gcd_11@PLT + + +.Lcount_better: + rep;bsf %rsi, %rcx + jmp .Lshr + +.Llowz: + + + mov %rdx, %rcx + sub %rdi, %rcx + je .Lend + + xor %r11, %r11 + mov %rsi, %r8 + mov %rdi, %r9 + mov %rdi, %rsi + xor %rdi, %rdi + sub %rdx, %rsi + jmp .Lbck + +.Lend: + + + ret + .size __gmpn_gcd_22,.-__gmpn_gcd_22 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/hamdist.s b/vere/ext/gmp/gen/x86_64-linux/mpn/hamdist.s new file mode 100644 index 0000000..1ab3a8c --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/hamdist.s @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_hamdist + .type __gmpn_hamdist,@function + +__gmpn_hamdist: + + + push %rbx + mov $0x5555555555555555, %r10 + push %rbp + mov $0x3333333333333333, %r11 + push %r12 + lea (%rdi,%rdx,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + lea (%rsi,%rdx,8), %rsi + neg %rdx + mov $0x0101010101010101, %r12 + xor %eax, %eax + test $1, %dl + jz .Ltop + + mov (%rdi,%rdx,8), %r8 + xor (%rsi,%rdx,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rdx + jmp .Lmid + + .align 16, 0x90 +.Ltop: mov (%rdi,%rdx,8), %r8 + mov 8(%rdi,%rdx,8), %rbx + xor (%rsi,%rdx,8), %r8 + xor 8(%rsi,%rdx,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %r12, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rdx + jnc .Ltop + +.Lend: + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_hamdist,.-__gmpn_hamdist diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb.s b/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb.s new file mode 100644 index 0000000..d7352e7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb.s @@ -0,0 +1,123 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.hidden __gmpn_invert_limb_table + + + .text + .align 16, 0x90 + .globl __gmpn_invert_limb + .type __gmpn_invert_limb,@function + +__gmpn_invert_limb: + + + mov %rdi, %rax + shr $55, %rax + + lea -512+__gmpn_invert_limb_table(%rip), %r8 + + movzwl (%r8,%rax,2), %ecx + + + mov %rdi, %rsi + mov %ecx, %eax + imul %ecx, %ecx + shr $24, %rsi + inc %rsi + imul %rsi, %rcx + shr $40, %rcx + sal $11, %eax + dec %eax + sub %ecx, %eax + + + mov $0x1000000000000000, %rcx + imul %rax, %rsi + sub %rsi, %rcx + imul %rax, %rcx + sal $13, %rax + shr $47, %rcx + add %rax, %rcx + + + mov %rdi, %rsi + shr %rsi + sbb %rax, %rax + sub %rax, %rsi + imul %rcx, %rsi + and %rcx, %rax + shr %rax + sub %rsi, %rax + mul %rcx + sal $31, %rcx + shr %rdx + add %rdx, %rcx + + mov %rdi, %rax + mul %rcx + add %rdi, %rax + mov %rcx, %rax + adc %rdi, %rdx + sub %rdx, %rax + + + ret + .size __gmpn_invert_limb,.-__gmpn_invert_limb + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb_table.s b/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb_table.s new file mode 100644 index 0000000..a990458 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/invert_limb_table.s @@ -0,0 +1,313 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.hidden __gmpn_invert_limb_table + + + + + .section .rodata + .align 2, 0x90 + .globl __gmpn_invert_limb_table +__gmpn_invert_limb_table: + .value 2045 + .value 2037 + .value 2029 + .value 2021 + .value 2013 + .value 2005 + .value 1998 + .value 1990 + .value 1983 + .value 1975 + .value 1968 + .value 1960 + .value 1953 + .value 1946 + .value 1938 + .value 1931 + .value 1924 + .value 1917 + .value 1910 + .value 1903 + .value 1896 + .value 1889 + .value 1883 + .value 1876 + .value 1869 + .value 1863 + .value 1856 + .value 1849 + .value 1843 + .value 1836 + .value 1830 + .value 1824 + .value 1817 + .value 1811 + .value 1805 + .value 1799 + .value 1792 + .value 1786 + .value 1780 + .value 1774 + .value 1768 + .value 1762 + .value 1756 + .value 1750 + .value 1745 + .value 1739 + .value 1733 + .value 1727 + .value 1722 + .value 1716 + .value 1710 + .value 1705 + .value 1699 + .value 1694 + .value 1688 + .value 1683 + .value 1677 + .value 1672 + .value 1667 + .value 1661 + .value 1656 + .value 1651 + .value 1646 + .value 1641 + .value 1636 + .value 1630 + .value 1625 + .value 1620 + .value 1615 + .value 1610 + .value 1605 + .value 1600 + .value 1596 + .value 1591 + .value 1586 + .value 1581 + .value 1576 + .value 1572 + .value 1567 + .value 1562 + .value 1558 + .value 1553 + .value 1548 + .value 1544 + .value 1539 + .value 1535 + .value 1530 + .value 1526 + .value 1521 + .value 1517 + .value 1513 + .value 1508 + .value 1504 + .value 1500 + .value 1495 + .value 1491 + .value 1487 + .value 1483 + .value 1478 + .value 1474 + .value 1470 + .value 1466 + .value 1462 + .value 1458 + .value 1454 + .value 1450 + .value 1446 + .value 1442 + .value 1438 + .value 1434 + .value 1430 + .value 1426 + .value 1422 + .value 1418 + .value 1414 + .value 1411 + .value 1407 + .value 1403 + .value 1399 + .value 1396 + .value 1392 + .value 1388 + .value 1384 + .value 1381 + .value 1377 + .value 1374 + .value 1370 + .value 1366 + .value 1363 + .value 1359 + .value 1356 + .value 1352 + .value 1349 + .value 1345 + .value 1342 + .value 1338 + .value 1335 + .value 1332 + .value 1328 + .value 1325 + .value 1322 + .value 1318 + .value 1315 + .value 1312 + .value 1308 + .value 1305 + .value 1302 + .value 1299 + .value 1295 + .value 1292 + .value 1289 + .value 1286 + .value 1283 + .value 1280 + .value 1276 + .value 1273 + .value 1270 + .value 1267 + .value 1264 + .value 1261 + .value 1258 + .value 1255 + .value 1252 + .value 1249 + .value 1246 + .value 1243 + .value 1240 + .value 1237 + .value 1234 + .value 1231 + .value 1228 + .value 1226 + .value 1223 + .value 1220 + .value 1217 + .value 1214 + .value 1211 + .value 1209 + .value 1206 + .value 1203 + .value 1200 + .value 1197 + .value 1195 + .value 1192 + .value 1189 + .value 1187 + .value 1184 + .value 1181 + .value 1179 + .value 1176 + .value 1173 + .value 1171 + .value 1168 + .value 1165 + .value 1163 + .value 1160 + .value 1158 + .value 1155 + .value 1153 + .value 1150 + .value 1148 + .value 1145 + .value 1143 + .value 1140 + .value 1138 + .value 1135 + .value 1133 + .value 1130 + .value 1128 + .value 1125 + .value 1123 + .value 1121 + .value 1118 + .value 1116 + .value 1113 + .value 1111 + .value 1109 + .value 1106 + .value 1104 + .value 1102 + .value 1099 + .value 1097 + .value 1095 + .value 1092 + .value 1090 + .value 1088 + .value 1086 + .value 1083 + .value 1081 + .value 1079 + .value 1077 + .value 1074 + .value 1072 + .value 1070 + .value 1068 + .value 1066 + .value 1064 + .value 1061 + .value 1059 + .value 1057 + .value 1055 + .value 1053 + .value 1051 + .value 1049 + .value 1047 + .value 1044 + .value 1042 + .value 1040 + .value 1038 + .value 1036 + .value 1034 + .value 1032 + .value 1030 + .value 1028 + .value 1026 + .value 1024 + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/ior_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/ior_n.s new file mode 100644 index 0000000..6509f28 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/ior_n.s @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_ior_n + .type __gmpn_ior_n,@function + +__gmpn_ior_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_ior_n,.-__gmpn_ior_n + + + + + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s new file mode 100644 index 0000000..b199ca3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/iorn_n.s @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_iorn_n + .type __gmpn_iorn_n,@function + +__gmpn_iorn_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 + not %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + not %r9 + or (%rsi,%rcx,8), %r8 + or 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 + not %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + not %r9 + or 16(%rsi,%rcx,8), %r8 + or 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_iorn_n,.-__gmpn_iorn_n + + + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/jacobitab.h b/vere/ext/gmp/gen/x86_64-linux/mpn/jacobitab.h new file mode 100644 index 0000000..4bdbfcc --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/jacobitab.h @@ -0,0 +1,13 @@ + 0, 0, 0, 0, 0,12, 8, 4, 1, 1, 1, 1, 1,13, 9, 5, + 2, 2, 2, 2, 2, 6,10,14, 3, 3, 3, 3, 3, 7,11,15, + 4,16, 6,18, 4, 0,12, 8, 5,17, 7,19, 5, 1,13, 9, + 6,18, 4,16, 6,10,14, 2, 7,19, 5,17, 7,11,15, 3, + 8,10, 9,11, 8, 4, 0,12, 9,11, 8,10, 9, 5, 1,13, +10, 9,11, 8,10,14, 2, 6,11, 8,10, 9,11,15, 3, 7, +12,22,24,20,12, 8, 4, 0,13,23,25,21,13, 9, 5, 1, +25,21,13,23,14, 2, 6,10,24,20,12,22,15, 3, 7,11, +16, 6,18, 4,16,16,16,16,17, 7,19, 5,17,17,17,17, +18, 4,16, 6,18,22,19,23,19, 5,17, 7,19,23,18,22, +20,12,22,24,20,20,20,20,21,13,23,25,21,21,21,21, +22,24,20,12,22,19,23,18,23,25,21,13,23,18,22,19, +24,20,12,22,15, 3, 7,11,25,21,13,23,14, 2, 6,10, diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/lshift.s b/vere/ext/gmp/gen/x86_64-linux/mpn/lshift.s new file mode 100644 index 0000000..89e9566 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/lshift.s @@ -0,0 +1,186 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_lshift + .type __gmpn_lshift,@function + +__gmpn_lshift: + + + neg %ecx + mov -8(%rsi,%rdx,8), %rax + shr %cl, %rax + + neg %ecx + lea 1(%rdx), %r8d + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend + .align 16, 0x90 +.Ltop: + + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 + + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + mov %r10, (%rdi) + + ret + .size __gmpn_lshift,.-__gmpn_lshift diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s b/vere/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s new file mode 100644 index 0000000..6809940 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/lshiftc.s @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_lshiftc + .type __gmpn_lshiftc,@function + +__gmpn_lshiftc: + + + neg %ecx + mov -8(%rsi,%rdx,8), %rax + shr %cl, %rax + + neg %ecx + lea 1(%rdx), %r8d + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + shr %cl, %r8 + or %r8, %r10 + not %r10 + mov %r10, -8(%rdi,%rdx,8) + dec %rdx + neg %ecx +.L1x: + cmp $1, %rdx + je .Last + mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -24(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, -8(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + sub $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov -8(%rsi,%rdx,8), %r10 + shl %cl, %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r11 + + sub $4, %rdx + jb .Lend + .align 16, 0x90 +.Ltop: + + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + or %r8, %r10 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 24(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + + mov 0(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shr %cl, %r8 + shr %cl, %r9 + + + neg %ecx + mov 8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shl %cl, %r10 + or %r10, %r8 + shl %cl, %r11 + or %r11, %r9 + not %r8 + not %r9 + mov %r8, 8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov -8(%rsi,%rdx,8), %r10 + mov -16(%rsi,%rdx,8), %r11 + shl %cl, %r10 + shl %cl, %r11 + + sub $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov 8(%rsi), %r8 + shr %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shr %cl, %r9 + or %r9, %r11 + not %r10 + not %r11 + mov %r10, 16(%rdi) + mov %r11, 8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shl %cl, %r10 + not %r10 + mov %r10, (%rdi) + + ret + .size __gmpn_lshiftc,.-__gmpn_lshiftc diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_1.s new file mode 100644 index 0000000..a8e3198 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_1.s @@ -0,0 +1,241 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mod_1_1p + .type __gmpn_mod_1_1p,@function + +__gmpn_mod_1_1p: + + + push %rbp + push %rbx + mov %rdx, %rbx + mov %rcx, %r8 + + mov -8(%rdi, %rsi, 8), %rax + cmp $3, %rsi + jnc .Lfirst + mov -16(%rdi, %rsi, 8), %rbp + jmp .Lreduce_two + +.Lfirst: + + mov 24(%r8), %r11 + mul %r11 + mov -24(%rdi, %rsi, 8), %rbp + add %rax, %rbp + mov -16(%rdi, %rsi, 8), %rax + adc %rdx, %rax + sbb %rcx, %rcx + sub $4, %rsi + jc .Lreduce_three + + mov %r11, %r10 + sub %rbx, %r10 + + .align 16, 0x90 +.Ltop: and %r11, %rcx + lea (%r10, %rbp), %r9 + mul %r11 + add %rbp, %rcx + mov (%rdi, %rsi, 8), %rbp + cmovc %r9, %rcx + add %rax, %rbp + mov %rcx, %rax + adc %rdx, %rax + sbb %rcx, %rcx + sub $1, %rsi + jnc .Ltop + +.Lreduce_three: + + and %rbx, %rcx + sub %rcx, %rax + +.Lreduce_two: + mov 8(%r8), %ecx + test %ecx, %ecx + jz .Lnormalized + + + mulq 16(%r8) + xor %r9, %r9 + add %rax, %rbp + adc %rdx, %r9 + mov %r9, %rax + + + + shld %cl, %rbp, %rax + + shl %cl, %rbp + jmp .Ludiv + +.Lnormalized: + mov %rax, %r9 + sub %rbx, %r9 + cmovnc %r9, %rax + +.Ludiv: + lea 1(%rax), %r9 + mulq (%r8) + add %rbp, %rax + adc %r9, %rdx + imul %rbx, %rdx + sub %rdx, %rbp + cmp %rbp, %rax + lea (%rbx, %rbp), %rax + cmovnc %rbp, %rax + cmp %rbx, %rax + jnc .Lfix +.Lok: shr %cl, %rax + + pop %rbx + pop %rbp + + ret +.Lfix: sub %rbx, %rax + jmp .Lok + .size __gmpn_mod_1_1p,.-__gmpn_mod_1_1p + + .align 16, 0x90 + .globl __gmpn_mod_1_1p_cps + .type __gmpn_mod_1_1p_cps,@function + +__gmpn_mod_1_1p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call __gmpn_invert_limb@PLT + + + neg %r12 + mov %r12, %r8 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + imul %rax, %r12 + mov %r12, 24(%rbx) + mov %ebp, %ecx + test %ecx, %ecx + jz .Lz + + mov $1, %edx + + shld %cl, %rax, %rdx + + imul %rdx, %r8 + shr %cl, %r8 + mov %r8, 16(%rbx) +.Lz: + pop %r12 + pop %rbx + pop %rbp + + ret + .size __gmpn_mod_1_1p_cps,.-__gmpn_mod_1_1p_cps + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_2.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_2.s new file mode 100644 index 0000000..1a19107 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_2.s @@ -0,0 +1,252 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mod_1s_2p + .type __gmpn_mod_1s_2p,@function + +__gmpn_mod_1s_2p: + + + push %r14 + test $1, %sil + mov %rdx, %r14 + push %r13 + mov %rcx, %r13 + push %r12 + push %rbp + push %rbx + mov 16(%rcx), %r10 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + je .Lb0 + dec %rsi + je .Lone + mov -8(%rdi,%rsi,8), %rax + mul %r10 + mov %rax, %r9 + mov %rdx, %r8 + mov (%rdi,%rsi,8), %rax + add -16(%rdi,%rsi,8), %r9 + adc $0, %r8 + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + jmp .L11 + +.Lb0: mov -8(%rdi,%rsi,8), %r8 + mov -16(%rdi,%rsi,8), %r9 + +.L11: sub $4, %rsi + jb .Led2 + lea 40(%rdi,%rsi,8), %rdi + mov -40(%rdi), %r11 + mov -32(%rdi), %rax + jmp .Lm0 + + .align 16, 0x90 +.Ltop: mov -24(%rdi), %r9 + add %rax, %r11 + mov -16(%rdi), %rax + adc %rdx, %r12 + mul %r10 + add %rax, %r9 + mov %r11, %rax + mov %rdx, %r8 + adc $0, %r8 + mul %rbx + add %rax, %r9 + mov %r12, %rax + adc %rdx, %r8 + mul %rbp + sub $2, %rsi + jb .Led1 + mov -40(%rdi), %r11 + add %rax, %r9 + mov -32(%rdi), %rax + adc %rdx, %r8 +.Lm0: mul %r10 + add %rax, %r11 + mov %r9, %rax + mov %rdx, %r12 + adc $0, %r12 + mul %rbx + add %rax, %r11 + lea -32(%rdi), %rdi + mov %r8, %rax + adc %rdx, %r12 + mul %rbp + sub $2, %rsi + jae .Ltop + +.Led0: mov %r11, %r9 + mov %r12, %r8 +.Led1: add %rax, %r9 + adc %rdx, %r8 +.Led2: mov 8(%r13), %edi + mov %r8, %rax + mov %r9, %r8 + mul %r10 + add %rax, %r8 + adc $0, %rdx +.L1: xor %ecx, %ecx + mov %r8, %r9 + sub %edi, %ecx + shr %cl, %r9 + mov %edi, %ecx + sal %cl, %rdx + or %rdx, %r9 + sal %cl, %r8 + mov %r9, %rax + mulq (%r13) + mov %rax, %rsi + inc %r9 + add %r8, %rsi + adc %r9, %rdx + imul %r14, %rdx + sub %rdx, %r8 + lea (%r8,%r14), %rax + cmp %r8, %rsi + cmovc %rax, %r8 + mov %r8, %rax + sub %r14, %rax + cmovc %r8, %rax + mov %edi, %ecx + shr %cl, %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + + ret +.Lone: + mov (%rdi), %r8 + mov 8(%rcx), %edi + xor %rdx, %rdx + jmp .L1 + .size __gmpn_mod_1s_2p,.-__gmpn_mod_1s_2p + + .align 16, 0x90 + .globl __gmpn_mod_1s_2p_cps + .type __gmpn_mod_1s_2p_cps,@function + +__gmpn_mod_1s_2p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call __gmpn_invert_limb@PLT + + + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + neg %r8 + mov %ebp, %ecx + mov $1, %esi + + shld %cl, %rax, %rsi + + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 16(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 24(%rbx) + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr %cl, %r12 + mov %r12, 32(%rbx) + + pop %r12 + pop %rbx + pop %rbp + + ret + .size __gmpn_mod_1s_2p_cps,.-__gmpn_mod_1s_2p_cps diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_4.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_4.s new file mode 100644 index 0000000..491753d --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_1_4.s @@ -0,0 +1,283 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mod_1s_4p + .type __gmpn_mod_1s_4p,@function + +__gmpn_mod_1s_4p: + + + push %r15 + push %r14 + push %r13 + push %r12 + push %rbp + push %rbx + + mov %rdx, %r15 + mov %rcx, %r14 + mov 16(%rcx), %r11 + mov 24(%rcx), %rbx + mov 32(%rcx), %rbp + mov 40(%rcx), %r13 + mov 48(%rcx), %r12 + xor %r8d, %r8d + mov %esi, %edx + and $3, %edx + je .Lb0 + cmp $2, %edx + jc .Lb1 + je .Lb2 + +.Lb3: lea -24(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + jmp .Lm0 + + .align 8, 0x90 +.Lb0: lea -32(%rdi,%rsi,8), %rdi + mov 8(%rdi), %rax + mul %r11 + mov (%rdi), %r9 + add %rax, %r9 + adc %rdx, %r8 + mov 16(%rdi), %rax + mul %rbx + add %rax, %r9 + adc %rdx, %r8 + mov 24(%rdi), %rax + mul %rbp + jmp .Lm0 + + .align 8, 0x90 +.Lb1: lea -8(%rdi,%rsi,8), %rdi + mov (%rdi), %r9 + jmp .Lm1 + + .align 8, 0x90 +.Lb2: lea -16(%rdi,%rsi,8), %rdi + mov 8(%rdi), %r8 + mov (%rdi), %r9 + jmp .Lm1 + + .align 16, 0x90 +.Ltop: mov -24(%rdi), %rax + mov -32(%rdi), %r10 + mul %r11 + add %rax, %r10 + mov -16(%rdi), %rax + mov $0, %ecx + adc %rdx, %rcx + mul %rbx + add %rax, %r10 + mov -8(%rdi), %rax + adc %rdx, %rcx + sub $32, %rdi + mul %rbp + add %rax, %r10 + mov %r13, %rax + adc %rdx, %rcx + mul %r9 + add %rax, %r10 + mov %r12, %rax + adc %rdx, %rcx + mul %r8 + mov %r10, %r9 + mov %rcx, %r8 +.Lm0: add %rax, %r9 + adc %rdx, %r8 +.Lm1: sub $4, %rsi + ja .Ltop + +.Lend: mov 8(%r14), %esi + mov %r8, %rax + mul %r11 + mov %rax, %r8 + add %r9, %r8 + adc $0, %rdx + xor %ecx, %ecx + sub %esi, %ecx + mov %r8, %rdi + shr %cl, %rdi + mov %esi, %ecx + sal %cl, %rdx + or %rdx, %rdi + mov %rdi, %rax + mulq (%r14) + mov %r15, %rbx + mov %rax, %r9 + sal %cl, %r8 + inc %rdi + add %r8, %r9 + adc %rdi, %rdx + imul %rbx, %rdx + sub %rdx, %r8 + lea (%r8,%rbx), %rax + cmp %r8, %r9 + cmovc %rax, %r8 + mov %r8, %rax + sub %rbx, %rax + cmovc %r8, %rax + shr %cl, %rax + pop %rbx + pop %rbp + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + ret + .size __gmpn_mod_1s_4p,.-__gmpn_mod_1s_4p + + .align 16, 0x90 + .globl __gmpn_mod_1s_4p_cps + .type __gmpn_mod_1s_4p_cps,@function + +__gmpn_mod_1s_4p_cps: + + + push %rbp + bsr %rsi, %rcx + push %rbx + mov %rdi, %rbx + push %r12 + xor $63, %ecx + mov %rsi, %r12 + mov %ecx, %ebp + sal %cl, %r12 + mov %r12, %rdi + + + + call __gmpn_invert_limb@PLT + + + mov %r12, %r8 + mov %rax, %r11 + mov %rax, (%rbx) + mov %rbp, 8(%rbx) + neg %r8 + mov %ebp, %ecx + mov $1, %esi + + shld %cl, %rax, %rsi + + imul %r8, %rsi + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 16(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 24(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 32(%rbx) + + not %rdx + imul %r12, %rdx + lea (%rdx,%r12), %rsi + cmp %rdx, %rax + cmovnc %rdx, %rsi + mov %r11, %rax + mul %rsi + + add %rsi, %rdx + shr %cl, %rsi + mov %rsi, 40(%rbx) + + not %rdx + imul %r12, %rdx + add %rdx, %r12 + cmp %rdx, %rax + cmovnc %rdx, %r12 + + shr %cl, %r12 + mov %r12, 48(%rbx) + + pop %r12 + pop %rbx + pop %rbp + + ret + .size __gmpn_mod_1s_4p_cps,.-__gmpn_mod_1s_4p_cps diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mod_34lsub1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_34lsub1.s new file mode 100644 index 0000000..e2a2ebb --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mod_34lsub1.s @@ -0,0 +1,228 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_mod_34lsub1 + .type __gmpn_mod_34lsub1,@function + +__gmpn_mod_34lsub1: + + + + mov $0x0000FFFFFFFFFFFF, %r11 + + mov (%rdi), %rax + + cmp $2, %rsi + ja .Lgt2 + + jb .Lone + + mov 8(%rdi), %rsi + mov %rax, %rdx + shr $48, %rax + + and %r11, %rdx + add %rdx, %rax + mov %esi, %edx + + shr $32, %rsi + add %rsi, %rax + + shl $16, %rdx + add %rdx, %rax +.Lone: + ret + + + + + +.Lgt2: mov 8(%rdi), %rcx + mov 16(%rdi), %rdx + xor %r9, %r9 + add $24, %rdi + sub $12, %rsi + jc .Lend + .align 16, 0x90 +.Ltop: + add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add 24(%rdi), %rax + adc 32(%rdi), %rcx + adc 40(%rdi), %rdx + adc $0, %r9 + add 48(%rdi), %rax + adc 56(%rdi), %rcx + adc 64(%rdi), %rdx + adc $0, %r9 + add $72, %rdi + sub $9, %rsi + jnc .Ltop + +.Lend: + lea .Ltab(%rip), %r8 + movslq 36(%r8,%rsi,4), %r10 + add %r10, %r8 + jmp *%r8 + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L0-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L4-.Ltab + .long .L5-.Ltab + .long .L6-.Ltab + .long .L7-.Ltab + .long .L8-.Ltab + .text + +.L6: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +.L3: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + jmp .Lcj1 + +.L7: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +.L4: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +.L1: add (%rdi), %rax + adc $0, %rcx + jmp .Lcj2 + +.L8: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +.L5: add (%rdi), %rax + adc 8(%rdi), %rcx + adc 16(%rdi), %rdx + adc $0, %r9 + add $24, %rdi +.L2: add (%rdi), %rax + adc 8(%rdi), %rcx + +.Lcj2: adc $0, %rdx +.Lcj1: adc $0, %r9 +.L0: add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax + + mov %rax, %rdi + shr $48, %rax + + and %r11, %rdi + mov %ecx, %r10d + + shr $32, %rcx + + add %rdi, %rax + movzwl %dx, %edi + shl $16, %r10 + + add %rcx, %rax + shr $16, %rdx + + add %r10, %rax + shl $32, %rdi + + add %rdx, %rax + add %rdi, %rax + + + ret + .size __gmpn_mod_34lsub1,.-__gmpn_mod_34lsub1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mode1o.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mode1o.s new file mode 100644 index 0000000..bff06a3 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mode1o.s @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_modexact_1_odd + .type __gmpn_modexact_1_odd,@function + +__gmpn_modexact_1_odd: + + + mov $0, %ecx + + + .globl __gmpn_modexact_1c_odd + .type __gmpn_modexact_1c_odd,@function + +__gmpn_modexact_1c_odd: + + +.Lent: + + + + + + mov %rdx, %r8 + shr %edx + + mov __gmp_binvert_limb_table@GOTPCREL(%rip), %r9 + + + + and $127, %edx + mov %rcx, %r10 + + movzbl (%r9,%rdx), %edx + + mov (%rdi), %rax + lea (%rdi,%rsi,8), %r11 + mov %r8, %rdi + + lea (%rdx,%rdx), %ecx + imul %edx, %edx + + neg %rsi + + imul %edi, %edx + + sub %edx, %ecx + + lea (%rcx,%rcx), %edx + imul %ecx, %ecx + + imul %edi, %ecx + + sub %ecx, %edx + xor %ecx, %ecx + + lea (%rdx,%rdx), %r9 + imul %rdx, %rdx + + imul %r8, %rdx + + sub %rdx, %r9 + mov %r10, %rdx + + + + inc %rsi + jz .Lone + + + .align 16, 0x90 +.Ltop: + + + + + + + + + + sub %rdx, %rax + + adc $0, %rcx + imul %r9, %rax + + mul %r8 + + mov (%r11,%rsi,8), %rax + sub %rcx, %rax + setc %cl + + inc %rsi + jnz .Ltop + + +.Lone: + sub %rdx, %rax + + adc $0, %rcx + imul %r9, %rax + + mul %r8 + + lea (%rcx,%rdx), %rax + + ret + + .size __gmpn_modexact_1c_odd,.-__gmpn_modexact_1c_odd + .size __gmpn_modexact_1_odd,.-__gmpn_modexact_1_odd diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mp_bases.c b/vere/ext/gmp/gen/x86_64-linux/mpn/mp_bases.c new file mode 100644 index 0000000..c72c531 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mp_bases.c @@ -0,0 +1,268 @@ +/* This file generated by gen-bases.c - DO NOT EDIT. */ + +#include "gmp-impl.h" + +#if GMP_NUMB_BITS != 64 +Error, error, this data is for 64 bits +#endif + +const struct bases mp_bases[257] = +{ + /* 0 */ { 0, 0, 0, 0, 0 }, + /* 1 */ { 0, 0, 0, 0, 0 }, + /* 2 */ { 64, CNST_LIMB(0xffffffffffffffff), CNST_LIMB(0x1fffffffffffffff), CNST_LIMB(0x1), CNST_LIMB(0x0) }, + /* 3 */ { 40, CNST_LIMB(0xa1849cc1a9a9e94e), CNST_LIMB(0x32b803473f7ad0f3), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 4 */ { 32, CNST_LIMB(0x7fffffffffffffff), CNST_LIMB(0x3fffffffffffffff), CNST_LIMB(0x2), CNST_LIMB(0x0) }, + /* 5 */ { 27, CNST_LIMB(0x6e40d1a4143dcb94), CNST_LIMB(0x4a4d3c25e68dc57f), CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90) }, + /* 6 */ { 24, CNST_LIMB(0x6308c91b702a7cf4), CNST_LIMB(0x52b803473f7ad0f3), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 7 */ { 22, CNST_LIMB(0x5b3064eb3aa6d388), CNST_LIMB(0x59d5d9fd5010b366), CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b) }, + /* 8 */ { 21, CNST_LIMB(0x5555555555555555), CNST_LIMB(0x5fffffffffffffff), CNST_LIMB(0x3), CNST_LIMB(0x0) }, + /* 9 */ { 20, CNST_LIMB(0x50c24e60d4d4f4a7), CNST_LIMB(0x6570068e7ef5a1e7), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 10 */ { 19, CNST_LIMB(0x4d104d427de7fbcc), CNST_LIMB(0x6a4d3c25e68dc57f), CNST_LIMB(0x8ac7230489e80000), CNST_LIMB(0xd83c94fb6d2ac34a) }, + /* 11 */ { 18, CNST_LIMB(0x4a00270775914e88), CNST_LIMB(0x6eb3a9f01975077f), CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b) }, + /* 12 */ { 17, CNST_LIMB(0x4768ce0d05818e12), CNST_LIMB(0x72b803473f7ad0f3), CNST_LIMB(0x1eca170c00000000), CNST_LIMB(0xa10c2bec5da8f8f) }, + /* 13 */ { 17, CNST_LIMB(0x452e53e365907bda), CNST_LIMB(0x766a008e4788cbcd), CNST_LIMB(0x780c7372621bd74d), CNST_LIMB(0x10f4becafe412ec3) }, + /* 14 */ { 16, CNST_LIMB(0x433cfffb4b5aae55), CNST_LIMB(0x79d5d9fd5010b366), CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86) }, + /* 15 */ { 16, CNST_LIMB(0x41867711b4f85355), CNST_LIMB(0x7d053f6d26089673), CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48) }, + /* 16 */ { 16, CNST_LIMB(0x3fffffffffffffff), CNST_LIMB(0x7fffffffffffffff), CNST_LIMB(0x4), CNST_LIMB(0x0) }, + /* 17 */ { 15, CNST_LIMB(0x3ea16afd58b10966), CNST_LIMB(0x82cc7edf592262cf), CNST_LIMB(0x27b95e997e21d9f1), CNST_LIMB(0x9c71e11bab279323) }, + /* 18 */ { 15, CNST_LIMB(0x3d64598d154dc4de), CNST_LIMB(0x8570068e7ef5a1e7), CNST_LIMB(0x5da0e1e53c5c8000), CNST_LIMB(0x5dfaa697ec6f6a1c) }, + /* 19 */ { 15, CNST_LIMB(0x3c43c23018bb5563), CNST_LIMB(0x87ef05ae409a0288), CNST_LIMB(0xd2ae3299c1c4aedb), CNST_LIMB(0x3711783f6be7e9ec) }, + /* 20 */ { 14, CNST_LIMB(0x3b3b9a42873069c7), CNST_LIMB(0x8a4d3c25e68dc57f), CNST_LIMB(0x16bcc41e90000000), CNST_LIMB(0x6849b86a12b9b01e) }, + /* 21 */ { 14, CNST_LIMB(0x3a4898f06cf41ac9), CNST_LIMB(0x8c8ddd448f8b845a), CNST_LIMB(0x2d04b7fdd9c0ef49), CNST_LIMB(0x6bf097ba5ca5e239) }, + /* 22 */ { 14, CNST_LIMB(0x39680b13582e7c18), CNST_LIMB(0x8eb3a9f01975077f), CNST_LIMB(0x5658597bcaa24000), CNST_LIMB(0x7b8015c8d7af8f08) }, + /* 23 */ { 14, CNST_LIMB(0x3897b2b751ae561a), CNST_LIMB(0x90c10500d63aa658), CNST_LIMB(0xa0e2073737609371), CNST_LIMB(0x975a24b3a3151b38) }, + /* 24 */ { 13, CNST_LIMB(0x37d5aed131f19c98), CNST_LIMB(0x92b803473f7ad0f3), CNST_LIMB(0xc29e98000000000), CNST_LIMB(0x50bd367972689db1) }, + /* 25 */ { 13, CNST_LIMB(0x372068d20a1ee5ca), CNST_LIMB(0x949a784bcd1b8afe), CNST_LIMB(0x14adf4b7320334b9), CNST_LIMB(0x8c240c4aecb13bb5) }, + /* 26 */ { 13, CNST_LIMB(0x3676867e5d60de29), CNST_LIMB(0x966a008e4788cbcd), CNST_LIMB(0x226ed36478bfa000), CNST_LIMB(0xdbd2e56854e118c9) }, + /* 27 */ { 13, CNST_LIMB(0x35d6deeb388df86f), CNST_LIMB(0x982809d5be7072db), CNST_LIMB(0x383d9170b85ff80b), CNST_LIMB(0x2351ffcaa9c7c4ae) }, + /* 28 */ { 13, CNST_LIMB(0x354071d61c77fa2e), CNST_LIMB(0x99d5d9fd5010b366), CNST_LIMB(0x5a3c23e39c000000), CNST_LIMB(0x6b24188ca33b0636) }, + /* 29 */ { 13, CNST_LIMB(0x34b260c5671b18ac), CNST_LIMB(0x9b74948f5532da4b), CNST_LIMB(0x8e65137388122bcd), CNST_LIMB(0xcc3dceaf2b8ba99d) }, + /* 30 */ { 13, CNST_LIMB(0x342be986572b45cc), CNST_LIMB(0x9d053f6d26089673), CNST_LIMB(0xdd41bb36d259e000), CNST_LIMB(0x2832e835c6c7d6b6) }, + /* 31 */ { 12, CNST_LIMB(0x33ac61b998fbbdf2), CNST_LIMB(0x9e88c6b3626a72aa), CNST_LIMB(0xaee5720ee830681), CNST_LIMB(0x76b6aa272e1873c5) }, + /* 32 */ { 12, CNST_LIMB(0x3333333333333333), CNST_LIMB(0x9fffffffffffffff), CNST_LIMB(0x5), CNST_LIMB(0x0) }, + /* 33 */ { 12, CNST_LIMB(0x32bfd90114c12861), CNST_LIMB(0xa16bad3758efd873), CNST_LIMB(0x172588ad4f5f0981), CNST_LIMB(0x61eaf5d402c7bf4f) }, + /* 34 */ { 12, CNST_LIMB(0x3251dcf6169e45f2), CNST_LIMB(0xa2cc7edf592262cf), CNST_LIMB(0x211e44f7d02c1000), CNST_LIMB(0xeeb658123ffb27ec) }, + /* 35 */ { 12, CNST_LIMB(0x31e8d59f180dc630), CNST_LIMB(0xa4231623369e78e5), CNST_LIMB(0x2ee56725f06e5c71), CNST_LIMB(0x5d5e3762e6fdf509) }, + /* 36 */ { 12, CNST_LIMB(0x3184648db8153e7a), CNST_LIMB(0xa570068e7ef5a1e7), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 37 */ { 12, CNST_LIMB(0x312434e89c35dacd), CNST_LIMB(0xa6b3d78b6d3b24fb), CNST_LIMB(0x5b5b57f8a98a5dd1), CNST_LIMB(0x66ae7831762efb6f) }, + /* 38 */ { 12, CNST_LIMB(0x30c7fa349460a541), CNST_LIMB(0xa7ef05ae409a0288), CNST_LIMB(0x7dcff8986ea31000), CNST_LIMB(0x47388865a00f544) }, + /* 39 */ { 12, CNST_LIMB(0x306f6f4c8432bc6d), CNST_LIMB(0xa92203d587039cc1), CNST_LIMB(0xabd4211662a6b2a1), CNST_LIMB(0x7d673c33a123b54c) }, + /* 40 */ { 12, CNST_LIMB(0x301a557ffbfdd252), CNST_LIMB(0xaa4d3c25e68dc57f), CNST_LIMB(0xe8d4a51000000000), CNST_LIMB(0x19799812dea11197) }, + /* 41 */ { 11, CNST_LIMB(0x2fc873d1fda55f3b), CNST_LIMB(0xab7110e6ce866f2b), CNST_LIMB(0x7a32956ad081b79), CNST_LIMB(0xc27e62e0686feae) }, + /* 42 */ { 11, CNST_LIMB(0x2f799652a4e6dc49), CNST_LIMB(0xac8ddd448f8b845a), CNST_LIMB(0x9f49aaff0e86800), CNST_LIMB(0x9b6e7507064ce7c7) }, + /* 43 */ { 11, CNST_LIMB(0x2f2d8d8f64460aad), CNST_LIMB(0xada3f5fb9c415052), CNST_LIMB(0xce583bb812d37b3), CNST_LIMB(0x3d9ac2bf66cfed94) }, + /* 44 */ { 11, CNST_LIMB(0x2ee42e164e8f53a4), CNST_LIMB(0xaeb3a9f01975077f), CNST_LIMB(0x109b79a654c00000), CNST_LIMB(0xed46bc50ce59712a) }, + /* 45 */ { 11, CNST_LIMB(0x2e9d500984041dbd), CNST_LIMB(0xafbd42b465836767), CNST_LIMB(0x1543beff214c8b95), CNST_LIMB(0x813d97e2c89b8d46) }, + /* 46 */ { 11, CNST_LIMB(0x2e58cec05a6a8144), CNST_LIMB(0xb0c10500d63aa658), CNST_LIMB(0x1b149a79459a3800), CNST_LIMB(0x2e81751956af8083) }, + /* 47 */ { 11, CNST_LIMB(0x2e1688743ef9104c), CNST_LIMB(0xb1bf311e95d00de3), CNST_LIMB(0x224edfb5434a830f), CNST_LIMB(0xdd8e0a95e30c0988) }, + /* 48 */ { 11, CNST_LIMB(0x2dd65df7a583598f), CNST_LIMB(0xb2b803473f7ad0f3), CNST_LIMB(0x2b3fb00000000000), CNST_LIMB(0x7ad4dd48a0b5b167) }, + /* 49 */ { 11, CNST_LIMB(0x2d9832759d5369c4), CNST_LIMB(0xb3abb3faa02166cc), CNST_LIMB(0x3642798750226111), CNST_LIMB(0x2df495ccaa57147b) }, + /* 50 */ { 11, CNST_LIMB(0x2d5beb38dcd1394c), CNST_LIMB(0xb49a784bcd1b8afe), CNST_LIMB(0x43c33c1937564800), CNST_LIMB(0xe392010175ee5962) }, + /* 51 */ { 11, CNST_LIMB(0x2d216f7943e2ba6a), CNST_LIMB(0xb5848226989d33c3), CNST_LIMB(0x54411b2441c3cd8b), CNST_LIMB(0x84eaf11b2fe7738e) }, + /* 52 */ { 11, CNST_LIMB(0x2ce8a82efbb3ff2c), CNST_LIMB(0xb66a008e4788cbcd), CNST_LIMB(0x6851455acd400000), CNST_LIMB(0x3a1e3971e008995d) }, + /* 53 */ { 11, CNST_LIMB(0x2cb17fea7ad7e332), CNST_LIMB(0xb74b1fd64e0753c6), CNST_LIMB(0x80a23b117c8feb6d), CNST_LIMB(0xfd7a462344ffce25) }, + /* 54 */ { 11, CNST_LIMB(0x2c7be2b0cfa1ba50), CNST_LIMB(0xb82809d5be7072db), CNST_LIMB(0x9dff7d32d5dc1800), CNST_LIMB(0x9eca40b40ebcef8a) }, + /* 55 */ { 11, CNST_LIMB(0x2c47bddba92d7463), CNST_LIMB(0xb900e6160002ccfe), CNST_LIMB(0xc155af6faeffe6a7), CNST_LIMB(0x52fa161a4a48e43d) }, + /* 56 */ { 11, CNST_LIMB(0x2c14fffcaa8b131e), CNST_LIMB(0xb9d5d9fd5010b366), CNST_LIMB(0xebb7392e00000000), CNST_LIMB(0x1607a2cbacf930c1) }, + /* 57 */ { 10, CNST_LIMB(0x2be398c3a38be053), CNST_LIMB(0xbaa708f58014d37c), CNST_LIMB(0x50633659656d971), CNST_LIMB(0x97a014f8e3be55f1) }, + /* 58 */ { 10, CNST_LIMB(0x2bb378e758451068), CNST_LIMB(0xbb74948f5532da4b), CNST_LIMB(0x5fa8624c7fba400), CNST_LIMB(0x568df8b76cbf212c) }, + /* 59 */ { 10, CNST_LIMB(0x2b8492108be5e5f7), CNST_LIMB(0xbc3e9ca2e1a05533), CNST_LIMB(0x717d9faa73c5679), CNST_LIMB(0x20ba7c4b4e6ef492) }, + /* 60 */ { 10, CNST_LIMB(0x2b56d6c70d55481b), CNST_LIMB(0xbd053f6d26089673), CNST_LIMB(0x86430aac6100000), CNST_LIMB(0xe81ee46b9ef492f5) }, + /* 61 */ { 10, CNST_LIMB(0x2b2a3a608c72ddd5), CNST_LIMB(0xbdc899ab3ff56c5e), CNST_LIMB(0x9e64d9944b57f29), CNST_LIMB(0x9dc0d10d51940416) }, + /* 62 */ { 10, CNST_LIMB(0x2afeb0f1060c7e41), CNST_LIMB(0xbe88c6b3626a72aa), CNST_LIMB(0xba5ca5392cb0400), CNST_LIMB(0x5fa8ed2f450272a5) }, + /* 63 */ { 10, CNST_LIMB(0x2ad42f3c9aca595c), CNST_LIMB(0xbf45e08bcf06554e), CNST_LIMB(0xdab2ce1d022cd81), CNST_LIMB(0x2ba9eb8c5e04e641) }, + /* 64 */ { 10, CNST_LIMB(0x2aaaaaaaaaaaaaaa), CNST_LIMB(0xbfffffffffffffff), CNST_LIMB(0x6), CNST_LIMB(0x0) }, + /* 65 */ { 10, CNST_LIMB(0x2a82193a13425883), CNST_LIMB(0xc0b73cb42e16914c), CNST_LIMB(0x12aeed5fd3e2d281), CNST_LIMB(0xb67759cc00287bf1) }, + /* 66 */ { 10, CNST_LIMB(0x2a5a717672f66450), CNST_LIMB(0xc16bad3758efd873), CNST_LIMB(0x15c3da1572d50400), CNST_LIMB(0x78621feeb7f4ed33) }, + /* 67 */ { 10, CNST_LIMB(0x2a33aa6e56d9c71c), CNST_LIMB(0xc21d6713f453f356), CNST_LIMB(0x194c05534f75ee29), CNST_LIMB(0x43d55b5f72943bc0) }, + /* 68 */ { 10, CNST_LIMB(0x2a0dbbaa3bdfcea4), CNST_LIMB(0xc2cc7edf592262cf), CNST_LIMB(0x1d56299ada100000), CNST_LIMB(0x173decb64d1d4409) }, + /* 69 */ { 10, CNST_LIMB(0x29e89d244eb4bfaf), CNST_LIMB(0xc379084815b5774c), CNST_LIMB(0x21f2a089a4ff4f79), CNST_LIMB(0xe29fb54fd6b6074f) }, + /* 70 */ { 10, CNST_LIMB(0x29c44740d7db51e6), CNST_LIMB(0xc4231623369e78e5), CNST_LIMB(0x2733896c68d9a400), CNST_LIMB(0xa1f1f5c210d54e62) }, + /* 71 */ { 10, CNST_LIMB(0x29a0b2c743b14d74), CNST_LIMB(0xc4caba789e2b8687), CNST_LIMB(0x2d2cf2c33b533c71), CNST_LIMB(0x6aac7f9bfafd57b2) }, + /* 72 */ { 10, CNST_LIMB(0x297dd8dbb7c22a2d), CNST_LIMB(0xc570068e7ef5a1e7), CNST_LIMB(0x33f506e440000000), CNST_LIMB(0x3b563c2478b72ee2) }, + /* 73 */ { 10, CNST_LIMB(0x295bb2f9285c8c1b), CNST_LIMB(0xc6130af40bc0ecbf), CNST_LIMB(0x3ba43bec1d062211), CNST_LIMB(0x12b536b574e92d1b) }, + /* 74 */ { 10, CNST_LIMB(0x293a3aebe2be1c92), CNST_LIMB(0xc6b3d78b6d3b24fb), CNST_LIMB(0x4455872d8fd4e400), CNST_LIMB(0xdf86c03020404fa5) }, + /* 75 */ { 10, CNST_LIMB(0x29196acc815ebd9f), CNST_LIMB(0xc7527b930c965bf2), CNST_LIMB(0x4e2694539f2f6c59), CNST_LIMB(0xa34adf02234eea8e) }, + /* 76 */ { 10, CNST_LIMB(0x28f93cfb40f5c22a), CNST_LIMB(0xc7ef05ae409a0288), CNST_LIMB(0x5938006c18900000), CNST_LIMB(0x6f46eb8574eb59dd) }, + /* 77 */ { 10, CNST_LIMB(0x28d9ac1badc64117), CNST_LIMB(0xc88983ed6985bae5), CNST_LIMB(0x65ad9912474aa649), CNST_LIMB(0x42459b481df47cec) }, + /* 78 */ { 10, CNST_LIMB(0x28bab310a196b478), CNST_LIMB(0xc92203d587039cc1), CNST_LIMB(0x73ae9ff4241ec400), CNST_LIMB(0x1b424b95d80ca505) }, + /* 79 */ { 10, CNST_LIMB(0x289c4cf88b774469), CNST_LIMB(0xc9b892675266f66c), CNST_LIMB(0x836612ee9c4ce1e1), CNST_LIMB(0xf2c1b982203a0dac) }, + /* 80 */ { 10, CNST_LIMB(0x287e7529fb244e91), CNST_LIMB(0xca4d3c25e68dc57f), CNST_LIMB(0x9502f90000000000), CNST_LIMB(0xb7cdfd9d7bdbab7d) }, + /* 81 */ { 10, CNST_LIMB(0x286127306a6a7a53), CNST_LIMB(0xcae00d1cfdeb43cf), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 82 */ { 10, CNST_LIMB(0x28445ec93f792b1e), CNST_LIMB(0xcb7110e6ce866f2b), CNST_LIMB(0xbebf59a07dab4400), CNST_LIMB(0x57931eeaf85cf64f) }, + /* 83 */ { 10, CNST_LIMB(0x282817e1038950fa), CNST_LIMB(0xcc0052b18b0e2a19), CNST_LIMB(0xd7540d4093bc3109), CNST_LIMB(0x305a944507c82f47) }, + /* 84 */ { 10, CNST_LIMB(0x280c4e90c9ab1f45), CNST_LIMB(0xcc8ddd448f8b845a), CNST_LIMB(0xf2b96616f1900000), CNST_LIMB(0xe007ccc9c22781a) }, + /* 85 */ { 9, CNST_LIMB(0x27f0ff1bc1ee87cd), CNST_LIMB(0xcd19bb053fb0284e), CNST_LIMB(0x336de62af2bca35), CNST_LIMB(0x3e92c42e000eeed4) }, + /* 86 */ { 9, CNST_LIMB(0x27d625ecf571c340), CNST_LIMB(0xcda3f5fb9c415052), CNST_LIMB(0x39235ec33d49600), CNST_LIMB(0x1ebe59130db2795e) }, + /* 87 */ { 9, CNST_LIMB(0x27bbbf95282fcd45), CNST_LIMB(0xce2c97d694adab3f), CNST_LIMB(0x3f674e539585a17), CNST_LIMB(0x268859e90f51b89) }, + /* 88 */ { 9, CNST_LIMB(0x27a1c8c8ddaf84da), CNST_LIMB(0xceb3a9f01975077f), CNST_LIMB(0x4645b6958000000), CNST_LIMB(0xd24cde0463108cfa) }, + /* 89 */ { 9, CNST_LIMB(0x27883e5e7df3f518), CNST_LIMB(0xcf393550f3aa6906), CNST_LIMB(0x4dcb74afbc49c19), CNST_LIMB(0xa536009f37adc383) }, + /* 90 */ { 9, CNST_LIMB(0x276f1d4c9847e90e), CNST_LIMB(0xcfbd42b465836767), CNST_LIMB(0x56064e1d18d9a00), CNST_LIMB(0x7cea06ce1c9ace10) }, + /* 91 */ { 9, CNST_LIMB(0x275662a841b30191), CNST_LIMB(0xd03fda8b97997f33), CNST_LIMB(0x5f04fe2cd8a39fb), CNST_LIMB(0x58db032e72e8ba43) }, + /* 92 */ { 9, CNST_LIMB(0x273e0ba38d15a47b), CNST_LIMB(0xd0c10500d63aa658), CNST_LIMB(0x68d74421f5c0000), CNST_LIMB(0x388cc17cae105447) }, + /* 93 */ { 9, CNST_LIMB(0x2726158c1b13cf03), CNST_LIMB(0xd140c9faa1e5439e), CNST_LIMB(0x738df1f6ab4827d), CNST_LIMB(0x1b92672857620ce0) }, + /* 94 */ { 9, CNST_LIMB(0x270e7dc9c01d8e9b), CNST_LIMB(0xd1bf311e95d00de3), CNST_LIMB(0x7f3afbc9cfb5e00), CNST_LIMB(0x18c6a9575c2ade4) }, + /* 95 */ { 9, CNST_LIMB(0x26f741dd3f070d61), CNST_LIMB(0xd23c41d42727c808), CNST_LIMB(0x8bf187fba88f35f), CNST_LIMB(0xd44da7da8e44b24f) }, + /* 96 */ { 9, CNST_LIMB(0x26e05f5f16c2159e), CNST_LIMB(0xd2b803473f7ad0f3), CNST_LIMB(0x99c600000000000), CNST_LIMB(0xaa2f78f1b4cc6794) }, + /* 97 */ { 9, CNST_LIMB(0x26c9d3fe61e80598), CNST_LIMB(0xd3327c6ab49ca6c8), CNST_LIMB(0xa8ce21eb6531361), CNST_LIMB(0x843c067d091ee4cc) }, + /* 98 */ { 9, CNST_LIMB(0x26b39d7fc6ddab08), CNST_LIMB(0xd3abb3faa02166cc), CNST_LIMB(0xb92112c1a0b6200), CNST_LIMB(0x62005e1e913356e3) }, + /* 99 */ { 9, CNST_LIMB(0x269db9bc7772a5cc), CNST_LIMB(0xd423b07e986aa967), CNST_LIMB(0xcad7718b8747c43), CNST_LIMB(0x4316eed01dedd518) }, + /* 100 */ { 9, CNST_LIMB(0x268826a13ef3fde6), CNST_LIMB(0xd49a784bcd1b8afe), CNST_LIMB(0xde0b6b3a7640000), CNST_LIMB(0x2725dd1d243aba0e) }, + /* 101 */ { 9, CNST_LIMB(0x2672e22d9dbdbd9f), CNST_LIMB(0xd510118708a8f8dd), CNST_LIMB(0xf2d8cf5fe6d74c5), CNST_LIMB(0xddd9057c24cb54f) }, + /* 102 */ { 9, CNST_LIMB(0x265dea72f169cc99), CNST_LIMB(0xd5848226989d33c3), CNST_LIMB(0x1095d25bfa712600), CNST_LIMB(0xedeee175a736d2a1) }, + /* 103 */ { 9, CNST_LIMB(0x26493d93a8cb2514), CNST_LIMB(0xd5f7cff41e09aeb8), CNST_LIMB(0x121b7c4c3698faa7), CNST_LIMB(0xc4699f3df8b6b328) }, + /* 104 */ { 9, CNST_LIMB(0x2634d9c282f3ef82), CNST_LIMB(0xd66a008e4788cbcd), CNST_LIMB(0x13c09e8d68000000), CNST_LIMB(0x9ebbe7d859cb5a7c) }, + /* 105 */ { 9, CNST_LIMB(0x2620bd41d8933adc), CNST_LIMB(0xd6db196a761949d9), CNST_LIMB(0x15876ccb0b709ca9), CNST_LIMB(0x7c828b9887eb2179) }, + /* 106 */ { 9, CNST_LIMB(0x260ce662ef04088a), CNST_LIMB(0xd74b1fd64e0753c6), CNST_LIMB(0x17723c2976da2a00), CNST_LIMB(0x5d652ab99001adcf) }, + /* 107 */ { 9, CNST_LIMB(0x25f95385547353fd), CNST_LIMB(0xd7ba18f93502e409), CNST_LIMB(0x198384e9c259048b), CNST_LIMB(0x4114f1754e5d7b32) }, + /* 108 */ { 9, CNST_LIMB(0x25e60316448db8e1), CNST_LIMB(0xd82809d5be7072db), CNST_LIMB(0x1bbde41dfeec0000), CNST_LIMB(0x274b7c902f7e0188) }, + /* 109 */ { 9, CNST_LIMB(0x25d2f390152f74f5), CNST_LIMB(0xd894f74b06ef8b40), CNST_LIMB(0x1e241d6e3337910d), CNST_LIMB(0xfc9e0fbb32e210c) }, + /* 110 */ { 9, CNST_LIMB(0x25c02379aa9ad043), CNST_LIMB(0xd900e6160002ccfe), CNST_LIMB(0x20b91cee9901ee00), CNST_LIMB(0xf4afa3e594f8ea1f) }, + /* 111 */ { 9, CNST_LIMB(0x25ad9165f2c18907), CNST_LIMB(0xd96bdad2acb5f5ef), CNST_LIMB(0x237ff9079863dfef), CNST_LIMB(0xcd85c32e9e4437b0) }, + /* 112 */ { 9, CNST_LIMB(0x259b3bf36735c90c), CNST_LIMB(0xd9d5d9fd5010b366), CNST_LIMB(0x267bf47000000000), CNST_LIMB(0xa9bbb147e0dd92a8) }, + /* 113 */ { 9, CNST_LIMB(0x258921cb955e7693), CNST_LIMB(0xda3ee7f38e181ed0), CNST_LIMB(0x29b08039fbeda7f1), CNST_LIMB(0x8900447b70e8eb82) }, + /* 114 */ { 9, CNST_LIMB(0x257741a2ac9170af), CNST_LIMB(0xdaa708f58014d37c), CNST_LIMB(0x2d213df34f65f200), CNST_LIMB(0x6b0a92adaad5848a) }, + /* 115 */ { 9, CNST_LIMB(0x25659a3711bc827d), CNST_LIMB(0xdb0e4126bcc86bd7), CNST_LIMB(0x30d201d957a7c2d3), CNST_LIMB(0x4f990ad8740f0ee5) }, + /* 116 */ { 9, CNST_LIMB(0x25542a50f84b9c39), CNST_LIMB(0xdb74948f5532da4b), CNST_LIMB(0x34c6d52160f40000), CNST_LIMB(0x3670a9663a8d3610) }, + /* 117 */ { 9, CNST_LIMB(0x2542f0c20000377d), CNST_LIMB(0xdbda071cc67e6db5), CNST_LIMB(0x3903f855d8f4c755), CNST_LIMB(0x1f5c44188057be3c) }, + /* 118 */ { 9, CNST_LIMB(0x2531ec64d772bd64), CNST_LIMB(0xdc3e9ca2e1a05533), CNST_LIMB(0x3d8de5c8ec59b600), CNST_LIMB(0xa2bea956c4e4977) }, + /* 119 */ { 9, CNST_LIMB(0x25211c1ce2fb5a6e), CNST_LIMB(0xdca258dca9331635), CNST_LIMB(0x4269541d1ff01337), CNST_LIMB(0xed68b23033c3637e) }, + /* 120 */ { 9, CNST_LIMB(0x25107ed5e7c3ec3b), CNST_LIMB(0xdd053f6d26089673), CNST_LIMB(0x479b38e478000000), CNST_LIMB(0xc99cf624e50549c5) }, + /* 121 */ { 9, CNST_LIMB(0x25001383bac8a744), CNST_LIMB(0xdd6753e032ea0efe), CNST_LIMB(0x4d28cb56c33fa539), CNST_LIMB(0xa8adf7ae45e7577b) }, + /* 122 */ { 9, CNST_LIMB(0x24efd921f390bce3), CNST_LIMB(0xddc899ab3ff56c5e), CNST_LIMB(0x5317871fa13aba00), CNST_LIMB(0x8a5bc740b1c113e5) }, + /* 123 */ { 9, CNST_LIMB(0x24dfceb3a26bb203), CNST_LIMB(0xde29142e0e01401f), CNST_LIMB(0x596d2f44de9fa71b), CNST_LIMB(0x6e6c7efb81cfbb9b) }, + /* 124 */ { 9, CNST_LIMB(0x24cff3430a0341a7), CNST_LIMB(0xde88c6b3626a72aa), CNST_LIMB(0x602fd125c47c0000), CNST_LIMB(0x54aba5c5cada5f10) }, + /* 125 */ { 9, CNST_LIMB(0x24c045e15c149931), CNST_LIMB(0xdee7b471b3a9507d), CNST_LIMB(0x6765c793fa10079d), CNST_LIMB(0x3ce9a36f23c0fc90) }, + /* 126 */ { 9, CNST_LIMB(0x24b0c5a679267ae2), CNST_LIMB(0xdf45e08bcf06554e), CNST_LIMB(0x6f15be069b847e00), CNST_LIMB(0x26fb43de2c8cd2a8) }, + /* 127 */ { 9, CNST_LIMB(0x24a171b0b31461c8), CNST_LIMB(0xdfa34e1177c23362), CNST_LIMB(0x7746b3e82a77047f), CNST_LIMB(0x12b94793db8486a1) }, + /* 128 */ { 9, CNST_LIMB(0x2492492492492492), CNST_LIMB(0xdfffffffffffffff), CNST_LIMB(0x7), CNST_LIMB(0x0) }, + /* 129 */ { 9, CNST_LIMB(0x24834b2c9d85cdfe), CNST_LIMB(0xe05bf942dbbc2145), CNST_LIMB(0x894953f7ea890481), CNST_LIMB(0xdd5deca404c0156d) }, + /* 130 */ { 9, CNST_LIMB(0x247476f924137501), CNST_LIMB(0xe0b73cb42e16914c), CNST_LIMB(0x932abffea4848200), CNST_LIMB(0xbd51373330291de0) }, + /* 131 */ { 9, CNST_LIMB(0x2465cbc00a40cec0), CNST_LIMB(0xe111cd1d5133412e), CNST_LIMB(0x9dacb687d3d6a163), CNST_LIMB(0x9fa4025d66f23085) }, + /* 132 */ { 9, CNST_LIMB(0x245748bc980e0427), CNST_LIMB(0xe16bad3758efd873), CNST_LIMB(0xa8d8102a44840000), CNST_LIMB(0x842530ee2db4949d) }, + /* 133 */ { 9, CNST_LIMB(0x2448ed2f49eb0633), CNST_LIMB(0xe1c4dfab90aab5ef), CNST_LIMB(0xb4b60f9d140541e5), CNST_LIMB(0x6aa7f2766b03dc25) }, + /* 134 */ { 9, CNST_LIMB(0x243ab85da36e3167), CNST_LIMB(0xe21d6713f453f356), CNST_LIMB(0xc15065d4856e4600), CNST_LIMB(0x53035ba7ebf32e8d) }, + /* 135 */ { 9, CNST_LIMB(0x242ca99203ea8c18), CNST_LIMB(0xe27545fba4fe385a), CNST_LIMB(0xceb1363f396d23c7), CNST_LIMB(0x3d12091fc9fb4914) }, + /* 136 */ { 9, CNST_LIMB(0x241ec01b7cce4ea0), CNST_LIMB(0xe2cc7edf592262cf), CNST_LIMB(0xdce31b2488000000), CNST_LIMB(0x28b1cb81b1ef1849) }, + /* 137 */ { 9, CNST_LIMB(0x2410fb4da9b3b0fc), CNST_LIMB(0xe323142dc8c66b55), CNST_LIMB(0xebf12a24bca135c9), CNST_LIMB(0x15c35be67ae3e2c9) }, + /* 138 */ { 9, CNST_LIMB(0x24035a808a0f315e), CNST_LIMB(0xe379084815b5774c), CNST_LIMB(0xfbe6f8dbf88f4a00), CNST_LIMB(0x42a17bd09be1ff0) }, + /* 139 */ { 8, CNST_LIMB(0x23f5dd105c67ab9d), CNST_LIMB(0xe3ce5d822ff4b643), CNST_LIMB(0x1ef156c084ce761), CNST_LIMB(0x8bf461f03cf0bbf) }, + /* 140 */ { 8, CNST_LIMB(0x23e8825d7b05abb1), CNST_LIMB(0xe4231623369e78e5), CNST_LIMB(0x20c4e3b94a10000), CNST_LIMB(0xf3fbb43f68a32d05) }, + /* 141 */ { 8, CNST_LIMB(0x23db49cc3a0866fe), CNST_LIMB(0xe4773465d54aded7), CNST_LIMB(0x22b0695a08ba421), CNST_LIMB(0xd84f44c48564dc19) }, + /* 142 */ { 8, CNST_LIMB(0x23ce32c4c6cfb9f5), CNST_LIMB(0xe4caba789e2b8687), CNST_LIMB(0x24b4f35d7a4c100), CNST_LIMB(0xbe58ebcce7956abe) }, + /* 143 */ { 8, CNST_LIMB(0x23c13cb308ab6ab7), CNST_LIMB(0xe51daa7e60fdd34c), CNST_LIMB(0x26d397284975781), CNST_LIMB(0xa5fac463c7c134b7) }, + /* 144 */ { 8, CNST_LIMB(0x23b4670682c0c709), CNST_LIMB(0xe570068e7ef5a1e7), CNST_LIMB(0x290d74100000000), CNST_LIMB(0x8f19241e28c7d757) }, + /* 145 */ { 8, CNST_LIMB(0x23a7b13237187c8b), CNST_LIMB(0xe5c1d0b53bc09fca), CNST_LIMB(0x2b63b3a37866081), CNST_LIMB(0x799a6d046c0ae1ae) }, + /* 146 */ { 8, CNST_LIMB(0x239b1aac8ac74728), CNST_LIMB(0xe6130af40bc0ecbf), CNST_LIMB(0x2dd789f4d894100), CNST_LIMB(0x6566e37d746a9e40) }, + /* 147 */ { 8, CNST_LIMB(0x238ea2ef2b24c379), CNST_LIMB(0xe663b741df9c37c0), CNST_LIMB(0x306a35e51b58721), CNST_LIMB(0x526887dbfb5f788f) }, + /* 148 */ { 8, CNST_LIMB(0x23824976f4045a26), CNST_LIMB(0xe6b3d78b6d3b24fb), CNST_LIMB(0x331d01712e10000), CNST_LIMB(0x408af3382b8efd3d) }, + /* 149 */ { 8, CNST_LIMB(0x23760dc3d6e4d729), CNST_LIMB(0xe7036db376537b90), CNST_LIMB(0x35f14200a827c61), CNST_LIMB(0x2fbb374806ec05f1) }, + /* 150 */ { 8, CNST_LIMB(0x2369ef58c30bd43e), CNST_LIMB(0xe7527b930c965bf2), CNST_LIMB(0x38e858b62216100), CNST_LIMB(0x1fe7c0f0afce87fe) }, + /* 151 */ { 8, CNST_LIMB(0x235dedbb8e82aa1c), CNST_LIMB(0xe7a102f9d39a9331), CNST_LIMB(0x3c03b2c13176a41), CNST_LIMB(0x11003d517540d32e) }, + /* 152 */ { 8, CNST_LIMB(0x23520874dfeb1ffd), CNST_LIMB(0xe7ef05ae409a0288), CNST_LIMB(0x3f44c9b21000000), CNST_LIMB(0x2f5810f98eff0dc) }, + /* 153 */ { 8, CNST_LIMB(0x23463f1019228dd7), CNST_LIMB(0xe83c856dd81804b7), CNST_LIMB(0x42ad23cef3113c1), CNST_LIMB(0xeb72e35e7840d910) }, + /* 154 */ { 8, CNST_LIMB(0x233a911b42aa9b3c), CNST_LIMB(0xe88983ed6985bae5), CNST_LIMB(0x463e546b19a2100), CNST_LIMB(0xd27de19593dc3614) }, + /* 155 */ { 8, CNST_LIMB(0x232efe26f7cf33f9), CNST_LIMB(0xe8d602d948f83829), CNST_LIMB(0x49f9fc3f96684e1), CNST_LIMB(0xbaf391fd3e5e6fc2) }, + /* 156 */ { 8, CNST_LIMB(0x232385c65381b485), CNST_LIMB(0xe92203d587039cc1), CNST_LIMB(0x4de1c9c5dc10000), CNST_LIMB(0xa4bd38c55228c81d) }, + /* 157 */ { 8, CNST_LIMB(0x2318278edde1b39b), CNST_LIMB(0xe96d887e26cd57b7), CNST_LIMB(0x51f77994116d2a1), CNST_LIMB(0x8fc5a8de8e1de782) }, + /* 158 */ { 8, CNST_LIMB(0x230ce3187a6c2be9), CNST_LIMB(0xe9b892675266f66c), CNST_LIMB(0x563cd6bb3398100), CNST_LIMB(0x7bf9265bea9d3a3b) }, + /* 159 */ { 8, CNST_LIMB(0x2301b7fd56ca21bb), CNST_LIMB(0xea03231d8d8224ba), CNST_LIMB(0x5ab3bb270beeb01), CNST_LIMB(0x69454b325983dccd) }, + /* 160 */ { 8, CNST_LIMB(0x22f6a5d9da38341c), CNST_LIMB(0xea4d3c25e68dc57f), CNST_LIMB(0x5f5e10000000000), CNST_LIMB(0x5798ee2308c39df9) }, + /* 161 */ { 8, CNST_LIMB(0x22ebac4c9580d89f), CNST_LIMB(0xea96defe264b59be), CNST_LIMB(0x643dce0ec16f501), CNST_LIMB(0x46e40ba0fa66a753) }, + /* 162 */ { 8, CNST_LIMB(0x22e0caf633834beb), CNST_LIMB(0xeae00d1cfdeb43cf), CNST_LIMB(0x6954fe21e3e8100), CNST_LIMB(0x3717b0870b0db3a7) }, + /* 163 */ { 8, CNST_LIMB(0x22d601796a418886), CNST_LIMB(0xeb28c7f233bdd372), CNST_LIMB(0x6ea5b9755f440a1), CNST_LIMB(0x2825e6775d11cdeb) }, + /* 164 */ { 8, CNST_LIMB(0x22cb4f7aec6fd8b4), CNST_LIMB(0xeb7110e6ce866f2b), CNST_LIMB(0x74322a1c0410000), CNST_LIMB(0x1a01a1c09d1b4dac) }, + /* 165 */ { 8, CNST_LIMB(0x22c0b4a15b80d83e), CNST_LIMB(0xebb8e95d3f7d9df2), CNST_LIMB(0x79fc8b6ae8a46e1), CNST_LIMB(0xc9eb0a8bebc8f3e) }, + /* 166 */ { 8, CNST_LIMB(0x22b630953a28f77a), CNST_LIMB(0xec0052b18b0e2a19), CNST_LIMB(0x80072a66d512100), CNST_LIMB(0xffe357ff59e6a004) }, + /* 167 */ { 8, CNST_LIMB(0x22abc300df54ca7c), CNST_LIMB(0xec474e39705912d2), CNST_LIMB(0x86546633b42b9c1), CNST_LIMB(0xe7dfd1be05fa61a8) }, + /* 168 */ { 8, CNST_LIMB(0x22a16b90698da5d2), CNST_LIMB(0xec8ddd448f8b845a), CNST_LIMB(0x8ce6b0861000000), CNST_LIMB(0xd11ed6fc78f760e5) }, + /* 169 */ { 8, CNST_LIMB(0x229729f1b2c83ded), CNST_LIMB(0xecd4011c8f11979a), CNST_LIMB(0x93c08e16a022441), CNST_LIMB(0xbb8db609dd29ebfe) }, + /* 170 */ { 8, CNST_LIMB(0x228cfdd444992f78), CNST_LIMB(0xed19bb053fb0284e), CNST_LIMB(0x9ae49717f026100), CNST_LIMB(0xa71aec8d1813d532) }, + /* 171 */ { 8, CNST_LIMB(0x2282e6e94ccb8588), CNST_LIMB(0xed5f0c3cbf8fa470), CNST_LIMB(0xa25577ae24c1a61), CNST_LIMB(0x93b612a9f20fbc02) }, + /* 172 */ { 8, CNST_LIMB(0x2278e4e392557ecf), CNST_LIMB(0xeda3f5fb9c415052), CNST_LIMB(0xaa15f068e610000), CNST_LIMB(0x814fc7b19a67d317) }, + /* 173 */ { 8, CNST_LIMB(0x226ef7776aa7fd29), CNST_LIMB(0xede87974f3c81855), CNST_LIMB(0xb228d6bf7577921), CNST_LIMB(0x6fd9a03f2e0a4b7c) }, + /* 174 */ { 8, CNST_LIMB(0x22651e5aaf5532d0), CNST_LIMB(0xee2c97d694adab3f), CNST_LIMB(0xba91158ef5c4100), CNST_LIMB(0x5f4615a38d0d316e) }, + /* 175 */ { 8, CNST_LIMB(0x225b5944b40b4694), CNST_LIMB(0xee7052491d2c3e64), CNST_LIMB(0xc351ad9aec0b681), CNST_LIMB(0x4f8876863479a286) }, + /* 176 */ { 8, CNST_LIMB(0x2251a7ee3cdfcca5), CNST_LIMB(0xeeb3a9f01975077f), CNST_LIMB(0xcc6db6100000000), CNST_LIMB(0x4094d8a3041b60eb) }, + /* 177 */ { 8, CNST_LIMB(0x22480a1174e913d9), CNST_LIMB(0xeef69fea211b2627), CNST_LIMB(0xd5e85d09025c181), CNST_LIMB(0x32600b8ed883a09b) }, + /* 178 */ { 8, CNST_LIMB(0x223e7f69e522683c), CNST_LIMB(0xef393550f3aa6906), CNST_LIMB(0xdfc4e816401c100), CNST_LIMB(0x24df8c6eb4b6d1f1) }, + /* 179 */ { 8, CNST_LIMB(0x223507b46b988abe), CNST_LIMB(0xef7b6b399471103e), CNST_LIMB(0xea06b4c72947221), CNST_LIMB(0x18097a8ee151acef) }, + /* 180 */ { 8, CNST_LIMB(0x222ba2af32dbbb9e), CNST_LIMB(0xefbd42b465836767), CNST_LIMB(0xf4b139365210000), CNST_LIMB(0xbd48cc8ec1cd8e3) }, + /* 181 */ { 8, CNST_LIMB(0x22225019a9b4d16c), CNST_LIMB(0xeffebccd41ffcd5c), CNST_LIMB(0xffc80497d520961), CNST_LIMB(0x3807a8d67485fb) }, + /* 182 */ { 8, CNST_LIMB(0x22190fb47b1af172), CNST_LIMB(0xf03fda8b97997f33), CNST_LIMB(0x10b4ebfca1dee100), CNST_LIMB(0xea5768860b62e8d8) }, + /* 183 */ { 8, CNST_LIMB(0x220fe14186679801), CNST_LIMB(0xf0809cf27f703d52), CNST_LIMB(0x117492de921fc141), CNST_LIMB(0xd54faf5b635c5005) }, + /* 184 */ { 8, CNST_LIMB(0x2206c483d7c6b786), CNST_LIMB(0xf0c10500d63aa658), CNST_LIMB(0x123bb2ce41000000), CNST_LIMB(0xc14a56233a377926) }, + /* 185 */ { 8, CNST_LIMB(0x21fdb93fa0e0ccc5), CNST_LIMB(0xf10113b153c8ea7b), CNST_LIMB(0x130a8b6157bdecc1), CNST_LIMB(0xae39a88db7cd329f) }, + /* 186 */ { 8, CNST_LIMB(0x21f4bf3a31bcdcaa), CNST_LIMB(0xf140c9faa1e5439e), CNST_LIMB(0x13e15dede0e8a100), CNST_LIMB(0x9c10bde69efa7ab6) }, + /* 187 */ { 8, CNST_LIMB(0x21ebd639f1d86584), CNST_LIMB(0xf18028cf72976a4e), CNST_LIMB(0x14c06d941c0ca7e1), CNST_LIMB(0x8ac36c42a2836497) }, + /* 188 */ { 8, CNST_LIMB(0x21e2fe06597361a6), CNST_LIMB(0xf1bf311e95d00de3), CNST_LIMB(0x15a7ff487a810000), CNST_LIMB(0x7a463c8b84f5ef67) }, + /* 189 */ { 8, CNST_LIMB(0x21da3667eb0e8ccb), CNST_LIMB(0xf1fde3d30e812642), CNST_LIMB(0x169859ddc5c697a1), CNST_LIMB(0x6a8e5f5ad090fd4b) }, + /* 190 */ { 8, CNST_LIMB(0x21d17f282d1a300e), CNST_LIMB(0xf23c41d42727c808), CNST_LIMB(0x1791c60f6fed0100), CNST_LIMB(0x5b91a2943596fc56) }, + /* 191 */ { 8, CNST_LIMB(0x21c8d811a3d3c9e1), CNST_LIMB(0xf27a4c0585cbf805), CNST_LIMB(0x18948e8c0e6fba01), CNST_LIMB(0x4d4667b1c468e8f0) }, + /* 192 */ { 8, CNST_LIMB(0x21c040efcb50f858), CNST_LIMB(0xf2b803473f7ad0f3), CNST_LIMB(0x19a1000000000000), CNST_LIMB(0x3fa39ab547994daf) }, + /* 193 */ { 8, CNST_LIMB(0x21b7b98f11b61c1a), CNST_LIMB(0xf2f56875eb3f2614), CNST_LIMB(0x1ab769203dafc601), CNST_LIMB(0x32a0a9b2faee1e2a) }, + /* 194 */ { 8, CNST_LIMB(0x21af41bcd19739ba), CNST_LIMB(0xf3327c6ab49ca6c8), CNST_LIMB(0x1bd81ab557f30100), CNST_LIMB(0x26357ceac0e96962) }, + /* 195 */ { 8, CNST_LIMB(0x21a6d9474c81adf0), CNST_LIMB(0xf36f3ffb6d916240), CNST_LIMB(0x1d0367a69fed1ba1), CNST_LIMB(0x1a5a6f65caa5859e) }, + /* 196 */ { 8, CNST_LIMB(0x219e7ffda5ad572a), CNST_LIMB(0xf3abb3faa02166cc), CNST_LIMB(0x1e39a5057d810000), CNST_LIMB(0xf08480f672b4e86) }, + /* 197 */ { 8, CNST_LIMB(0x219635afdcd3e46d), CNST_LIMB(0xf3e7d9379f70166a), CNST_LIMB(0x1f7b2a18f29ac3e1), CNST_LIMB(0x4383340615612ca) }, + /* 198 */ { 8, CNST_LIMB(0x218dfa2ec92d0643), CNST_LIMB(0xf423b07e986aa967), CNST_LIMB(0x20c850694c2aa100), CNST_LIMB(0xf3c77969ee4be5a2) }, + /* 199 */ { 8, CNST_LIMB(0x2185cd4c148e4ae2), CNST_LIMB(0xf45f3a98a20738a4), CNST_LIMB(0x222173cc014980c1), CNST_LIMB(0xe00993cc187c5ec9) }, + /* 200 */ { 8, CNST_LIMB(0x217daeda36ad7a5c), CNST_LIMB(0xf49a784bcd1b8afe), CNST_LIMB(0x2386f26fc1000000), CNST_LIMB(0xcd2b297d889bc2b6) }, + /* 201 */ { 8, CNST_LIMB(0x21759eac708452fe), CNST_LIMB(0xf4d56a5b33cec44a), CNST_LIMB(0x24f92ce8af296d41), CNST_LIMB(0xbb214d5064862b22) }, + /* 202 */ { 8, CNST_LIMB(0x216d9c96c7d490d4), CNST_LIMB(0xf510118708a8f8dd), CNST_LIMB(0x2678863cd0ece100), CNST_LIMB(0xa9e1a7ca7ea10e20) }, + /* 203 */ { 8, CNST_LIMB(0x2165a86e02cb358c), CNST_LIMB(0xf54a6e8ca5438db1), CNST_LIMB(0x280563f0a9472d61), CNST_LIMB(0x99626e72b39ea0cf) }, + /* 204 */ { 8, CNST_LIMB(0x215dc207a3c20fdf), CNST_LIMB(0xf5848226989d33c3), CNST_LIMB(0x29a02e1406210000), CNST_LIMB(0x899a5ba9c13fafd9) }, + /* 205 */ { 8, CNST_LIMB(0x2155e939e51e8b37), CNST_LIMB(0xf5be4d0cb51434aa), CNST_LIMB(0x2b494f4efe6d2e21), CNST_LIMB(0x7a80a705391e96ff) }, + /* 206 */ { 8, CNST_LIMB(0x214e1ddbb54cd933), CNST_LIMB(0xf5f7cff41e09aeb8), CNST_LIMB(0x2d0134ef21cbc100), CNST_LIMB(0x6c0cfe23de23042a) }, + /* 207 */ { 8, CNST_LIMB(0x21465fc4b2d68f98), CNST_LIMB(0xf6310b8f55304840), CNST_LIMB(0x2ec84ef4da2ef581), CNST_LIMB(0x5e377df359c944dd) }, + /* 208 */ { 8, CNST_LIMB(0x213eaecd2893dd60), CNST_LIMB(0xf66a008e4788cbcd), CNST_LIMB(0x309f102100000000), CNST_LIMB(0x50f8ac5fc8f53985) }, + /* 209 */ { 8, CNST_LIMB(0x21370ace09f681c6), CNST_LIMB(0xf6a2af9e5a0f0a08), CNST_LIMB(0x3285ee02a1420281), CNST_LIMB(0x44497266278e35b7) }, + /* 210 */ { 8, CNST_LIMB(0x212f73a0ef6db7cb), CNST_LIMB(0xf6db196a761949d9), CNST_LIMB(0x347d6104fc324100), CNST_LIMB(0x382316831f7ee175) }, + /* 211 */ { 8, CNST_LIMB(0x2127e92012e25004), CNST_LIMB(0xf7133e9b156c7be5), CNST_LIMB(0x3685e47dade53d21), CNST_LIMB(0x2c7f377833b8946e) }, + /* 212 */ { 8, CNST_LIMB(0x21206b264c4a39a7), CNST_LIMB(0xf74b1fd64e0753c6), CNST_LIMB(0x389ff6bb15610000), CNST_LIMB(0x2157c761ab4163ef) }, + /* 213 */ { 8, CNST_LIMB(0x2118f98f0e52c28f), CNST_LIMB(0xf782bdbfdda6577b), CNST_LIMB(0x3acc1912ebb57661), CNST_LIMB(0x16a7071803cc49a9) }, + /* 214 */ { 8, CNST_LIMB(0x211194366320dc66), CNST_LIMB(0xf7ba18f93502e409), CNST_LIMB(0x3d0acff111946100), CNST_LIMB(0xc6781d80f8224fc) }, + /* 215 */ { 8, CNST_LIMB(0x210a3af8e926bb78), CNST_LIMB(0xf7f1322182cf15d1), CNST_LIMB(0x3f5ca2e692eaf841), CNST_LIMB(0x294092d370a900b) }, + /* 216 */ { 8, CNST_LIMB(0x2102edb3d00e29a6), CNST_LIMB(0xf82809d5be7072db), CNST_LIMB(0x41c21cb8e1000000), CNST_LIMB(0xf24f62335024a295) }, + /* 217 */ { 8, CNST_LIMB(0x20fbac44d5b6edc2), CNST_LIMB(0xf85ea0b0b27b2610), CNST_LIMB(0x443bcb714399a5c1), CNST_LIMB(0xe03b98f103fad6d2) }, + /* 218 */ { 8, CNST_LIMB(0x20f4768a4348ad08), CNST_LIMB(0xf894f74b06ef8b40), CNST_LIMB(0x46ca406c81af2100), CNST_LIMB(0xcee3d32cad2a9049) }, + /* 219 */ { 8, CNST_LIMB(0x20ed4c62ea57b1f0), CNST_LIMB(0xf8cb0e3b4b3bbdb3), CNST_LIMB(0x496e106ac22aaae1), CNST_LIMB(0xbe3f9df9277fdada) }, + /* 220 */ { 8, CNST_LIMB(0x20e62dae221c087a), CNST_LIMB(0xf900e6160002ccfe), CNST_LIMB(0x4c27d39fa5410000), CNST_LIMB(0xae46f0d94c05e933) }, + /* 221 */ { 8, CNST_LIMB(0x20df1a4bc4ba6525), CNST_LIMB(0xf9367f6da0ab2e9c), CNST_LIMB(0x4ef825c296e43ca1), CNST_LIMB(0x9ef2280fb437a33d) }, + /* 222 */ { 8, CNST_LIMB(0x20d8121c2c9e506e), CNST_LIMB(0xf96bdad2acb5f5ef), CNST_LIMB(0x51dfa61f5ad88100), CNST_LIMB(0x9039ff426d3f284b) }, + /* 223 */ { 8, CNST_LIMB(0x20d1150031e51549), CNST_LIMB(0xf9a0f8d3b0e04fde), CNST_LIMB(0x54def7a6d2f16901), CNST_LIMB(0x82178c6d6b51f8f4) }, + /* 224 */ { 8, CNST_LIMB(0x20ca22d927d8f54d), CNST_LIMB(0xf9d5d9fd5010b366), CNST_LIMB(0x57f6c10000000000), CNST_LIMB(0x74843b1ee4c1e053) }, + /* 225 */ { 8, CNST_LIMB(0x20c33b88da7c29aa), CNST_LIMB(0xfa0a7eda4c112ce6), CNST_LIMB(0x5b27ac993df97701), CNST_LIMB(0x6779c7f90dc42f48) }, + /* 226 */ { 8, CNST_LIMB(0x20bc5ef18c233bdf), CNST_LIMB(0xfa3ee7f38e181ed0), CNST_LIMB(0x5e7268b9bbdf8100), CNST_LIMB(0x5af23c74f9ad9fe9) }, + /* 227 */ { 8, CNST_LIMB(0x20b58cf5f31e4526), CNST_LIMB(0xfa7315d02f20c7bd), CNST_LIMB(0x61d7a7932ff3d6a1), CNST_LIMB(0x4ee7eae2acdc617e) }, + /* 228 */ { 8, CNST_LIMB(0x20aec5793770a74d), CNST_LIMB(0xfaa708f58014d37c), CNST_LIMB(0x65581f53c8c10000), CNST_LIMB(0x43556aa2ac262a0b) }, + /* 229 */ { 8, CNST_LIMB(0x20a8085ef096d530), CNST_LIMB(0xfadac1e711c832d1), CNST_LIMB(0x68f48a385b8320e1), CNST_LIMB(0x3835949593b8ddd1) }, + /* 230 */ { 8, CNST_LIMB(0x20a1558b2359c4b1), CNST_LIMB(0xfb0e4126bcc86bd7), CNST_LIMB(0x6cada69ed07c2100), CNST_LIMB(0x2d837fbe78458762) }, + /* 231 */ { 8, CNST_LIMB(0x209aace23fafa72e), CNST_LIMB(0xfb418734a9008bd9), CNST_LIMB(0x70843718cdbf27c1), CNST_LIMB(0x233a7e150a54a555) }, + /* 232 */ { 8, CNST_LIMB(0x20940e491ea988d7), CNST_LIMB(0xfb74948f5532da4b), CNST_LIMB(0x7479027ea1000000), CNST_LIMB(0x19561984a50ff8fe) }, + /* 233 */ { 8, CNST_LIMB(0x208d79a5006d7a47), CNST_LIMB(0xfba769b39e49640e), CNST_LIMB(0x788cd40268f39641), CNST_LIMB(0xfd211159fe3490f) }, + /* 234 */ { 8, CNST_LIMB(0x2086eedb8a3cead3), CNST_LIMB(0xfbda071cc67e6db5), CNST_LIMB(0x7cc07b437ecf6100), CNST_LIMB(0x6aa563e655033e3) }, + /* 235 */ { 8, CNST_LIMB(0x20806dd2c486dcc6), CNST_LIMB(0xfc0c6d447c5dd362), CNST_LIMB(0x8114cc6220762061), CNST_LIMB(0xfbb614b3f2d3b14c) }, + /* 236 */ { 8, CNST_LIMB(0x2079f67119059fae), CNST_LIMB(0xfc3e9ca2e1a05533), CNST_LIMB(0x858aa0135be10000), CNST_LIMB(0xeac0f8837fb05773) }, + /* 237 */ { 8, CNST_LIMB(0x2073889d50e7bf63), CNST_LIMB(0xfc7095ae91e1c760), CNST_LIMB(0x8a22d3b53c54c321), CNST_LIMB(0xda6e4c10e8615ca5) }, + /* 238 */ { 8, CNST_LIMB(0x206d243e9303d929), CNST_LIMB(0xfca258dca9331635), CNST_LIMB(0x8ede496339f34100), CNST_LIMB(0xcab755a8d01fa67f) }, + /* 239 */ { 8, CNST_LIMB(0x2066c93c62170aa8), CNST_LIMB(0xfcd3e6a0ca8906c2), CNST_LIMB(0x93bde80aec3a1481), CNST_LIMB(0xbb95a9ae71aa3e0c) }, + /* 240 */ { 8, CNST_LIMB(0x2060777e9b0db0f6), CNST_LIMB(0xfd053f6d26089673), CNST_LIMB(0x98c29b8100000000), CNST_LIMB(0xad0326c296b4f529) }, + /* 241 */ { 8, CNST_LIMB(0x205a2eed73563032), CNST_LIMB(0xfd3663b27f31d529), CNST_LIMB(0x9ded549671832381), CNST_LIMB(0x9ef9f21eed31b7c1) }, + /* 242 */ { 8, CNST_LIMB(0x2053ef71773d7e6a), CNST_LIMB(0xfd6753e032ea0efe), CNST_LIMB(0xa33f092e0b1ac100), CNST_LIMB(0x91747422be14b0b2) }, + /* 243 */ { 8, CNST_LIMB(0x204db8f388552ea9), CNST_LIMB(0xfd9810643d6614c3), CNST_LIMB(0xa8b8b452291fe821), CNST_LIMB(0x846d550e37b5063d) }, + /* 244 */ { 8, CNST_LIMB(0x20478b5cdbe2bb2f), CNST_LIMB(0xfdc899ab3ff56c5e), CNST_LIMB(0xae5b564ac3a10000), CNST_LIMB(0x77df79e9a96c06f6) }, + /* 245 */ { 8, CNST_LIMB(0x20416696f957cfbf), CNST_LIMB(0xfdf8f02086af2c4b), CNST_LIMB(0xb427f4b3be74c361), CNST_LIMB(0x6bc6019636c7d0c2) }, + /* 246 */ { 8, CNST_LIMB(0x203b4a8bb8d356e7), CNST_LIMB(0xfe29142e0e01401f), CNST_LIMB(0xba1f9a938041e100), CNST_LIMB(0x601c4205aebd9e47) }, + /* 247 */ { 8, CNST_LIMB(0x2035372541ab0f0d), CNST_LIMB(0xfe59063c8822ce56), CNST_LIMB(0xc0435871d1110f41), CNST_LIMB(0x54ddc59756f05016) }, + /* 248 */ { 8, CNST_LIMB(0x202f2c4e08fd6dcc), CNST_LIMB(0xfe88c6b3626a72aa), CNST_LIMB(0xc694446f01000000), CNST_LIMB(0x4a0648979c838c18) }, + /* 249 */ { 8, CNST_LIMB(0x202929f0d04b99e9), CNST_LIMB(0xfeb855f8ca88fb0d), CNST_LIMB(0xcd137a5b57ac3ec1), CNST_LIMB(0x3f91b6e0bb3a053d) }, + /* 250 */ { 8, CNST_LIMB(0x20232ff8a41b45eb), CNST_LIMB(0xfee7b471b3a9507d), CNST_LIMB(0xd3c21bcecceda100), CNST_LIMB(0x357c299a88ea76a5) }, + /* 251 */ { 8, CNST_LIMB(0x201d3e50daa036db), CNST_LIMB(0xff16e281db76303b), CNST_LIMB(0xdaa150410b788de1), CNST_LIMB(0x2bc1e517aecc56e3) }, + /* 252 */ { 8, CNST_LIMB(0x201754e5126d446d), CNST_LIMB(0xff45e08bcf06554e), CNST_LIMB(0xe1b24521be010000), CNST_LIMB(0x225f56ceb3da9f5d) }, + /* 253 */ { 8, CNST_LIMB(0x201173a1312ca135), CNST_LIMB(0xff74aef0efafadd7), CNST_LIMB(0xe8f62df12777c1a1), CNST_LIMB(0x1951136d53ad63ac) }, + /* 254 */ { 8, CNST_LIMB(0x200b9a71625f3b13), CNST_LIMB(0xffa34e1177c23362), CNST_LIMB(0xf06e445906fc0100), CNST_LIMB(0x1093d504b3cd7d93) }, + /* 255 */ { 8, CNST_LIMB(0x2005c94216230568), CNST_LIMB(0xffd1be4c7f2af942), CNST_LIMB(0xf81bc845c81bf801), CNST_LIMB(0x824794d1ec1814f) }, + /* 256 */ { 8, CNST_LIMB(0x1fffffffffffffff), CNST_LIMB(0xffffffffffffffff), CNST_LIMB(0x8), CNST_LIMB(0x0) }, +}; diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mul_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_1.s new file mode 100644 index 0000000..1644074 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_1.s @@ -0,0 +1,205 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mul_1c + .type __gmpn_mul_1c,@function + +__gmpn_mul_1c: + + + + + push %rbx + mov %r8, %r10 + + jmp .Lcommon + .size __gmpn_mul_1c,.-__gmpn_mul_1c + + .globl __gmpn_mul_1 + .type __gmpn_mul_1,@function + +__gmpn_mul_1: + + + + + + push %rbx + xor %r10, %r10 +.Lcommon: + mov (%rsi), %rax + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + add %r10, %rax + adc $0, %rdx + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + mov %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: mov %r10, (%rdi,%r11,8) + add %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + mov %r9, 8(%rdi,%r11,8) + add %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + mov %r8, 16(%rdi,%r11,8) + add %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + mov %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + add %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + mov %r10, (%rdi,%r11,8) + add %rax, %r9 + adc %r8, %rdx + mov %r9, 8(%rdi,%r11,8) + add %r8, %rdx +.Lret: mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_mul_1,.-__gmpn_mul_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mul_2.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_2.s new file mode 100644 index 0000000..0c3310d --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_2.s @@ -0,0 +1,218 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mul_2 + .type __gmpn_mul_2,@function + +__gmpn_mul_2: + + + push %rbx + push %rbp + + mov (%rcx), %r8 + mov 8(%rcx), %r9 + + mov (%rsi), %rax + + mov %rdx, %r11 + neg %r11 + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + + and $3, %edx + jz .Lm2p0 + cmp $2, %edx + jc .Lm2p1 + jz .Lm2p2 +.Lm2p3: + mul %r8 + xor %r10d, %r10d + mov %rax, %rcx + mov %rdx, %rbp + mov 8(%rsi,%r11,8), %rax + add $-1, %r11 + mul %r9 + add %rax, %rbp + jmp .Lm23 +.Lm2p0: + mul %r8 + xor %ebp, %ebp + mov %rax, %rbx + mov %rdx, %rcx + jmp .Lm20 +.Lm2p1: + mul %r8 + xor %r10d, %r10d + xor %ebx, %ebx + xor %ecx, %ecx + add $1, %r11 + jmp .Lm2top +.Lm2p2: + mul %r8 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, %rbp + mov %rdx, %r10 + mov 8(%rsi,%r11,8), %rax + add $-2, %r11 + jmp .Lm22 + + + .align 32, 0x90 +.Lm2top: + add %rax, %r10 + adc %rdx, %rbx + mov 0(%rsi,%r11,8), %rax + adc $0, %ecx + mov $0, %ebp + mul %r9 + add %rax, %rbx + mov %r10, 0(%rdi,%r11,8) + adc %rdx, %rcx + mov 8(%rsi,%r11,8), %rax + mul %r8 + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp +.Lm20: mov 8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r11,8), %rax + mov $0, %r10d + mul %r8 + add %rax, %rcx + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rax, %rbp + mov %rbx, 8(%rdi,%r11,8) +.Lm23: adc %rdx, %r10 + mov 24(%rsi,%r11,8), %rax + mul %r8 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov %rcx, 16(%rdi,%r11,8) + mov 24(%rsi,%r11,8), %rax + mov $0, %ecx + adc $0, %ebx +.Lm22: mul %r9 + add %rax, %r10 + mov %rbp, 24(%rdi,%r11,8) + adc %rdx, %rbx + mov 32(%rsi,%r11,8), %rax + mul %r8 + add $4, %r11 + js .Lm2top + + + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + mov (%rsi), %rax + mul %r9 + mov %r10, (%rdi) + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%rdi) + mov %rcx, %rax + + pop %rbp + pop %rbx + + ret + .size __gmpn_mul_2,.-__gmpn_mul_2 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s new file mode 100644 index 0000000..2cfb7aa --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mul_basecase.s @@ -0,0 +1,483 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mul_basecase + .type __gmpn_mul_basecase,@function + +__gmpn_mul_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + xor %r13d, %r13d + mov (%rsi), %rax + mov (%rcx), %r12 + + sub %rdx, %r13 + mov %r13, %r11 + mov %edx, %ebx + + lea (%rdi,%rdx,8), %rdi + lea (%rsi,%rdx,8), %rsi + + mul %r12 + + test $1, %r8b + jz .Lmul_2 + + + + +.Lmul_1: + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 + +.Lmul_1_prologue_3: + add $-1, %r11 + lea .Laddmul_outer_3(%rip), %r14 + mov %rax, %r10 + mov %rdx, %rbx + jmp .Lmul_1_entry_3 + +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_1_entry_0 + +.Lmul_1_prologue_1: + cmp $-1, %r13 + jne 2f + mov %rax, -8(%rdi) + mov %rdx, (%rdi) + jmp .Lret +2: add $1, %r11 + lea .Laddmul_outer_1(%rip), %r14 + mov %rax, %r15 + mov %rdx, %rbp + xor %r10d, %r10d + mov (%rsi,%r11,8), %rax + jmp .Lmul_1_entry_1 + +.Lmul_1_prologue_2: + add $-2, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov %rax, %rbx + mov %rdx, %r15 + mov 24(%rsi,%r11,8), %rax + xor %ebp, %ebp + xor %r10d, %r10d + jmp .Lmul_1_entry_2 + + + + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %r15 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + xor %ebx, %ebx + mul %r12 + mov %r15, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + add $-1, %r8 + jz .Lret + + mov 8(%rcx), %r12 + mov 16(%rcx), %r9 + + lea 8(%rcx), %rcx + lea 8(%rdi), %rdi + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov 8(%rcx), %r9 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jz .Lmul_2_prologue_2 + jc .Lmul_2_prologue_1 + +.Lmul_2_prologue_3: + lea .Laddmul_outer_3(%rip), %r14 + add $2, %r11 + mov %rax, -16(%rdi,%r11,8) + mov %rdx, %rbp + xor %r10d, %r10d + xor %ebx, %ebx + mov -16(%rsi,%r11,8), %rax + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + add $3, %r11 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + mov -24(%rsi,%r11,8), %rax + lea .Laddmul_outer_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + add $1, %r11 + lea .Laddmul_outer_2(%rip), %r14 + mov $0, %ebx + mov $0, %r15d + mov %rax, %rbp + mov -8(%rsi,%r11,8), %rax + mov %rdx, %r10 + jmp .Lmul_2_entry_2 + + + + .align 16, 0x90 +.Lmul_2_top: + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r11,8), %rax + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc $0, %ebp +.Lmul_2_entry_0: + mul %r9 + add %rax, %r15 + mov %rbx, -24(%rdi,%r11,8) + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r11,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %r15, -16(%rdi,%r11,8) +.Lmul_2_entry_3: + mul %r9 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r12 + add %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx +.Lmul_2_entry_2: + mul %r9 + add %rax, %r10 + mov %rbp, -8(%rdi,%r11,8) + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r11 + mov %r10, -32(%rdi,%r11,8) + js .Lmul_2_top + + mov -32(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + mov 16(%rcx), %r12 + mov 24(%rcx), %r9 + + lea 16(%rcx), %rcx + lea 16(%rdi), %rdi + + jmp *%r14 + + + + + + + + +.Laddmul_outer_0: + add $3, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -24(%rsi,%r13,8), %rax + mul %r12 + mov %rax, %rbx + mov -24(%rsi,%r13,8), %rax + mov %rdx, %r15 + xor %ebp, %ebp + jmp .Laddmul_entry_0 + +.Laddmul_outer_1: + mov %r13, %r11 + mov (%rsi,%r13,8), %rax + mul %r12 + mov %rax, %r10 + mov (%rsi,%r13,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + jmp .Laddmul_entry_1 + +.Laddmul_outer_2: + add $1, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -8(%rsi,%r13,8), %rax + mul %r12 + xor %ebx, %ebx + mov %rax, %rbp + xor %r15d, %r15d + mov %rdx, %r10 + mov -8(%rsi,%r13,8), %rax + jmp .Laddmul_entry_2 + +.Laddmul_outer_3: + add $2, %r13 + lea 0(%rip), %r14 + + mov %r13, %r11 + mov -16(%rsi,%r13,8), %rax + xor %r10d, %r10d + mul %r12 + mov %rax, %r15 + mov -16(%rsi,%r13,8), %rax + mov %rdx, %rbp + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r11,8) + adc %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r12 + add %rax, %rbx + mov -24(%rsi,%r11,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp +.Laddmul_entry_0: + mul %r9 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r11,8) + adc %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + mul %r12 + add %rax, %r15 + mov -16(%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r9 + add %r15, -16(%rdi,%r11,8) + adc %rax, %rbp + mov -8(%rsi,%r11,8), %rax + adc %rdx, %r10 + mul %r12 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r11,8), %rax + adc %r15d, %ebx +.Laddmul_entry_2: + mul %r9 + add %rbp, -8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov (%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r9 + add $4, %r11 + js .Laddmul_top + + add %r10, -8(%rdi) + adc %rax, %rbx + mov %rbx, (%rdi) + adc %rdx, %r15 + mov %r15, 8(%rdi) + + add $-2, %r8 + jz .Lret + + lea 16(%rdi), %rdi + lea 16(%rcx), %rcx + + mov (%rcx), %r12 + mov 8(%rcx), %r9 + + jmp *%r14 + + .align 16, 0x90 +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + .size __gmpn_mul_basecase,.-__gmpn_mul_basecase diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s new file mode 100644 index 0000000..d76272c --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mullo_basecase.s @@ -0,0 +1,439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mullo_basecase + .type __gmpn_mullo_basecase,@function + +__gmpn_mullo_basecase: + + + cmp $4, %rcx + jge .Lgen + mov (%rsi), %rax + mov (%rdx), %r8 + + lea .Ltab(%rip), %r9 + movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .Ltab-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .text + +.L1: imul %r8, %rax + mov %rax, (%rdi) + + ret + +.L2: mov 8(%rdx), %r11 + imul %rax, %r11 + mul %r8 + mov %rax, (%rdi) + imul 8(%rsi), %r8 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(%rdi) + + ret + +.L3: mov 8(%rdx), %r9 + mov 16(%rdx), %r11 + mul %r8 + mov %rax, (%rdi) + mov (%rsi), %rax + mov %rdx, %rcx + mul %r9 + imul 8(%rsi), %r9 + mov 16(%rsi), %r10 + imul %r8, %r10 + add %rax, %rcx + adc %rdx, %r9 + add %r10, %r9 + mov 8(%rsi), %rax + mul %r8 + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (%rsi), %rax + add %rax, %r9 + mov %rcx, 8(%rdi) + mov %r9, 16(%rdi) + + ret + +.L0m4: +.L1m4: +.L2m4: +.L3m4: +.Lgen: push %rbx + push %rbp + push %r13 + push %r14 + push %r15 + + mov (%rsi), %rax + mov (%rdx), %r13 + mov %rdx, %r11 + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + neg %rcx + + mul %r13 + + test $1, %cl + jz .Lmul_2 + +.Lmul_1: + lea -8(%rdi), %rdi + lea -8(%rsi), %rsi + test $2, %cl + jnz .Lmul_1_prologue_3 + +.Lmul_1_prologue_2: + lea -1(%rcx), %r9 + lea .Laddmul_outer_1(%rip), %r8 + mov %rax, %rbx + mov %rdx, %r15 + xor %ebp, %ebp + xor %r10d, %r10d + mov 16(%rsi,%rcx,8), %rax + jmp .Lmul_1_entry_2 + +.Lmul_1_prologue_3: + lea 1(%rcx), %r9 + lea .Laddmul_outer_3(%rip), %r8 + mov %rax, %rbp + mov %rdx, %r10 + xor %ebx, %ebx + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r9,8) + add %rax, %r15 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbp + xor %ebx, %ebx + mul %r13 + mov %r15, -8(%rdi,%r9,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r9,8), %rax + mul %r13 + mov %rbp, (%rdi,%r9,8) + add %rax, %r10 + adc %rdx, %rbx + mov 16(%rsi,%r9,8), %rax + mul %r13 + mov %r10, 8(%rdi,%r9,8) + xor %ebp, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r9,8), %rax + mov %rbp, %r15 + adc %rdx, %r15 +.Lmul_1_entry_2: + mul %r13 + add $4, %r9 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %r15 + mov %r15, -8(%rdi) + adc %rdx, %rbp + + imul (%rsi), %r13 + add %r13, %rbp + mov %rbp, (%rdi) + + add $1, %rcx + jz .Lret + + mov 8(%r11), %r13 + mov 16(%r11), %r14 + + lea 16(%rsi), %rsi + lea 8(%r11), %r11 + lea 24(%rdi), %rdi + + jmp *%r8 + + +.Lmul_2: + mov 8(%r11), %r14 + test $2, %cl + jz .Lmul_2_prologue_3 + + .align 16, 0x90 +.Lmul_2_prologue_1: + lea 0(%rcx), %r9 + mov %rax, %r10 + mov %rdx, %rbx + xor %r15d, %r15d + mov (%rsi,%rcx,8), %rax + lea .Laddmul_outer_3(%rip), %r8 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_3: + lea 2(%rcx), %r9 + mov $0, %r10d + mov %rax, %r15 + mov (%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_top: + mov -32(%rsi,%r9,8), %rax + mul %r14 + add %rax, %rbx + adc %rdx, %r15 + mov -24(%rsi,%r9,8), %rax + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc $0, %ebp + mul %r14 + add %rax, %r15 + mov %rbx, -24(%rdi,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %r15 + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d +.Lmul_2_entry_3: + mov $0, %ebx + mov %r15, -16(%rdi,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %r15d + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc %r15d, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%rdi,%r9,8) + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %r15d +.Lmul_2_entry_1: + add $4, %r9 + mov %r10, -32(%rdi,%r9,8) + js .Lmul_2_top + + imul -16(%rsi), %r14 + add %r14, %rbx + imul -8(%rsi), %r13 + add %r13, %rbx + mov %rbx, -8(%rdi) + + add $2, %rcx + jz .Lret + + mov 16(%r11), %r13 + mov 24(%r11), %r14 + + lea 16(%r11), %r11 + lea 16(%rdi), %rdi + + jmp *%r8 + + +.Laddmul_outer_1: + lea -2(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + mul %r13 + mov %rax, %r10 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbx + xor %r15d, %r15d + lea .Laddmul_outer_3(%rip), %r8 + jmp .Laddmul_entry_1 + +.Laddmul_outer_3: + lea 0(%rcx), %r9 + mov -16(%rsi,%rcx,8), %rax + xor %r10d, %r10d + mul %r13 + mov %rax, %r15 + mov -16(%rsi,%rcx,8), %rax + mov %rdx, %rbp + lea .Laddmul_outer_1(%rip), %r8 + jmp .Laddmul_entry_3 + + .align 16, 0x90 +.Laddmul_top: + add %r10, -32(%rdi,%r9,8) + adc %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %r15 + adc %ebp, %ebp + mul %r14 + xor %r10d, %r10d + add %rbx, -24(%rdi,%r9,8) + adc %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + mul %r13 + add %rax, %r15 + mov -16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d +.Laddmul_entry_3: + mul %r14 + add %r15, -16(%rdi,%r9,8) + adc %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + xor %ebx, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %r15d + mov -8(%rsi,%r9,8), %rax + adc %r15d, %ebx + mul %r14 + add %rbp, -8(%rdi,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov (%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %r15d +.Laddmul_entry_1: + mul %r14 + add $4, %r9 + js .Laddmul_top + + add %r10, -32(%rdi) + adc %rax, %rbx + + imul -24(%rsi), %r13 + add %r13, %rbx + add %rbx, -24(%rdi) + + add $2, %rcx + jns .Lret + + lea 16(%r11), %r11 + + mov (%r11), %r13 + mov 8(%r11), %r14 + + lea -16(%rsi), %rsi + + jmp *%r8 + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %rbp + pop %rbx + + ret + .size __gmpn_mullo_basecase,.-__gmpn_mullo_basecase diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s new file mode 100644 index 0000000..b607e84 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/mulmid_basecase.s @@ -0,0 +1,573 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_mulmid_basecase + .type __gmpn_mulmid_basecase,@function + +__gmpn_mulmid_basecase: + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rcx, %r15 + + + lea 1(%rdx), %r13 + sub %r8, %r13 + + lea (%rdi,%r13,8), %rdi + + cmp $4, %r13 + jc .Ldiagonal + + lea (%rsi,%rdx,8), %rsi + + test $1, %r8 + jz .Lmul_2 + + + + +.Lmul_1: + mov %r13d, %ebx + + neg %r13 + mov (%rsi,%r13,8), %rax + mov (%r15), %r12 + mul %r12 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_1_prologue_0 + cmp $2, %ebx + jc .Lmul_1_prologue_1 + jz .Lmul_1_prologue_2 + +.Lmul_1_prologue_3: + mov %rax, %r10 + mov %rdx, %rbx + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_1_entry_3 + + .align 16, 0x90 +.Lmul_1_prologue_0: + mov %rax, %rbp + mov %rdx, %r10 + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_1_entry_0 + + .align 16, 0x90 +.Lmul_1_prologue_1: + add $4, %r11 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + mov (%rsi,%r11,8), %rax + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_1_entry_1 + + .align 16, 0x90 +.Lmul_1_prologue_2: + mov %rax, %rbx + mov %rdx, %rcx + mov 24(%rsi,%r11,8), %rax + mov $0, %ebp + mov $0, %r10d + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_1_entry_2 + + + + + .align 16, 0x90 +.Lmul_1_top: + mov %rbx, -16(%rdi,%r11,8) + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp +.Lmul_1_entry_1: + mov $0, %ebx + mul %r12 + mov %rcx, -8(%rdi,%r11,8) + add %rax, %rbp + adc %rdx, %r10 +.Lmul_1_entry_0: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov %rbp, (%rdi,%r11,8) + add %rax, %r10 + adc %rdx, %rbx +.Lmul_1_entry_3: + mov 16(%rsi,%r11,8), %rax + mul %r12 + mov %r10, 8(%rdi,%r11,8) + mov $0, %ebp + mov %rbp, %r10 + add %rax, %rbx + mov 24(%rsi,%r11,8), %rax + mov %rbp, %rcx + adc %rdx, %rcx +.Lmul_1_entry_2: + mul %r12 + add $4, %r11 + js .Lmul_1_top + + mov %rbx, -16(%rdi) + add %rax, %rcx + mov %rcx, -8(%rdi) + mov %rbp, 8(%rdi) + adc %rdx, %rbp + mov %rbp, (%rdi) + + dec %r8 + jz .Lret + + lea -8(%rsi), %rsi + lea 8(%r15), %r15 + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Lmul_2: + mov %r13d, %ebx + + neg %r13 + mov -8(%rsi,%r13,8), %rax + mov (%r15), %r12 + mov 8(%r15), %r9 + mul %r9 + + and $-4, %r13 + mov %r13, %r11 + + and $3, %ebx + jz .Lmul_2_prologue_0 + cmp $2, %ebx + jc .Lmul_2_prologue_1 + jz .Lmul_2_prologue_2 + +.Lmul_2_prologue_3: + mov %rax, %rcx + mov %rdx, %rbp + lea .Laddmul_prologue_3(%rip), %r14 + jmp .Lmul_2_entry_3 + + .align 16, 0x90 +.Lmul_2_prologue_0: + mov %rax, %rbx + mov %rdx, %rcx + lea .Laddmul_prologue_0(%rip), %r14 + jmp .Lmul_2_entry_0 + + .align 16, 0x90 +.Lmul_2_prologue_1: + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + lea .Laddmul_prologue_1(%rip), %r14 + jmp .Lmul_2_entry_1 + + .align 16, 0x90 +.Lmul_2_prologue_2: + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov 16(%rsi,%r11,8), %rax + lea .Laddmul_prologue_2(%rip), %r14 + jmp .Lmul_2_entry_2 + + + + + .align 16, 0x90 +.Lmul_2_top: + mov -8(%rsi,%r11,8), %rax + mul %r9 + add %rax, %rbx + adc %rdx, %rcx +.Lmul_2_entry_0: + mov $0, %ebp + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbx + mov (%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rax, %rcx + mov %rbx, (%rdi,%r11,8) + adc %rdx, %rbp +.Lmul_2_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov $0, %ebx + adc $0, %r10d + mov 8(%rsi,%r11,8), %rax + mov %rcx, 8(%rdi,%r11,8) + mul %r9 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 +.Lmul_2_entry_2: + mov $0, %ecx + mul %r12 + add %rax, %rbp + mov 16(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rax, %r10 + mov %rbp, 16(%rdi,%r11,8) + adc %rdx, %rbx +.Lmul_2_entry_1: + mov 24(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r11 + mov %r10, -8(%rdi,%r11,8) + jnz .Lmul_2_top + + mov %rbx, (%rdi) + mov %rcx, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Laddmul_prologue_0: + mov -8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rcx + mov %rdx, %rbp + mov $0, %r10d + jmp .Laddmul_entry_0 + + .align 16, 0x90 +.Laddmul_prologue_1: + mov 16(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbx + mov %rdx, %rcx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + jmp .Laddmul_entry_1 + + .align 16, 0x90 +.Laddmul_prologue_2: + mov 8(%rsi,%r11,8), %rax + mul %r9 + mov %rax, %r10 + mov %rdx, %rbx + mov $0, %ecx + jmp .Laddmul_entry_2 + + .align 16, 0x90 +.Laddmul_prologue_3: + mov (%rsi,%r11,8), %rax + mul %r9 + mov %rax, %rbp + mov %rdx, %r10 + mov $0, %ebx + mov $0, %ecx + jmp .Laddmul_entry_3 + + + + .align 16, 0x90 +.Laddmul_top: + mov $0, %r10d + add %rax, %rbx + mov -8(%rsi,%r11,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r9 + add %rbx, -8(%rdi,%r11,8) + adc %rax, %rcx + adc %rdx, %rbp +.Laddmul_entry_0: + mov (%rsi,%r11,8), %rax + mul %r12 + add %rax, %rcx + mov (%rsi,%r11,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r9 + add %rcx, (%rdi,%r11,8) + mov $0, %ecx + adc %rax, %rbp + mov $0, %ebx + adc %rdx, %r10 +.Laddmul_entry_3: + mov 8(%rsi,%r11,8), %rax + mul %r12 + add %rax, %rbp + mov 8(%rsi,%r11,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r9 + add %rbp, 8(%rdi,%r11,8) + adc %rax, %r10 + adc %rdx, %rbx +.Laddmul_entry_2: + mov 16(%rsi,%r11,8), %rax + mul %r12 + add %rax, %r10 + mov 16(%rsi,%r11,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r9 + add %r10, 16(%rdi,%r11,8) + nop + adc %rax, %rbx + mov $0, %ebp + mov 24(%rsi,%r11,8), %rax + adc %rdx, %rcx +.Laddmul_entry_1: + mul %r12 + add $4, %r11 + jnz .Laddmul_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %ebp + + add %rbx, -8(%rdi) + adc %rcx, (%rdi) + adc %rbp, 8(%rdi) + + sub $2, %r8 + jz .Lret + + lea 16(%r15), %r15 + lea -16(%rsi), %rsi + + mov %r13, %r11 + mov (%r15), %r12 + mov 8(%r15), %r9 + + jmp *%r14 + + + + + .align 16, 0x90 +.Ldiagonal: + xor %ebx, %ebx + xor %ecx, %ecx + xor %ebp, %ebp + + neg %r13 + + mov %r8d, %eax + and $3, %eax + jz .Ldiag_prologue_0 + cmp $2, %eax + jc .Ldiag_prologue_1 + jz .Ldiag_prologue_2 + +.Ldiag_prologue_3: + lea -8(%r15), %r15 + mov %r15, %r10 + add $1, %r8 + mov %r8, %r11 + lea .Ldiag_entry_3(%rip), %r14 + jmp .Ldiag_entry_3 + +.Ldiag_prologue_0: + mov %r15, %r10 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%rsi,%r11,8), %rax + jmp .Ldiag_entry_0 + +.Ldiag_prologue_1: + lea 8(%r15), %r15 + mov %r15, %r10 + add $3, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov -8(%r10), %rax + jmp .Ldiag_entry_1 + +.Ldiag_prologue_2: + lea -16(%r15), %r15 + mov %r15, %r10 + add $2, %r8 + mov %r8, %r11 + lea 0(%rip), %r14 + mov 16(%r10), %rax + jmp .Ldiag_entry_2 + + + + + .align 16, 0x90 +.Ldiag_top: + add %rax, %rbx + adc %rdx, %rcx + mov -8(%rsi,%r11,8), %rax + adc $0, %rbp +.Ldiag_entry_0: + mulq (%r10) + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_3: + mov -16(%rsi,%r11,8), %rax + mulq 8(%r10) + add %rax, %rbx + mov 16(%r10), %rax + adc %rdx, %rcx + adc $0, %rbp +.Ldiag_entry_2: + mulq -24(%rsi,%r11,8) + add %rax, %rbx + mov 24(%r10), %rax + adc %rdx, %rcx + lea 32(%r10), %r10 + adc $0, %rbp +.Ldiag_entry_1: + mulq -32(%rsi,%r11,8) + sub $4, %r11 + jnz .Ldiag_top + + add %rax, %rbx + adc %rdx, %rcx + adc $0, %rbp + + mov %rbx, (%rdi,%r13,8) + + inc %r13 + jz .Ldiag_end + + mov %r8, %r11 + mov %r15, %r10 + + lea 8(%rsi), %rsi + mov %rcx, %rbx + mov %rbp, %rcx + xor %ebp, %ebp + + jmp *%r14 + +.Ldiag_end: + mov %rcx, (%rdi) + mov %rbp, 8(%rdi) + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_mulmid_basecase,.-__gmpn_mulmid_basecase diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/nand_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/nand_n.s new file mode 100644 index 0000000..04593b9 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/nand_n.s @@ -0,0 +1,155 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_nand_n + .type __gmpn_nand_n,@function + +__gmpn_nand_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: and (%rsi,%rcx,8), %r8 + not %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: and (%rsi,%rcx,8), %r8 + not %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + and (%rsi,%rcx,8), %r8 + not %r8 + and 8(%rsi,%rcx,8), %r9 + not %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + and 16(%rsi,%rcx,8), %r8 + not %r8 + and 24(%rsi,%rcx,8), %r9 + not %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_nand_n,.-__gmpn_nand_n + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/nior_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/nior_n.s new file mode 100644 index 0000000..8ea0437 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/nior_n.s @@ -0,0 +1,155 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_nior_n + .type __gmpn_nior_n,@function + +__gmpn_nior_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: or (%rsi,%rcx,8), %r8 + not %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: or (%rsi,%rcx,8), %r8 + not %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + or (%rsi,%rcx,8), %r8 + not %r8 + or 8(%rsi,%rcx,8), %r9 + not %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + or 16(%rsi,%rcx,8), %r8 + not %r8 + or 24(%rsi,%rcx,8), %r9 + not %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_nior_n,.-__gmpn_nior_n + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/perfsqr.h b/vere/ext/gmp/gen/x86_64-linux/mpn/perfsqr.h new file mode 100644 index 0000000..80c5eb7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/perfsqr.h @@ -0,0 +1,46 @@ +/* This file generated by gen-psqr.c - DO NOT EDIT. */ + +#if GMP_LIMB_BITS != 64 || GMP_NAIL_BITS != 0 +Error, error, this data is for 64 bit limb and 0 bit nail +#endif + +/* Non-zero bit indicates a quadratic residue mod 0x100. + This test identifies 82.81% as non-squares (212/256). */ +static const mp_limb_t +sq_res_0x100[4] = { + CNST_LIMB(0x202021202030213), + CNST_LIMB(0x202021202020213), + CNST_LIMB(0x202021202030212), + CNST_LIMB(0x202021202020212), +}; + +/* 2^48-1 = 3^2 * 5 * 7 * 13 * 17 * 97 ... */ +#define PERFSQR_MOD_BITS 49 + +/* This test identifies 97.81% as non-squares. */ +#define PERFSQR_MOD_TEST(up, usize) \ + do { \ + mp_limb_t r; \ + PERFSQR_MOD_34 (r, up, usize); \ + \ + /* 69.23% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(91), CNST_LIMB(0xfd2fd2fd2fd3), \ + CNST_LIMB(0x2191240), CNST_LIMB(0x8850a206953820e1)); \ + \ + /* 68.24% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(85), CNST_LIMB(0xfcfcfcfcfcfd), \ + CNST_LIMB(0x82158), CNST_LIMB(0x10b48c4b4206a105)); \ + \ + /* 55.56% */ \ + PERFSQR_MOD_1 (r, CNST_LIMB( 9), CNST_LIMB(0xe38e38e38e39), \ + CNST_LIMB(0x93)); \ + \ + /* 49.48% */ \ + PERFSQR_MOD_2 (r, CNST_LIMB(97), CNST_LIMB(0xfd5c5f02a3a1), \ + CNST_LIMB(0x1eb628b47), CNST_LIMB(0x6067981b8b451b5f)); \ + } while (0) + +/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, 99.62% non-squares. */ + +/* helper for tests/mpz/t-perfsqr.c */ +#define PERFSQR_DIVISORS { 256, 91, 85, 9, 97, } diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/popcount.s b/vere/ext/gmp/gen/x86_64-linux/mpn/popcount.s new file mode 100644 index 0000000..243219e --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/popcount.s @@ -0,0 +1,160 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_popcount + .type __gmpn_popcount,@function + +__gmpn_popcount: + + + push %rbx + mov $0x5555555555555555, %r10 + push %rbp + mov $0x3333333333333333, %r11 + lea (%rdi,%rsi,8), %rdi + mov $0x0f0f0f0f0f0f0f0f, %rcx + neg %rsi + mov $0x0101010101010101, %rdx + xor %eax, %eax + test $1, %sil + jz .Ltop + + mov (%rdi,%rsi,8), %r8 + + mov %r8, %r9 + shr %r8 + and %r10, %r8 + sub %r8, %r9 + + mov %r9, %r8 + shr $2, %r9 + and %r11, %r8 + and %r11, %r9 + add %r8, %r9 + + dec %rsi + jmp .Lmid + + .align 16, 0x90 +.Ltop: mov (%rdi,%rsi,8), %r8 + mov 8(%rdi,%rsi,8), %rbx + + mov %r8, %r9 + mov %rbx, %rbp + shr %r8 + shr %rbx + and %r10, %r8 + and %r10, %rbx + sub %r8, %r9 + sub %rbx, %rbp + + mov %r9, %r8 + mov %rbp, %rbx + shr $2, %r9 + shr $2, %rbp + and %r11, %r8 + and %r11, %r9 + and %r11, %rbx + and %r11, %rbp + add %r8, %r9 + add %rbx, %rbp + + add %rbp, %r9 +.Lmid: mov %r9, %r8 + shr $4, %r9 + and %rcx, %r8 + and %rcx, %r9 + add %r8, %r9 + + imul %rdx, %r9 + shr $56, %r9 + + add %r9, %rax + add $2, %rsi + jnc .Ltop + +.Lend: + pop %rbp + pop %rbx + + ret + .size __gmpn_popcount,.-__gmpn_popcount diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/redc_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/redc_1.s new file mode 100644 index 0000000..da7fd88 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/redc_1.s @@ -0,0 +1,603 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_redc_1 + .type __gmpn_redc_1,@function + +__gmpn_redc_1: + + + + push %rbp + mov (%rsi), %rbp + push %rbx + imul %r8, %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rcx, %r12 + neg %r12 + lea (%rdx,%rcx,8), %r13 + lea -16(%rsi,%rcx,8), %rsi + + mov %ecx, %eax + and $3, %eax + lea 4(%rax), %r9 + cmp $4, %ecx + cmovg %r9, %rax + lea .Ltab(%rip), %r9 + + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax + + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L0-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text + + .align 16, 0x90 +.L1: mov (%rdx), %rax + mul %rbp + add 8(%rsi), %rax + adc 16(%rsi), %rdx + mov %rdx, (%rdi) + mov $0, %eax + adc %eax, %eax + jmp .Lret + + + .align 16, 0x90 +.L2: mov (%rdx), %rax + mul %rbp + xor %r14d, %r14d + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r9 + mul %rbp + add (%rsi), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(%rsi), %r9 + adc $0, %r14 + mov %r9, %rbp + imul %r8, %rbp + mov -16(%r13), %rax + mul %rbp + xor %ebx, %ebx + mov %rax, %r10 + mov -8(%r13), %rax + mov %rdx, %r11 + mul %rbp + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(%rsi), %r11 + adc $0, %rbx + xor %eax, %eax + add %r11, %r14 + adc 24(%rsi), %rbx + mov %r14, (%rdi) + mov %rbx, 8(%rdi) + adc %eax, %eax + jmp .Lret + + +.L3: mov (%rdx), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add -8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add (%rsi), %r10 + mov %r10, (%rsi) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 8(%rsi) + adc $0, %r14 + mov %r14, -8(%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add (%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 8(%rsi), %r10 + mov %r10, 8(%rsi) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, %rbp + imul %r8, %rbp + add %r9, 16(%rsi) + adc $0, %r14 + mov %r14, (%rsi) + + mov -24(%r13), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov -16(%r13), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add 8(%rsi), %rbx + adc %rax, %r10 + mov -8(%r13), %rax + adc %rdx, %r9 + mul %rbp + add 16(%rsi), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 24(%rsi), %r9 + adc $0, %r14 + + xor %eax, %eax + add -8(%rsi), %r10 + adc (%rsi), %r9 + adc 32(%rsi), %r14 + mov %r10, (%rdi) + mov %r9, 8(%rdi) + mov %r14, 16(%rdi) + adc %eax, %eax + jmp .Lret + + + .align 16, 0x90 +.L2m4: +.Llo2: mov (%r13,%r12,8), %rax + mul %rbp + xor %r14d, %r14d + xor %ebx, %ebx + mov %rax, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r9 + mul %rbp + add 16(%rsi,%r12,8), %r10 + adc %rax, %r9 + mov 16(%r13,%r12,8), %rax + adc %rdx, %r14 + mul %rbp + mov $0, %r10d + lea 2(%r12), %r11 + add %r9, %r15 + imul %r8, %r15 + jmp .Le2 + + .align 16, 0x90 +.Lli2: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp +.Le2: add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli2 + +.Lle2: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo2 + + mov %r12, %rcx + sar $2, %rcx + lea 32(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -16(%rsi), %r8 + mov -8(%rsi), %r9 + add -16(%rdx), %r8 + adc -8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + lea 16(%rdi), %rdi + jmp .Laddx + + + .align 16, 0x90 +.L1m4: +.Llo1: mov (%r13,%r12,8), %rax + xor %r9, %r9 + xor %ebx, %ebx + mul %rbp + mov %rax, %r9 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mov %rdx, %r14 + mov $0, %r10d + mul %rbp + add 16(%rsi,%r12,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(%r13,%r12,8), %rax + mul %rbp + lea 1(%r12), %r11 + add %r14, %r15 + imul %r8, %r15 + jmp .Le1 + + .align 16, 0x90 +.Lli1: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp +.Le1: add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli1 + +.Lle1: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo1 + + mov %r12, %rcx + sar $2, %rcx + lea 24(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -8(%rsi), %r8 + add -8(%rdx), %r8 + mov %r8, (%rdi) + lea 8(%rdi), %rdi + jmp .Laddx + + + .align 16, 0x90 +.L0: +.L0m4: +.Llo0: mov (%r13,%r12,8), %rax + mov %r12, %r11 + mul %rbp + xor %r10d, %r10d + mov %rax, %r14 + mov %rdx, %rbx + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul %r8, %r15 + jmp .Le0 + + .align 16, 0x90 +.Lli0: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.Le0: mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli0 + +.Lle0: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + add $8, %rsi + mov %r15, %rbp + dec %rcx + jnz .Llo0 + + mov %r12, %rcx + sar $2, %rcx + clc + lea 16(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + jmp .Laddy + + + .align 16, 0x90 +.L3m4: +.Llo3: mov (%r13,%r12,8), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + mov 8(%r13,%r12,8), %rax + mov 24(%rsi,%r12,8), %r15 + mul %rbp + add 16(%rsi,%r12,8), %rbx + mov $0, %ebx + mov %rbx, %r14 + adc %rax, %r10 + mov 16(%r13,%r12,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + add %r10, %r15 + mul %rbp + lea 3(%r12), %r11 + imul %r8, %r15 + + + .align 16, 0x90 +.Lli3: add %r10, (%rsi,%r11,8) + adc %rax, %r9 + mov (%r13,%r11,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(%rsi,%r11,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(%r13,%r11,8), %rax + mul %rbp + add %r14, 16(%rsi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(%r13,%r11,8), %rax + mul %rbp + add %rbx, 24(%rsi,%r11,8) + mov $0, %r14d + mov %r14, %rbx + adc %rax, %r10 + mov 24(%r13,%r11,8), %rax + mov %r14, %r9 + adc %rdx, %r9 + mul %rbp + add $4, %r11 + js .Lli3 + +.Lle3: add %r10, (%rsi) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(%rsi) + adc $0, %rdx + mov %rdx, 16(%rsi,%r12,8) + mov %r15, %rbp + lea 8(%rsi), %rsi + dec %rcx + jnz .Llo3 + + + + mov %r12, %rcx + sar $2, %rcx + lea 40(%rsi,%r12,8), %rsi + lea (%rsi,%r12,8), %rdx + + mov -24(%rsi), %r8 + mov -16(%rsi), %r9 + mov -8(%rsi), %r10 + add -24(%rdx), %r8 + adc -16(%rdx), %r9 + adc -8(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + lea 24(%rdi), %rdi + +.Laddx:inc %rcx + jz .Lad3 + +.Laddy:mov (%rsi), %r8 + mov 8(%rsi), %r9 + inc %rcx + jmp .Lmid + + +.Lal3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + inc %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Lal3 + +.Lae3: adc (%rdx), %r8 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + adc 24(%rdx), %r11 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + +.Lad3: mov %ecx, %eax + adc %eax, %eax + +.Lret: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + + ret + .size __gmpn_redc_1,.-__gmpn_redc_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s new file mode 100644 index 0000000..ac1323b --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh1_n.s @@ -0,0 +1,179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_rsblsh1_n + .type __gmpn_rsblsh1_n,@function + +__gmpn_rsblsh1_n: + + + push %rbp + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + sbb %eax, %eax + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + mov %r10, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + sbb %eax, %eax + sub (%rsi,%rcx,8), %r8 + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 + sbb %eax, %eax + sub (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + sbb (%rsi,%rcx,8), %r8 + nop + sbb 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) + sbb 16(%rsi,%rcx,8), %r10 + sbb 24(%rsi,%rcx,8), %r11 + mov %r10, 16(%rdi,%rcx,8) + mov %r11, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: + + + sub %eax, %ebp + movslq %ebp, %rax + + pop %rbp + + ret + .size __gmpn_rsblsh1_n,.-__gmpn_rsblsh1_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s new file mode 100644 index 0000000..e9f079a --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh2_n.s @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_rsblsh2_n + .type __gmpn_rsblsh2_n,@function + +__gmpn_rsblsh2_n: + + + push %r12 + push %r13 + push %r14 + push %r15 + + mov (%rdx), %r8 + lea (,%r8,4), %r12 + shr $62, %r8 + + mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + and $3, %al + je .Lb00 + cmp $2, %al + jc .Lb01 + je .Lb10 + +.Lb11: mov 8(%rdx,%rcx,8), %r10 + lea (%r8,%r10,4), %r14 + shr $62, %r10 + mov 16(%rdx,%rcx,8), %r11 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r14 + sbb 16(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r14, 8(%rdi,%rcx,8) + mov %r15, 16(%rdi,%rcx,8) + add $3, %rcx + js .Ltop + jmp .Lend + +.Lb01: mov %r8, %r11 + sub (%rsi,%rcx,8), %r12 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + add $1, %rcx + js .Ltop + jmp .Lend + +.Lb10: mov 8(%rdx,%rcx,8), %r11 + lea (%r8,%r11,4), %r15 + shr $62, %r11 + sub (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r15 + sbb %eax, %eax + mov %r12, (%rdi,%rcx,8) + mov %r15, 8(%rdi,%rcx,8) + add $2, %rcx + js .Ltop + jmp .Lend + +.Lb00: mov 8(%rdx,%rcx,8), %r9 + mov 16(%rdx,%rcx,8), %r10 + jmp .Le00 + + .align 16, 0x90 +.Ltop: mov 16(%rdx,%rcx,8), %r10 + mov (%rdx,%rcx,8), %r8 + mov 8(%rdx,%rcx,8), %r9 + lea (%r11,%r8,4), %r12 + shr $62, %r8 +.Le00: lea (%r8,%r9,4), %r13 + shr $62, %r9 + mov 24(%rdx,%rcx,8), %r11 + lea (%r9,%r10,4), %r14 + shr $62, %r10 + lea (%r10,%r11,4), %r15 + shr $62, %r11 + add %eax, %eax + sbb (%rsi,%rcx,8), %r12 + sbb 8(%rsi,%rcx,8), %r13 + sbb 16(%rsi,%rcx,8), %r14 + sbb 24(%rsi,%rcx,8), %r15 + mov %r12, (%rdi,%rcx,8) + mov %r13, 8(%rdi,%rcx,8) + mov %r14, 16(%rdi,%rcx,8) + sbb %eax, %eax + mov %r15, 24(%rdi,%rcx,8) + add $4, %rcx + js .Ltop +.Lend: + + + add %r11d, %eax + movslq %eax, %rax + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + ret + .size __gmpn_rsblsh2_n,.-__gmpn_rsblsh2_n + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s new file mode 100644 index 0000000..d439217 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rsblsh_n.s @@ -0,0 +1,228 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_rsblsh_n + .type __gmpn_rsblsh_n,@function + +__gmpn_rsblsh_n: + + + + push %r12 + push %rbp + push %rbx + + mov (%rdx), %rax + + mov $0, %ebp + sub %rcx, %rbp + + lea -16(%rsi,%rcx,8), %rsi + lea -16(%rdi,%rcx,8), %rdi + lea 16(%rdx,%rcx,8), %r12 + + mov %rcx, %r9 + + mov %r8, %rcx + mov $1, %r8d + shl %cl, %r8 + + mul %r8 + + and $3, %r9d + jz .Lb0 + cmp $2, %r9d + jc .Lb1 + jz .Lb2 + +.Lb3: mov %rax, %r11 + sub 16(%rsi,%rbp,8), %r11 + mov -8(%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, %rbp + jnz .Llo3 + jmp .Lcj3 + +.Lb2: mov %rax, %rbx + mov -8(%r12,%rbp,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, %rbp + jz .Lcj2 + mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + xor %ecx, %ecx + jmp .Llo2 + +.Lb1: mov %rax, %r9 + mov %rdx, %r10 + add $1, %rbp + jnz .Lgt1 + sub 8(%rsi,%rbp,8), %r9 + jmp .Lcj1 +.Lgt1: mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + jmp .Llo1 + +.Lb0: mov %rax, %r10 + mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + sub 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jz .Lend + + .align 8, 0x90 +.Ltop: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Llo3: mov %rdx, %r10 + mov -16(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(%rdi,%rbp,8) +.Llo2: mov %rdx, %r11 + mov -8(%r12,%rbp,8), %rax + mul %r8 + or %rax, %r11 + add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + sbb 16(%rsi,%rbp,8), %r10 + sbb 24(%rsi,%rbp,8), %r11 + mov (%r12,%rbp,8), %rax + sbb %ecx, %ecx + mov %rbx, (%rdi,%rbp,8) +.Llo1: mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(%rdi,%rbp,8) +.Llo0: mov 8(%r12,%rbp,8), %rax + add $4, %rbp + jnz .Ltop + +.Lend: mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(%rdi,%rbp,8) +.Lcj3: mov %r11, -8(%rdi,%rbp,8) +.Lcj2: add %ecx, %ecx + sbb (%rsi,%rbp,8), %rbx + sbb 8(%rsi,%rbp,8), %r9 + mov %rbx, (%rdi,%rbp,8) +.Lcj1: mov %r9, 8(%rdi,%rbp,8) + mov %rdx, %rax + sbb $0, %rax + pop %rbx + pop %rbp + pop %r12 + + ret + .size __gmpn_rsblsh_n,.-__gmpn_rsblsh_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s new file mode 100644 index 0000000..8554f6f --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1add_n.s @@ -0,0 +1,203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_rsh1add_nc + .type __gmpn_rsh1add_nc,@function + +__gmpn_rsh1add_nc: + + + + push %rbx + + xor %eax, %eax + neg %r8 + mov (%rsi), %rbx + adc (%rdx), %rbx + jmp .Lent + .size __gmpn_rsh1add_nc,.-__gmpn_rsh1add_nc + + .align 16, 0x90 + .globl __gmpn_rsh1add_n + .type __gmpn_rsh1add_n,@function + +__gmpn_rsh1add_n: + + + push %rbx + + xor %eax, %eax + mov (%rsi), %rbx + add (%rdx), %rbx +.Lent: + rcr %rbx + adc %eax, %eax + + mov %ecx, %r11d + and $3, %r11d + + cmp $1, %r11d + je .Ldo + +.Ln1: cmp $2, %r11d + jne .Ln2 + add %rbx, %rbx + mov 8(%rsi), %r10 + adc 8(%rdx), %r10 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) + jmp .Lcj1 + +.Ln2: cmp $3, %r11d + jne .Ln3 + add %rbx, %rbx + mov 8(%rsi), %r9 + mov 16(%rsi), %r10 + adc 8(%rdx), %r9 + adc 16(%rdx), %r10 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) + jmp .Lcj2 + +.Ln3: dec %rcx + add %rbx, %rbx + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + adc 8(%rdx), %r8 + adc 16(%rdx), %r9 + mov 24(%rsi), %r10 + adc 24(%rdx), %r10 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) + mov %r8, -16(%rdi) +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx + +.Ldo: + shr $2, %rcx + je .Lend + .align 16, 0x90 +.Ltop: add %rbx, %rbx + + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + adc 8(%rdx), %r8 + adc 16(%rdx), %r9 + mov 24(%rsi), %r10 + mov 32(%rsi), %r11 + adc 24(%rdx), %r10 + adc 32(%rdx), %r11 + + lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 + + rcr %rbx + mov %rbx, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbx + + lea 32(%rdi), %rdi + dec %rcx + jne .Ltop + +.Lend: mov %rbx, (%rdi) + pop %rbx + + ret + .size __gmpn_rsh1add_n,.-__gmpn_rsh1add_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s new file mode 100644 index 0000000..ff06ece --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rsh1sub_n.s @@ -0,0 +1,203 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_rsh1sub_nc + .type __gmpn_rsh1sub_nc,@function + +__gmpn_rsh1sub_nc: + + + + push %rbx + + xor %eax, %eax + neg %r8 + mov (%rsi), %rbx + sbb (%rdx), %rbx + jmp .Lent + .size __gmpn_rsh1sub_nc,.-__gmpn_rsh1sub_nc + + .align 16, 0x90 + .globl __gmpn_rsh1sub_n + .type __gmpn_rsh1sub_n,@function + +__gmpn_rsh1sub_n: + + + push %rbx + + xor %eax, %eax + mov (%rsi), %rbx + sub (%rdx), %rbx +.Lent: + rcr %rbx + adc %eax, %eax + + mov %ecx, %r11d + and $3, %r11d + + cmp $1, %r11d + je .Ldo + +.Ln1: cmp $2, %r11d + jne .Ln2 + add %rbx, %rbx + mov 8(%rsi), %r10 + sbb 8(%rdx), %r10 + lea 8(%rsi), %rsi + lea 8(%rdx), %rdx + lea 8(%rdi), %rdi + rcr %r10 + rcr %rbx + mov %rbx, -8(%rdi) + jmp .Lcj1 + +.Ln2: cmp $3, %r11d + jne .Ln3 + add %rbx, %rbx + mov 8(%rsi), %r9 + mov 16(%rsi), %r10 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + lea 16(%rsi), %rsi + lea 16(%rdx), %rdx + lea 16(%rdi), %rdi + rcr %r10 + rcr %r9 + rcr %rbx + mov %rbx, -16(%rdi) + jmp .Lcj2 + +.Ln3: dec %rcx + add %rbx, %rbx + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + sbb 8(%rdx), %r8 + sbb 16(%rdx), %r9 + mov 24(%rsi), %r10 + sbb 24(%rdx), %r10 + lea 24(%rsi), %rsi + lea 24(%rdx), %rdx + lea 24(%rdi), %rdi + rcr %r10 + rcr %r9 + rcr %r8 + rcr %rbx + mov %rbx, -24(%rdi) + mov %r8, -16(%rdi) +.Lcj2: mov %r9, -8(%rdi) +.Lcj1: mov %r10, %rbx + +.Ldo: + shr $2, %rcx + je .Lend + .align 16, 0x90 +.Ltop: add %rbx, %rbx + + mov 8(%rsi), %r8 + mov 16(%rsi), %r9 + sbb 8(%rdx), %r8 + sbb 16(%rdx), %r9 + mov 24(%rsi), %r10 + mov 32(%rsi), %r11 + sbb 24(%rdx), %r10 + sbb 32(%rdx), %r11 + + lea 32(%rsi), %rsi + lea 32(%rdx), %rdx + + rcr %r11 + rcr %r10 + rcr %r9 + rcr %r8 + + rcr %rbx + mov %rbx, (%rdi) + mov %r8, 8(%rdi) + mov %r9, 16(%rdi) + mov %r10, 24(%rdi) + mov %r11, %rbx + + lea 32(%rdi), %rdi + dec %rcx + jne .Ltop + +.Lend: mov %rbx, (%rdi) + pop %rbx + + ret + .size __gmpn_rsh1sub_n,.-__gmpn_rsh1sub_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/rshift.s b/vere/ext/gmp/gen/x86_64-linux/mpn/rshift.s new file mode 100644 index 0000000..8ddd7b5 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/rshift.s @@ -0,0 +1,191 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_rshift + .type __gmpn_rshift,@function + +__gmpn_rshift: + + + neg %ecx + mov (%rsi), %rax + shl %cl, %rax + neg %ecx + + lea 1(%rdx), %r8d + + lea -8(%rsi,%rdx,8), %rsi + lea -8(%rdi,%rdx,8), %rdi + neg %rdx + + and $3, %r8d + je .Lrlx + + dec %r8d + jne .L1 + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + jmp .Lrll + +.L1: dec %r8d + je .L1x + + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + shl %cl, %r8 + or %r8, %r10 + mov %r10, 8(%rdi,%rdx,8) + inc %rdx + neg %ecx +.L1x: + cmp $-1, %rdx + je .Last + mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + neg %ecx + mov 16(%rsi,%rdx,8), %r8 + mov 24(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, 8(%rdi,%rdx,8) + mov %r11, 16(%rdi,%rdx,8) + add $2, %rdx + +.Lrll: neg %ecx +.Lrlx: mov 8(%rsi,%rdx,8), %r10 + shr %cl, %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r11 + + add $4, %rdx + jb .Lend + .align 16, 0x90 +.Ltop: + + neg %ecx + mov -16(%rsi,%rdx,8), %r8 + mov -8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + or %r8, %r10 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -24(%rdi,%rdx,8) + mov %r11, -16(%rdi,%rdx,8) + + mov (%rsi,%rdx,8), %r8 + mov 8(%rsi,%rdx,8), %r9 + shl %cl, %r8 + shl %cl, %r9 + + + neg %ecx + mov -8(%rsi,%rdx,8), %r10 + mov 0(%rsi,%rdx,8), %r11 + shr %cl, %r10 + or %r10, %r8 + shr %cl, %r11 + or %r11, %r9 + mov %r8, -8(%rdi,%rdx,8) + mov %r9, 0(%rdi,%rdx,8) + + mov 8(%rsi,%rdx,8), %r10 + mov 16(%rsi,%rdx,8), %r11 + shr %cl, %r10 + shr %cl, %r11 + + add $4, %rdx + jae .Ltop +.Lend: + neg %ecx + mov -8(%rsi), %r8 + shl %cl, %r8 + or %r8, %r10 + mov (%rsi), %r9 + shl %cl, %r9 + or %r9, %r11 + mov %r10, -16(%rdi) + mov %r11, -8(%rdi) + + neg %ecx +.Last: mov (%rsi), %r10 + shr %cl, %r10 + mov %r10, (%rdi) + + ret + .size __gmpn_rshift,.-__gmpn_rshift diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s new file mode 100644 index 0000000..7a50a70 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sec_tabselect.s @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sec_tabselect + .type __gmpn_sec_tabselect,@function + +__gmpn_sec_tabselect: + + + + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rdx, %r9 + add $-4, %r9 + js .Louter_end + +.Louter_top: + mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + mov %r8, %rbx + + .align 16, 0x90 +.Ltop: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r14 + or %r11, %r15 + lea (%rsi,%rdx,8), %rsi + add $-1, %rbp + jne .Ltop + + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + mov %r14, 16(%rdi) + mov %r15, 24(%rdi) + pop %rsi + lea 32(%rsi), %rsi + lea 32(%rdi), %rdi + add $-4, %r9 + jns .Louter_top +.Louter_end: + + test $2, %dl + jz .Lb0x +.Lb1x: mov %rcx, %rbp + push %rsi + xor %r12d, %r12d + xor %r13d, %r13d + mov %r8, %rbx + .align 16, 0x90 +.Ltp2: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + mov 8(%rsi), %r11 + and %rax, %r10 + and %rax, %r11 + or %r10, %r12 + or %r11, %r13 + lea (%rsi,%rdx,8), %rsi + add $-1, %rbp + jne .Ltp2 + mov %r12, 0(%rdi) + mov %r13, 8(%rdi) + pop %rsi + lea 16(%rsi), %rsi + lea 16(%rdi), %rdi + +.Lb0x: test $1, %dl + jz .Lb00 +.Lb01: mov %rcx, %rbp + xor %r12d, %r12d + mov %r8, %rbx + .align 16, 0x90 +.Ltp1: sub $1, %rbx + sbb %rax, %rax + mov 0(%rsi), %r10 + and %rax, %r10 + or %r10, %r12 + lea (%rsi,%rdx,8), %rsi + add $-1, %rbp + jne .Ltp1 + mov %r12, 0(%rdi) + +.Lb00: pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_sec_tabselect,.-__gmpn_sec_tabselect diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s new file mode 100644 index 0000000..eb24851 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_basecase.s @@ -0,0 +1,818 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sqr_basecase + .type __gmpn_sqr_basecase,@function + +__gmpn_sqr_basecase: + + + mov %edx, %ecx + mov %edx, %r11d + + add $-40, %rsp + + and $3, %ecx + cmp $4, %edx + lea 4(%rcx), %r8 + + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea .Ltab(%rip), %rax + movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax + + .section .data.rel.ro.local,"a",@progbits + .align 8, 0x90 +.Ltab: .long .L4-.Ltab + .long .L1-.Ltab + .long .L2-.Ltab + .long .L3-.Ltab + .long .L0m4-.Ltab + .long .L1m4-.Ltab + .long .L2m4-.Ltab + .long .L3m4-.Ltab + .text + +.L1: mov (%rsi), %rax + mul %rax + add $40, %rsp + mov %rax, (%rdi) + mov %rdx, 8(%rdi) + + ret + +.L2: mov (%rsi), %rax + mov %rax, %r8 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(%rdi) + adc %rdx, %r10 + mov %r10, 16(%rdi) + adc %r8, %r11 + mov %r11, 24(%rdi) + + ret + +.L3: mov (%rsi), %rax + mov %rax, %r10 + mul %rax + mov 8(%rsi), %r11 + mov %rax, (%rdi) + mov %r11, %rax + mov %rdx, 8(%rdi) + mul %rax + mov 16(%rsi), %rcx + mov %rax, 16(%rdi) + mov %rcx, %rax + mov %rdx, 24(%rdi) + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %rdx, 32(%rdi) + adc %r11, 40(%rdi) + + ret + +.L4: mov (%rsi), %rax + mov %rax, %r11 + mul %rax + mov 8(%rsi), %rbx + mov %rax, (%rdi) + mov %rbx, %rax + mov %rdx, 8(%rdi) + mul %rax + mov %rax, 16(%rdi) + mov %rdx, 24(%rdi) + mov 16(%rsi), %rax + mul %rax + mov %rax, 32(%rdi) + mov %rdx, 40(%rdi) + mov 24(%rsi), %rax + mul %rax + mov %rax, 48(%rdi) + mov %rbx, %rax + mov %rdx, 56(%rdi) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(%rsi), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(%rsi), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(%rsi), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(%rsi), %rax + mul %rbx + pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(%rsi), %rdx + mov 24(%rsi), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, %eax + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(%rdi) + adc %r9, 16(%rdi) + adc %r10, 24(%rdi) + adc %r11, 32(%rdi) + adc %rcx, 40(%rdi) + adc %rdx, 48(%rdi) + adc %rax, 56(%rdi) + + ret + + +.L0m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + xor %r9d, %r9d + sub %r11, %r9 + + mul %r13 + xor %ebp, %ebp + mov %rax, %rbx + mov 16(%rsi,%r9,8), %rax + mov %rdx, %r10 + jmp .LL3 + + .align 16, 0x90 +.Lmul_1_m3_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx + xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 +.LL3: xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m3_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile + + +.L1m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -3(%r11), %r8 + + lea -3(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %rcx + xor %ebp, %ebp + mov %rax, 8(%rdi) + jmp .Lm0 + + .align 16, 0x90 +.Lmul_2_m0_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp +.Lm0: mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2x: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m0_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + lea 0(%r12), %r12 + jmp .Ldowhile_end + + +.L2m4: + lea -16(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea (%rsi,%r11,8), %rsi + + lea -4(%r11), %r8 + + lea -2(%r11), %r9 + neg %r9 + + mul %r13 + mov %rax, %rbp + mov (%rsi,%r9,8), %rax + mov %rdx, %rcx + jmp .LL1 + + .align 16, 0x90 +.Lmul_1_m1_top: + add %rax, %rbp + mov %r10, (%r12,%r9,8) + mov (%rsi,%r9,8), %rax + adc %rdx, %rcx +.LL1: xor %ebx, %ebx + mul %r13 + xor %r10d, %r10d + mov %rbp, 8(%r12,%r9,8) + add %rax, %rcx + adc %rdx, %rbx + mov 8(%rsi,%r9,8), %rax + mov %rcx, 16(%r12,%r9,8) + xor %ebp, %ebp + mul %r13 + add %rax, %rbx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %r10 + xor %ecx, %ecx + mul %r13 + add %rax, %r10 + mov 24(%rsi,%r9,8), %rax + adc %rdx, %rbp + mov %rbx, 24(%r12,%r9,8) + mul %r13 + add $4, %r9 + js .Lmul_1_m1_top + + add %rax, %rbp + mov %r10, (%r12) + adc %rdx, %rcx + mov %rbp, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + lea -8(%rsi), %rsi + jmp .Ldowhile_mid + + +.L3m4: + lea 8(%rdi,%r11,8), %r12 + mov (%rsi), %r13 + mov 8(%rsi), %rax + lea 8(%rsi,%r11,8), %rsi + + lea -5(%r11), %r8 + + lea -1(%r11), %r9 + neg %r9 + + mov %rax, %r14 + mul %r13 + mov %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov %rax, 8(%rdi) + jmp .Lm2 + + .align 16, 0x90 +.Lmul_2_m2_top: + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov -24(%rsi,%r9,8), %rax + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov -24(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rax, %rcx + mov %rbx, -24(%r12,%r9,8) + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + mul %r13 + mov $0, %r10d + add %rax, %rcx + adc %rdx, %rbp + mov -16(%rsi,%r9,8), %rax + adc $0, %r10d + mov $0, %ebx + mov %rcx, -16(%r12,%r9,8) + mul %r14 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + mov $0, %ecx + mul %r13 + add %rax, %rbp + mov -8(%rsi,%r9,8), %rax + adc %rdx, %r10 + adc $0, %ebx + mul %r14 + add %rax, %r10 + mov %rbp, -8(%r12,%r9,8) + adc %rdx, %rbx +.Lm2: mov (%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + adc %rdx, %rbx + adc $0, %ecx + add $4, %r9 + mov -32(%rsi,%r9,8), %rax + mov %r10, -32(%r12,%r9,8) + js .Lmul_2_m2_top + + mul %r14 + add %rax, %rbx + adc %rdx, %rcx + mov %rbx, -8(%r12) + mov %rcx, (%r12) + + lea -16(%rsi), %rsi + jmp .Ldowhile_mid + +.Ldowhile: + + lea 4(%r8), %r9 + neg %r9 + + mov 16(%rsi,%r9,8), %r13 + mov 24(%rsi,%r9,8), %r14 + mov 24(%rsi,%r9,8), %rax + mul %r13 + xor %r10d, %r10d + add %rax, 24(%r12,%r9,8) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + jmp .Lam2 + + .align 16, 0x90 +.Laddmul_2_m2_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp + mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx +.Lam2: mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m2_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 + + add $-2, %r8d + +.Ldowhile_mid: + + lea 2(%r8), %r9 + neg %r9 + + mov (%rsi,%r9,8), %r13 + mov 8(%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %rax + mul %r13 + xor %ecx, %ecx + add %rax, 8(%r12,%r9,8) + adc %rdx, %rcx + xor %ebp, %ebp + jmp .L20 + + .align 16, 0x90 +.Laddmul_2_m0_top: + add %r10, (%r12,%r9,8) + adc %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + mov $0, %ebp + mul %r13 + add %rax, %rbx + mov 8(%rsi,%r9,8), %rax + adc %rdx, %rcx + adc $0, %ebp + mul %r14 + add %rbx, 8(%r12,%r9,8) + adc %rax, %rcx + adc %rdx, %rbp +.L20: mov 16(%rsi,%r9,8), %rax + mov $0, %r10d + mul %r13 + add %rax, %rcx + mov 16(%rsi,%r9,8), %rax + adc %rdx, %rbp + adc $0, %r10d + mul %r14 + add %rcx, 16(%r12,%r9,8) + adc %rax, %rbp + mov 24(%rsi,%r9,8), %rax + adc %rdx, %r10 + mul %r13 + mov $0, %ebx + add %rax, %rbp + adc %rdx, %r10 + mov $0, %ecx + mov 24(%rsi,%r9,8), %rax + adc $0, %ebx + mul %r14 + add %rbp, 24(%r12,%r9,8) + adc %rax, %r10 + adc %rdx, %rbx + mov 32(%rsi,%r9,8), %rax + mul %r13 + add %rax, %r10 + mov 32(%rsi,%r9,8), %rax + adc %rdx, %rbx + adc $0, %ecx + mul %r14 + add $4, %r9 + js .Laddmul_2_m0_top + + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + lea 16(%r12), %r12 +.Ldowhile_end: + + add $-2, %r8d + jne .Ldowhile + + + mov -16(%rsi), %r13 + mov -8(%rsi), %r14 + mov -8(%rsi), %rax + mul %r13 + xor %r10d, %r10d + add %rax, -8(%r12) + adc %rdx, %r10 + xor %ebx, %ebx + xor %ecx, %ecx + mov (%rsi), %rax + mul %r13 + add %rax, %r10 + mov (%rsi), %rax + adc %rdx, %rbx + mul %r14 + add %r10, (%r12) + adc %rax, %rbx + adc %rdx, %rcx + mov %rbx, 8(%r12) + mov %rcx, 16(%r12) + + + lea -4(%r11,%r11), %r9 + + mov 8(%rdi), %r11 + lea -8(%rsi), %rsi + lea (%rdi,%r9,8), %rdi + neg %r9 + mov (%rsi,%r9,4), %rax + mul %rax + test $2, %r9b + jnz .Lodd + +.Levn: add %r11, %r11 + sbb %ebx, %ebx + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + jmp .Ld0 + +.Lodd: add %r11, %r11 + sbb %ebp, %ebp + add %rdx, %r11 + mov %rax, (%rdi,%r9,8) + lea -2(%r9), %r9 + jmp .Ld1 + + .align 16, 0x90 +.Ltop: mov (%rsi,%r9,4), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi,%r9,8) +.Ld0: mov %r11, 8(%rdi,%r9,8) + mov 16(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 24(%rdi,%r9,8), %r11 + adc %r11, %r11 + nop + sbb %ebp, %ebp + mov 8(%rsi,%r9,4), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(%rdi,%r9,8) +.Ld1: mov %r11, 24(%rdi,%r9,8) + mov 32(%rdi,%r9,8), %r10 + adc %r10, %r10 + mov 40(%rdi,%r9,8), %r11 + adc %r11, %r11 + sbb %ebx, %ebx + add $4, %r9 + js .Ltop + + mov (%rsi), %rax + mul %rax + add %ebp, %ebp + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (%rdi) + mov %r11, 8(%rdi) + mov 16(%rdi), %r10 + adc %r10, %r10 + sbb %ebp, %ebp + neg %ebp + mov 8(%rsi), %rax + mul %rax + add %ebx, %ebx + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(%rdi) + mov %rdx, 24(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + .size __gmpn_sqr_basecase,.-__gmpn_sqr_basecase diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_diag_addlsh1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_diag_addlsh1.s new file mode 100644 index 0000000..7203603 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sqr_diag_addlsh1.s @@ -0,0 +1,130 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_sqr_diag_addlsh1 + .type __gmpn_sqr_diag_addlsh1,@function + +__gmpn_sqr_diag_addlsh1: + + + push %rbx + + dec %rcx + shl %rcx + + mov (%rdx), %rax + + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,4), %r11 + neg %rcx + + mul %rax + mov %rax, (%rdi,%rcx,8) + + xor %ebx, %ebx + jmp .Lmid + + .align 16, 0x90 +.Ltop: add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(%rdi,%rcx,8) + mov %r9, (%rdi,%rcx,8) +.Lmid: mov 8(%r11,%rcx,4), %rax + mov (%rsi,%rcx,8), %r8 + mov 8(%rsi,%rcx,8), %r9 + adc %r8, %r8 + adc %r9, %r9 + lea (%rdx,%rbx), %r10 + setc %bl + mul %rax + add $2, %rcx + js .Ltop + +.Lend: add %r10, %r8 + adc %rax, %r9 + mov %r8, -8(%rdi) + mov %r9, (%rdi) + adc %rbx, %rdx + mov %rdx, 8(%rdi) + + pop %rbx + + ret + .size __gmpn_sqr_diag_addlsh1,.-__gmpn_sqr_diag_addlsh1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s new file mode 100644 index 0000000..cbef8af --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err1_n.s @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sub_err1_n + .type __gmpn_sub_err1_n,@function + +__gmpn_sub_err1_n: + + mov 8(%rsp), %rax + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + lea (%rsi,%r9,8), %rsi + lea (%rdx,%r9,8), %rdx + lea (%rdi,%r9,8), %rdi + + mov %r9d, %r10d + and $3, %r10d + jz .L0mod4 + cmp $2, %r10d + jc .L1mod4 + jz .L2mod4 +.L3mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + xor %r11d, %r11d + lea -24(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 16(%r8), %rbx + sbb 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc 8(%r8), %r10 + mov 16(%rsi,%r9,8), %r14 + sbb 16(%rdx,%r9,8), %r14 + mov %r14, 16(%rdi,%r9,8) + cmovc (%r8), %r11 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + + add $3, %r9 + jnz .Lloop + jmp .Lend + + .align 16, 0x90 +.L0mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea (%r8,%r9,8), %r8 + neg %r9 + jmp .Lloop + + .align 16, 0x90 +.L1mod4: + xor %ebx, %ebx + xor %ebp, %ebp + lea -8(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc (%r8), %rbx + setc %al + + add $1, %r9 + jnz .Lloop + jmp .Lend + + .align 16, 0x90 +.L2mod4: + xor %ebx, %ebx + xor %ebp, %ebp + xor %r10d, %r10d + lea -16(%r8,%r9,8), %r8 + neg %r9 + + shr $1, %al + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + sbb (%rdx,%r9,8), %r14 + mov %r14, (%rdi,%r9,8) + cmovc 8(%r8), %rbx + sbb 8(%rdx,%r9,8), %r15 + mov %r15, 8(%rdi,%r9,8) + cmovc (%r8), %r10 + setc %al + add %r10, %rbx + adc $0, %rbp + + add $2, %r9 + jnz .Lloop + jmp .Lend + + .align 32, 0x90 +.Lloop: + shr $1, %al + mov -8(%r8), %r10 + mov $0, %r13d + mov (%rsi,%r9,8), %r14 + mov 8(%rsi,%r9,8), %r15 + sbb (%rdx,%r9,8), %r14 + cmovnc %r13, %r10 + sbb 8(%rdx,%r9,8), %r15 + mov -16(%r8), %r11 + mov %r14, (%rdi,%r9,8) + mov 16(%rsi,%r9,8), %r14 + mov %r15, 8(%rdi,%r9,8) + cmovnc %r13, %r11 + mov -24(%r8), %r12 + sbb 16(%rdx,%r9,8), %r14 + cmovnc %r13, %r12 + mov 24(%rsi,%r9,8), %r15 + sbb 24(%rdx,%r9,8), %r15 + cmovc -32(%r8), %r13 + setc %al + add %r10, %rbx + adc $0, %rbp + add %r11, %rbx + adc $0, %rbp + add %r12, %rbx + adc $0, %rbp + mov %r14, 16(%rdi,%r9,8) + add %r13, %rbx + lea -32(%r8), %r8 + adc $0, %rbp + mov %r15, 24(%rdi,%r9,8) + add $4, %r9 + jnz .Lloop + +.Lend: + mov %rbx, (%rcx) + mov %rbp, 8(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_sub_err1_n,.-__gmpn_sub_err1_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err2_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err2_n.s new file mode 100644 index 0000000..77ebcb7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err2_n.s @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sub_err2_n + .type __gmpn_sub_err2_n,@function + +__gmpn_sub_err2_n: + + mov 16(%rsp), %rax + mov 8(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + + sub %r8, %r9 + + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + + test $1, %r10 + jnz .Lodd + + lea -8(%r8,%r10,8), %r8 + neg %r10 + jmp .Ltop + + .align 16, 0x90 +.Lodd: + lea -16(%r8,%r10,8), %r8 + neg %r10 + shr $1, %rax + mov (%rsi,%r10,8), %rbx + sbb (%rdx,%r10,8), %rbx + cmovc 8(%r8), %rbp + cmovc 8(%r8,%r9), %r12 + mov %rbx, (%rdi,%r10,8) + sbb %rax, %rax + inc %r10 + jz .Lend + + .align 16, 0x90 +.Ltop: + mov (%rsi,%r10,8), %rbx + shr $1, %rax + sbb (%rdx,%r10,8), %rbx + mov %rbx, (%rdi,%r10,8) + sbb %r14, %r14 + + mov 8(%rsi,%r10,8), %rbx + sbb 8(%rdx,%r10,8), %rbx + mov %rbx, 8(%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %r14, %rbx + add %rbx, %rbp + adc $0, %r11 + + and (%r8,%r9), %r14 + add %r14, %r12 + adc $0, %r13 + + mov -8(%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov -8(%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + add $2, %r10 + lea -16(%r8), %r8 + jnz .Ltop +.Lend: + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + + and $1, %eax + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_sub_err2_n,.-__gmpn_sub_err2_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err3_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err3_n.s new file mode 100644 index 0000000..b995ec7 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_err3_n.s @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sub_err3_n + .type __gmpn_sub_err3_n,@function + +__gmpn_sub_err3_n: + + mov 24(%rsp), %rax + mov 16(%rsp), %r10 + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + push %rcx + mov 64(%rsp), %rcx + + xor %ebp, %ebp + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + sub %r8, %r9 + sub %r8, %rcx + + lea -8(%r8,%r10,8), %r8 + lea (%rdi,%r10,8), %rdi + lea (%rsi,%r10,8), %rsi + lea (%rdx,%r10,8), %rdx + neg %r10 + + .align 16, 0x90 +.Ltop: + shr $1, %rax + mov (%rsi,%r10,8), %rax + sbb (%rdx,%r10,8), %rax + mov %rax, (%rdi,%r10,8) + sbb %rax, %rax + + mov (%r8), %rbx + and %rax, %rbx + add %rbx, %rbp + adc $0, %r11 + + mov (%r8,%r9), %rbx + and %rax, %rbx + add %rbx, %r12 + adc $0, %r13 + + mov (%r8,%rcx), %rbx + and %rax, %rbx + add %rbx, %r14 + adc $0, %r15 + + lea -8(%r8), %r8 + inc %r10 + jnz .Ltop + +.Lend: + and $1, %eax + pop %rcx + + mov %rbp, (%rcx) + mov %r11, 8(%rcx) + mov %r12, 16(%rcx) + mov %r13, 24(%rcx) + mov %r14, 32(%rcx) + mov %r15, 40(%rcx) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + .size __gmpn_sub_err3_n,.-__gmpn_sub_err3_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sub_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_n.s new file mode 100644 index 0000000..8c1db0a --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sub_n.s @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sub_nc + .type __gmpn_sub_nc,@function + +__gmpn_sub_nc: + + + + mov %ecx, %eax + shr $2, %rcx + and $3, %eax + bt $0, %r8 + jrcxz .Llt4 + + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + + .size __gmpn_sub_nc,.-__gmpn_sub_nc + .align 16, 0x90 + .globl __gmpn_sub_n + .type __gmpn_sub_n,@function + +__gmpn_sub_n: + + + mov %ecx, %eax + shr $2, %rcx + and $3, %eax + jrcxz .Llt4 + + mov (%rsi), %r8 + mov 8(%rsi), %r9 + dec %rcx + jmp .Lmid + +.Llt4: dec %eax + mov (%rsi), %r8 + jnz .L2 + sbb (%rdx), %r8 + mov %r8, (%rdi) + adc %eax, %eax + + ret + +.L2: dec %eax + mov 8(%rsi), %r9 + jnz .L3 + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + adc %eax, %eax + + ret + +.L3: mov 16(%rsi), %r10 + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + setc %al + + ret + + .align 16, 0x90 +.Ltop: sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + mov %r8, (%rdi) + lea 32(%rsi), %rsi + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + dec %rcx + mov %r11, 24(%rdi) + lea 32(%rdx), %rdx + mov (%rsi), %r8 + mov 8(%rsi), %r9 + lea 32(%rdi), %rdi +.Lmid: mov 16(%rsi), %r10 + mov 24(%rsi), %r11 + jnz .Ltop + +.Lend: lea 32(%rsi), %rsi + sbb (%rdx), %r8 + sbb 8(%rdx), %r9 + sbb 16(%rdx), %r10 + sbb 24(%rdx), %r11 + lea 32(%rdx), %rdx + mov %r8, (%rdi) + mov %r9, 8(%rdi) + mov %r10, 16(%rdi) + mov %r11, 24(%rdi) + lea 32(%rdi), %rdi + + inc %eax + dec %eax + jnz .Llt4 + adc %eax, %eax + + ret + .size __gmpn_sub_n,.-__gmpn_sub_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s new file mode 100644 index 0000000..d257a05 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/sublsh1_n.s @@ -0,0 +1,175 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_sublsh1_n + .type __gmpn_sublsh1_n,@function + +__gmpn_sublsh1_n: + + + push %rbx + push %rbp + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdi,%rcx,8), %rdi + lea (%rsi,%rcx,8), %rsi + lea (%rdx,%rcx,8), %rdx + neg %rcx + xor %ebp, %ebp + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + sbb %r10, %rbp + mov %rbp, 16(%rdi,%rcx,8) + sbb %ebp, %ebp + add $3, %rcx + jmp .Lent + +.Lb10: add %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sub %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + sbb %ebp, %ebp + add $2, %rcx + jmp .Lent + +.Lb01: add %r8, %r8 + sbb %eax, %eax + mov (%rsi,%rcx,8), %rbp + sub %r8, %rbp + mov %rbp, (%rdi,%rcx,8) + sbb %ebp, %ebp + inc %rcx +.Lent: jns .Lend + + .align 16, 0x90 +.Ltop: add %eax, %eax + + mov (%rdx,%rcx,8), %r8 +.Lb00: adc %r8, %r8 + mov 8(%rdx,%rcx,8), %r9 + adc %r9, %r9 + mov 16(%rdx,%rcx,8), %r10 + adc %r10, %r10 + mov 24(%rdx,%rcx,8), %r11 + adc %r11, %r11 + + sbb %eax, %eax + add %ebp, %ebp + + mov (%rsi,%rcx,8), %rbp + mov 8(%rsi,%rcx,8), %rbx + sbb %r8, %rbp + sbb %r9, %rbx + mov %rbp, (%rdi,%rcx,8) + mov %rbx, 8(%rdi,%rcx,8) + mov 16(%rsi,%rcx,8), %rbp + mov 24(%rsi,%rcx,8), %rbx + sbb %r10, %rbp + sbb %r11, %rbx + mov %rbp, 16(%rdi,%rcx,8) + mov %rbx, 24(%rdi,%rcx,8) + + sbb %ebp, %ebp + add $4, %rcx + js .Ltop + +.Lend: add %ebp, %eax + neg %eax + + pop %rbp + pop %rbx + + ret + .size __gmpn_sublsh1_n,.-__gmpn_sublsh1_n diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/submul_1.s b/vere/ext/gmp/gen/x86_64-linux/mpn/submul_1.s new file mode 100644 index 0000000..5e34932 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/submul_1.s @@ -0,0 +1,196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 16, 0x90 + .globl __gmpn_submul_1 + .type __gmpn_submul_1,@function + +__gmpn_submul_1: + + + + + + + mov (%rsi), %rax + push %rbx + mov %rdx, %rbx + + mul %rcx + mov %rbx, %r11 + + and $3, %ebx + jz .Lb0 + cmp $2, %ebx + jz .Lb2 + jg .Lb3 + +.Lb1: dec %r11 + jne .Lgt1 + sub %rax, (%rdi) + jmp .Lret +.Lgt1: lea 8(%rsi,%r11,8), %rsi + lea -8(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + xor %ebx, %ebx + mov %rax, %r9 + mov (%rsi,%r11,8), %rax + mov %rdx, %r8 + jmp .LL1 + +.Lb0: lea (%rsi,%r11,8), %rsi + lea -16(%rdi,%r11,8), %rdi + neg %r11 + xor %r10, %r10 + mov %rax, %r8 + mov %rdx, %rbx + jmp .LL0 + +.Lb3: lea -8(%rsi,%r11,8), %rsi + lea -24(%rdi,%r11,8), %rdi + neg %r11 + mov %rax, %rbx + mov %rdx, %r10 + jmp .LL3 + +.Lb2: lea -16(%rsi,%r11,8), %rsi + lea -32(%rdi,%r11,8), %rdi + neg %r11 + xor %r8, %r8 + xor %ebx, %ebx + mov %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %rdx, %r9 + jmp .LL2 + + .align 16, 0x90 +.Ltop: sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + mov (%rsi,%r11,8), %rax + adc %rdx, %r8 + mov $0, %r10d +.LL1: mul %rcx + sub %r9, 8(%rdi,%r11,8) + adc %rax, %r8 + adc %rdx, %rbx +.LL0: mov 8(%rsi,%r11,8), %rax + mul %rcx + sub %r8, 16(%rdi,%r11,8) + adc %rax, %rbx + adc %rdx, %r10 +.LL3: mov 16(%rsi,%r11,8), %rax + mul %rcx + sub %rbx, 24(%rdi,%r11,8) + mov $0, %r8d + mov %r8, %rbx + adc %rax, %r10 + mov 24(%rsi,%r11,8), %rax + mov %r8, %r9 + adc %rdx, %r9 +.LL2: mul %rcx + add $4, %r11 + js .Ltop + + sub %r10, (%rdi,%r11,8) + adc %rax, %r9 + adc %r8, %rdx + sub %r9, 8(%rdi,%r11,8) +.Lret: adc $0, %rdx + mov %rdx, %rax + + pop %rbx + + + ret + .size __gmpn_submul_1,.-__gmpn_submul_1 diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s new file mode 100644 index 0000000..4db0497 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/xnor_n.s @@ -0,0 +1,154 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_xnor_n + .type __gmpn_xnor_n,@function + +__gmpn_xnor_n: + + + mov (%rdx), %r8 + not %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 + .byte 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90 +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 + not %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + not %r9 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 + not %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + not %r9 + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_xnor_n,.-__gmpn_xnor_n + + + diff --git a/vere/ext/gmp/gen/x86_64-linux/mpn/xor_n.s b/vere/ext/gmp/gen/x86_64-linux/mpn/xor_n.s new file mode 100644 index 0000000..8ef14d0 --- /dev/null +++ b/vere/ext/gmp/gen/x86_64-linux/mpn/xor_n.s @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + .text + .align 32, 0x90 + .globl __gmpn_xor_n + .type __gmpn_xor_n,@function + +__gmpn_xor_n: + + + mov (%rdx), %r8 + mov %ecx, %eax + lea (%rdx,%rcx,8), %rdx + lea (%rsi,%rcx,8), %rsi + lea (%rdi,%rcx,8), %rdi + neg %rcx + and $3, %eax + je .Lb00 + cmp $2, %eax + jc .Lb01 + je .Lb10 + +.Lb11: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + dec %rcx + jmp .Le11 +.Lb10: add $-2, %rcx + jmp .Le10 +.Lb01: xor (%rsi,%rcx,8), %r8 + mov %r8, (%rdi,%rcx,8) + inc %rcx + jz .Lret + +.Ltop: mov (%rdx,%rcx,8), %r8 +.Lb00: mov 8(%rdx,%rcx,8), %r9 + xor (%rsi,%rcx,8), %r8 + xor 8(%rsi,%rcx,8), %r9 + nop + mov %r8, (%rdi,%rcx,8) + mov %r9, 8(%rdi,%rcx,8) +.Le11: mov 16(%rdx,%rcx,8), %r8 +.Le10: mov 24(%rdx,%rcx,8), %r9 + xor 16(%rsi,%rcx,8), %r8 + xor 24(%rsi,%rcx,8), %r9 + mov %r8, 16(%rdi,%rcx,8) + mov %r9, 24(%rdi,%rcx,8) + add $4, %rcx + jnc .Ltop + +.Lret: + ret + .size __gmpn_xor_n,.-__gmpn_xor_n + + + + + |