diff --git a/dev/gen_mulhigh_basecase.jl b/dev/gen_mulhigh_basecase.jl new file mode 100644 index 0000000000..f1f5194650 --- /dev/null +++ b/dev/gen_mulhigh_basecase.jl @@ -0,0 +1,403 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +_regs = ["%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rax"] +__regs = ["%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15"] + +function R8(reg::String) + if reg == "%rax" + return "%al" + elseif reg == "%rbx" + return "%bl" + elseif reg == "%rcx" + return "%cl" + elseif reg == "%rdx" + return "%dl" + elseif reg == "%rsp" + return "%spl" + elseif reg == "%rbp" + return "%bpl" + elseif reg == "%rsi" + return "%sil" + elseif reg == "%rdi" + return "%dil" + elseif reg == "%r8" + return "%r8b" + elseif reg == "%r9" + return "%r9b" + elseif reg == "%r10" + return "%r10b" + elseif reg == "%r11" + return "%r11b" + elseif reg == "%r12" + return "%r12b" + elseif reg == "%r13" + return "%r13b" + elseif reg == "%r14" + return "%r14b" + elseif reg == "%r15" + return "%r15b" + else + return "hejhoppgummi" + end +end + +function R32(reg::String) + if reg == "%rax" + return "%eax" + elseif reg == "%rbx" + return "%ebx" + elseif reg == "%rcx" + return "%ecx" + elseif reg == "%rdx" + return "%edx" + elseif reg == "%rsp" + return "%esp" + elseif reg == "%rbp" + return "%ebp" + elseif reg == "%rsi" + return "%esi" + elseif reg == "%rdi" + return "%edi" + elseif reg == "%r8" + return "%r8d" + elseif reg == "%r9" + return "%r9d" + elseif reg == "%r10" + return "%r10d" + elseif reg == "%r11" + return "%r11d" + elseif reg == "%r12" + return "%r12d" + elseif reg == "%r13" + return "%r13d" + elseif reg == "%r14" + return "%r14d" + elseif reg == "%r15" + return "%r15d" + else + return "hejhoppgummi" + end +end + +############################################################################### +# Preamble +############################################################################### + +copyright = "# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +#\n" + +preamble = "include(`config.m4')dnl\ndnl\n.text\n" + +function function_pre_post(funname::String) + pre = ".global\tFUNC($funname) +.p2align\t4, 0x90 +TYPE($funname) + +FUNC($funname): +\t.cfi_startproc\n" + + post = ".$(funname)_end: +SIZE($(funname), .$(funname)_end) +.cfi_endproc\n" + + return (pre, post) +end + +############################################################################### +# mulhigh +############################################################################### + +function mulhigh_1() + r0 = _regs[9] # Important that r0 is rax + r1 = _regs[4] + + res = _regs[1] + ap = _regs[2] + bp = _regs[3] + + body = "" + + body *= "\tmov\t0*8($bp), %rdx\n" + body *= "\tmulx\t0*8($ap), $r0, $r1\n" + body *= "\tmov\t$r1, 0*8($res)\n" + + return body * "\n\tret\n" +end + +function mulhigh_2() + r0 = _regs[9] + r1 = _regs[4] + r2 = _regs[5] + + sc = _regs[6] + zr = _regs[7] + + res = _regs[1] + ap = _regs[2] + bp_old = _regs[3] + b1 = _regs[8] + + body = "" + + body *= "\tmov\t1*8($bp_old), $b1\n" + body *= "\tmov\t0*8($bp_old), %rdx\n" + body *= "\txor\t$(R32(zr)), $(R32(zr))\n" + body *= "\tmulx\t0*8($ap), $sc, $r0\n" + body *= "\tmulx\t1*8($ap), $sc, $r1\n" + body *= "\tadcx\t$sc, $r0\n" + body *= "\tadcx\t$zr, $r1\n" + + body *= "\tmov\t$b1, %rdx\n" + body *= "\tmulx\t0*8($ap), $sc, $r2\n" + body *= "\tadcx\t$sc, $r0\n" + body *= "\tadcx\t$r2, $r1\n" + body *= "\tmulx\t1*8($ap), $sc, $r2\n" + body *= "\tadox\t$sc, $r1\n" + body *= "\tadox\t$zr, $r2\n" + body *= "\tadcx\t$zr, $r2\n" + body *= "\tmov\t$r1, 0*8($res)\n" + body *= "\tmov\t$r2, 1*8($res)\n" + + return body * "\n\tret\n" +end + +# When n = 9, push res to stack. +# When n = 10, push res and rsp to xmm register. +# When n = 11, do it like n = 10, and also use bp as r(11) as r(11) is only +# really needed in the last step. +# When n = 12, do it like n = 11 but do not use a zero register. +function mulhigh(n::Int; debug::Bool = false) + if n < 1 + error() + elseif n == 1 + return mulhigh_1() + elseif n == 2 + return mulhigh_2() + elseif n <= 12 + # Continue + else + error() + end + + if debug + res = "res" + ap = "ap" + bp_old = "bp_old" + bp = "bp" + sc = "sc" + zr = "zr" + else + res = _regs[1] + ap = _regs[2] + bp_old = _regs[3] # rdx + bp = _regs[4] # rdx is used by mulx, so we need to switch register for bp + + if n != 12 + sc = _regs[5] # scrap register + zr = (n < 10) ? _regs[6] : "%rsp" # zero + else + sc = "%rsp" + zr = sc + end + + if n < 9 + _r = [_regs[9]; _regs[7:8]; __regs[1:n - 2]] + elseif n == 9 + _r = [_regs[9]; _regs[7:8]; __regs[1:6]; res] + elseif n == 10 + _r = [_regs[9]; _regs[6:8]; __regs[1:6]; res] + elseif n == 11 + _r = [_regs[9]; _regs[6:8]; __regs[1:6]; res; bp] + elseif n == 12 + _r = [_regs[9]; _regs[5:8]; __regs[1:6]; res; bp] + end + end + + r(ix::Int) = debug ? "r$ix" : _r[ix + 1] + + # Push + body = "" + for ix in 1:min(n - 2, 6) + body *= "\tpush\t$(__regs[ix])\n" + end + if n == 9 + body *= "\tpush\t$res\n" + elseif n >= 10 + body *= "\tvmovq\t%rsp, %xmm0\n" + body *= "\tvmovq\t$res, %xmm1\n" + end + body *= "\n" + + # Prepare + body *= "\tmov\t$bp_old, $bp\n" + body *= "\tmov\t0*8($bp_old), %rdx\n" + if n != 12 + body *= "\txor\t$(R32(zr)), $(R32(zr))\n" + end + body *= "\n" + + # First multiplication chain + body *= "\tmulx\t$(n - 2)*8($ap), $sc, $(r(0))\n" + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(1))\n" + if n != 12 + body *= "\tadcx\t$sc, $(r(0))\n" + body *= "\tadcx\t$zr, $(r(1))\n" + else + body *= "\tadd\t$sc, $(r(0))\n" + body *= "\tadc\t\$0, $(r(1))\n" + body *= "\ttest\t%al, %al\n" + end + body *= "\n" + + # Intermediate multiplication chains + for ix in 1:min(n - 2, (n != 12) ? 8 : 9) + body *= "\tmov\t$ix*8($bp), %rdx\n" + + body *= "\tmulx\t$(n - 2 - ix)*8($ap), $sc, $(r(ix + 2))\n" + body *= "\tmulx\t$(n - 1 - ix)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadcx\t$(r(ix + 2)), $(r(0))\n" + body *= "\tadox\t$sc, $(r(0))\n" + body *= "\tadcx\t$(r(ix + 1)), $(r(1))\n" + + for jx in 1:ix - 1 + body *= "\tmulx\t$(n - 1 - ix + jx)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadox\t$sc, $(r(jx + 0))\n" + body *= "\tadcx\t$(r(ix + 1)), $(r(jx + 1))\n" + end + + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(ix + 1))\n" + body *= "\tadox\t$sc, $(r(ix + 0))\n" + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(ix + 1))\n" + body *= "\tadox\t$zr, $(r(ix + 1))\n" + + body *= "\n" + end + + if n >= 11 + N = n - 1 + body *= "\tmov\t$(n - 2)*8($bp), %rdx\n" + for ix in 0:n - 2 + body *= "\tmulx\t$ix*8($ap), $sc, $(r(N))\n" + if ix == 0 + body *= "\tadcx\t$(r(N)), $(r(ix + 0))\n" + else + body *= "\tadox\t$sc, $(r(ix - 1))\n" + body *= "\tadcx\t$(r(N)), $(r(ix + 0))\n" + end + end + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(N))\n" + body *= "\tadox\t$sc, $(r(N - 1))\n" + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(N))\n" + body *= "\tadox\t$zr, $(r(N))\n" + body *= "\n" + end + + # Last multiplication chain + body *= "\tmov\t$(n - 1)*8($bp), %rdx\n" + for ix in 0:n - 2 + body *= "\tmulx\t$ix*8($ap), $sc, $(r(n))\n" + if ix % 2 == 0 + body *= "\tadcx\t$sc, $(r(ix + 0))\n" + body *= "\tadcx\t$(r(n)), $(r(ix + 1))\n" + else + body *= "\tadox\t$sc, $(r(ix + 0))\n" + body *= "\tadox\t$(r(n)), $(r(ix + 1))\n" + end + end + body *= "\tmulx\t$(n - 1)*8($ap), $sc, $(r(n))\n" + if (n - 1) % 2 == 0 + body *= "\tadcx\t$sc, $(r(n - 1))\n" + else + body *= "\tadox\t$sc, $(r(n - 1))\n" + end + if n == 12 + body *= "\tmov\t\$0, $(R32(zr))\n" + end + body *= "\tadcx\t$zr, $(r(n))\n" + if n == 9 + # Use scrap register for storing pointer to res + res = sc + body *= "\tpop\t$res\n" + end + body *= "\tadox\t$zr, $(r(n))\n" + body *= "\n" + + if n == 10 || n == 11 + res, zr = sc, "error zr" + body *= "\tvmovq\t%xmm1, $res\n" + body *= "\tvmovq\t%xmm0, %rsp\n" + elseif n == 12 + res = sc + body *= "\tvmovq\t%xmm1, $res\n" + end + + # Store result + for ix in 1:n + body *= "\tmov\t$(r(ix)), $(ix - 1)*8($res)\n" + end + body *= "\n" + + # Pop + if n == 12 + body *= "\tvmovq\t%xmm0, %rsp\n" + end + for ix in min(n - 2, 6):-1:1 + body *= "\tpop\t$(__regs[ix])\n" + end + body *= "\n" + + if debug + print(body * "\tret\n") + else + return body * "\tret\n" + end +end + +############################################################################### +# Generate file +############################################################################### + +function gen_mulhigh(m::Int, nofile::Bool = false) + (pre, post) = function_pre_post("flint_mpn_mulhigh_$m") + functionbody = mulhigh(m) + + str = "$copyright\n$preamble\n$pre$functionbody$post" + + if nofile + print(str) + else + path = String(@__DIR__) * "/../src/mpn_extras/broadwell/mulhigh_$m.asm" + file = open(path, "w") + write(file, str) + close(file) + end +end + +function gen_all() + for m in 1:12 + gen_mulhigh(m) + end +end diff --git a/doc/source/mpn_extras.rst b/doc/source/mpn_extras.rst index 2b031c3a7d..dfc61a9b5a 100644 --- a/doc/source/mpn_extras.rst +++ b/doc/source/mpn_extras.rst @@ -69,6 +69,17 @@ Multiplication Return the normalized length of `y` if `y \ge 0` and `y` fits into `n` limbs. Otherwise, return `-1`. `y` may alias `x_1` but is not allowed to alias `x_2`. +.. function:: mp_limb_t flint_mpn_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) + + Sets *{rp, n}* to the "high part" of *{xp, n}* times *{yp, n}* and returns + the *n*-th least significant limb, in the sense that the multiplication of + the *n - 1* lower limbs are never accounted for. Hence, the multiplication + is typically non-exact for sizes larger than one. The highest error + is *n + 2* ULP in the returned limb. + + .. note:: This function may not exist on processors not supporting the ADX + instruction set. + Divisibility -------------------------------------------------------------------------------- diff --git a/src/mpn_extras.h b/src/mpn_extras.h index cf9ffe6f8a..7594cbe6b2 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -229,6 +229,27 @@ flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n) flint_mpn_mul((_z), (_y), (_yn), (_x), (_xn)); \ } +#if FLINT_HAVE_ADX +# define FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH 12 +#else +# define FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH 0 +#endif + +#define FLINT_HAVE_MULHIGH_N_FUNC(n) ((n) <= FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH) + +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_mulhigh_n_func_tab[]; + +/* NOTE: Aliasing is allowed! */ +/* FIXME: How do we proceed for bigger n? */ +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + FLINT_ASSERT(FLINT_HAVE_MULHIGH_N_FUNC(n)); + + return flint_mpn_mulhigh_n_func_tab[n - 1](rp, xp, yp); +} + /* return the high limb of a two limb left shift by n < GMP_LIMB_BITS bits. Note: if GMP_NAIL_BITS != 0, the rest of flint is already broken anyways. diff --git a/src/mpn_extras/broadwell/mulhigh_1.asm b/src/mpn_extras/broadwell/mulhigh_1.asm new file mode 100644 index 0000000000..4bd024e3c7 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_1.asm @@ -0,0 +1,29 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_1) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_1) + +FUNC(flint_mpn_mulhigh_1): + .cfi_startproc + mov 0*8(%rdx), %rdx + mulx 0*8(%rsi), %rax, %rcx + mov %rcx, 0*8(%rdi) + + ret +.flint_mpn_mulhigh_1_end: +SIZE(flint_mpn_mulhigh_1, .flint_mpn_mulhigh_1_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_10.asm b/src/mpn_extras/broadwell/mulhigh_10.asm new file mode 100644 index 0000000000..d92c3d0699 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_10.asm @@ -0,0 +1,268 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_10) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_10) + +FUNC(flint_mpn_mulhigh_10): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %esp, %esp + + mulx 8*8(%rsi), %r8, %rax + mulx 9*8(%rsi), %r8, %r9 + adcx %r8, %rax + adcx %rsp, %r9 + + mov 1*8(%rcx), %rdx + mulx 7*8(%rsi), %r8, %r11 + mulx 8*8(%rsi), %r8, %r10 + adcx %r11, %rax + adox %r8, %rax + adcx %r10, %r9 + mulx 9*8(%rsi), %r8, %r10 + adox %r8, %r9 + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 2*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbx + mulx 7*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r9 + mulx 8*8(%rsi), %r8, %r11 + adox %r8, %r9 + adcx %r11, %r10 + mulx 9*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 3*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbp + mulx 6*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r9 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r9 + adcx %rbx, %r10 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 9*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 4*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r12 + mulx 5*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r9 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r9 + adcx %rbp, %r10 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 9*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 5*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r13 + mulx 4*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r9 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r9 + adcx %r12, %r10 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 9*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 6*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r14 + mulx 3*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r9 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r9 + adcx %r13, %r10 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 9*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 7*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r15 + mulx 2*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r9 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r9 + adcx %r14, %r10 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 9*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 8*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + mulx 1*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r9 + mulx 2*8(%rsi), %r8, %r15 + adox %r8, %r9 + adcx %r15, %r10 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 9*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 9*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %r8, %rax + adcx %rdi, %r9 + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %r9 + adox %rdi, %r10 + mulx 2*8(%rsi), %r8, %rdi + adcx %r8, %r10 + adcx %rdi, %r11 + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %r11 + adox %rdi, %rbx + mulx 4*8(%rsi), %r8, %rdi + adcx %r8, %rbx + adcx %rdi, %rbp + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %rbp + adox %rdi, %r12 + mulx 6*8(%rsi), %r8, %rdi + adcx %r8, %r12 + adcx %rdi, %r13 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r13 + adox %rdi, %r14 + mulx 8*8(%rsi), %r8, %rdi + adcx %r8, %r14 + adcx %rdi, %r15 + mulx 9*8(%rsi), %r8, %rdi + adox %r8, %r15 + adcx %rsp, %rdi + adox %rsp, %rdi + + vmovq %xmm1, %r8 + vmovq %xmm0, %rsp + mov %r9, 0*8(%r8) + mov %r10, 1*8(%r8) + mov %r11, 2*8(%r8) + mov %rbx, 3*8(%r8) + mov %rbp, 4*8(%r8) + mov %r12, 5*8(%r8) + mov %r13, 6*8(%r8) + mov %r14, 7*8(%r8) + mov %r15, 8*8(%r8) + mov %rdi, 9*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_10_end: +SIZE(flint_mpn_mulhigh_10, .flint_mpn_mulhigh_10_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_11.asm b/src/mpn_extras/broadwell/mulhigh_11.asm new file mode 100644 index 0000000000..aced9621d6 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_11.asm @@ -0,0 +1,307 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_11) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_11) + +FUNC(flint_mpn_mulhigh_11): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %esp, %esp + + mulx 9*8(%rsi), %r8, %rax + mulx 10*8(%rsi), %r8, %r9 + adcx %r8, %rax + adcx %rsp, %r9 + + mov 1*8(%rcx), %rdx + mulx 8*8(%rsi), %r8, %r11 + mulx 9*8(%rsi), %r8, %r10 + adcx %r11, %rax + adox %r8, %rax + adcx %r10, %r9 + mulx 10*8(%rsi), %r8, %r10 + adox %r8, %r9 + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 2*8(%rcx), %rdx + mulx 7*8(%rsi), %r8, %rbx + mulx 8*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r9 + mulx 9*8(%rsi), %r8, %r11 + adox %r8, %r9 + adcx %r11, %r10 + mulx 10*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 3*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbp + mulx 7*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r9 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r9 + adcx %rbx, %r10 + mulx 9*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 10*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 4*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %r12 + mulx 6*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r9 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r9 + adcx %rbp, %r10 + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 9*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 10*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 5*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r13 + mulx 5*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r9 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r9 + adcx %r12, %r10 + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 9*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 10*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 6*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r14 + mulx 4*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r9 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r9 + adcx %r13, %r10 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 9*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 10*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 7*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r15 + mulx 3*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r9 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r9 + adcx %r14, %r10 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 9*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 10*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 8*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rdi + mulx 2*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r9 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r9 + adcx %r15, %r10 + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 9*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 10*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 9*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %rdi, %rax + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %rax + adcx %rdi, %r9 + mulx 2*8(%rsi), %r8, %rdi + adox %r8, %r9 + adcx %rdi, %r10 + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %r10 + adcx %rdi, %r11 + mulx 4*8(%rsi), %r8, %rdi + adox %r8, %r11 + adcx %rdi, %rbx + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %rbx + adcx %rdi, %rbp + mulx 6*8(%rsi), %r8, %rdi + adox %r8, %rbp + adcx %rdi, %r12 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r12 + adcx %rdi, %r13 + mulx 8*8(%rsi), %r8, %rdi + adox %r8, %r13 + adcx %rdi, %r14 + mulx 9*8(%rsi), %r8, %rdi + adox %r8, %r14 + adcx %rdi, %r15 + mulx 10*8(%rsi), %r8, %rdi + adox %r8, %r15 + adcx %rsp, %rdi + adox %rsp, %rdi + + mov 10*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rcx + adcx %r8, %rax + adcx %rcx, %r9 + mulx 1*8(%rsi), %r8, %rcx + adox %r8, %r9 + adox %rcx, %r10 + mulx 2*8(%rsi), %r8, %rcx + adcx %r8, %r10 + adcx %rcx, %r11 + mulx 3*8(%rsi), %r8, %rcx + adox %r8, %r11 + adox %rcx, %rbx + mulx 4*8(%rsi), %r8, %rcx + adcx %r8, %rbx + adcx %rcx, %rbp + mulx 5*8(%rsi), %r8, %rcx + adox %r8, %rbp + adox %rcx, %r12 + mulx 6*8(%rsi), %r8, %rcx + adcx %r8, %r12 + adcx %rcx, %r13 + mulx 7*8(%rsi), %r8, %rcx + adox %r8, %r13 + adox %rcx, %r14 + mulx 8*8(%rsi), %r8, %rcx + adcx %r8, %r14 + adcx %rcx, %r15 + mulx 9*8(%rsi), %r8, %rcx + adox %r8, %r15 + adox %rcx, %rdi + mulx 10*8(%rsi), %r8, %rcx + adcx %r8, %rdi + adcx %rsp, %rcx + adox %rsp, %rcx + + vmovq %xmm1, %r8 + vmovq %xmm0, %rsp + mov %r9, 0*8(%r8) + mov %r10, 1*8(%r8) + mov %r11, 2*8(%r8) + mov %rbx, 3*8(%r8) + mov %rbp, 4*8(%r8) + mov %r12, 5*8(%r8) + mov %r13, 6*8(%r8) + mov %r14, 7*8(%r8) + mov %r15, 8*8(%r8) + mov %rdi, 9*8(%r8) + mov %rcx, 10*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_11_end: +SIZE(flint_mpn_mulhigh_11, .flint_mpn_mulhigh_11_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_12.asm b/src/mpn_extras/broadwell/mulhigh_12.asm new file mode 100644 index 0000000000..bc99365cbb --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_12.asm @@ -0,0 +1,360 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_12) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_12) + +FUNC(flint_mpn_mulhigh_12): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + vmovq %rsp, %xmm0 + vmovq %rdi, %xmm1 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + + mulx 10*8(%rsi), %rsp, %rax + mulx 11*8(%rsi), %rsp, %r8 + add %rsp, %rax + adc $0, %r8 + test %al, %al + + mov 1*8(%rcx), %rdx + mulx 9*8(%rsi), %rsp, %r10 + mulx 10*8(%rsi), %rsp, %r9 + adcx %r10, %rax + adox %rsp, %rax + adcx %r9, %r8 + mulx 11*8(%rsi), %rsp, %r9 + adox %rsp, %r8 + mov $0, %esp + adcx %rsp, %r9 + adox %rsp, %r9 + + mov 2*8(%rcx), %rdx + mulx 8*8(%rsi), %rsp, %r11 + mulx 9*8(%rsi), %rsp, %r10 + adcx %r11, %rax + adox %rsp, %rax + adcx %r10, %r8 + mulx 10*8(%rsi), %rsp, %r10 + adox %rsp, %r8 + adcx %r10, %r9 + mulx 11*8(%rsi), %rsp, %r10 + adox %rsp, %r9 + mov $0, %esp + adcx %rsp, %r10 + adox %rsp, %r10 + + mov 3*8(%rcx), %rdx + mulx 7*8(%rsi), %rsp, %rbx + mulx 8*8(%rsi), %rsp, %r11 + adcx %rbx, %rax + adox %rsp, %rax + adcx %r11, %r8 + mulx 9*8(%rsi), %rsp, %r11 + adox %rsp, %r8 + adcx %r11, %r9 + mulx 10*8(%rsi), %rsp, %r11 + adox %rsp, %r9 + adcx %r11, %r10 + mulx 11*8(%rsi), %rsp, %r11 + adox %rsp, %r10 + mov $0, %esp + adcx %rsp, %r11 + adox %rsp, %r11 + + mov 4*8(%rcx), %rdx + mulx 6*8(%rsi), %rsp, %rbp + mulx 7*8(%rsi), %rsp, %rbx + adcx %rbp, %rax + adox %rsp, %rax + adcx %rbx, %r8 + mulx 8*8(%rsi), %rsp, %rbx + adox %rsp, %r8 + adcx %rbx, %r9 + mulx 9*8(%rsi), %rsp, %rbx + adox %rsp, %r9 + adcx %rbx, %r10 + mulx 10*8(%rsi), %rsp, %rbx + adox %rsp, %r10 + adcx %rbx, %r11 + mulx 11*8(%rsi), %rsp, %rbx + adox %rsp, %r11 + mov $0, %esp + adcx %rsp, %rbx + adox %rsp, %rbx + + mov 5*8(%rcx), %rdx + mulx 5*8(%rsi), %rsp, %r12 + mulx 6*8(%rsi), %rsp, %rbp + adcx %r12, %rax + adox %rsp, %rax + adcx %rbp, %r8 + mulx 7*8(%rsi), %rsp, %rbp + adox %rsp, %r8 + adcx %rbp, %r9 + mulx 8*8(%rsi), %rsp, %rbp + adox %rsp, %r9 + adcx %rbp, %r10 + mulx 9*8(%rsi), %rsp, %rbp + adox %rsp, %r10 + adcx %rbp, %r11 + mulx 10*8(%rsi), %rsp, %rbp + adox %rsp, %r11 + adcx %rbp, %rbx + mulx 11*8(%rsi), %rsp, %rbp + adox %rsp, %rbx + mov $0, %esp + adcx %rsp, %rbp + adox %rsp, %rbp + + mov 6*8(%rcx), %rdx + mulx 4*8(%rsi), %rsp, %r13 + mulx 5*8(%rsi), %rsp, %r12 + adcx %r13, %rax + adox %rsp, %rax + adcx %r12, %r8 + mulx 6*8(%rsi), %rsp, %r12 + adox %rsp, %r8 + adcx %r12, %r9 + mulx 7*8(%rsi), %rsp, %r12 + adox %rsp, %r9 + adcx %r12, %r10 + mulx 8*8(%rsi), %rsp, %r12 + adox %rsp, %r10 + adcx %r12, %r11 + mulx 9*8(%rsi), %rsp, %r12 + adox %rsp, %r11 + adcx %r12, %rbx + mulx 10*8(%rsi), %rsp, %r12 + adox %rsp, %rbx + adcx %r12, %rbp + mulx 11*8(%rsi), %rsp, %r12 + adox %rsp, %rbp + mov $0, %esp + adcx %rsp, %r12 + adox %rsp, %r12 + + mov 7*8(%rcx), %rdx + mulx 3*8(%rsi), %rsp, %r14 + mulx 4*8(%rsi), %rsp, %r13 + adcx %r14, %rax + adox %rsp, %rax + adcx %r13, %r8 + mulx 5*8(%rsi), %rsp, %r13 + adox %rsp, %r8 + adcx %r13, %r9 + mulx 6*8(%rsi), %rsp, %r13 + adox %rsp, %r9 + adcx %r13, %r10 + mulx 7*8(%rsi), %rsp, %r13 + adox %rsp, %r10 + adcx %r13, %r11 + mulx 8*8(%rsi), %rsp, %r13 + adox %rsp, %r11 + adcx %r13, %rbx + mulx 9*8(%rsi), %rsp, %r13 + adox %rsp, %rbx + adcx %r13, %rbp + mulx 10*8(%rsi), %rsp, %r13 + adox %rsp, %rbp + adcx %r13, %r12 + mulx 11*8(%rsi), %rsp, %r13 + adox %rsp, %r12 + mov $0, %esp + adcx %rsp, %r13 + adox %rsp, %r13 + + mov 8*8(%rcx), %rdx + mulx 2*8(%rsi), %rsp, %r15 + mulx 3*8(%rsi), %rsp, %r14 + adcx %r15, %rax + adox %rsp, %rax + adcx %r14, %r8 + mulx 4*8(%rsi), %rsp, %r14 + adox %rsp, %r8 + adcx %r14, %r9 + mulx 5*8(%rsi), %rsp, %r14 + adox %rsp, %r9 + adcx %r14, %r10 + mulx 6*8(%rsi), %rsp, %r14 + adox %rsp, %r10 + adcx %r14, %r11 + mulx 7*8(%rsi), %rsp, %r14 + adox %rsp, %r11 + adcx %r14, %rbx + mulx 8*8(%rsi), %rsp, %r14 + adox %rsp, %rbx + adcx %r14, %rbp + mulx 9*8(%rsi), %rsp, %r14 + adox %rsp, %rbp + adcx %r14, %r12 + mulx 10*8(%rsi), %rsp, %r14 + adox %rsp, %r12 + adcx %r14, %r13 + mulx 11*8(%rsi), %rsp, %r14 + adox %rsp, %r13 + mov $0, %esp + adcx %rsp, %r14 + adox %rsp, %r14 + + mov 9*8(%rcx), %rdx + mulx 1*8(%rsi), %rsp, %rdi + mulx 2*8(%rsi), %rsp, %r15 + adcx %rdi, %rax + adox %rsp, %rax + adcx %r15, %r8 + mulx 3*8(%rsi), %rsp, %r15 + adox %rsp, %r8 + adcx %r15, %r9 + mulx 4*8(%rsi), %rsp, %r15 + adox %rsp, %r9 + adcx %r15, %r10 + mulx 5*8(%rsi), %rsp, %r15 + adox %rsp, %r10 + adcx %r15, %r11 + mulx 6*8(%rsi), %rsp, %r15 + adox %rsp, %r11 + adcx %r15, %rbx + mulx 7*8(%rsi), %rsp, %r15 + adox %rsp, %rbx + adcx %r15, %rbp + mulx 8*8(%rsi), %rsp, %r15 + adox %rsp, %rbp + adcx %r15, %r12 + mulx 9*8(%rsi), %rsp, %r15 + adox %rsp, %r12 + adcx %r15, %r13 + mulx 10*8(%rsi), %rsp, %r15 + adox %rsp, %r13 + adcx %r15, %r14 + mulx 11*8(%rsi), %rsp, %r15 + adox %rsp, %r14 + mov $0, %esp + adcx %rsp, %r15 + adox %rsp, %r15 + + mov 10*8(%rcx), %rdx + mulx 0*8(%rsi), %rsp, %rdi + adcx %rdi, %rax + mulx 1*8(%rsi), %rsp, %rdi + adox %rsp, %rax + adcx %rdi, %r8 + mulx 2*8(%rsi), %rsp, %rdi + adox %rsp, %r8 + adcx %rdi, %r9 + mulx 3*8(%rsi), %rsp, %rdi + adox %rsp, %r9 + adcx %rdi, %r10 + mulx 4*8(%rsi), %rsp, %rdi + adox %rsp, %r10 + adcx %rdi, %r11 + mulx 5*8(%rsi), %rsp, %rdi + adox %rsp, %r11 + adcx %rdi, %rbx + mulx 6*8(%rsi), %rsp, %rdi + adox %rsp, %rbx + adcx %rdi, %rbp + mulx 7*8(%rsi), %rsp, %rdi + adox %rsp, %rbp + adcx %rdi, %r12 + mulx 8*8(%rsi), %rsp, %rdi + adox %rsp, %r12 + adcx %rdi, %r13 + mulx 9*8(%rsi), %rsp, %rdi + adox %rsp, %r13 + adcx %rdi, %r14 + mulx 10*8(%rsi), %rsp, %rdi + adox %rsp, %r14 + adcx %rdi, %r15 + mulx 11*8(%rsi), %rsp, %rdi + adox %rsp, %r15 + mov $0, %esp + adcx %rsp, %rdi + adox %rsp, %rdi + + mov 11*8(%rcx), %rdx + mulx 0*8(%rsi), %rsp, %rcx + adcx %rsp, %rax + adcx %rcx, %r8 + mulx 1*8(%rsi), %rsp, %rcx + adox %rsp, %r8 + adox %rcx, %r9 + mulx 2*8(%rsi), %rsp, %rcx + adcx %rsp, %r9 + adcx %rcx, %r10 + mulx 3*8(%rsi), %rsp, %rcx + adox %rsp, %r10 + adox %rcx, %r11 + mulx 4*8(%rsi), %rsp, %rcx + adcx %rsp, %r11 + adcx %rcx, %rbx + mulx 5*8(%rsi), %rsp, %rcx + adox %rsp, %rbx + adox %rcx, %rbp + mulx 6*8(%rsi), %rsp, %rcx + adcx %rsp, %rbp + adcx %rcx, %r12 + mulx 7*8(%rsi), %rsp, %rcx + adox %rsp, %r12 + adox %rcx, %r13 + mulx 8*8(%rsi), %rsp, %rcx + adcx %rsp, %r13 + adcx %rcx, %r14 + mulx 9*8(%rsi), %rsp, %rcx + adox %rsp, %r14 + adox %rcx, %r15 + mulx 10*8(%rsi), %rsp, %rcx + adcx %rsp, %r15 + adcx %rcx, %rdi + mulx 11*8(%rsi), %rsp, %rcx + adox %rsp, %rdi + mov $0, %esp + adcx %rsp, %rcx + adox %rsp, %rcx + + vmovq %xmm1, %rsp + mov %r8, 0*8(%rsp) + mov %r9, 1*8(%rsp) + mov %r10, 2*8(%rsp) + mov %r11, 3*8(%rsp) + mov %rbx, 4*8(%rsp) + mov %rbp, 5*8(%rsp) + mov %r12, 6*8(%rsp) + mov %r13, 7*8(%rsp) + mov %r14, 8*8(%rsp) + mov %r15, 9*8(%rsp) + mov %rdi, 10*8(%rsp) + mov %rcx, 11*8(%rsp) + + vmovq %xmm0, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_12_end: +SIZE(flint_mpn_mulhigh_12, .flint_mpn_mulhigh_12_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_2.asm b/src/mpn_extras/broadwell/mulhigh_2.asm new file mode 100644 index 0000000000..2ebbfa0d35 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_2.asm @@ -0,0 +1,43 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_2) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_2) + +FUNC(flint_mpn_mulhigh_2): + .cfi_startproc + mov 1*8(%rdx), %r11 + mov 0*8(%rdx), %rdx + xor %r10d, %r10d + mulx 0*8(%rsi), %r9, %rax + mulx 1*8(%rsi), %r9, %rcx + adcx %r9, %rax + adcx %r10, %rcx + mov %r11, %rdx + mulx 0*8(%rsi), %r9, %r8 + adcx %r9, %rax + adcx %r8, %rcx + mulx 1*8(%rsi), %r9, %r8 + adox %r9, %rcx + adox %r10, %r8 + adcx %r10, %r8 + mov %rcx, 0*8(%rdi) + mov %r8, 1*8(%rdi) + + ret +.flint_mpn_mulhigh_2_end: +SIZE(flint_mpn_mulhigh_2, .flint_mpn_mulhigh_2_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_3.asm b/src/mpn_extras/broadwell/mulhigh_3.asm new file mode 100644 index 0000000000..826d15b348 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_3.asm @@ -0,0 +1,65 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_3) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_3) + +FUNC(flint_mpn_mulhigh_3): + .cfi_startproc + push %rbx + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 1*8(%rsi), %r8, %rax + mulx 2*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbx + mulx 1*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 2*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbx + adcx %r8, %rax + adcx %rbx, %r10 + mulx 1*8(%rsi), %r8, %rbx + adox %r8, %r10 + adox %rbx, %r11 + mulx 2*8(%rsi), %r8, %rbx + adcx %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + + pop %rbx + + ret +.flint_mpn_mulhigh_3_end: +SIZE(flint_mpn_mulhigh_3, .flint_mpn_mulhigh_3_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_4.asm b/src/mpn_extras/broadwell/mulhigh_4.asm new file mode 100644 index 0000000000..746b2fc915 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_4.asm @@ -0,0 +1,85 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_4) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_4) + +FUNC(flint_mpn_mulhigh_4): + .cfi_startproc + push %rbx + push %rbp + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 2*8(%rsi), %r8, %rax + mulx 3*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rbx + mulx 2*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 3*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbp + mulx 1*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 2*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 3*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rbp + adcx %r8, %rax + adcx %rbp, %r10 + mulx 1*8(%rsi), %r8, %rbp + adox %r8, %r10 + adox %rbp, %r11 + mulx 2*8(%rsi), %r8, %rbp + adcx %r8, %r11 + adcx %rbp, %rbx + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_4_end: +SIZE(flint_mpn_mulhigh_4, .flint_mpn_mulhigh_4_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_5.asm b/src/mpn_extras/broadwell/mulhigh_5.asm new file mode 100644 index 0000000000..c77c9c08aa --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_5.asm @@ -0,0 +1,108 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_5) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_5) + +FUNC(flint_mpn_mulhigh_5): + .cfi_startproc + push %rbx + push %rbp + push %r12 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 3*8(%rsi), %r8, %rax + mulx 4*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %rbx + mulx 3*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 4*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %rbp + mulx 2*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 3*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 4*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r12 + mulx 1*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 2*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r12 + adcx %r8, %rax + adcx %r12, %r10 + mulx 1*8(%rsi), %r8, %r12 + adox %r8, %r10 + adox %r12, %r11 + mulx 2*8(%rsi), %r8, %r12 + adcx %r8, %r11 + adcx %r12, %rbx + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %rbx + adox %r12, %rbp + mulx 4*8(%rsi), %r8, %r12 + adcx %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_5_end: +SIZE(flint_mpn_mulhigh_5, .flint_mpn_mulhigh_5_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_6.asm b/src/mpn_extras/broadwell/mulhigh_6.asm new file mode 100644 index 0000000000..686a744d46 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_6.asm @@ -0,0 +1,134 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_6) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_6) + +FUNC(flint_mpn_mulhigh_6): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 4*8(%rsi), %r8, %rax + mulx 5*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %rbx + mulx 4*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 5*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %rbp + mulx 3*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 4*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 5*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r12 + mulx 2*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 3*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r13 + mulx 1*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 2*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r13 + adcx %r8, %rax + adcx %r13, %r10 + mulx 1*8(%rsi), %r8, %r13 + adox %r8, %r10 + adox %r13, %r11 + mulx 2*8(%rsi), %r8, %r13 + adcx %r8, %r11 + adcx %r13, %rbx + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %rbx + adox %r13, %rbp + mulx 4*8(%rsi), %r8, %r13 + adcx %r8, %rbp + adcx %r13, %r12 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_6_end: +SIZE(flint_mpn_mulhigh_6, .flint_mpn_mulhigh_6_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_7.asm b/src/mpn_extras/broadwell/mulhigh_7.asm new file mode 100644 index 0000000000..7528aae87b --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_7.asm @@ -0,0 +1,163 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_7) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_7) + +FUNC(flint_mpn_mulhigh_7): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 5*8(%rsi), %r8, %rax + mulx 6*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %rbx + mulx 5*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 6*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %rbp + mulx 4*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 5*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 6*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r12 + mulx 3*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 4*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r13 + mulx 2*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 3*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r14 + mulx 1*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 2*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r14 + adcx %r8, %rax + adcx %r14, %r10 + mulx 1*8(%rsi), %r8, %r14 + adox %r8, %r10 + adox %r14, %r11 + mulx 2*8(%rsi), %r8, %r14 + adcx %r8, %r11 + adcx %r14, %rbx + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %rbx + adox %r14, %rbp + mulx 4*8(%rsi), %r8, %r14 + adcx %r8, %rbp + adcx %r14, %r12 + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %r12 + adox %r14, %r13 + mulx 6*8(%rsi), %r8, %r14 + adcx %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + mov %r14, 6*8(%rdi) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_7_end: +SIZE(flint_mpn_mulhigh_7, .flint_mpn_mulhigh_7_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_8.asm b/src/mpn_extras/broadwell/mulhigh_8.asm new file mode 100644 index 0000000000..2c2983fe34 --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_8.asm @@ -0,0 +1,195 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_8) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_8) + +FUNC(flint_mpn_mulhigh_8): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 6*8(%rsi), %r8, %rax + mulx 7*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbx + mulx 6*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 7*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %rbp + mulx 5*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 6*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r12 + mulx 4*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 5*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r13 + mulx 3*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 4*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r14 + mulx 2*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 3*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r15 + mulx 1*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r10 + mulx 2*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov 7*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %r15 + adcx %r8, %rax + adcx %r15, %r10 + mulx 1*8(%rsi), %r8, %r15 + adox %r8, %r10 + adox %r15, %r11 + mulx 2*8(%rsi), %r8, %r15 + adcx %r8, %r11 + adcx %r15, %rbx + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %rbx + adox %r15, %rbp + mulx 4*8(%rsi), %r8, %r15 + adcx %r8, %rbp + adcx %r15, %r12 + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %r12 + adox %r15, %r13 + mulx 6*8(%rsi), %r8, %r15 + adcx %r8, %r13 + adcx %r15, %r14 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %r9, %r15 + adox %r9, %r15 + + mov %r10, 0*8(%rdi) + mov %r11, 1*8(%rdi) + mov %rbx, 2*8(%rdi) + mov %rbp, 3*8(%rdi) + mov %r12, 4*8(%rdi) + mov %r13, 5*8(%rdi) + mov %r14, 6*8(%rdi) + mov %r15, 7*8(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_8_end: +SIZE(flint_mpn_mulhigh_8, .flint_mpn_mulhigh_8_end) +.cfi_endproc diff --git a/src/mpn_extras/broadwell/mulhigh_9.asm b/src/mpn_extras/broadwell/mulhigh_9.asm new file mode 100644 index 0000000000..ba4438eb0b --- /dev/null +++ b/src/mpn_extras/broadwell/mulhigh_9.asm @@ -0,0 +1,230 @@ +# +# Copyright (C) 2024 Albin Ahlbäck +# +# This file is part of FLINT. +# +# FLINT is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License (LGPL) as published +# by the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. See . +# + +include(`config.m4')dnl +dnl +.text + +.global FUNC(flint_mpn_mulhigh_9) +.p2align 4, 0x90 +TYPE(flint_mpn_mulhigh_9) + +FUNC(flint_mpn_mulhigh_9): + .cfi_startproc + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi + + mov %rdx, %rcx + mov 0*8(%rdx), %rdx + xor %r9d, %r9d + + mulx 7*8(%rsi), %r8, %rax + mulx 8*8(%rsi), %r8, %r10 + adcx %r8, %rax + adcx %r9, %r10 + + mov 1*8(%rcx), %rdx + mulx 6*8(%rsi), %r8, %rbx + mulx 7*8(%rsi), %r8, %r11 + adcx %rbx, %rax + adox %r8, %rax + adcx %r11, %r10 + mulx 8*8(%rsi), %r8, %r11 + adox %r8, %r10 + adcx %r9, %r11 + adox %r9, %r11 + + mov 2*8(%rcx), %rdx + mulx 5*8(%rsi), %r8, %rbp + mulx 6*8(%rsi), %r8, %rbx + adcx %rbp, %rax + adox %r8, %rax + adcx %rbx, %r10 + mulx 7*8(%rsi), %r8, %rbx + adox %r8, %r10 + adcx %rbx, %r11 + mulx 8*8(%rsi), %r8, %rbx + adox %r8, %r11 + adcx %r9, %rbx + adox %r9, %rbx + + mov 3*8(%rcx), %rdx + mulx 4*8(%rsi), %r8, %r12 + mulx 5*8(%rsi), %r8, %rbp + adcx %r12, %rax + adox %r8, %rax + adcx %rbp, %r10 + mulx 6*8(%rsi), %r8, %rbp + adox %r8, %r10 + adcx %rbp, %r11 + mulx 7*8(%rsi), %r8, %rbp + adox %r8, %r11 + adcx %rbp, %rbx + mulx 8*8(%rsi), %r8, %rbp + adox %r8, %rbx + adcx %r9, %rbp + adox %r9, %rbp + + mov 4*8(%rcx), %rdx + mulx 3*8(%rsi), %r8, %r13 + mulx 4*8(%rsi), %r8, %r12 + adcx %r13, %rax + adox %r8, %rax + adcx %r12, %r10 + mulx 5*8(%rsi), %r8, %r12 + adox %r8, %r10 + adcx %r12, %r11 + mulx 6*8(%rsi), %r8, %r12 + adox %r8, %r11 + adcx %r12, %rbx + mulx 7*8(%rsi), %r8, %r12 + adox %r8, %rbx + adcx %r12, %rbp + mulx 8*8(%rsi), %r8, %r12 + adox %r8, %rbp + adcx %r9, %r12 + adox %r9, %r12 + + mov 5*8(%rcx), %rdx + mulx 2*8(%rsi), %r8, %r14 + mulx 3*8(%rsi), %r8, %r13 + adcx %r14, %rax + adox %r8, %rax + adcx %r13, %r10 + mulx 4*8(%rsi), %r8, %r13 + adox %r8, %r10 + adcx %r13, %r11 + mulx 5*8(%rsi), %r8, %r13 + adox %r8, %r11 + adcx %r13, %rbx + mulx 6*8(%rsi), %r8, %r13 + adox %r8, %rbx + adcx %r13, %rbp + mulx 7*8(%rsi), %r8, %r13 + adox %r8, %rbp + adcx %r13, %r12 + mulx 8*8(%rsi), %r8, %r13 + adox %r8, %r12 + adcx %r9, %r13 + adox %r9, %r13 + + mov 6*8(%rcx), %rdx + mulx 1*8(%rsi), %r8, %r15 + mulx 2*8(%rsi), %r8, %r14 + adcx %r15, %rax + adox %r8, %rax + adcx %r14, %r10 + mulx 3*8(%rsi), %r8, %r14 + adox %r8, %r10 + adcx %r14, %r11 + mulx 4*8(%rsi), %r8, %r14 + adox %r8, %r11 + adcx %r14, %rbx + mulx 5*8(%rsi), %r8, %r14 + adox %r8, %rbx + adcx %r14, %rbp + mulx 6*8(%rsi), %r8, %r14 + adox %r8, %rbp + adcx %r14, %r12 + mulx 7*8(%rsi), %r8, %r14 + adox %r8, %r12 + adcx %r14, %r13 + mulx 8*8(%rsi), %r8, %r14 + adox %r8, %r13 + adcx %r9, %r14 + adox %r9, %r14 + + mov 7*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + mulx 1*8(%rsi), %r8, %r15 + adcx %rdi, %rax + adox %r8, %rax + adcx %r15, %r10 + mulx 2*8(%rsi), %r8, %r15 + adox %r8, %r10 + adcx %r15, %r11 + mulx 3*8(%rsi), %r8, %r15 + adox %r8, %r11 + adcx %r15, %rbx + mulx 4*8(%rsi), %r8, %r15 + adox %r8, %rbx + adcx %r15, %rbp + mulx 5*8(%rsi), %r8, %r15 + adox %r8, %rbp + adcx %r15, %r12 + mulx 6*8(%rsi), %r8, %r15 + adox %r8, %r12 + adcx %r15, %r13 + mulx 7*8(%rsi), %r8, %r15 + adox %r8, %r13 + adcx %r15, %r14 + mulx 8*8(%rsi), %r8, %r15 + adox %r8, %r14 + adcx %r9, %r15 + adox %r9, %r15 + + mov 8*8(%rcx), %rdx + mulx 0*8(%rsi), %r8, %rdi + adcx %r8, %rax + adcx %rdi, %r10 + mulx 1*8(%rsi), %r8, %rdi + adox %r8, %r10 + adox %rdi, %r11 + mulx 2*8(%rsi), %r8, %rdi + adcx %r8, %r11 + adcx %rdi, %rbx + mulx 3*8(%rsi), %r8, %rdi + adox %r8, %rbx + adox %rdi, %rbp + mulx 4*8(%rsi), %r8, %rdi + adcx %r8, %rbp + adcx %rdi, %r12 + mulx 5*8(%rsi), %r8, %rdi + adox %r8, %r12 + adox %rdi, %r13 + mulx 6*8(%rsi), %r8, %rdi + adcx %r8, %r13 + adcx %rdi, %r14 + mulx 7*8(%rsi), %r8, %rdi + adox %r8, %r14 + adox %rdi, %r15 + mulx 8*8(%rsi), %r8, %rdi + adcx %r8, %r15 + adcx %r9, %rdi + pop %r8 + adox %r9, %rdi + + mov %r10, 0*8(%r8) + mov %r11, 1*8(%r8) + mov %rbx, 2*8(%r8) + mov %rbp, 3*8(%r8) + mov %r12, 4*8(%r8) + mov %r13, 5*8(%r8) + mov %r14, 6*8(%r8) + mov %r15, 7*8(%r8) + mov %rdi, 8*8(%r8) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +.flint_mpn_mulhigh_9_end: +SIZE(flint_mpn_mulhigh_9, .flint_mpn_mulhigh_9_end) +.cfi_endproc diff --git a/src/mpn_extras/mulhigh.c b/src/mpn_extras/mulhigh.c new file mode 100644 index 0000000000..57365b5cfd --- /dev/null +++ b/src/mpn_extras/mulhigh.c @@ -0,0 +1,45 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" + +#if FLINT_HAVE_ADX +mp_limb_t flint_mpn_mulhigh_1(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_2(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_3(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_4(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_5(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_6(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_7(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_8(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_9(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_10(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_11(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_12(mp_ptr, mp_srcptr, mp_srcptr); + +const flint_mpn_mul_func_t flint_mpn_mulhigh_n_func_tab[] = +{ + flint_mpn_mulhigh_1, + flint_mpn_mulhigh_2, + flint_mpn_mulhigh_3, + flint_mpn_mulhigh_4, + flint_mpn_mulhigh_5, + flint_mpn_mulhigh_6, + flint_mpn_mulhigh_7, + flint_mpn_mulhigh_8, + flint_mpn_mulhigh_9, + flint_mpn_mulhigh_10, + flint_mpn_mulhigh_11, + flint_mpn_mulhigh_12 +}; +#else +typedef int this_file_is_empty; +#endif diff --git a/src/mpn_extras/profile/p-mulhigh_basecase.c b/src/mpn_extras/profile/p-mulhigh_basecase.c new file mode 100644 index 0000000000..e26068f1f7 --- /dev/null +++ b/src/mpn_extras/profile/p-mulhigh_basecase.c @@ -0,0 +1,57 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" +#include "profiler.h" + +#define mpn_mul_basecase __gmpn_mul_basecase +void mpn_mul_basecase(mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t); +void mpfr_mulhigh_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); + +#define N_MAX FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH + +int main(void) +{ + mp_limb_t rf[N_MAX]; + mp_limb_t rg[2 * N_MAX]; + mp_limb_t rm[2 * N_MAX]; + mp_limb_t xp[N_MAX]; + mp_limb_t yp[N_MAX]; + mp_size_t n; + + flint_printf(" GMP MPFR\n"); + for (n = 1; n <= N_MAX; n++) + { + double t1, t2, t3, __attribute__((unused)) __; + flint_printf("n = %2wd:", n); + + mpn_random2(xp, n); + mpn_random2(yp, n); + + TIMEIT_START + flint_mpn_mulhigh_n(rf, xp, yp, n); + TIMEIT_STOP_VALUES(__, t1) + + TIMEIT_START + mpn_mul_basecase(rg, xp, n, yp, n); + TIMEIT_STOP_VALUES(__, t2) + + TIMEIT_START + mpfr_mulhigh_n(rm, xp, yp, n); + TIMEIT_STOP_VALUES(__, t3) + + flint_printf("%7.2fx %7.2fx\n", t2 / t1, t3 / t1); + } + + flint_cleanup_master(); + + return 0; +} diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index 7cb72d0a28..fb7a0307d4 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -23,6 +23,7 @@ #include "t-mul.c" #include "t-mul_n.c" #include "t-mul_basecase.c" +#include "t-mulhigh_n.c" #include "t-mulmod_2expp1.c" #include "t-mulmod_preinv1.c" #include "t-mulmod_preinvn.c" @@ -43,6 +44,7 @@ test_struct tests[] = TEST_FUNCTION(flint_mpn_mul), TEST_FUNCTION(flint_mpn_mul_n), TEST_FUNCTION(flint_mpn_mul_basecase), + TEST_FUNCTION(flint_mpn_mulhigh_n), TEST_FUNCTION(flint_mpn_mulmod_2expp1), TEST_FUNCTION(flint_mpn_mulmod_preinv1), TEST_FUNCTION(flint_mpn_mulmod_preinvn), diff --git a/src/mpn_extras/test/t-mulhigh_n.c b/src/mpn_extras/test/t-mulhigh_n.c new file mode 100644 index 0000000000..7484079a9a --- /dev/null +++ b/src/mpn_extras/test/t-mulhigh_n.c @@ -0,0 +1,119 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 2.1 of the License, or + (at your option) any later version. See . +*/ + +#include +#include "test_helpers.h" +#include "mpn_extras.h" + +#if FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH != 0 + +# define TEST_MPFR 0 +# define N_MAX FLINT_MPN_MULHIGH_N_FUNC_TAB_WIDTH + +void mpfr_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n); + +TEST_FUNCTION_START(flint_mpn_mulhigh_n, state) +{ + slong ix; + int result; + + for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) + { + mp_limb_t rp[N_MAX + 1] = {UWORD(0)}; + mp_limb_t rp_upperbound[2 * N_MAX] = {UWORD(0)}; + mp_limb_t rp_lowerbound[2 * N_MAX] = {UWORD(0)}; +#if TEST_MPFR + mp_limb_t rp_mpfr[2 * N_MAX] = {UWORD(0)}; +#endif + mp_limb_t borrow; + mp_limb_t xp[N_MAX]; + mp_limb_t yp[N_MAX]; + mp_size_t n; + mp_limb_t lb; + + n = 1 + n_randint(state, N_MAX); + + mpn_random2(xp, n); + mpn_random2(yp, n); + + rp[0] = flint_mpn_mulhigh_n(rp + 1, xp, yp, n); + + /* Check upper bound */ + flint_mpn_mul_n(rp_upperbound, xp, yp, n); + + result = (mpn_cmp(rp, rp_upperbound + n - 1, n + 1) <= 0); + if (!result) + TEST_FUNCTION_FAIL( + "rp > rp_upperbound\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp = %{ulong*}\n" + "rp_upperbound = %{ulong*}\n", + ix, n, xp, n, yp, n, rp, n + 1, rp_upperbound + n - 1, n + 1); + + /* Check lower bound */ + memcpy(rp_lowerbound, rp_upperbound, 2 * n * sizeof(mp_limb_t)); + + if (n <= 2) + lb = 0; /* Multiplication should be "exact" */ + else if (n == 3) + lb = 2; + else if (n == 4) + lb = 4; + else if (n < 6) + lb = n + 1; + else + lb = n + 2; + + borrow = mpn_sub_1(rp_lowerbound + n - 1, rp_lowerbound + n - 1, n + 1, lb); + if (borrow) + mpn_zero(rp_lowerbound, 2 * n); + + result = (mpn_cmp(rp, rp_lowerbound + n - 1, n + 1) >= 0); + if (!result) + TEST_FUNCTION_FAIL( + "rp < rp_lowerbound\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp = %{ulong*}\n" + "rp_lowerbound = %{ulong*}\n", + ix, n, xp, n, yp, n, rp, n + 1, rp_lowerbound + n - 1, n + 1); + +#if TEST_MPFR + /* Check against MPFR */ + mpfr_mulhigh_n(rp_mpfr, xp, yp, n); + + result = (mpn_cmp(rp + n - 1, rp_mpfr + n - 1, n + 1) == 0); + if (!result) + TEST_FUNCTION_FAIL( + "flint_mpn_mulhigh_n does not agree with mpfr_mulhigh_n\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "rp = %{ulong*}\n" + "rp_mpfr = %{ulong*}\n", + ix, n, xp, n, yp, n, rp + n - 1, n + 1, rp_mpfr + n - 1, n + 1); +#endif + } + + TEST_FUNCTION_END(state); +} +#else +TEST_FUNCTION_START(flint_mpn_mulhigh_n, state) +{ + TEST_FUNCTION_END_SKIPPED(state); +} +#endif