diff --git a/src/mpn_extras/sqrhigh.c b/src/mpn_extras/sqrhigh.c index d1578e3c5d..ece3476eae 100644 --- a/src/mpn_extras/sqrhigh.c +++ b/src/mpn_extras/sqrhigh.c @@ -15,6 +15,7 @@ (at your option) any later version. See . */ +#include #include "mpn_extras.h" /* Generated by tune-sqrhigh.c */ @@ -128,25 +129,35 @@ _flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n) /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */ cy = mpn_lshift(rp + l - 1, rp + l - 1, l + 1, 1); cy += mpn_add_n(rp + n - 1, rp + n - 1, rp + l - 1, l + 1); - mpn_add_1(rp + n + l, rp + n + l, k, cy); /* propagate carry */ + MPN_INCR_U(rp + n + l, k, cy); /* propagate carry */ + + /* Corrections for precise high product (not needed for the + original Mulders). */ + { + mp_limb_t hi, lo; + + /* Note: if we relax the condition on k, we will need + a branch for k == l here to avoid double counting. */ + FLINT_ASSERT(k != l); + + umul_ppmm(hi, lo, np[k - 1], np[l - 1]); + MPN_INCR_U(rp + n - 1, n + 1, hi); + MPN_INCR_U(rp + n - 1, n + 1, hi); + } } mp_limb_t _flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n) { - mp_ptr tmp, tr, tu; + mp_ptr tmp; mp_limb_t bot; TMP_INIT; TMP_START; - tmp = TMP_ALLOC(sizeof(mp_limb_t) * (3 * (n + 1))); - tu = tmp; - tr = tu + (n + 1); - tu[0] = 0; - flint_mpn_copyi(tu + 1, u, n); - _flint_mpn_sqrhigh_mulders_recursive(tr, tu, n + 1); - flint_mpn_copyi(res, tr + n + 2, n); - bot = tr[n + 1]; + tmp = TMP_ALLOC(sizeof(mp_limb_t) * (2 * n)); + _flint_mpn_sqrhigh_mulders_recursive(tmp, u, n); + memcpy(res, tmp + n, sizeof(mp_limb_t) * n); + bot = tmp[n - 1]; TMP_END; return bot; @@ -159,7 +170,7 @@ _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n) mp_limb_t bot; tmp = flint_malloc(sizeof(mp_limb_t) * (2 * n)); flint_mpn_sqr(tmp, u, n); - flint_mpn_copyi(res, tmp + n, n); + memcpy(res, tmp + n, sizeof(mp_limb_t) * n); bot = tmp[n - 1]; flint_free(tmp); return bot;