diff --git a/src/mpn_extras/sqrhigh.c b/src/mpn_extras/sqrhigh.c
index d1578e3c5d..ece3476eae 100644
--- a/src/mpn_extras/sqrhigh.c
+++ b/src/mpn_extras/sqrhigh.c
@@ -15,6 +15,7 @@
(at your option) any later version. See .
*/
+#include
#include "mpn_extras.h"
/* Generated by tune-sqrhigh.c */
@@ -128,25 +129,35 @@ _flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n)
/* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
cy = mpn_lshift(rp + l - 1, rp + l - 1, l + 1, 1);
cy += mpn_add_n(rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
- mpn_add_1(rp + n + l, rp + n + l, k, cy); /* propagate carry */
+ MPN_INCR_U(rp + n + l, k, cy); /* propagate carry */
+
+ /* Corrections for precise high product (not needed for the
+ original Mulders). */
+ {
+ mp_limb_t hi, lo;
+
+ /* Note: if we relax the condition on k, we will need
+ a branch for k == l here to avoid double counting. */
+ FLINT_ASSERT(k != l);
+
+ umul_ppmm(hi, lo, np[k - 1], np[l - 1]);
+ MPN_INCR_U(rp + n - 1, n + 1, hi);
+ MPN_INCR_U(rp + n - 1, n + 1, hi);
+ }
}
mp_limb_t
_flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n)
{
- mp_ptr tmp, tr, tu;
+ mp_ptr tmp;
mp_limb_t bot;
TMP_INIT;
TMP_START;
- tmp = TMP_ALLOC(sizeof(mp_limb_t) * (3 * (n + 1)));
- tu = tmp;
- tr = tu + (n + 1);
- tu[0] = 0;
- flint_mpn_copyi(tu + 1, u, n);
- _flint_mpn_sqrhigh_mulders_recursive(tr, tu, n + 1);
- flint_mpn_copyi(res, tr + n + 2, n);
- bot = tr[n + 1];
+ tmp = TMP_ALLOC(sizeof(mp_limb_t) * (2 * n));
+ _flint_mpn_sqrhigh_mulders_recursive(tmp, u, n);
+ memcpy(res, tmp + n, sizeof(mp_limb_t) * n);
+ bot = tmp[n - 1];
TMP_END;
return bot;
@@ -159,7 +170,7 @@ _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n)
mp_limb_t bot;
tmp = flint_malloc(sizeof(mp_limb_t) * (2 * n));
flint_mpn_sqr(tmp, u, n);
- flint_mpn_copyi(res, tmp + n, n);
+ memcpy(res, tmp + n, sizeof(mp_limb_t) * n);
bot = tmp[n - 1];
flint_free(tmp);
return bot;