From 1209dc21c6dfd7adaa5e504761352655db4b6d60 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Sat, 11 Jan 2025 00:10:22 +0100 Subject: [PATCH] tweaks; restore old tuning tab for now --- src/mpn_extras.h | 4 +++ src/mpn_extras/mulhigh.c | 55 +++++++++++++++--------------- src/mpn_extras/mulhigh_basecase.c | 31 +++++++++++++++++ src/mpn_extras/mulhigh_recursive.c | 13 +++---- 4 files changed, 68 insertions(+), 35 deletions(-) diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 709f0a3ef0..0a373f5ba3 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -619,6 +619,8 @@ FLINT_DLL extern const flint_mpn_sqrhigh_normalised_func_t flint_mpn_sqrhigh_nor #if FLINT_HAVE_ASSEMBLY_x86_64_adx # define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 8 # define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 13 +/* n with best effective cycles/limb (and current largest assembly case) -- used by mulhigh_recursive */ +# define FLINT_MPN_MULHIGH_BEST_TAB_N 9 # define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8 # define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 9 # define FLINT_MPN_SQRHIGH_NORMALISED_FUNC_TAB_WIDTH 8 @@ -632,6 +634,7 @@ FLINT_DLL extern const flint_mpn_sqrhigh_normalised_func_t flint_mpn_sqrhigh_nor #elif FLINT_HAVE_ASSEMBLY_armv8 # define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 0 # define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 8 +# define FLINT_MPN_MULHIGH_BEST_TAB_N 8 # define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8 # define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 0 # define FLINT_MPN_SQRHIGH_NORMALISED_FUNC_TAB_WIDTH 0 @@ -643,6 +646,7 @@ FLINT_DLL extern const flint_mpn_sqrhigh_normalised_func_t flint_mpn_sqrhigh_nor /* TODO: generic hardcoded mullows */ # define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 0 # define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 16 +# define FLINT_MPN_MULHIGH_BEST_TAB_N 16 # define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 2 # define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 0 # define FLINT_MPN_SQRHIGH_NORMALISED_FUNC_TAB_WIDTH 0 diff --git a/src/mpn_extras/mulhigh.c b/src/mpn_extras/mulhigh.c index 828831da15..11dda002ac 100644 --- a/src/mpn_extras/mulhigh.c +++ b/src/mpn_extras/mulhigh.c @@ -19,34 +19,35 @@ #include "mpn_extras.h" /* Generated by tune-mulhigh.c */ + const signed short flint_mpn_mulhigh_k_tab[FLINT_MPN_MULHIGH_K_TAB_SIZE] = -{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 12, 13, 13, 14, 16, 16, 17, 18, 19, -18, 20, 20, 20, 24, 20, 24, 20, 21, 22, 22, 24, 24, 26, 26, 24, 25, 30, 30, 31, 32, 30, 32, 32, 36, 35, 36, 32, 40, 38, -39, 40, 40, 37, 39, 40, 40, 40, 40, 40, 48, 48, 44, 48, 52, 52, 40, 40, 44, 52, 44, 44, 44, 52, 64, 44, 48, 48, 48, 64, -64, 52, 56, 64, 52, 60, 56, 56, 56, 56, 64, 64, 64, 60, 64, 64, 68, 64, 60, 64, 64, 64, 64, 64, 72, 68, 64, 76, 76, 68, -68, 72, 80, 72, 72, 76, 72, 72, 80, 72, 72, 80, 72, 72, 80, 80, 80, 88, 76, 80, 80, 76, 80, 84, 80, 80, 88, 88, 96, 96, -96, 92, 84, 96, 96, 92, 88, 96, 96, 100, 96, 92, 96, 96, 100, 96, 96, 120, 112, 120, 104, 112, 116, 120, 120, 112, 112, 124, 112, 120, -112, 120, 120, 116, 120, 124, 120, 128, 120, 116, 116, 124, 120, 120, 128, 120, 124, 120, 128, 140, 140, 128, 128, 128, 140, 144, 140, 140, 144, 144, -140, 128, 140, 140, 144, 156, 156, 140, 160, 140, 156, 156, 148, 144, 160, 160, 156, 144, 160, 156, 156, 156, 156, 156, 160, 144, 144, 156, 160, 152, -140, 148, 144, 160, 160, 156, 156, 152, 156, 160, 160, 156, 156, 160, 160, 160, 156, 156, 156, 152, 156, 156, 160, 156, 160, 160, 156, 160, 160, 160, -156, 168, 160, 156, 160, 172, 172, 160, 188, 188, 172, 188, 160, 192, 188, 192, 172, 192, 188, 188, 188, 188, 188, 188, 172, 188, 188, 204, 152, 152, -188, 152, 164, 156, 156, 160, 168, 172, 180, 184, 176, 172, 180, 208, 196, 200, 204, 204, 172, 172, 172, 168, 176, 184, 184, 168, 176, 204, 180, 196, -188, 192, 204, 176, 212, 180, 176, 176, 184, 180, 180, 184, 188, 188, 212, 192, 192, 232, 220, 232, 208, 228, 184, 180, 200, 192, 204, 208, 188, 212, -204, 208, 212, 220, 268, 260, 224, 228, 264, 264, 196, 196, 196, 208, 204, 212, 216, 204, 204, 252, 232, 256, 264, 284, 272, 264, 252, 276, 204, 220, -220, 224, 236, 248, 236, 244, 248, 252, 252, 268, 272, 264, 252, 208, 220, 224, 220, 228, 256, 228, 248, 252, 264, 260, 260, 300, 272, 280, 268, 292, -328, 228, 248, 260, 256, 260, 264, 260, 276, 276, 276, 296, 292, 288, 324, 300, 244, 260, 304, 264, 268, 272, 272, 284, 268, 296, 300, 292, 300, 324, -316, 272, 268, 264, 276, 288, 288, 280, 288, 324, 268, 324, 316, 284, 304, 300, 268, 300, 280, 304, 300, 292, 324, 296, 320, 304, 268, 328, 324, 284, -296, 304, 296, 296, 296, 328, 340, 372, 364, 432, 444, 436, 436, 448, 316, 456, 464, 436, 440, 480, 372, 448, 452, 468, 472, 472, 480, 376, 472, 412, -460, 436, 456, 456, 448, 464, 408, 472, 464, 464, 472, 464, 472, 472, 456, 464, 464, 464, 464, 456, 464, 480, 416, 472, 472, 480, 480, 432, 464, 440, -456, 464, 464, 480, 448, 448, 448, 472, 464, 464, 456, 480, 456, 448, 456, 472, 448, 472, 456, 472, 464, 448, 472, 480, 480, 472, 448, 472, 480, 464, -480, 472, 464, 472, 464, 456, 480, 480, 456, 480, 456, 480, 472, 456, 472, 464, 464, 464, 472, 456, 480, 480, 472, 472, 472, 472, 456, 480, 480, 480, -480, 480, 480, 464, 464, 472, 480, 544, 536, 472, 472, 480, 480, 536, 472, 472, 480, 560, 552, 480, 568, 560, 552, 568, 568, 576, 568, 568, 576, 528, -568, 576, 568, 480, 480, 576, 552, 544, 560, 552, 552, 560, 560, 576, 568, 576, 576, 568, 576, 560, 568, 576, 552, 552, 552, 576, 560, 544, 560, 560, -560, 568, 576, 568, 560, 568, 568, 568, 560, 576, 576, 544, 576, 560, 560, 576, 568, 576, 568, 568, 568, 576, 560, 568, 568, 576, 568, 576, 544, 552, -576, 576, 552, 552, 560, 576, 560, 568, 576, 568, 560, 544, 576, 576, 576, 576, 576, 560, 560, 576, 568, 568, 568, 576, 568, 568, 568, 576, 576, 576, -560, 576, 568, 576, 560, 560, 568, 576, 568, 576, 576, 568, 576, 560, 576, 576, 576, 560, 576, 568, 568, 576, 568, 576, 576, 576, 568, 576, 576, 576, -576, 568, 576, 560, 568, 576, 576, 568, 576, 576, 576, 568, 576, 568, 568, 568, 568, 576, 568, 576, 576, 568, 576, 560, 568, 560, 576, 576, 568, 560, -568, 576, 552, 776, 576, 560, 560, 576, 568, 576, 576, 576, 776, 576, 560, 576, 784, 568, 576, 776, 568, 792, 776, 576, 776, 776, 776, 776, 776, 784, +{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 14, 14, 16, 15, 15, 18, 18, +18, 19, 20, 18, 22, 22, 20, 20, 26, 22, 22, 22, 24, 24, 24, 26, 25, 26, 30, 30, 28, 30, 31, 32, 32, 30, 36, 36, 36, 36, +38, 39, 39, 38, 39, 40, 40, 40, 44, 40, 44, 44, 40, 44, 44, 48, 44, 48, 44, 48, 48, 52, 52, 52, 44, 52, 52, 52, 52, 56, +60, 60, 52, 60, 60, 52, 52, 60, 64, 72, 56, 60, 72, 60, 60, 60, 76, 64, 60, 60, 72, 60, 72, 80, 72, 72, 80, 72, 68, 76, +88, 76, 68, 76, 72, 72, 80, 88, 72, 72, 88, 72, 80, 76, 76, 80, 80, 88, 80, 88, 84, 88, 80, 96, 80, 80, 88, 80, 88, 88, +80, 88, 96, 96, 88, 96, 92, 96, 96, 92, 100, 88, 96, 104, 88, 108, 96, 104, 104, 104, 112, 112, 108, 104, 104, 112, 112, 120, 104, 112, +120, 112, 112, 120, 124, 124, 116, 124, 108, 120, 124, 116, 120, 120, 116, 120, 124, 120, 120, 140, 120, 120, 120, 120, 144, 120, 132, 144, 136, 140, +144, 144, 144, 144, 144, 144, 144, 144, 140, 156, 140, 140, 144, 144, 144, 160, 144, 144, 156, 156, 144, 160, 160, 160, 160, 152, 160, 156, 156, 156, +160, 160, 144, 160, 164, 156, 156, 156, 172, 156, 156, 160, 176, 160, 160, 164, 176, 156, 160, 160, 156, 156, 160, 160, 156, 160, 172, 160, 188, 172, +172, 172, 160, 172, 176, 160, 160, 176, 180, 176, 164, 188, 192, 176, 172, 188, 188, 188, 172, 188, 192, 188, 180, 192, 192, 188, 188, 192, 188, 188, +188, 188, 192, 160, 156, 204, 160, 164, 164, 164, 164, 176, 180, 168, 172, 184, 188, 200, 216, 188, 164, 188, 220, 188, 208, 176, 180, 188, 172, 188, +184, 188, 204, 208, 220, 196, 220, 196, 208, 212, 188, 220, 176, 176, 184, 192, 208, 184, 188, 196, 204, 244, 208, 212, 212, 228, 256, 188, 204, 196, +188, 192, 192, 192, 212, 188, 292, 212, 220, 236, 228, 248, 260, 224, 264, 196, 200, 196, 212, 208, 204, 216, 208, 228, 216, 220, 252, 220, 268, 264, +284, 268, 300, 220, 208, 212, 220, 236, 244, 224, 252, 252, 260, 264, 256, 256, 292, 272, 288, 292, 328, 224, 256, 236, 252, 268, 256, 252, 260, 272, +284, 296, 300, 280, 300, 284, 252, 236, 328, 324, 264, 264, 256, 264, 280, 268, 284, 284, 292, 304, 260, 304, 264, 256, 328, 328, 260, 276, 328, 284, +276, 296, 300, 320, 320, 304, 328, 304, 272, 268, 280, 268, 288, 292, 288, 284, 316, 288, 328, 328, 300, 328, 328, 280, 264, 328, 300, 328, 316, 324, +300, 324, 300, 324, 316, 316, 328, 348, 276, 376, 288, 296, 296, 304, 320, 316, 328, 328, 324, 328, 340, 384, 348, 376, 300, 396, 304, 300, 304, 324, +300, 324, 328, 328, 328, 440, 448, 384, 376, 456, 464, 384, 376, 472, 480, 376, 352, 328, 376, 352, 376, 392, 392, 384, 456, 456, 480, 448, 456, 456, +472, 472, 472, 352, 464, 472, 472, 472, 480, 440, 480, 480, 480, 480, 456, 472, 472, 464, 464, 464, 456, 472, 480, 472, 480, 480, 480, 480, 448, 456, +480, 448, 456, 464, 456, 464, 456, 480, 472, 464, 464, 472, 472, 472, 480, 472, 480, 480, 472, 480, 480, 480, 480, 464, 464, 464, 456, 472, 464, 480, +472, 472, 480, 472, 480, 480, 464, 464, 472, 464, 472, 472, 480, 464, 480, 472, 480, 480, 576, 576, 560, 480, 472, 480, 568, 480, 480, 464, 480, 472, +480, 576, 480, 552, 560, 560, 560, 560, 568, 560, 560, 576, 576, 560, 568, 472, 480, 480, 544, 568, 552, 544, 560, 544, 560, 568, 552, 576, 568, 560, +576, 576, 568, 576, 560, 576, 568, 536, 576, 568, 560, 544, 560, 552, 560, 568, 560, 576, 568, 560, 560, 560, 568, 576, 568, 576, 576, 576, 576, 544, +576, 576, 568, 576, 560, 576, 576, 576, 544, 552, 568, 576, 552, 560, 576, 560, 568, 560, 576, 560, 544, 576, 576, 576, 576, 568, 576, 568, 560, 576, +552, 552, 576, 560, 568, 568, 568, 576, 576, 576, 560, 552, 576, 560, 568, 560, 576, 560, 568, 560, 568, 568, 568, 576, 552, 576, 560, 576, 576, 560, +568, 576, 568, 576, 576, 576, 576, 560, 576, 568, 568, 568, 560, 560, 576, 576, 568, 568, 576, 560, 576, 576, 568, 576, 560, 576, 576, 568, 576, 568, +576, 568, 576, 576, 568, 576, 576, 576, 576, 568, 576, 576, 568, 568, 576, 576, 784, 576, 776, 576, 568, 576, 576, 576, 576, 576, 576, 576, 776, 776, 776, 776, 776, 776, 776, 784, 776, 776, 784, 776, 776, 776, 800, 776, 776, 776, 776, 776, 776, 800, 776, 808, 792, 800, 776, 792, 776, 776, 776, 776, 792, 776, 776, 784, 792, 784, 800, 776, 784, 808, 784, 776, 776, 776, 808, 784, 792, 776, 792, 832, 800, 800, 816, 792, 816, 816, 856, 808, 848, 824, 870, 832, 792, 776, 784, 784, 784, 784, 800, 792, 800, 792, 784, 800, 800, 800, 816, 824, 824, 824, 832, 816, 816, 832, 824, 824, 848, 832, 856, 856, diff --git a/src/mpn_extras/mulhigh_basecase.c b/src/mpn_extras/mulhigh_basecase.c index 89b2bc9881..9ae58f3c86 100644 --- a/src/mpn_extras/mulhigh_basecase.c +++ b/src/mpn_extras/mulhigh_basecase.c @@ -42,7 +42,18 @@ mp_limb_t flint_mpn_mulhigh_11(nn_ptr r, nn_srcptr x, nn_srcptr y) { mp_limb_t w0, w1, lo, w2, cy; +#if 0 w0 = flint_mpn_mulhigh_10(r, x + 1, y); +#else + w0 = flint_mpn_mulhigh_9(r, x + 2, y); + r[9] = mpn_addmul_1(r, x + 2, 9, y[9]); + umul_ppmm(w1, lo, x[1], y[8]); + umul_ppmm(cy, w2, x[1], y[9]); + add_ssaaaa(cy, w0, cy, w0, 0, w1); + add_ssaaaa(cy, w0, cy, w0, 0, w2); + MPN_INCR_U(r, 10, cy); +#endif + r[10] = mpn_addmul_1(r, x + 1, 10, y[10]); umul_ppmm(w1, lo, x[0], y[9]); umul_ppmm(cy, w2, x[0], y[10]); @@ -51,13 +62,33 @@ mp_limb_t flint_mpn_mulhigh_11(nn_ptr r, nn_srcptr x, nn_srcptr y) MPN_INCR_U(r, 11, cy); return w0; + + return w0; } mp_limb_t flint_mpn_mulhigh_12(nn_ptr r, nn_srcptr x, nn_srcptr y) { mp_limb_t w0, w1, lo, w2, cy; +#if 0 w0 = flint_mpn_mulhigh_11(r, x + 1, y); +#else + w0 = flint_mpn_mulhigh_9(r, x + 3, y); + r[9] = mpn_addmul_1(r, x + 3, 9, y[9]); + umul_ppmm(w1, lo, x[2], y[8]); + umul_ppmm(cy, w2, x[2], y[9]); + add_ssaaaa(cy, w0, cy, w0, 0, w1); + add_ssaaaa(cy, w0, cy, w0, 0, w2); + MPN_INCR_U(r, 10, cy); + + r[10] = mpn_addmul_1(r, x + 2, 10, y[10]); + umul_ppmm(w1, lo, x[1], y[9]); + umul_ppmm(cy, w2, x[1], y[10]); + add_ssaaaa(cy, w0, cy, w0, 0, w1); + add_ssaaaa(cy, w0, cy, w0, 0, w2); + MPN_INCR_U(r, 11, cy); +#endif + r[11] = mpn_addmul_1(r, x + 1, 11, y[11]); umul_ppmm(w1, lo, x[0], y[10]); umul_ppmm(cy, w2, x[0], y[11]); diff --git a/src/mpn_extras/mulhigh_recursive.c b/src/mpn_extras/mulhigh_recursive.c index 2cc618040b..5d31865ffa 100644 --- a/src/mpn_extras/mulhigh_recursive.c +++ b/src/mpn_extras/mulhigh_recursive.c @@ -12,9 +12,6 @@ #include "mpn_extras.h" -/* Tuned for x86-64 */ -#define BEST_BASECASE_N 9 - mp_limb_t _flint_mpn_mulhigh_n_recursive(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n) { @@ -22,15 +19,15 @@ _flint_mpn_mulhigh_n_recursive(mp_ptr r, mp_srcptr x, mp_srcptr y, mp_size_t n) { return flint_mpn_mulhigh_func_tab[n](r, x, y); } - else if (n <= 2 * BEST_BASECASE_N) + else if (n <= 2 * FLINT_MPN_MULHIGH_BEST_TAB_N) { - mp_limb_t t[2 * BEST_BASECASE_N]; + mp_limb_t t[2 * FLINT_MPN_MULHIGH_BEST_TAB_N]; - mp_size_t m1 = n - BEST_BASECASE_N; - mp_size_t m2 = BEST_BASECASE_N; + mp_size_t m1 = n - FLINT_MPN_MULHIGH_BEST_TAB_N; + mp_size_t m2 = FLINT_MPN_MULHIGH_BEST_TAB_N; mp_limb_t cy, lo, w0, w1, w2; - FLINT_ASSERT(BEST_BASECASE_N <= FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH); + FLINT_ASSERT(FLINT_MPN_MULHIGH_BEST_TAB_N <= FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH); flint_mpn_mul(r, x + m1, m2, y + m2, m1); w0 = flint_mpn_mulhigh_n(t, x + m1, y, m2);