From 96bc07288ab2948817054a4f762e40ca409d0e86 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Tue, 28 May 2024 18:41:03 +0200 Subject: [PATCH 1/2] initial nfloat_complex dot product; refactor test code --- src/gr.h | 15 + src/gr/test_ring.c | 505 ++++++++++++++++++++++++++ src/nfloat.h | 3 + src/nfloat/complex.c | 2 +- src/nfloat/dot.c | 450 +++++++++++++++-------- src/nfloat/test/t-nfloat.c | 562 +---------------------------- src/nfloat/test/t-nfloat_complex.c | 44 ++- 7 files changed, 875 insertions(+), 706 deletions(-) diff --git a/src/gr.h b/src/gr.h index 274d2b6117..728366c7ac 100644 --- a/src/gr.h +++ b/src/gr.h @@ -1475,6 +1475,21 @@ void gr_test_ring(gr_ctx_t R, slong iters, int test_flags); void gr_test_multiplicative_group(gr_ctx_t R, slong iters, int test_flags); void gr_test_floating_point(gr_ctx_t R, slong iters, int test_flags); +int gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags); +int gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags); +int gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags); +int gr_test_approx_binary_op_type_variants(gr_ctx_t R, const char * opname, + gr_method_binary_op gr_op, + gr_method_binary_op_ui gr_op_ui, + gr_method_binary_op_si gr_op_si, + gr_method_binary_op_fmpz gr_op_fmpz, + gr_method_binary_op_fmpq gr_op_fmpq, + int fused, + int small_test_values, + gr_srcptr rel_tol, flint_rand_t state, int test_flags); +int gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol, flint_rand_t state, int test_flags); + + #ifdef __cplusplus } #endif diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c index 7e7f5a4442..78a5c939d3 100644 --- a/src/gr/test_ring.c +++ b/src/gr/test_ring.c @@ -3642,6 +3642,511 @@ gr_test_field(gr_ctx_t R, flint_rand_t state, int test_flags) return status; } +int +gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags) +{ + int status = GR_SUCCESS; + int cmp1, cmp2; + gr_ptr a, b, a_ref, b_ref; + + GR_TMP_INIT2(a, b, R); + GR_TMP_INIT2(a_ref, b_ref, R_ref); + + status |= gr_randtest(a, state, R); + status |= gr_randtest(b, state, R); + + status |= gr_set_other(a_ref, a, R, R_ref); + status |= gr_set_other(b_ref, b, R, R_ref); + + status |= op(&cmp1, a, b, R); + status |= op(&cmp2, a_ref, b_ref, R_ref); + + if (status == GR_SUCCESS && cmp1 != cmp2) + status = GR_TEST_FAIL; + + if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) + { + flint_printf("\n"); + gr_ctx_println(R); + gr_ctx_println(R_ref); + flint_printf("a = "); gr_println(a, R); + flint_printf("b = "); gr_println(b, R); + flint_printf("cmp1 = %d\n", cmp1); + flint_printf("cmp2 = %d\n", cmp2); + flint_printf("\n"); + } + + if (status == GR_TEST_FAIL) + flint_abort(); + + GR_TMP_CLEAR2(a, b, R); + GR_TMP_CLEAR2(a_ref, b_ref, R_ref); + + return status; +} + +int +gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags) +{ + int status = GR_SUCCESS; + int alias; + int cmp; + gr_ptr a, b, a_ref, b_ref, rel_err; + + GR_TMP_INIT2(a, b, R); + GR_TMP_INIT3(a_ref, b_ref, rel_err, R_ref); + + alias = n_randint(state, 2); + + status |= gr_randtest(a, state, R); + status |= gr_randtest(b, state, R); + + status |= gr_set_other(a_ref, a, R, R_ref); + + if (status == GR_SUCCESS) + { + if (alias == 0) + { + status |= op(b, a, R); + status |= op(b_ref, a_ref, R_ref); + } + else + { + status |= gr_set(b, a, R); + status |= op(b, b, R); + status |= op(b_ref, a_ref, R_ref); + } + + if (status == GR_SUCCESS) + { + status |= gr_set_other(rel_err, b, R, R_ref); + status |= gr_sub(rel_err, b_ref, rel_err, R_ref); + status |= gr_div(rel_err, rel_err, b_ref, R_ref); + status |= gr_abs(rel_err, rel_err, R_ref); + status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref); + + if (status == GR_SUCCESS && cmp > 0) + status = GR_TEST_FAIL; + + if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) + { + flint_printf("\n"); + gr_ctx_println(R); + gr_ctx_println(R_ref); + flint_printf("alias: %d\n", alias); + flint_printf("Computed:\n"); + flint_printf("a = "); gr_println(a, R); + flint_printf("op(a) = "); gr_println(b, R); + flint_printf("Reference:\n"); + flint_printf("a = "); gr_println(a_ref, R_ref); + flint_printf("op(a) = "); gr_println(b_ref, R_ref); + flint_printf("\nrel_err = "); gr_println(rel_err, R_ref); + flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref); + flint_printf("\n"); + } + } + } + + if (status == GR_TEST_FAIL) + flint_abort(); + + GR_TMP_CLEAR2(a, b, R); + GR_TMP_CLEAR3(a_ref, b_ref, rel_err, R_ref); + + return status; +} + +int +gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags) +{ + int status = GR_SUCCESS; + int alias; + int cmp; + gr_ptr a, b, c, a_ref, b_ref, c_ref, rel_err; + + GR_TMP_INIT3(a, b, c, R); + GR_TMP_INIT4(a_ref, b_ref, c_ref, rel_err, R_ref); + + alias = n_randint(state, 5); + + status |= gr_randtest(a, state, R); + status |= gr_randtest(b, state, R); + status |= gr_randtest(c, state, R); + + status |= gr_set_other(a_ref, a, R, R_ref); + status |= gr_set_other(b_ref, b, R, R_ref); + + if (status == GR_SUCCESS) + { + if (alias == 0) + { + status |= op(c, a, b, R); + status |= op(c_ref, a_ref, b_ref, R_ref); + } + else if (alias == 1) + { + status |= gr_set(c, a, R); + status |= op(c, c, b, R); + status |= op(c_ref, a_ref, b_ref, R_ref); + } + else if (alias == 2) + { + status |= gr_set(c, b, R); + status |= op(c, a, c, R); + status |= op(c_ref, a_ref, b_ref, R_ref); + } + else if (alias == 3) + { + status |= gr_set(b, a, R); + status |= gr_set(b_ref, a_ref, R_ref); + status |= op(c, a, a, R); + status |= op(c_ref, a_ref, a_ref, R_ref); + } + else + { + status |= gr_set(b, a, R); + status |= gr_set(c, a, R); + status |= gr_set(b_ref, a_ref, R_ref); + status |= op(c, c, c, R); + status |= op(c_ref, a_ref, a_ref, R_ref); + } + + if (status == GR_SUCCESS) + { + status |= gr_set_other(rel_err, c, R, R_ref); + + status |= gr_sub(rel_err, c_ref, rel_err, R_ref); + status |= gr_div(rel_err, rel_err, c_ref, R_ref); + status |= gr_abs(rel_err, rel_err, R_ref); + status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref); + + if (status == GR_SUCCESS && cmp > 0) + status = GR_TEST_FAIL; + + if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) + { + flint_printf("\n"); + gr_ctx_println(R); + gr_ctx_println(R_ref); + flint_printf("alias: %d\n", alias); + flint_printf("Computed:\n"); + flint_printf("a = "); gr_println(a, R); + flint_printf("b = "); gr_println(b, R); + flint_printf("a (op) b = "); gr_println(c, R); + flint_printf("Reference:\n"); + flint_printf("a = "); gr_println(a_ref, R_ref); + flint_printf("b = "); gr_println(b_ref, R_ref); + flint_printf("a (op) b = "); gr_println(c_ref, R_ref); + flint_printf("\nrel_err = "); gr_println(rel_err, R_ref); + flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref); + flint_printf("\n"); + } + } + } + + if (status == GR_TEST_FAIL) + flint_abort(); + + GR_TMP_CLEAR3(a, b, c, R); + GR_TMP_CLEAR4(a_ref, b_ref, c_ref, rel_err, R_ref); + + return status; +} + +int +gr_test_approx_binary_op_type_variants(gr_ctx_t R, + const char * opname, + gr_method_binary_op gr_op, + gr_method_binary_op_ui gr_op_ui, + gr_method_binary_op_si gr_op_si, + gr_method_binary_op_fmpz gr_op_fmpz, + gr_method_binary_op_fmpq gr_op_fmpq, + int fused, + int small_test_values, + gr_srcptr rel_tol, flint_rand_t state, int test_flags) +{ + int status, alias, which; + gr_ptr x, y, xy1, xy2, err; + ulong uy; + slong sy; + fmpz_t zy; + fmpq_t qy; + + GR_TMP_INIT5(x, y, xy1, xy2, err, R); + fmpz_init(zy); + fmpq_init(qy); + + which = 0; + + if (small_test_values) + { + uy = n_randint(state, 6); + sy = -5 + (slong) n_randint(state, 11); + fmpz_randtest(zy, state, 3); + fmpq_randtest(qy, state, 3); + } + else + { + uy = n_randtest(state); + sy = (slong) n_randtest(state); + + if (n_randint(state, 10) == 0) + { + fmpz_randtest(zy, state, 10000); + fmpq_randtest(qy, state, 200); + } + else + { + fmpz_randtest(zy, state, 100); + fmpq_randtest(qy, state, 100); + } + } + + for (which = 0; which < 4; which++) + { + status = GR_SUCCESS; + alias = n_randint(state, 2); + + GR_MUST_SUCCEED(gr_randtest(x, state, R)); + GR_MUST_SUCCEED(gr_randtest(y, state, R)); + GR_MUST_SUCCEED(gr_randtest(xy1, state, R)); + + if (fused && alias) + GR_MUST_SUCCEED(gr_set(xy2, x, R)); + else if (fused) + GR_MUST_SUCCEED(gr_set(xy2, xy1, R)); + else + GR_MUST_SUCCEED(gr_randtest(xy2, state, R)); + + if (alias) + GR_MUST_SUCCEED(gr_set(xy1, x, R)); + + if (which == 0) + { + if (alias) + status |= gr_op_ui(xy1, xy1, uy, R); + else + status |= gr_op_ui(xy1, x, uy, R); + + status |= gr_set_ui(y, uy, R); + + if (status == GR_SUCCESS) + status |= gr_op(xy2, x, y, R); + } + else if (which == 1) + { + if (alias) + status |= gr_op_si(xy1, xy1, sy, R); + else + status |= gr_op_si(xy1, x, sy, R); + status |= gr_set_si(y, sy, R); + + if (status == GR_SUCCESS) + status |= gr_op(xy2, x, y, R); + } + else if (which == 2) + { + if (alias) + status |= gr_op_fmpz(xy1, xy1, zy, R); + else + status |= gr_op_fmpz(xy1, x, zy, R); + status |= gr_set_fmpz(y, zy, R); + + if (status == GR_SUCCESS) + status |= gr_op(xy2, x, y, R); + } + else + { + if (alias) + status |= gr_op_fmpq(xy1, xy1, qy, R); + else + status |= gr_op_fmpq(xy1, x, qy, R); + status |= gr_set_fmpq(y, qy, R); + + if (status == GR_SUCCESS) + status |= gr_op(xy2, x, y, R); + } + + if (status == GR_SUCCESS) + { + int cmp; + + status |= gr_sub(err, xy1, xy2, R); + status |= gr_div(err, err, xy2, R); + status |= gr_abs(err, err, R); + status |= gr_cmp(&cmp, err, rel_tol, R); + + if (status == GR_SUCCESS && cmp > 0) + { + status = GR_TEST_FAIL; + break; + } + } + } + + if ((test_flags & GR_TEST_ALWAYS_ABLE) && (status & GR_UNABLE)) + status = GR_TEST_FAIL; + + if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) + { + flint_printf("\n"); + flint_printf("%s\n", opname); + gr_ctx_println(R); + flint_printf("which: %d\n", which); + flint_printf("alias: %d\n", alias); + flint_printf("x = "); gr_println(x, R); + flint_printf("y = "); gr_println(y, R); + flint_printf("y (op) y (1) = "); gr_println(xy1, R); + flint_printf("x (op) y (2) = "); gr_println(xy2, R); + flint_printf("err = "); gr_println(err, R); + flint_printf("tol = "); gr_println(rel_tol, R); + flint_printf("\n"); + } + + if (status == GR_TEST_FAIL) + flint_abort(); + + GR_TMP_CLEAR5(x, y, xy1, xy2, err, R); + + fmpz_clear(zy); + fmpq_clear(qy); + + return status; +} + +int +gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol, flint_rand_t state, int test_flags) +{ + slong i, len; + gr_ptr s0, vec1, vec2, res; + int subtract, initial, reverse; + gr_ptr avec1, avec2; + gr_ptr as0, ares, ares2, amag, err, t; + int status = GR_SUCCESS; + int cmp; + + len = n_randint(state, maxlen); + initial = n_randint(state, 2); + subtract = n_randint(state, 2); + reverse = n_randint(state, 2); + + vec1 = gr_heap_init_vec(len, R); + vec2 = gr_heap_init_vec(len, R); + s0 = gr_heap_init(R); + res = gr_heap_init(R); + + avec1 = gr_heap_init_vec(len, R_ref); + avec2 = gr_heap_init_vec(len, R_ref); + GR_TMP_INIT4(as0, ares, ares2, amag, R_ref); + GR_TMP_INIT2(t, err, R_ref); + + if (initial) + { + status |= gr_randtest(s0, state, R); + status |= gr_set_other(as0, s0, R, R_ref); + + status |= gr_set(ares, as0, R_ref); + status |= gr_abs(t, as0, R_ref); + status |= gr_add(amag, amag, t, R_ref); + } + + for (i = 0; i < len; i++) + { + if (2 * i >= len && n_randint(state, 2)) + { + status |= gr_set(GR_ENTRY(vec1, i, R->sizeof_elem), GR_ENTRY(vec1, len - 1 - i, R->sizeof_elem), R); + status |= gr_neg(GR_ENTRY(vec2, i, R->sizeof_elem), GR_ENTRY(vec2, len - 1 - i, R->sizeof_elem), R); + } + else + { + status |= gr_randtest(GR_ENTRY(vec1, i, R->sizeof_elem), state, R); + status |= gr_randtest(GR_ENTRY(vec2, i, R->sizeof_elem), state, R); + } + + status |= gr_set_other(GR_ENTRY(avec1, i, R_ref->sizeof_elem), GR_ENTRY(vec1, i, R->sizeof_elem), R, R_ref); + status |= gr_set_other(GR_ENTRY(avec2, i, R_ref->sizeof_elem), GR_ENTRY(vec2, i, R->sizeof_elem), R, R_ref); + } + + GR_MUST_SUCCEED(status); + + for (i = 0; i < len; i++) + { + status |= gr_mul(t, GR_ENTRY(avec1, i, R_ref->sizeof_elem), GR_ENTRY(avec2, reverse ? len - 1 - i : i, R_ref->sizeof_elem), R_ref); + + if (subtract) + status |= gr_sub(ares, ares, t, R_ref); + else + status |= gr_add(ares, ares, t, R_ref); + + status |= gr_abs(t, t, R_ref); + status |= gr_add(amag, amag, t, R_ref); + } + + GR_MUST_SUCCEED(status); + + if (reverse) + GR_MUST_SUCCEED(_gr_vec_dot_rev(res, initial ? s0 : NULL, subtract, vec1, vec2, len, R)); + else + GR_MUST_SUCCEED(_gr_vec_dot(res, initial ? s0 : NULL, subtract, vec1, vec2, len, R)); + + status |= gr_set_other(ares2, res, R, R_ref); + status |= gr_sub(err, ares, ares2, R_ref); + status |= gr_abs(err, err, R_ref); + status |= gr_mul(t, amag, rel_tol, R_ref); + status |= gr_abs(t, t, R_ref); + GR_MUST_SUCCEED(status); + + status |= gr_cmpabs(&cmp, err, t, R_ref); + + if (status == GR_SUCCESS && cmp > 0) + status = GR_TEST_FAIL; + + if (status == GR_TEST_FAIL) + { + flint_printf("FAIL: dot\n"); + gr_ctx_println(R); + gr_ctx_println(R_ref); + + flint_printf("reverse = %d, subtract = %d\n", reverse, subtract); + + if (initial) + { + flint_printf("\n\ninitial = "); + gr_println(as0, R_ref); + } + + flint_printf("\n\nvec1 = "); + _gr_vec_print(vec1, len, R); + flint_printf("\n\nvec2 = "); + _gr_vec_print(vec2, len, R); + flint_printf("\n\nres (ref) = \n"); + gr_println(ares, R_ref); + flint_printf("\nres (computed) = \n"); + gr_println(ares2, R_ref); + flint_printf("\nrel_tol = \n"); + gr_println(rel_tol, R_ref); + flint_printf("\nabs_tol = \n"); + gr_println(t, R_ref); + flint_printf("\nerr = \n"); + gr_println(err, R_ref); + flint_printf("\n"); + + flint_abort(); + } + + gr_heap_clear_vec(vec1, len, R); + gr_heap_clear_vec(vec2, len, R); + gr_heap_clear(s0, R); + gr_heap_clear(res, R); + + gr_heap_clear_vec(avec1, len, R_ref); + gr_heap_clear_vec(avec2, len, R_ref); + GR_TMP_CLEAR4(as0, ares, ares2, amag, R_ref); + GR_TMP_CLEAR2(t, err, R_ref); + + return status; +} + void gr_test_iter(gr_ctx_t R, flint_rand_t state, const char * descr, gr_test_function func, slong iters, int test_flags) { diff --git a/src/nfloat.h b/src/nfloat.h index 199402794c..00e41183aa 100644 --- a/src/nfloat.h +++ b/src/nfloat.h @@ -550,6 +550,9 @@ int _nfloat_complex_vec_set(nfloat_complex_ptr res, nfloat_complex_srcptr x, slo int _nfloat_complex_vec_add(nfloat_complex_ptr res, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx); int _nfloat_complex_vec_sub(nfloat_complex_ptr res, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx); +int _nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx); +int _nfloat_complex_vec_dot_rev(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx); + #ifdef __cplusplus } #endif diff --git a/src/nfloat/complex.c b/src/nfloat/complex.c index 6ec103ad2b..3d4dc0bfd5 100644 --- a/src/nfloat/complex.c +++ b/src/nfloat/complex.c @@ -1664,9 +1664,9 @@ gr_method_tab_input _nfloat_complex_methods_input[] = {GR_METHOD_VEC_MUL_SCALAR, (gr_funcptr) _nfloat_complex_vec_mul_scalar}, {GR_METHOD_VEC_ADDMUL_SCALAR, (gr_funcptr) _nfloat_complex_vec_addmul_scalar}, {GR_METHOD_VEC_SUBMUL_SCALAR, (gr_funcptr) _nfloat_complex_vec_submul_scalar}, +*/ {GR_METHOD_VEC_DOT, (gr_funcptr) _nfloat_complex_vec_dot}, {GR_METHOD_VEC_DOT_REV, (gr_funcptr) _nfloat_complex_vec_dot_rev}, -*/ /* {GR_METHOD_POLY_MULLOW, (gr_funcptr) nfloat_complex_poly_mullow}, {GR_METHOD_POLY_ROOTS_OTHER,(gr_funcptr) nfloat_complex_poly_roots_other}, diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c index 65f9b8ec6a..32c678c200 100644 --- a/src/nfloat/dot.c +++ b/src/nfloat/dot.c @@ -11,6 +11,9 @@ #include "nfloat.h" +/* TODO: don't use ctx->sizeof_elem, to allow real dot products to be + called from complex context objects */ + /* Having some dummy code in the continue block makes the loop go faster. Don't ask me why. */ #if 1 #define DUMMY_OP add_ssaaaa(s1, s0, s1, s0, 0, 0) @@ -90,6 +93,128 @@ #endif +void +_nfloat_vec_dot_set_initial(ulong * s, slong sexp, nfloat_srcptr x, int subtract, slong nlimbs) +{ + slong xexp, delta; + slong delta_limbs, delta_bits; + int xsgnbit; + + xexp = NFLOAT_EXP(x); + xsgnbit = NFLOAT_SGNBIT(x) ^ subtract; + delta = sexp - xexp; + + if (delta < FLINT_BITS) + { + FLINT_ASSERT(delta != 0); + + mpn_rshift(s + 1, NFLOAT_D(x), nlimbs, delta); + s[0] = NFLOAT_D(x)[0] << (FLINT_BITS - delta); + } + else + { + delta_limbs = delta / FLINT_BITS; + delta_bits = delta % FLINT_BITS; + + if (delta_limbs < nlimbs + 1) + { + if (delta_bits == 0) + flint_mpn_copyi(s, NFLOAT_D(x) + delta_limbs - 1, nlimbs + 1 - delta_limbs); + else + mpn_rshift(s, NFLOAT_D(x) + delta_limbs - 1, nlimbs + 1 - delta_limbs, delta_bits); + } + } + + if (xsgnbit) + mpn_neg(s, s, nlimbs + 1); +} + +#include + +void +_nfloat_vec_dot_addmul(ulong * s, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract, slong nlimbs) +{ + slong xexp, delta; + slong delta_limbs, delta_bits; + int xsgnbit; + ulong t[NFLOAT_MAX_LIMBS + 1]; + + xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi); + xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract; + delta = sexp - xexp; + + if (delta < FLINT_BITS) + { + t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs); + + if (nlimbs == 3) + { + if (xsgnbit) + sub_ddddmmmmssss(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], + t[3] >> delta, + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + else + add_ssssaaaaaaaa(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], + t[3] >> delta, + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + } + else if (nlimbs == 4) + { + if (xsgnbit) + sub_dddddmmmmmsssss(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], + t[4] >> delta, + (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + else + add_sssssaaaaaaaaaa(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], + t[4] >> delta, + (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), + (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), + (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), + (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); + } + else + { + mpn_rshift(t, t, nlimbs + 1, delta); + + if (xsgnbit) + mpn_sub_n(s, s, t, nlimbs + 1); + else + mpn_add_n(s, s, t, nlimbs + 1); + } + } + else + { + delta_limbs = delta / FLINT_BITS; + delta_bits = delta % FLINT_BITS; + + /* alternative criterion: if (delta < (FLINT_BITS * nlimbs) + 2 * pad_bits) */ + if (delta_limbs < nlimbs + 1) + { + /* todo: squaring case */ + flint_mpn_mulhigh_n(t, NFLOAT_D(xi) + delta_limbs - 1, NFLOAT_D(yi) + delta_limbs - 1, nlimbs + 1 - delta_limbs); + + if (delta_bits != 0) + mpn_rshift(t, t, nlimbs + 1 - delta_limbs, delta_bits); + + if (xsgnbit) + mpn_sub(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs); + else + mpn_add(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs); + } + else + { + /* Skip term. */ + } + } +} + int __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, slong sizeof_xstep, nfloat_srcptr y, slong sizeof_ystep, slong len, gr_ctx_t ctx) { @@ -341,11 +466,12 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src { xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi); -#if 1 +#if 0 FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]); (void) t0; #else FLINT_MPN_MUL_2X2(t3, t2, t1, t0, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]); + (void) t0; #endif if (xsgnbit) @@ -447,12 +573,10 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src if (!(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN))) { - slong i, xexp, delta, sexp, norm, pad_bits; + slong i, xexp, sexp, pad_bits; int xsgnbit; - ulong t[NFLOAT_MAX_LIMBS + 1]; ulong s[NFLOAT_MAX_LIMBS + 1]; nfloat_srcptr xi, yi; - slong n; slong nlimbs = NFLOAT_CTX_NLIMBS(ctx); @@ -479,182 +603,214 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src sexp += pad_bits; if (initial != NULL && !NFLOAT_IS_ZERO(initial)) + _nfloat_vec_dot_set_initial(s, sexp, initial, subtract, nlimbs); + + for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep) { - xexp = NFLOAT_EXP(initial); - xsgnbit = NFLOAT_SGNBIT(initial) ^ subtract; - delta = sexp - xexp; + if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi)) + continue; - if (delta < FLINT_BITS) - { - FLINT_ASSERT(delta != 0); + _nfloat_vec_dot_addmul(s, sexp, xi, yi, 0, nlimbs); + } - mpn_rshift(s + 1, NFLOAT_D(initial), nlimbs, delta); - s[0] = NFLOAT_D(initial)[0] << (FLINT_BITS - delta); - if (xsgnbit) - mpn_neg(s, s, nlimbs + 1); - } - else - { - slong delta_limbs, delta_bits; + if (LIMB_MSB_IS_SET(s[nlimbs])) + { + mpn_neg(s, s, nlimbs + 1); + xsgnbit = !subtract; + } + else + { + xsgnbit = subtract; + } - delta_limbs = delta / FLINT_BITS; - delta_bits = delta % FLINT_BITS; + return nfloat_set_mpn_2exp(res, s, nlimbs + 1, sexp, xsgnbit, ctx); + } - if (delta_limbs < nlimbs + 1) - { - if (delta_bits == 0) - flint_mpn_copyi(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs); - else - mpn_rshift(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs, delta_bits); + return GR_UNABLE; +} - if (xsgnbit) - mpn_neg(s, s, nlimbs + 1); - } - } - } +int +_nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx) +{ + return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, y, ctx->sizeof_elem, len, ctx); +} - for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep) - { - if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi)) - continue; +int +_nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx) +{ + return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx); +} - xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi); - xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi); - delta = sexp - xexp; - if (delta < FLINT_BITS) - { - t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs); +void +print_s(ulong * s, slong sexp, slong nlimbs) +{ + ulong x = s[nlimbs - 1]; + double c = 1.0; - if (nlimbs == 3) - { - if (xsgnbit) - sub_ddddmmmmssss(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], - t[3] >> delta, - (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), - (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), - (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); - else - add_ssssaaaaaaaa(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0], - t[3] >> delta, - (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), - (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), - (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); - } - else if (nlimbs == 4) - { - if (xsgnbit) - sub_dddddmmmmmsssss(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], - t[4] >> delta, - (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), - (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), - (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), - (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); - else - add_sssssaaaaaaaaaa(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0], - t[4] >> delta, - (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)), - (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)), - (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)), - (t[0] >> delta) | (t[1] << (FLINT_BITS - delta))); - } - else - { - mpn_rshift(t, t, nlimbs + 1, delta); + if (LIMB_MSB_IS_SET(x)) + { + x = -x; + c = -1.0; + } - if (xsgnbit) - mpn_sub_n(s, s, t, nlimbs + 1); - else - mpn_add_n(s, s, t, nlimbs + 1); - } - } - else - { - slong delta_limbs, delta_bits; + flint_printf("SUM %.15e\n", (double) x * ldexp(1.0, sexp - FLINT_BITS) * c); +} - delta_limbs = delta / FLINT_BITS; - delta_bits = delta % FLINT_BITS; +int +__nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, slong xstep, nfloat_complex_srcptr y, slong ystep, slong len, gr_ctx_t ctx) +{ + slong i, pad_bits; + slong re_exp, im_exp; + ulong re_s[NFLOAT_MAX_LIMBS + 1]; + ulong im_s[NFLOAT_MAX_LIMBS + 1]; + nfloat_srcptr a, b, c, d; + nfloat_srcptr re_initial, im_initial; + nfloat_ptr re_res, im_res; + slong aexp, bexp, cexp, dexp; + nfloat_srcptr xi, yi; + int status = GR_SUCCESS; + int re_sgnbit, im_sgnbit; + + slong nlimbs = NFLOAT_CTX_NLIMBS(ctx); + slong real_part_size = NFLOAT_CTX_DATA_NLIMBS(ctx); + + re_exp = WORD_MIN; + im_exp = WORD_MIN; + + re_initial = NFLOAT_COMPLEX_RE(initial, ctx); + im_initial = NFLOAT_COMPLEX_IM(initial, ctx); + + if (initial != NULL) + { + if (!NFLOAT_IS_ZERO(re_initial)) + re_exp = NFLOAT_EXP(re_initial); + if (!NFLOAT_IS_ZERO(im_initial)) + im_exp = NFLOAT_EXP(im_initial); + } - /* alternative criterion: if (delta < (FLINT_BITS * nlimbs) + 2 * pad_bits) */ - if (delta_limbs < nlimbs + 1) - { - /* todo: squaring case */ - flint_mpn_mulhigh_n(t, NFLOAT_D(xi) + delta_limbs - 1, NFLOAT_D(yi) + delta_limbs - 1, nlimbs + 1 - delta_limbs); + for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) + { + a = xi; + b = (ulong *) xi + real_part_size; + c = yi; + d = (ulong *) yi + real_part_size; + + aexp = NFLOAT_EXP(a); + bexp = NFLOAT_EXP(b); + cexp = NFLOAT_EXP(c); + dexp = NFLOAT_EXP(d); + + if (aexp != NFLOAT_EXP_ZERO && cexp != NFLOAT_EXP_ZERO) + re_exp = FLINT_MAX(re_exp, aexp + cexp); + if (bexp != NFLOAT_EXP_ZERO && dexp != NFLOAT_EXP_ZERO) + re_exp = FLINT_MAX(re_exp, bexp + dexp); + if (aexp != NFLOAT_EXP_ZERO && dexp != NFLOAT_EXP_ZERO) + im_exp = FLINT_MAX(im_exp, aexp + dexp); + if (bexp != NFLOAT_EXP_ZERO && cexp != NFLOAT_EXP_ZERO) + im_exp = FLINT_MAX(im_exp, bexp + cexp); + } - if (delta_bits != 0) - mpn_rshift(t, t, nlimbs + 1 - delta_limbs, delta_bits); + if (re_exp == WORD_MIN && im_exp == WORD_MIN) + return nfloat_complex_zero(res, ctx); - if (xsgnbit) - mpn_sub(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs); - else - mpn_add(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs); - } - else - { - /* Skip term. */ - } - } - } + pad_bits = FLINT_BIT_COUNT(len + 1) + 2; + if (re_exp != WORD_MIN) re_exp += pad_bits; + if (im_exp != WORD_MIN) im_exp += pad_bits; - if (LIMB_MSB_IS_SET(s[nlimbs])) - { - mpn_neg(s, s, nlimbs + 1); - xsgnbit = 1; - } - else - { - xsgnbit = 0; - } + flint_mpn_zero(re_s, nlimbs + 1); + flint_mpn_zero(im_s, nlimbs + 1); - NFLOAT_SGNBIT(res) = xsgnbit ^ subtract; + if (initial != NULL) + { + if (!NFLOAT_IS_ZERO(re_initial)) + _nfloat_vec_dot_set_initial(re_s, re_exp, re_initial, subtract, nlimbs); + if (!NFLOAT_IS_ZERO(im_initial)) + _nfloat_vec_dot_set_initial(im_s, im_exp, im_initial, subtract, nlimbs); + } - n = nlimbs + 1; - MPN_NORM(s, n); + for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) + { + a = xi; + b = (ulong *) xi + real_part_size; + c = yi; + d = (ulong *) yi + real_part_size; - xexp = sexp - (nlimbs + 1 - n) * FLINT_BITS; +/* + aexp = NFLOAT_EXP(a); + bexp = NFLOAT_EXP(b); + cexp = NFLOAT_EXP(c); + dexp = NFLOAT_EXP(d); + + asgnbit = NFLOAT_SGNBIT(a); + bsgnbit = NFLOAT_SGNBIT(b); + csgnbit = NFLOAT_SGNBIT(c); + dsgnbit = NFLOAT_SGNBIT(d); + + abexp = FLINT_MAX(aexp, bexp) + 2; + cdexp = FLINT_MAX(cexp, dexp) + 2; + + adelta = abexp - aexp; + bdelta = abexp - bexp; + cdelta = cdexp - cexp; + ddelta = cdexp - dexp; +*/ - if (n == nlimbs + 1) + if (!NFLOAT_IS_ZERO(a)) { - norm = flint_clz(s[n - 1]); - if (norm) - { - mpn_lshift(NFLOAT_D(res), s + 1, nlimbs, norm); - NFLOAT_D(res)[0] |= (s[0] >> (FLINT_BITS - norm)); - } - else - flint_mpn_copyi(NFLOAT_D(res), s + 1, nlimbs); - xexp -= norm; + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs); + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs); } - else - { - if (n == 0) - return nfloat_zero(res, ctx); - norm = flint_clz(s[n - 1]); - if (norm) - mpn_lshift(NFLOAT_D(res) + nlimbs - n, s, n, norm); - else - flint_mpn_copyi(NFLOAT_D(res) + nlimbs - n, s, n); - flint_mpn_zero(NFLOAT_D(res), nlimbs - n); - xexp -= norm; + if (!NFLOAT_IS_ZERO(b)) + { + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs); + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs); } + } - NFLOAT_EXP(res) = xexp; - NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx); - return GR_SUCCESS; + re_res = res; + im_res = (nn_ptr) res + real_part_size; + + if (re_exp == WORD_MIN) + { + status |= nfloat_zero(re_res, ctx); + } + else + { + re_sgnbit = LIMB_MSB_IS_SET(re_s[nlimbs]); + if (re_sgnbit) + mpn_neg(re_s, re_s, nlimbs + 1); + status |= nfloat_set_mpn_2exp(re_res, re_s, nlimbs + 1, re_exp, re_sgnbit ^ subtract, ctx); } - return GR_UNABLE; + if (im_exp == WORD_MIN) + { + status |= nfloat_zero(im_res, ctx); + } + else + { + im_sgnbit = LIMB_MSB_IS_SET(im_s[nlimbs]); + if (im_sgnbit) + mpn_neg(im_s, im_s, nlimbs + 1); + status |= nfloat_set_mpn_2exp(im_res, im_s, nlimbs + 1, im_exp, im_sgnbit ^ subtract, ctx); + } + + return status; } int -_nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx) +_nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx) { - return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, y, ctx->sizeof_elem, len, ctx); + return __nfloat_complex_vec_dot(res, initial, subtract, x, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), y, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), len, ctx); } int -_nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx) +_nfloat_complex_vec_dot_rev(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx) { - return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx); + return __nfloat_complex_vec_dot(res, initial, subtract, x, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), GR_ENTRY(y, len - 1, ctx->sizeof_elem), -NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), len, ctx); } diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c index 05fa2b3977..a1d746bc15 100644 --- a/src/nfloat/test/t-nfloat.c +++ b/src/nfloat/test/t-nfloat.c @@ -13,522 +13,11 @@ #include "fmpq.h" #include "arf.h" #include "acf.h" +#include "gr.h" #include "gr_vec.h" #include "gr_special.h" #include "nfloat.h" -int -gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags) -{ - int status = GR_SUCCESS; - int cmp1, cmp2; - gr_ptr a, b, a_ref, b_ref; - - GR_TMP_INIT2(a, b, R); - GR_TMP_INIT2(a_ref, b_ref, R_ref); - - status |= gr_randtest(a, state, R); - status |= gr_randtest(b, state, R); - - status |= gr_set_other(a_ref, a, R, R_ref); - status |= gr_set_other(b_ref, b, R, R_ref); - - status |= op(&cmp1, a, b, R); - status |= op(&cmp2, a_ref, b_ref, R_ref); - - if (status == GR_SUCCESS && cmp1 != cmp2) - status = GR_TEST_FAIL; - - if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) - { - flint_printf("\n"); - gr_ctx_println(R); - gr_ctx_println(R_ref); - flint_printf("a = "); gr_println(a, R); - flint_printf("b = "); gr_println(b, R); - flint_printf("cmp1 = %d\n", cmp1); - flint_printf("cmp2 = %d\n", cmp2); - flint_printf("\n"); - } - - if (status == GR_TEST_FAIL) - flint_abort(); - - GR_TMP_CLEAR2(a, b, R); - GR_TMP_CLEAR2(a_ref, b_ref, R_ref); - - return status; -} - -int -gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags) -{ - int status = GR_SUCCESS; - int alias; - int cmp; - gr_ptr a, b, a_ref, b_ref, rel_err; - - GR_TMP_INIT2(a, b, R); - GR_TMP_INIT3(a_ref, b_ref, rel_err, R_ref); - - alias = n_randint(state, 2); - - status |= gr_randtest(a, state, R); - status |= gr_randtest(b, state, R); - - status |= gr_set_other(a_ref, a, R, R_ref); - - if (status == GR_SUCCESS) - { - if (alias == 0) - { - status |= op(b, a, R); - status |= op(b_ref, a_ref, R_ref); - } - else - { - status |= gr_set(b, a, R); - status |= op(b, b, R); - status |= op(b_ref, a_ref, R_ref); - } - - if (status == GR_SUCCESS) - { - status |= gr_set_other(rel_err, b, R, R_ref); - status |= gr_sub(rel_err, b_ref, rel_err, R_ref); - status |= gr_div(rel_err, rel_err, b_ref, R_ref); - status |= gr_abs(rel_err, rel_err, R_ref); - status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref); - - if (status == GR_SUCCESS && cmp > 0) - status = GR_TEST_FAIL; - - if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) - { - flint_printf("\n"); - gr_ctx_println(R); - gr_ctx_println(R_ref); - flint_printf("alias: %d\n", alias); - flint_printf("Computed:\n"); - flint_printf("a = "); gr_println(a, R); - flint_printf("op(a) = "); gr_println(b, R); - flint_printf("Reference:\n"); - flint_printf("a = "); gr_println(a_ref, R_ref); - flint_printf("op(a) = "); gr_println(b_ref, R_ref); - flint_printf("\nrel_err = "); gr_println(rel_err, R_ref); - flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref); - flint_printf("\n"); - } - } - } - - if (status == GR_TEST_FAIL) - flint_abort(); - - GR_TMP_CLEAR2(a, b, R); - GR_TMP_CLEAR3(a_ref, b_ref, rel_err, R_ref); - - return status; -} - -int -gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags) -{ - int status = GR_SUCCESS; - int alias; - int cmp; - gr_ptr a, b, c, a_ref, b_ref, c_ref, rel_err; - - GR_TMP_INIT3(a, b, c, R); - GR_TMP_INIT4(a_ref, b_ref, c_ref, rel_err, R_ref); - - alias = n_randint(state, 5); - - status |= gr_randtest(a, state, R); - status |= gr_randtest(b, state, R); - status |= gr_randtest(c, state, R); - - status |= gr_set_other(a_ref, a, R, R_ref); - status |= gr_set_other(b_ref, b, R, R_ref); - - if (status == GR_SUCCESS) - { - if (alias == 0) - { - status |= op(c, a, b, R); - status |= op(c_ref, a_ref, b_ref, R_ref); - } - else if (alias == 1) - { - status |= gr_set(c, a, R); - status |= op(c, c, b, R); - status |= op(c_ref, a_ref, b_ref, R_ref); - } - else if (alias == 2) - { - status |= gr_set(c, b, R); - status |= op(c, a, c, R); - status |= op(c_ref, a_ref, b_ref, R_ref); - } - else if (alias == 3) - { - status |= gr_set(b, a, R); - status |= gr_set(b_ref, a_ref, R_ref); - status |= op(c, a, a, R); - status |= op(c_ref, a_ref, a_ref, R_ref); - } - else - { - status |= gr_set(b, a, R); - status |= gr_set(c, a, R); - status |= gr_set(b_ref, a_ref, R_ref); - status |= op(c, c, c, R); - status |= op(c_ref, a_ref, a_ref, R_ref); - } - - if (status == GR_SUCCESS) - { - status |= gr_set_other(rel_err, c, R, R_ref); - - status |= gr_sub(rel_err, c_ref, rel_err, R_ref); - status |= gr_div(rel_err, rel_err, c_ref, R_ref); - status |= gr_abs(rel_err, rel_err, R_ref); - status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref); - - if (status == GR_SUCCESS && cmp > 0) - status = GR_TEST_FAIL; - - if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) - { - flint_printf("\n"); - gr_ctx_println(R); - gr_ctx_println(R_ref); - flint_printf("alias: %d\n", alias); - flint_printf("Computed:\n"); - flint_printf("a = "); gr_println(a, R); - flint_printf("b = "); gr_println(b, R); - flint_printf("a (op) b = "); gr_println(c, R); - flint_printf("Reference:\n"); - flint_printf("a = "); gr_println(a_ref, R_ref); - flint_printf("b = "); gr_println(b_ref, R_ref); - flint_printf("a (op) b = "); gr_println(c_ref, R_ref); - flint_printf("\nrel_err = "); gr_println(rel_err, R_ref); - flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref); - flint_printf("\n"); - } - } - } - - if (status == GR_TEST_FAIL) - flint_abort(); - - GR_TMP_CLEAR3(a, b, c, R); - GR_TMP_CLEAR4(a_ref, b_ref, c_ref, rel_err, R_ref); - - return status; -} - -int -gr_test_approx_binary_op_type_variants(gr_ctx_t R, - const char * opname, - gr_method_binary_op gr_op, - gr_method_binary_op_ui gr_op_ui, - gr_method_binary_op_si gr_op_si, - gr_method_binary_op_fmpz gr_op_fmpz, - gr_method_binary_op_fmpq gr_op_fmpq, - int fused, - int small_test_values, - gr_srcptr rel_tol, flint_rand_t state, int test_flags) -{ - int status, alias, which; - gr_ptr x, y, xy1, xy2, err; - ulong uy; - slong sy; - fmpz_t zy; - fmpq_t qy; - - GR_TMP_INIT5(x, y, xy1, xy2, err, R); - fmpz_init(zy); - fmpq_init(qy); - - which = 0; - - if (small_test_values) - { - uy = n_randint(state, 6); - sy = -5 + (slong) n_randint(state, 11); - fmpz_randtest(zy, state, 3); - fmpq_randtest(qy, state, 3); - } - else - { - uy = n_randtest(state); - sy = (slong) n_randtest(state); - - if (n_randint(state, 10) == 0) - { - fmpz_randtest(zy, state, 10000); - fmpq_randtest(qy, state, 200); - } - else - { - fmpz_randtest(zy, state, 100); - fmpq_randtest(qy, state, 100); - } - } - - for (which = 0; which < 4; which++) - { - status = GR_SUCCESS; - alias = n_randint(state, 2); - - GR_MUST_SUCCEED(gr_randtest(x, state, R)); - GR_MUST_SUCCEED(gr_randtest(y, state, R)); - GR_MUST_SUCCEED(gr_randtest(xy1, state, R)); - - if (fused && alias) - GR_MUST_SUCCEED(gr_set(xy2, x, R)); - else if (fused) - GR_MUST_SUCCEED(gr_set(xy2, xy1, R)); - else - GR_MUST_SUCCEED(gr_randtest(xy2, state, R)); - - if (alias) - GR_MUST_SUCCEED(gr_set(xy1, x, R)); - - if (which == 0) - { - if (alias) - status |= gr_op_ui(xy1, xy1, uy, R); - else - status |= gr_op_ui(xy1, x, uy, R); - - status |= gr_set_ui(y, uy, R); - - if (status == GR_SUCCESS) - status |= gr_op(xy2, x, y, R); - } - else if (which == 1) - { - if (alias) - status |= gr_op_si(xy1, xy1, sy, R); - else - status |= gr_op_si(xy1, x, sy, R); - status |= gr_set_si(y, sy, R); - - if (status == GR_SUCCESS) - status |= gr_op(xy2, x, y, R); - } - else if (which == 2) - { - if (alias) - status |= gr_op_fmpz(xy1, xy1, zy, R); - else - status |= gr_op_fmpz(xy1, x, zy, R); - status |= gr_set_fmpz(y, zy, R); - - if (status == GR_SUCCESS) - status |= gr_op(xy2, x, y, R); - } - else - { - if (alias) - status |= gr_op_fmpq(xy1, xy1, qy, R); - else - status |= gr_op_fmpq(xy1, x, qy, R); - status |= gr_set_fmpq(y, qy, R); - - if (status == GR_SUCCESS) - status |= gr_op(xy2, x, y, R); - } - - if (status == GR_SUCCESS) - { - int cmp; - - status |= gr_sub(err, xy1, xy2, R); - status |= gr_div(err, err, xy2, R); - status |= gr_abs(err, err, R); - status |= gr_cmp(&cmp, err, rel_tol, R); - - if (status == GR_SUCCESS && cmp > 0) - { - status = GR_TEST_FAIL; - break; - } - } - } - - if ((test_flags & GR_TEST_ALWAYS_ABLE) && (status & GR_UNABLE)) - status = GR_TEST_FAIL; - - if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL) - { - flint_printf("\n"); - flint_printf("%s\n", opname); - gr_ctx_println(R); - flint_printf("which: %d\n", which); - flint_printf("alias: %d\n", alias); - flint_printf("x = "); gr_println(x, R); - flint_printf("y = "); gr_println(y, R); - flint_printf("y (op) y (1) = "); gr_println(xy1, R); - flint_printf("x (op) y (2) = "); gr_println(xy2, R); - flint_printf("err = "); gr_println(err, R); - flint_printf("tol = "); gr_println(rel_tol, R); - flint_printf("\n"); - } - - if (status == GR_TEST_FAIL) - flint_abort(); - - GR_TMP_CLEAR5(x, y, xy1, xy2, err, R); - - fmpz_clear(zy); - fmpq_clear(qy); - - return status; -} - -void -nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx) -{ - slong iter, i, len, prec, ebits; - gr_ptr s0, vec1, vec2, res; - int subtract, initial, reverse; - arf_ptr avec1, avec2; - arf_t as0, ares, ares2, amag, err, t; - - prec = NFLOAT_CTX_PREC(ctx); - - for (iter = 0; iter < iters; iter++) - { - len = n_randint(state, 30); - - if (n_randint(state, 2)) - ebits = 1; - else - ebits = 10; - - initial = n_randint(state, 2); - subtract = n_randint(state, 2); - reverse = n_randint(state, 2); - - vec1 = gr_heap_init_vec(len, ctx); - vec2 = gr_heap_init_vec(len, ctx); - s0 = gr_heap_init(ctx); - res = gr_heap_init(ctx); - - avec1 = _arf_vec_init(len); - avec2 = _arf_vec_init(len); - arf_init(as0); - arf_init(ares); - arf_init(ares2); - arf_init(amag); - arf_init(t); - arf_init(err); - - if (initial) - { - arf_randtest(as0, state, prec, ebits); - GR_MUST_SUCCEED(nfloat_set_arf(s0, as0, ctx)); - - arf_set(ares, as0); - arf_abs(t, as0); - arf_add(amag, amag, t, prec, ARF_RND_DOWN); - } - - for (i = 0; i < len; i++) - { - if (i > 0 && n_randint(state, 2)) - { - arf_set(avec1 + i, avec1 + len - 1 - i); - arf_neg(avec2 + i, avec2 + len - 1 - i); - } - else - { - arf_randtest(avec1 + i, state, prec, ebits); - arf_randtest(avec2 + i, state, prec, ebits); - } - } - - for (i = 0; i < len; i++) - { - arf_mul(t, avec1 + i, avec2 + (reverse ? len - 1 - i : i), 2 * prec, ARF_RND_DOWN); - - if (subtract) - arf_sub(ares, ares, t, 2 * prec, ARF_RND_DOWN); - else - arf_add(ares, ares, t, 2 * prec, ARF_RND_DOWN); - - arf_abs(t, t); - arf_add(amag, amag, t, prec, ARF_RND_DOWN); - } - - /* tolerance */ - arf_mul_2exp_si(t, amag, -prec + 3); - - for (i = 0; i < len; i++) - { - GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec1, i, ctx->sizeof_elem), avec1 + i, ctx)); - GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec2, i, ctx->sizeof_elem), avec2 + i, ctx)); - } - - if (reverse) - GR_MUST_SUCCEED(_nfloat_vec_dot_rev(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx)); - else - GR_MUST_SUCCEED(_nfloat_vec_dot(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx)); - - GR_MUST_SUCCEED(nfloat_get_arf(ares2, res, ctx)); - - arf_sub(err, ares, ares2, prec, ARF_RND_DOWN); - arf_abs(err, err); - - if (arf_cmpabs(err, t) > 0) - { - flint_printf("FAIL: dot\n"); - gr_ctx_println(ctx); - - flint_printf("reverse = %d, subtract = %d\n", reverse, subtract); - - if (initial) - { - flint_printf("\n\ninitial = "); - arf_printd(as0, 2 + prec / 3.33); - } - - flint_printf("\n\nvec1 = "); - _gr_vec_print(vec1, len, ctx); - flint_printf("\n\nvec2 = "); - _gr_vec_print(vec2, len, ctx); - flint_printf("\n\nares = \n"); - arf_printd(ares, 2 + prec / 3.33); - flint_printf("\n\nares2 = \n"); - arf_printd(ares2, 2 + prec / 3.33); - flint_printf("\n\ntol = \n"); - arf_printd(t, 10); - flint_printf("\n\nerr = \n"); - arf_printd(err, 10); - flint_printf("\n\n"); - - flint_abort(); - } - - gr_heap_clear_vec(vec1, len, ctx); - gr_heap_clear_vec(vec2, len, ctx); - gr_heap_clear(s0, ctx); - gr_heap_clear(res, ctx); - - _arf_vec_clear(avec1, len); - _arf_vec_clear(avec2, len); - arf_clear(as0); - arf_clear(ares); - arf_clear(ares2); - arf_clear(amag); - arf_clear(t); - arf_clear(err); - } -} - TEST_FUNCTION_START(nfloat, state) { gr_ctx_t ctx; @@ -536,53 +25,12 @@ TEST_FUNCTION_START(nfloat, state) slong prec; slong iter; - for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS) - { - nfloat_complex_ctx_init(ctx, prec, 0); - - gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0); - - { - gr_ptr tol1, tol; - slong i, reps; - - gr_ctx_init_complex_acb(ctx2, prec + 32); - - tol1 = gr_heap_init(ctx); - tol = gr_heap_init(ctx2); - - GR_IGNORE(gr_one(tol, ctx2)); - GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 3, ctx2)); - - reps = (prec <= 256 ? 10000 : 1) * flint_test_multiplier(); - - for (i = 0; i < reps; i++) - { - gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_add, ctx2, tol, state, 0); - gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0); - gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0); - gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0); - gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_neg, ctx2, tol, state, 0); - gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sqr, ctx2, tol, state, 0); - gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_inv, ctx2, tol, state, 0); - } - - gr_heap_clear(tol1, ctx); - gr_heap_clear(tol, ctx2); - gr_ctx_clear(ctx2); - } - - gr_ctx_clear(ctx); - } - for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS) { nfloat_ctx_init(ctx, prec, 0); gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0); - nfloat_test_dot(state, (prec <= 128 ? 10000 : 100) * flint_test_multiplier(), ctx); - { gr_ptr x, x2; slong i; @@ -641,7 +89,7 @@ TEST_FUNCTION_START(nfloat, state) gr_ptr tol1, tol; slong i, reps; - gr_ctx_init_real_arb(ctx2, prec + 32); + gr_ctx_init_real_arb(ctx2, prec + 64); tol1 = gr_heap_init(ctx); tol = gr_heap_init(ctx2); @@ -657,6 +105,12 @@ TEST_FUNCTION_START(nfloat, state) gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0); } + reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier(); + + for (i = 0; i < reps; i++) + gr_test_approx_dot(ctx, ctx2, 30, tol, state, 0); + + reps = (prec <= 256 ? 1000 : 1) * flint_test_multiplier(); for (i = 0; i < reps; i++) diff --git a/src/nfloat/test/t-nfloat_complex.c b/src/nfloat/test/t-nfloat_complex.c index f800dbe883..320cc34f66 100644 --- a/src/nfloat/test/t-nfloat_complex.c +++ b/src/nfloat/test/t-nfloat_complex.c @@ -18,12 +18,48 @@ TEST_FUNCTION_START(nfloat_complex, state) { -/* gr_ctx_t ctx; gr_ctx_t ctx2; slong prec; - slong iter; -*/ + slong iter, reps; + gr_ptr tol1, tol; + + for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS) + { + nfloat_complex_ctx_init(ctx, prec, 0); + gr_ctx_init_complex_acb(ctx2, prec + 64); + + gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0); + + tol1 = gr_heap_init(ctx); + tol = gr_heap_init(ctx2); + + GR_IGNORE(gr_one(tol, ctx2)); + GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 3, ctx2)); + reps = (prec <= 256 ? 10000 : 1) * flint_test_multiplier(); + for (iter = 0; iter < reps; iter++) + { + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_add, ctx2, tol, state, 0); + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0); + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0); + gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0); + gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_neg, ctx2, tol, state, 0); + gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sqr, ctx2, tol, state, 0); + gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_inv, ctx2, tol, state, 0); + } + + GR_IGNORE(gr_one(tol, ctx2)); + GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 4, ctx2)); + reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier(); + for (iter = 0; iter < reps; iter++) + gr_test_approx_dot(ctx, ctx2, 10, tol, state, 0); + + gr_heap_clear(tol1, ctx); + gr_heap_clear(tol, ctx2); + gr_ctx_clear(ctx2); + + gr_ctx_clear(ctx); + } TEST_FUNCTION_END(state); -} +} \ No newline at end of file From 027e9219994b392950ef9de44644b3710311d997 Mon Sep 17 00:00:00 2001 From: Fredrik Johansson Date: Wed, 29 May 2024 10:19:34 +0200 Subject: [PATCH 2/2] nfloat_complex dot products --- src/gr/test_ring.c | 2 +- src/nfloat/complex.c | 2 +- src/nfloat/dot.c | 388 ++++++++++++++++++++++++++++++++++--- src/nfloat/test/t-nfloat.c | 2 - 4 files changed, 358 insertions(+), 36 deletions(-) diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c index 78a5c939d3..c1a16d7c43 100644 --- a/src/gr/test_ring.c +++ b/src/gr/test_ring.c @@ -4025,7 +4025,7 @@ gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol, int status = GR_SUCCESS; int cmp; - len = n_randint(state, maxlen); + len = n_randint(state, maxlen + 1); initial = n_randint(state, 2); subtract = n_randint(state, 2); reverse = n_randint(state, 2); diff --git a/src/nfloat/complex.c b/src/nfloat/complex.c index 3d4dc0bfd5..ec2a7a26ee 100644 --- a/src/nfloat/complex.c +++ b/src/nfloat/complex.c @@ -17,7 +17,7 @@ #include "acb.h" #include "nfloat.h" -int +static int _flint_mpn_signed_add_n(nn_ptr res, nn_srcptr x, int xsgnbit, nn_srcptr y, int ysgnbit, mp_size_t n) { if (xsgnbit == ysgnbit) diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c index 32c678c200..4d2a0c0970 100644 --- a/src/nfloat/dot.c +++ b/src/nfloat/dot.c @@ -129,6 +129,56 @@ _nfloat_vec_dot_set_initial(ulong * s, slong sexp, nfloat_srcptr x, int subtract mpn_neg(s, s, nlimbs + 1); } +/* n is the number of limbs of s and x */ +void +_nfloat_vec_dot_add_limbs(ulong * s, slong sexp, mp_srcptr x, slong xexp, int subtract, slong n) +{ + slong delta, delta_limbs, delta_bits; + ulong t[NFLOAT_MAX_LIMBS + 1]; + + if (sexp < xexp) + { + delta_bits = xexp - sexp; + delta_limbs = 0; + + FLINT_ASSERT(delta < FLINT_BITS); + + mpn_lshift(t, x, n, delta_bits); + + if (subtract) + mpn_sub_n(s, s, t, n); + else + mpn_add_n(s, s, t, n); + } + else + { + delta = sexp - xexp; + + delta_limbs = delta / FLINT_BITS; + delta_bits = delta % FLINT_BITS; + + if (delta_limbs < n) + { + if (delta_bits == 0) + { + if (subtract) + mpn_sub(s, s, n, x + delta_limbs, n - delta_limbs); + else + mpn_add(s, s, n, x + delta_limbs, n - delta_limbs); + } + else + { + mpn_rshift(t, x + delta_limbs, n - delta_limbs, delta_bits); + + if (subtract) + mpn_sub(s, s, n, t, n - delta_limbs); + else + mpn_add(s, s, n, t, n - delta_limbs); + } + } + } +} + #include void @@ -641,7 +691,7 @@ _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_ return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx); } - +/* void print_s(ulong * s, slong sexp, slong nlimbs) { @@ -656,6 +706,125 @@ print_s(ulong * s, slong sexp, slong nlimbs) flint_printf("SUM %.15e\n", (double) x * ldexp(1.0, sexp - FLINT_BITS) * c); } +*/ + +FLINT_FORCE_INLINE void +_nfloat_vec_dot_addmul_1(ulong * _s1, ulong * _s0, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract) +{ + slong xexp, delta; + ulong s0, s1, t0, t1; + int xsgnbit; + + xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi); + delta = sexp - xexp; + xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract; + + s0 = *_s0; + s1 = *_s1; + + if (delta < FLINT_BITS) + { + umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]); + if (xsgnbit) + sub_ddmmss(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta))); + else + add_ssaaaa(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta))); + } + else if (delta < 2 * FLINT_BITS) + { + umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]); + if (xsgnbit) + sub_ddmmss(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS)); + else + add_ssaaaa(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS)); + } + + *_s0 = s0; + *_s1 = s1; +} + +FLINT_FORCE_INLINE void +_nfloat_vec_dot_addmul_2(ulong * _s2, ulong * _s1, ulong * _s0, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract) +{ + slong xexp, delta; + ulong s0, s1, s2, t0, t1, t2, t3; + int xsgnbit; + + s0 = *_s0; + s1 = *_s1; + s2 = *_s2; + + xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi); + xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract; + delta = sexp - xexp; + FLINT_ASSERT(delta != 0); + + if (delta < FLINT_BITS) + { + FLINT_MPN_MUL_2X2(t3, t2, t1, t0, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]); + (void) t0; + + if (xsgnbit) + sub_dddmmmsss(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta))); + else + add_sssaaaaaa(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta))); + } + else if (delta < 3 * FLINT_BITS) + { + if (delta < 2 * FLINT_BITS) + { + FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]); + + delta -= FLINT_BITS; + + if (delta != 0) + { + t2 = (t2 >> delta) | (t3 << (FLINT_BITS - delta)); + t3 = t3 >> delta; + } + } + else + { + umul_ppmm(t3, t2, NFLOAT_D(xi)[1], NFLOAT_D(yi)[1]); + + delta -= 2 * FLINT_BITS; + + if (delta != 0) + t3 = t3 >> delta; + + t2 = t3; + t3 = 0; + } + + if (xsgnbit) + sub_dddmmmsss(s2, s1, s0, s2, s1, s0, 0, t3, t2); + else + add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t3, t2); + } + + *_s0 = s0; + *_s1 = s1; + *_s2 = s2; +} + +static int +_flint_mpn_signed_add_n(nn_ptr res, nn_srcptr x, int xsgnbit, nn_srcptr y, int ysgnbit, mp_size_t n) +{ + if (xsgnbit == ysgnbit) + mpn_add_n(res, x, y, n); + else + { + if (mpn_cmp(x, y, n) >= 0) + mpn_sub_n(res, x, y, n); + else + { + mpn_sub_n(res, y, x, n); + xsgnbit = !xsgnbit; + } + } + + return xsgnbit; +} int __nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, slong xstep, nfloat_complex_srcptr y, slong ystep, slong len, gr_ctx_t ctx) @@ -729,50 +898,205 @@ __nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, _nfloat_vec_dot_set_initial(im_s, im_exp, im_initial, subtract, nlimbs); } - for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) + if (nlimbs == 1) { - a = xi; - b = (ulong *) xi + real_part_size; - c = yi; - d = (ulong *) yi + real_part_size; + ulong re0, re1; + ulong im0, im1; -/* - aexp = NFLOAT_EXP(a); - bexp = NFLOAT_EXP(b); - cexp = NFLOAT_EXP(c); - dexp = NFLOAT_EXP(d); + re0 = re_s[0]; + re1 = re_s[1]; + im0 = im_s[0]; + im1 = im_s[1]; - asgnbit = NFLOAT_SGNBIT(a); - bsgnbit = NFLOAT_SGNBIT(b); - csgnbit = NFLOAT_SGNBIT(c); - dsgnbit = NFLOAT_SGNBIT(d); + for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) + { + a = xi; + b = (ulong *) xi + real_part_size; + c = yi; + d = (ulong *) yi + real_part_size; - abexp = FLINT_MAX(aexp, bexp) + 2; - cdexp = FLINT_MAX(cexp, dexp) + 2; + if (!NFLOAT_IS_ZERO(a)) + { + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul_1(&re1, &re0, re_exp, a, c, 0); + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul_1(&im1, &im0, im_exp, a, d, 0); + } - adelta = abexp - aexp; - bdelta = abexp - bexp; - cdelta = cdexp - cexp; - ddelta = cdexp - dexp; -*/ + if (!NFLOAT_IS_ZERO(b)) + { + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul_1(&re1, &re0, re_exp, b, d, 1); + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul_1(&im1, &im0, im_exp, b, c, 0); + } + } - if (!NFLOAT_IS_ZERO(a)) + re_s[0] = re0; + re_s[1] = re1; + im_s[0] = im0; + im_s[1] = im1; + } + else if (nlimbs == 2) + { + ulong re0, re1, re2; + ulong im0, im1, im2; + + re0 = re_s[0]; + re1 = re_s[1]; + re2 = re_s[2]; + im0 = im_s[0]; + im1 = im_s[1]; + im2 = im_s[2]; + + for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) { - if (!NFLOAT_IS_ZERO(c)) - _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs); - if (!NFLOAT_IS_ZERO(d)) - _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs); + a = xi; + b = (ulong *) xi + real_part_size; + c = yi; + d = (ulong *) yi + real_part_size; + + if (!NFLOAT_IS_ZERO(a)) + { + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul_2(&re2, &re1, &re0, re_exp, a, c, 0); + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul_2(&im2, &im1, &im0, im_exp, a, d, 0); + } + + if (!NFLOAT_IS_ZERO(b)) + { + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul_2(&re2, &re1, &re0, re_exp, b, d, 1); + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul_2(&im2, &im1, &im0, im_exp, b, c, 0); + } } - if (!NFLOAT_IS_ZERO(b)) + re_s[0] = re0; + re_s[1] = re1; + re_s[2] = re2; + + im_s[0] = im0; + im_s[1] = im1; + im_s[2] = im2; + } + else + { + for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep) { - if (!NFLOAT_IS_ZERO(d)) - _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs); - if (!NFLOAT_IS_ZERO(c)) - _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs); + a = xi; + b = (ulong *) xi + real_part_size; + c = yi; + d = (ulong *) yi + real_part_size; + + /* Want Karatsuba? */ + if (nlimbs >= 12 && !NFLOAT_IS_ZERO(a) && !NFLOAT_IS_ZERO(b) && + !NFLOAT_IS_ZERO(c) && !NFLOAT_IS_ZERO(d)) + { + slong abexp, cdexp, abcdexp, adelta, bdelta, cdelta, ddelta; + + aexp = NFLOAT_EXP(a); + bexp = NFLOAT_EXP(b); + cexp = NFLOAT_EXP(c); + dexp = NFLOAT_EXP(d); + + abexp = FLINT_MAX(aexp, bexp) + 2; + cdexp = FLINT_MAX(cexp, dexp) + 2; + abcdexp = abexp + cdexp; + + adelta = abexp - aexp; + bdelta = abexp - bexp; + cdelta = cdexp - cexp; + ddelta = cdexp - dexp; + + if (adelta < FLINT_BITS && bdelta < FLINT_BITS && + cdelta < FLINT_BITS && ddelta < FLINT_BITS && + /* To do: check very carefully that the rounded sums + cannot overflow the guard bits in re_s and im_s. */ + abcdexp < re_exp + FLINT_BITS - 8 && + abcdexp < im_exp + FLINT_BITS - 8 && + /* To do: use with truncation also when this term + is small. */ + abcdexp > re_exp - 2 * FLINT_BITS && + abcdexp > im_exp - 2 * FLINT_BITS) + { + ulong aa[NFLOAT_MAX_LIMBS + 1]; + ulong bb[NFLOAT_MAX_LIMBS + 1]; + ulong cc[NFLOAT_MAX_LIMBS + 1]; + ulong dd[NFLOAT_MAX_LIMBS + 1]; + ulong s[NFLOAT_MAX_LIMBS + 1]; + ulong t[NFLOAT_MAX_LIMBS + 1]; + ulong u[NFLOAT_MAX_LIMBS + 1]; + ulong v[NFLOAT_MAX_LIMBS + 1]; + + int asgnbit, bsgnbit, csgnbit, dsgnbit; + int ssgnbit, tsgnbit, usgnbit; + + asgnbit = NFLOAT_SGNBIT(a); + bsgnbit = NFLOAT_SGNBIT(b); + csgnbit = NFLOAT_SGNBIT(c); + dsgnbit = NFLOAT_SGNBIT(d); + + /* + s = c * (a + b) + t = a * (d - c) + u = b * (c + d) + re = s - u + im = s + t + */ + + aa[0] = mpn_rshift(aa + 1, NFLOAT_D(a), nlimbs, adelta); + bb[0] = mpn_rshift(bb + 1, NFLOAT_D(b), nlimbs, bdelta); + cc[0] = mpn_rshift(cc + 1, NFLOAT_D(c), nlimbs, cdelta); + dd[0] = mpn_rshift(dd + 1, NFLOAT_D(d), nlimbs, ddelta); + + ssgnbit = csgnbit ^ _flint_mpn_signed_add_n(v, aa, asgnbit, bb, bsgnbit, nlimbs + 1); + flint_mpn_mulhigh_n(s, cc, v, nlimbs + 1); + + tsgnbit = asgnbit ^ _flint_mpn_signed_add_n(v, dd, dsgnbit, cc, !csgnbit, nlimbs + 1); + flint_mpn_mulhigh_n(t, aa, v, nlimbs + 1); + + usgnbit = bsgnbit ^ _flint_mpn_signed_add_n(v, cc, csgnbit, dd, dsgnbit, nlimbs + 1); + flint_mpn_mulhigh_n(u, bb, v, nlimbs + 1); + + usgnbit = _flint_mpn_signed_add_n(u, s, ssgnbit, u, !usgnbit, nlimbs + 1); + tsgnbit = _flint_mpn_signed_add_n(t, s, ssgnbit, t, tsgnbit, nlimbs + 1); + + /* + flint_printf("|u| = "); + print_s(u, abexp + cdexp, nlimbs + 1); + flint_printf("|t| = "); + print_s(t, abexp + cdexp, nlimbs + 1); + */ + + _nfloat_vec_dot_add_limbs(re_s, re_exp, u, abcdexp, usgnbit, nlimbs + 1); + _nfloat_vec_dot_add_limbs(im_s, im_exp, t, abcdexp, tsgnbit, nlimbs + 1); + + continue; + } + } + + if (!NFLOAT_IS_ZERO(a)) + { + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs); + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs); + } + + if (!NFLOAT_IS_ZERO(b)) + { + if (!NFLOAT_IS_ZERO(d)) + _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs); + if (!NFLOAT_IS_ZERO(c)) + _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs); + } } } + /* todo: maybe specialize for nlimbs=1, 2 */ + re_res = res; im_res = (nn_ptr) res + real_part_size; diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c index a1d746bc15..cbc027d9fd 100644 --- a/src/nfloat/test/t-nfloat.c +++ b/src/nfloat/test/t-nfloat.c @@ -106,11 +106,9 @@ TEST_FUNCTION_START(nfloat, state) } reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier(); - for (i = 0; i < reps; i++) gr_test_approx_dot(ctx, ctx2, 30, tol, state, 0); - reps = (prec <= 256 ? 1000 : 1) * flint_test_multiplier(); for (i = 0; i < reps; i++)