From 96bc07288ab2948817054a4f762e40ca409d0e86 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Tue, 28 May 2024 18:41:03 +0200
Subject: [PATCH 1/2] initial nfloat_complex dot product; refactor test code

---
 src/gr.h                           |  15 +
 src/gr/test_ring.c                 | 505 ++++++++++++++++++++++++++
 src/nfloat.h                       |   3 +
 src/nfloat/complex.c               |   2 +-
 src/nfloat/dot.c                   | 450 +++++++++++++++--------
 src/nfloat/test/t-nfloat.c         | 562 +----------------------------
 src/nfloat/test/t-nfloat_complex.c |  44 ++-
 7 files changed, 875 insertions(+), 706 deletions(-)

diff --git a/src/gr.h b/src/gr.h
index 274d2b6117..728366c7ac 100644
--- a/src/gr.h
+++ b/src/gr.h
@@ -1475,6 +1475,21 @@ void gr_test_ring(gr_ctx_t R, slong iters, int test_flags);
 void gr_test_multiplicative_group(gr_ctx_t R, slong iters, int test_flags);
 void gr_test_floating_point(gr_ctx_t R, slong iters, int test_flags);
 
+int gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags);
+int gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags);
+int gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags);
+int gr_test_approx_binary_op_type_variants(gr_ctx_t R, const char * opname,
+    gr_method_binary_op gr_op,
+    gr_method_binary_op_ui gr_op_ui,
+    gr_method_binary_op_si gr_op_si,
+    gr_method_binary_op_fmpz gr_op_fmpz,
+    gr_method_binary_op_fmpq gr_op_fmpq,
+    int fused,
+    int small_test_values,
+    gr_srcptr rel_tol, flint_rand_t state, int test_flags);
+int gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol, flint_rand_t state, int test_flags);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c
index 7e7f5a4442..78a5c939d3 100644
--- a/src/gr/test_ring.c
+++ b/src/gr/test_ring.c
@@ -3642,6 +3642,511 @@ gr_test_field(gr_ctx_t R, flint_rand_t state, int test_flags)
     return status;
 }
 
+int
+gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int cmp1, cmp2;
+    gr_ptr a, b, a_ref, b_ref;
+
+    GR_TMP_INIT2(a, b, R);
+    GR_TMP_INIT2(a_ref, b_ref, R_ref);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+    status |= gr_set_other(b_ref, b, R, R_ref);
+
+    status |= op(&cmp1, a, b, R);
+    status |= op(&cmp2, a_ref, b_ref, R_ref);
+
+    if (status == GR_SUCCESS && cmp1 != cmp2)
+        status = GR_TEST_FAIL;
+
+    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+    {
+        flint_printf("\n");
+        gr_ctx_println(R);
+        gr_ctx_println(R_ref);
+        flint_printf("a = "); gr_println(a, R);
+        flint_printf("b = "); gr_println(b, R);
+        flint_printf("cmp1 = %d\n", cmp1);
+        flint_printf("cmp2 = %d\n", cmp2);
+        flint_printf("\n");
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR2(a, b, R);
+    GR_TMP_CLEAR2(a_ref, b_ref, R_ref);
+
+    return status;
+}
+
+int
+gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int alias;
+    int cmp;
+    gr_ptr a, b, a_ref, b_ref, rel_err;
+
+    GR_TMP_INIT2(a, b, R);
+    GR_TMP_INIT3(a_ref, b_ref, rel_err, R_ref);
+
+    alias = n_randint(state, 2);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+
+    if (status == GR_SUCCESS)
+    {
+        if (alias == 0)
+        {
+            status |= op(b, a, R);
+            status |= op(b_ref, a_ref, R_ref);
+        }
+        else
+        {
+            status |= gr_set(b, a, R);
+            status |= op(b, b, R);
+            status |= op(b_ref, a_ref, R_ref);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            status |= gr_set_other(rel_err, b, R, R_ref);
+            status |= gr_sub(rel_err, b_ref, rel_err, R_ref);
+            status |= gr_div(rel_err, rel_err, b_ref, R_ref);
+            status |= gr_abs(rel_err, rel_err, R_ref);
+            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
+
+            if (status == GR_SUCCESS && cmp > 0)
+                status = GR_TEST_FAIL;
+
+            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+            {
+                flint_printf("\n");
+                gr_ctx_println(R);
+                gr_ctx_println(R_ref);
+                flint_printf("alias: %d\n", alias);
+                flint_printf("Computed:\n");
+                flint_printf("a = "); gr_println(a, R);
+                flint_printf("op(a) = "); gr_println(b, R);
+                flint_printf("Reference:\n");
+                flint_printf("a = "); gr_println(a_ref, R_ref);
+                flint_printf("op(a) = "); gr_println(b_ref, R_ref);
+                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
+                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
+                flint_printf("\n");
+            }
+        }
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR2(a, b, R);
+    GR_TMP_CLEAR3(a_ref, b_ref, rel_err, R_ref);
+
+    return status;
+}
+
+int
+gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status = GR_SUCCESS;
+    int alias;
+    int cmp;
+    gr_ptr a, b, c, a_ref, b_ref, c_ref, rel_err;
+
+    GR_TMP_INIT3(a, b, c, R);
+    GR_TMP_INIT4(a_ref, b_ref, c_ref, rel_err, R_ref);
+
+    alias = n_randint(state, 5);
+
+    status |= gr_randtest(a, state, R);
+    status |= gr_randtest(b, state, R);
+    status |= gr_randtest(c, state, R);
+
+    status |= gr_set_other(a_ref, a, R, R_ref);
+    status |= gr_set_other(b_ref, b, R, R_ref);
+
+    if (status == GR_SUCCESS)
+    {
+        if (alias == 0)
+        {
+            status |= op(c, a, b, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 1)
+        {
+            status |= gr_set(c, a, R);
+            status |= op(c, c, b, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 2)
+        {
+            status |= gr_set(c, b, R);
+            status |= op(c, a, c, R);
+            status |= op(c_ref, a_ref, b_ref, R_ref);
+        }
+        else if (alias == 3)
+        {
+            status |= gr_set(b, a, R);
+            status |= gr_set(b_ref, a_ref, R_ref);
+            status |= op(c, a, a, R);
+            status |= op(c_ref, a_ref, a_ref, R_ref);
+        }
+        else
+        {
+            status |= gr_set(b, a, R);
+            status |= gr_set(c, a, R);
+            status |= gr_set(b_ref, a_ref, R_ref);
+            status |= op(c, c, c, R);
+            status |= op(c_ref, a_ref, a_ref, R_ref);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            status |= gr_set_other(rel_err, c, R, R_ref);
+
+            status |= gr_sub(rel_err, c_ref, rel_err, R_ref);
+            status |= gr_div(rel_err, rel_err, c_ref, R_ref);
+            status |= gr_abs(rel_err, rel_err, R_ref);
+            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
+
+            if (status == GR_SUCCESS && cmp > 0)
+                status = GR_TEST_FAIL;
+
+            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+            {
+                flint_printf("\n");
+                gr_ctx_println(R);
+                gr_ctx_println(R_ref);
+                flint_printf("alias: %d\n", alias);
+                flint_printf("Computed:\n");
+                flint_printf("a = "); gr_println(a, R);
+                flint_printf("b = "); gr_println(b, R);
+                flint_printf("a (op) b = "); gr_println(c, R);
+                flint_printf("Reference:\n");
+                flint_printf("a = "); gr_println(a_ref, R_ref);
+                flint_printf("b = "); gr_println(b_ref, R_ref);
+                flint_printf("a (op) b = "); gr_println(c_ref, R_ref);
+                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
+                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
+                flint_printf("\n");
+            }
+        }
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR3(a, b, c, R);
+    GR_TMP_CLEAR4(a_ref, b_ref, c_ref, rel_err, R_ref);
+
+    return status;
+}
+
+int
+gr_test_approx_binary_op_type_variants(gr_ctx_t R,
+    const char * opname,
+    gr_method_binary_op gr_op,
+    gr_method_binary_op_ui gr_op_ui,
+    gr_method_binary_op_si gr_op_si,
+    gr_method_binary_op_fmpz gr_op_fmpz,
+    gr_method_binary_op_fmpq gr_op_fmpq,
+    int fused,
+    int small_test_values,
+    gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    int status, alias, which;
+    gr_ptr x, y, xy1, xy2, err;
+    ulong uy;
+    slong sy;
+    fmpz_t zy;
+    fmpq_t qy;
+
+    GR_TMP_INIT5(x, y, xy1, xy2, err, R);
+    fmpz_init(zy);
+    fmpq_init(qy);
+
+    which = 0;
+
+    if (small_test_values)
+    {
+        uy = n_randint(state, 6);
+        sy = -5 + (slong) n_randint(state, 11);
+        fmpz_randtest(zy, state, 3);
+        fmpq_randtest(qy, state, 3);
+    }
+    else
+    {
+        uy = n_randtest(state);
+        sy = (slong) n_randtest(state);
+
+        if (n_randint(state, 10) == 0)
+        {
+            fmpz_randtest(zy, state, 10000);
+            fmpq_randtest(qy, state, 200);
+        }
+        else
+        {
+            fmpz_randtest(zy, state, 100);
+            fmpq_randtest(qy, state, 100);
+        }
+    }
+
+    for (which = 0; which < 4; which++)
+    {
+        status = GR_SUCCESS;
+        alias = n_randint(state, 2);
+
+        GR_MUST_SUCCEED(gr_randtest(x, state, R));
+        GR_MUST_SUCCEED(gr_randtest(y, state, R));
+        GR_MUST_SUCCEED(gr_randtest(xy1, state, R));
+
+        if (fused && alias)
+            GR_MUST_SUCCEED(gr_set(xy2, x, R));
+        else if (fused)
+            GR_MUST_SUCCEED(gr_set(xy2, xy1, R));
+        else
+            GR_MUST_SUCCEED(gr_randtest(xy2, state, R));
+
+        if (alias)
+            GR_MUST_SUCCEED(gr_set(xy1, x, R));
+
+        if (which == 0)
+        {
+            if (alias)
+                status |= gr_op_ui(xy1, xy1, uy, R);
+            else
+                status |= gr_op_ui(xy1, x, uy, R);
+
+            status |= gr_set_ui(y, uy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else if (which == 1)
+        {
+            if (alias)
+                status |= gr_op_si(xy1, xy1, sy, R);
+            else
+                status |= gr_op_si(xy1, x, sy, R);
+            status |= gr_set_si(y, sy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else if (which == 2)
+        {
+            if (alias)
+                status |= gr_op_fmpz(xy1, xy1, zy, R);
+            else
+                status |= gr_op_fmpz(xy1, x, zy, R);
+            status |= gr_set_fmpz(y, zy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+        else
+        {
+            if (alias)
+                status |= gr_op_fmpq(xy1, xy1, qy, R);
+            else
+                status |= gr_op_fmpq(xy1, x, qy, R);
+            status |= gr_set_fmpq(y, qy, R);
+
+            if (status == GR_SUCCESS)
+                status |= gr_op(xy2, x, y, R);
+        }
+
+        if (status == GR_SUCCESS)
+        {
+            int cmp;
+
+            status |= gr_sub(err, xy1, xy2, R);
+            status |= gr_div(err, err, xy2, R);
+            status |= gr_abs(err, err, R);
+            status |= gr_cmp(&cmp, err, rel_tol, R);
+
+            if (status == GR_SUCCESS && cmp > 0)
+            {
+                status = GR_TEST_FAIL;
+                break;
+            }
+        }
+    }
+
+    if ((test_flags & GR_TEST_ALWAYS_ABLE) && (status & GR_UNABLE))
+        status = GR_TEST_FAIL;
+
+    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
+    {
+        flint_printf("\n");
+        flint_printf("%s\n", opname);
+        gr_ctx_println(R);
+        flint_printf("which: %d\n", which);
+        flint_printf("alias: %d\n", alias);
+        flint_printf("x = "); gr_println(x, R);
+        flint_printf("y = "); gr_println(y, R);
+        flint_printf("y (op) y (1) = "); gr_println(xy1, R);
+        flint_printf("x (op) y (2) = "); gr_println(xy2, R);
+        flint_printf("err = "); gr_println(err, R);
+        flint_printf("tol = "); gr_println(rel_tol, R);
+        flint_printf("\n");
+    }
+
+    if (status == GR_TEST_FAIL)
+        flint_abort();
+
+    GR_TMP_CLEAR5(x, y, xy1, xy2, err, R);
+
+    fmpz_clear(zy);
+    fmpq_clear(qy);
+
+    return status;
+}
+
+int
+gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
+{
+    slong i, len;
+    gr_ptr s0, vec1, vec2, res;
+    int subtract, initial, reverse;
+    gr_ptr avec1, avec2;
+    gr_ptr as0, ares, ares2, amag, err, t;
+    int status = GR_SUCCESS;
+    int cmp;
+
+    len = n_randint(state, maxlen);
+    initial = n_randint(state, 2);
+    subtract = n_randint(state, 2);
+    reverse = n_randint(state, 2);
+
+    vec1 = gr_heap_init_vec(len, R);
+    vec2 = gr_heap_init_vec(len, R);
+    s0 = gr_heap_init(R);
+    res = gr_heap_init(R);
+
+    avec1 = gr_heap_init_vec(len, R_ref);
+    avec2 = gr_heap_init_vec(len, R_ref);
+    GR_TMP_INIT4(as0, ares, ares2, amag, R_ref);
+    GR_TMP_INIT2(t, err, R_ref);
+
+    if (initial)
+    {
+        status |= gr_randtest(s0, state, R);
+        status |= gr_set_other(as0, s0, R, R_ref);
+
+        status |= gr_set(ares, as0, R_ref);
+        status |= gr_abs(t, as0, R_ref);
+        status |= gr_add(amag, amag, t, R_ref);
+    }
+
+    for (i = 0; i < len; i++)
+    {
+        if (2 * i >= len && n_randint(state, 2))
+        {
+            status |= gr_set(GR_ENTRY(vec1, i, R->sizeof_elem), GR_ENTRY(vec1, len - 1 - i, R->sizeof_elem),  R);
+            status |= gr_neg(GR_ENTRY(vec2, i, R->sizeof_elem), GR_ENTRY(vec2, len - 1 - i, R->sizeof_elem),  R);
+        }
+        else
+        {
+            status |= gr_randtest(GR_ENTRY(vec1, i, R->sizeof_elem), state, R);
+            status |= gr_randtest(GR_ENTRY(vec2, i, R->sizeof_elem), state, R);
+        }
+
+        status |= gr_set_other(GR_ENTRY(avec1, i, R_ref->sizeof_elem), GR_ENTRY(vec1, i, R->sizeof_elem), R, R_ref);
+        status |= gr_set_other(GR_ENTRY(avec2, i, R_ref->sizeof_elem), GR_ENTRY(vec2, i, R->sizeof_elem), R, R_ref);
+    }
+
+    GR_MUST_SUCCEED(status);
+
+    for (i = 0; i < len; i++)
+    {
+        status |= gr_mul(t, GR_ENTRY(avec1, i, R_ref->sizeof_elem), GR_ENTRY(avec2, reverse ? len - 1 - i : i, R_ref->sizeof_elem), R_ref);
+
+        if (subtract)
+            status |= gr_sub(ares, ares, t, R_ref);
+        else
+            status |= gr_add(ares, ares, t, R_ref);
+
+        status |= gr_abs(t, t, R_ref);
+        status |= gr_add(amag, amag, t, R_ref);
+    }
+
+    GR_MUST_SUCCEED(status);
+
+    if (reverse)
+        GR_MUST_SUCCEED(_gr_vec_dot_rev(res, initial ? s0 : NULL, subtract, vec1, vec2, len, R));
+    else
+        GR_MUST_SUCCEED(_gr_vec_dot(res, initial ? s0 : NULL, subtract, vec1, vec2, len, R));
+
+    status |= gr_set_other(ares2, res, R, R_ref);
+    status |= gr_sub(err, ares, ares2, R_ref);
+    status |= gr_abs(err, err, R_ref);
+    status |= gr_mul(t, amag, rel_tol, R_ref);
+    status |= gr_abs(t, t, R_ref);
+    GR_MUST_SUCCEED(status);
+
+    status |= gr_cmpabs(&cmp, err, t, R_ref);
+
+    if (status == GR_SUCCESS && cmp > 0)
+        status = GR_TEST_FAIL;
+
+    if (status == GR_TEST_FAIL)
+    {
+        flint_printf("FAIL: dot\n");
+        gr_ctx_println(R);
+        gr_ctx_println(R_ref);
+
+        flint_printf("reverse = %d, subtract = %d\n", reverse, subtract);
+
+        if (initial)
+        {
+            flint_printf("\n\ninitial = ");
+            gr_println(as0, R_ref);
+        }
+
+        flint_printf("\n\nvec1 = ");
+        _gr_vec_print(vec1, len, R);
+        flint_printf("\n\nvec2 = ");
+        _gr_vec_print(vec2, len, R);
+        flint_printf("\n\nres (ref) = \n");
+        gr_println(ares, R_ref);
+        flint_printf("\nres (computed) = \n");
+        gr_println(ares2, R_ref);
+        flint_printf("\nrel_tol = \n");
+        gr_println(rel_tol, R_ref);
+        flint_printf("\nabs_tol = \n");
+        gr_println(t, R_ref);
+        flint_printf("\nerr = \n");
+        gr_println(err, R_ref);
+        flint_printf("\n");
+
+        flint_abort();
+    }
+
+    gr_heap_clear_vec(vec1, len, R);
+    gr_heap_clear_vec(vec2, len, R);
+    gr_heap_clear(s0, R);
+    gr_heap_clear(res, R);
+
+    gr_heap_clear_vec(avec1, len, R_ref);
+    gr_heap_clear_vec(avec2, len, R_ref);
+    GR_TMP_CLEAR4(as0, ares, ares2, amag, R_ref);
+    GR_TMP_CLEAR2(t, err, R_ref);
+
+    return status;
+}
+
 void
 gr_test_iter(gr_ctx_t R, flint_rand_t state, const char * descr, gr_test_function func, slong iters, int test_flags)
 {
diff --git a/src/nfloat.h b/src/nfloat.h
index 199402794c..00e41183aa 100644
--- a/src/nfloat.h
+++ b/src/nfloat.h
@@ -550,6 +550,9 @@ int _nfloat_complex_vec_set(nfloat_complex_ptr res, nfloat_complex_srcptr x, slo
 int _nfloat_complex_vec_add(nfloat_complex_ptr res, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx);
 int _nfloat_complex_vec_sub(nfloat_complex_ptr res, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx);
 
+int _nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx);
+int _nfloat_complex_vec_dot_rev(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/nfloat/complex.c b/src/nfloat/complex.c
index 6ec103ad2b..3d4dc0bfd5 100644
--- a/src/nfloat/complex.c
+++ b/src/nfloat/complex.c
@@ -1664,9 +1664,9 @@ gr_method_tab_input _nfloat_complex_methods_input[] =
     {GR_METHOD_VEC_MUL_SCALAR,          (gr_funcptr) _nfloat_complex_vec_mul_scalar},
     {GR_METHOD_VEC_ADDMUL_SCALAR,          (gr_funcptr) _nfloat_complex_vec_addmul_scalar},
     {GR_METHOD_VEC_SUBMUL_SCALAR,          (gr_funcptr) _nfloat_complex_vec_submul_scalar},
+*/
     {GR_METHOD_VEC_DOT,         (gr_funcptr) _nfloat_complex_vec_dot},
     {GR_METHOD_VEC_DOT_REV,     (gr_funcptr) _nfloat_complex_vec_dot_rev},
-*/
 /*
     {GR_METHOD_POLY_MULLOW,     (gr_funcptr) nfloat_complex_poly_mullow},
     {GR_METHOD_POLY_ROOTS_OTHER,(gr_funcptr) nfloat_complex_poly_roots_other},
diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c
index 65f9b8ec6a..32c678c200 100644
--- a/src/nfloat/dot.c
+++ b/src/nfloat/dot.c
@@ -11,6 +11,9 @@
 
 #include "nfloat.h"
 
+/* TODO: don't use ctx->sizeof_elem, to allow real dot products to be
+   called from complex context objects */
+
 /* Having some dummy code in the continue block makes the loop go faster. Don't ask me why. */
 #if 1
 #define DUMMY_OP add_ssaaaa(s1, s0, s1, s0, 0, 0)
@@ -90,6 +93,128 @@
 
 #endif
 
+void
+_nfloat_vec_dot_set_initial(ulong * s, slong sexp, nfloat_srcptr x, int subtract, slong nlimbs)
+{
+    slong xexp, delta;
+    slong delta_limbs, delta_bits;
+    int xsgnbit;
+
+    xexp = NFLOAT_EXP(x);
+    xsgnbit = NFLOAT_SGNBIT(x) ^ subtract;
+    delta = sexp - xexp;
+
+    if (delta < FLINT_BITS)
+    {
+        FLINT_ASSERT(delta != 0);
+
+        mpn_rshift(s + 1, NFLOAT_D(x), nlimbs, delta);
+        s[0] = NFLOAT_D(x)[0] << (FLINT_BITS - delta);
+    }
+    else
+    {
+        delta_limbs = delta / FLINT_BITS;
+        delta_bits = delta % FLINT_BITS;
+
+        if (delta_limbs < nlimbs + 1)
+        {
+            if (delta_bits == 0)
+                flint_mpn_copyi(s, NFLOAT_D(x) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
+            else
+                mpn_rshift(s, NFLOAT_D(x) + delta_limbs - 1, nlimbs + 1 - delta_limbs, delta_bits);
+        }
+    }
+
+    if (xsgnbit)
+        mpn_neg(s, s, nlimbs + 1);
+}
+
+#include <math.h>
+
+void
+_nfloat_vec_dot_addmul(ulong * s, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract, slong nlimbs)
+{
+    slong xexp, delta;
+    slong delta_limbs, delta_bits;
+    int xsgnbit;
+    ulong t[NFLOAT_MAX_LIMBS + 1];
+
+    xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract;
+    delta = sexp - xexp;
+
+    if (delta < FLINT_BITS)
+    {
+        t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs);
+
+        if (nlimbs == 3)
+        {
+            if (xsgnbit)
+                sub_ddddmmmmssss(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0],
+                    t[3] >> delta,
+                    (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
+                    (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
+                    (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
+            else
+                add_ssssaaaaaaaa(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0],
+                    t[3] >> delta,
+                    (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
+                    (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
+                    (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
+        }
+        else if (nlimbs == 4)
+        {
+            if (xsgnbit)
+                sub_dddddmmmmmsssss(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0],
+                    t[4] >> delta,
+                    (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)),
+                    (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
+                    (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
+                    (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
+            else
+                add_sssssaaaaaaaaaa(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0],
+                    t[4] >> delta,
+                    (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)),
+                    (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
+                    (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
+                    (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
+        }
+        else
+        {
+            mpn_rshift(t, t, nlimbs + 1, delta);
+
+            if (xsgnbit)
+                mpn_sub_n(s, s, t, nlimbs + 1);
+            else
+                mpn_add_n(s, s, t, nlimbs + 1);
+        }
+    }
+    else
+    {
+        delta_limbs = delta / FLINT_BITS;
+        delta_bits = delta % FLINT_BITS;
+
+        /* alternative criterion: if (delta < (FLINT_BITS * nlimbs) + 2 * pad_bits) */
+        if (delta_limbs < nlimbs + 1)
+        {
+            /* todo: squaring case */
+            flint_mpn_mulhigh_n(t, NFLOAT_D(xi) + delta_limbs - 1, NFLOAT_D(yi) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
+
+            if (delta_bits != 0)
+                mpn_rshift(t, t, nlimbs + 1 - delta_limbs, delta_bits);
+
+            if (xsgnbit)
+                mpn_sub(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
+            else
+                mpn_add(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
+        }
+        else
+        {
+            /* Skip term. */
+        }
+    }
+}
+
 int
 __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, slong sizeof_xstep, nfloat_srcptr y, slong sizeof_ystep, slong len, gr_ctx_t ctx)
 {
@@ -341,11 +466,12 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
             {
                 xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
 
-#if 1
+#if 0
                 FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
                 (void) t0;
 #else
                 FLINT_MPN_MUL_2X2(t3, t2, t1, t0, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+                (void) t0;
 #endif
 
                 if (xsgnbit)
@@ -447,12 +573,10 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
 
     if (!(NFLOAT_CTX_FLAGS(ctx) & (NFLOAT_ALLOW_INF | NFLOAT_ALLOW_NAN)))
     {
-        slong i, xexp, delta, sexp, norm, pad_bits;
+        slong i, xexp, sexp, pad_bits;
         int xsgnbit;
-        ulong t[NFLOAT_MAX_LIMBS + 1];
         ulong s[NFLOAT_MAX_LIMBS + 1];
         nfloat_srcptr xi, yi;
-        slong n;
 
         slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
 
@@ -479,182 +603,214 @@ __nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_src
         sexp += pad_bits;
 
         if (initial != NULL && !NFLOAT_IS_ZERO(initial))
+            _nfloat_vec_dot_set_initial(s, sexp, initial, subtract, nlimbs);
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
         {
-            xexp = NFLOAT_EXP(initial);
-            xsgnbit = NFLOAT_SGNBIT(initial) ^ subtract;
-            delta = sexp - xexp;
+            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
+                continue;
 
-            if (delta < FLINT_BITS)
-            {
-                FLINT_ASSERT(delta != 0);
+            _nfloat_vec_dot_addmul(s, sexp, xi, yi, 0, nlimbs);
+        }
 
-                mpn_rshift(s + 1, NFLOAT_D(initial), nlimbs, delta);
-                s[0] = NFLOAT_D(initial)[0] << (FLINT_BITS - delta);
-                if (xsgnbit)
-                    mpn_neg(s, s, nlimbs + 1);
-            }
-            else
-            {
-                slong delta_limbs, delta_bits;
+        if (LIMB_MSB_IS_SET(s[nlimbs]))
+        {
+            mpn_neg(s, s, nlimbs + 1);
+            xsgnbit = !subtract;
+        }
+        else
+        {
+            xsgnbit = subtract;
+        }
 
-                delta_limbs = delta / FLINT_BITS;
-                delta_bits = delta % FLINT_BITS;
+        return nfloat_set_mpn_2exp(res, s, nlimbs + 1, sexp, xsgnbit, ctx);
+    }
 
-                if (delta_limbs < nlimbs + 1)
-                {
-                    if (delta_bits == 0)
-                        flint_mpn_copyi(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
-                    else
-                        mpn_rshift(s, NFLOAT_D(initial) + delta_limbs - 1, nlimbs + 1 - delta_limbs, delta_bits);
+    return GR_UNABLE;
+}
 
-                    if (xsgnbit)
-                        mpn_neg(s, s, nlimbs + 1);
-                }
-            }
-        }
+int
+_nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+{
+    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, y, ctx->sizeof_elem, len, ctx);
+}
 
-        for (i = 0, xi = x, yi = y; i < len; i++, xi = (char *) xi + sizeof_xstep, yi = (char *) yi + sizeof_ystep)
-        {
-            if (NFLOAT_IS_ZERO(xi) || NFLOAT_IS_ZERO(yi))
-                continue;
+int
+_nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+{
+    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx);
+}
 
-            xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
-            xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi);
-            delta = sexp - xexp;
 
-            if (delta < FLINT_BITS)
-            {
-                t[0] = flint_mpn_mulhigh_n(t + 1, NFLOAT_D(xi), NFLOAT_D(yi), nlimbs);
+void
+print_s(ulong * s, slong sexp, slong nlimbs)
+{
+    ulong x = s[nlimbs - 1];
+    double c = 1.0;
 
-                if (nlimbs == 3)
-                {
-                    if (xsgnbit)
-                        sub_ddddmmmmssss(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0],
-                            t[3] >> delta,
-                            (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
-                            (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
-                            (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
-                    else
-                        add_ssssaaaaaaaa(s[3], s[2], s[1], s[0], s[3], s[2], s[1], s[0],
-                            t[3] >> delta,
-                            (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
-                            (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
-                            (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
-                }
-                else if (nlimbs == 4)
-                {
-                    if (xsgnbit)
-                        sub_dddddmmmmmsssss(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0],
-                            t[4] >> delta,
-                            (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)),
-                            (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
-                            (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
-                            (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
-                    else
-                        add_sssssaaaaaaaaaa(s[4], s[3], s[2], s[1], s[0], s[4], s[3], s[2], s[1], s[0],
-                            t[4] >> delta,
-                            (t[3] >> delta) | (t[4] << (FLINT_BITS - delta)),
-                            (t[2] >> delta) | (t[3] << (FLINT_BITS - delta)),
-                            (t[1] >> delta) | (t[2] << (FLINT_BITS - delta)),
-                            (t[0] >> delta) | (t[1] << (FLINT_BITS - delta)));
-                }
-                else
-                {
-                    mpn_rshift(t, t, nlimbs + 1, delta);
+    if (LIMB_MSB_IS_SET(x))
+    {
+        x = -x;
+        c = -1.0;
+    }
 
-                    if (xsgnbit)
-                        mpn_sub_n(s, s, t, nlimbs + 1);
-                    else
-                        mpn_add_n(s, s, t, nlimbs + 1);
-                }
-            }
-            else
-            {
-                slong delta_limbs, delta_bits;
+    flint_printf("SUM %.15e\n", (double) x * ldexp(1.0, sexp - FLINT_BITS) * c);
+}
 
-                delta_limbs = delta / FLINT_BITS;
-                delta_bits = delta % FLINT_BITS;
+int
+__nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, slong xstep, nfloat_complex_srcptr y, slong ystep, slong len, gr_ctx_t ctx)
+{
+    slong i, pad_bits;
+    slong re_exp, im_exp;
+    ulong re_s[NFLOAT_MAX_LIMBS + 1];
+    ulong im_s[NFLOAT_MAX_LIMBS + 1];
+    nfloat_srcptr a, b, c, d;
+    nfloat_srcptr re_initial, im_initial;
+    nfloat_ptr re_res, im_res;
+    slong aexp, bexp, cexp, dexp;
+    nfloat_srcptr xi, yi;
+    int status = GR_SUCCESS;
+    int re_sgnbit, im_sgnbit;
+
+    slong nlimbs = NFLOAT_CTX_NLIMBS(ctx);
+    slong real_part_size = NFLOAT_CTX_DATA_NLIMBS(ctx);
+
+    re_exp = WORD_MIN;
+    im_exp = WORD_MIN;
+
+    re_initial = NFLOAT_COMPLEX_RE(initial, ctx);
+    im_initial = NFLOAT_COMPLEX_IM(initial, ctx);
+
+    if (initial != NULL)
+    {
+        if (!NFLOAT_IS_ZERO(re_initial))
+            re_exp = NFLOAT_EXP(re_initial);
+        if (!NFLOAT_IS_ZERO(im_initial))
+            im_exp = NFLOAT_EXP(im_initial);
+    }
 
-                /* alternative criterion: if (delta < (FLINT_BITS * nlimbs) + 2 * pad_bits) */
-                if (delta_limbs < nlimbs + 1)
-                {
-                    /* todo: squaring case */
-                    flint_mpn_mulhigh_n(t, NFLOAT_D(xi) + delta_limbs - 1, NFLOAT_D(yi) + delta_limbs - 1, nlimbs + 1 - delta_limbs);
+    for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
+    {
+        a = xi;
+        b = (ulong *) xi + real_part_size;
+        c = yi;
+        d = (ulong *) yi + real_part_size;
+
+        aexp = NFLOAT_EXP(a);
+        bexp = NFLOAT_EXP(b);
+        cexp = NFLOAT_EXP(c);
+        dexp = NFLOAT_EXP(d);
+
+        if (aexp != NFLOAT_EXP_ZERO && cexp != NFLOAT_EXP_ZERO)
+            re_exp = FLINT_MAX(re_exp, aexp + cexp);
+        if (bexp != NFLOAT_EXP_ZERO && dexp != NFLOAT_EXP_ZERO)
+            re_exp = FLINT_MAX(re_exp, bexp + dexp);
+        if (aexp != NFLOAT_EXP_ZERO && dexp != NFLOAT_EXP_ZERO)
+            im_exp = FLINT_MAX(im_exp, aexp + dexp);
+        if (bexp != NFLOAT_EXP_ZERO && cexp != NFLOAT_EXP_ZERO)
+            im_exp = FLINT_MAX(im_exp, bexp + cexp);
+    }
 
-                    if (delta_bits != 0)
-                        mpn_rshift(t, t, nlimbs + 1 - delta_limbs, delta_bits);
+    if (re_exp == WORD_MIN && im_exp == WORD_MIN)
+        return nfloat_complex_zero(res, ctx);
 
-                    if (xsgnbit)
-                        mpn_sub(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
-                    else
-                        mpn_add(s, s, nlimbs + 1, t, nlimbs + 1 - delta_limbs);
-                }
-                else
-                {
-                    /* Skip term. */
-                }
-            }
-        }
+    pad_bits = FLINT_BIT_COUNT(len + 1) + 2;
+    if (re_exp != WORD_MIN) re_exp += pad_bits;
+    if (im_exp != WORD_MIN) im_exp += pad_bits;
 
-        if (LIMB_MSB_IS_SET(s[nlimbs]))
-        {
-            mpn_neg(s, s, nlimbs + 1);
-            xsgnbit = 1;
-        }
-        else
-        {
-            xsgnbit = 0;
-        }
+    flint_mpn_zero(re_s, nlimbs + 1);
+    flint_mpn_zero(im_s, nlimbs + 1);
 
-        NFLOAT_SGNBIT(res) = xsgnbit ^ subtract;
+    if (initial != NULL)
+    {
+        if (!NFLOAT_IS_ZERO(re_initial))
+            _nfloat_vec_dot_set_initial(re_s, re_exp, re_initial, subtract, nlimbs);
+        if (!NFLOAT_IS_ZERO(im_initial))
+            _nfloat_vec_dot_set_initial(im_s, im_exp, im_initial, subtract, nlimbs);
+    }
 
-        n = nlimbs + 1;
-        MPN_NORM(s, n);
+    for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
+    {
+        a = xi;
+        b = (ulong *) xi + real_part_size;
+        c = yi;
+        d = (ulong *) yi + real_part_size;
 
-        xexp = sexp - (nlimbs + 1 - n) * FLINT_BITS;
+/*
+        aexp = NFLOAT_EXP(a);
+        bexp = NFLOAT_EXP(b);
+        cexp = NFLOAT_EXP(c);
+        dexp = NFLOAT_EXP(d);
+
+        asgnbit = NFLOAT_SGNBIT(a);
+        bsgnbit = NFLOAT_SGNBIT(b);
+        csgnbit = NFLOAT_SGNBIT(c);
+        dsgnbit = NFLOAT_SGNBIT(d);
+
+        abexp = FLINT_MAX(aexp, bexp) + 2;
+        cdexp = FLINT_MAX(cexp, dexp) + 2;
+
+        adelta = abexp - aexp;
+        bdelta = abexp - bexp;
+        cdelta = cdexp - cexp;
+        ddelta = cdexp - dexp;
+*/
 
-        if (n == nlimbs + 1)
+        if (!NFLOAT_IS_ZERO(a))
         {
-            norm = flint_clz(s[n - 1]);
-            if (norm)
-            {
-                mpn_lshift(NFLOAT_D(res), s + 1, nlimbs, norm);
-                NFLOAT_D(res)[0] |= (s[0] >> (FLINT_BITS - norm));
-            }
-            else
-                flint_mpn_copyi(NFLOAT_D(res), s + 1, nlimbs);
-            xexp -= norm;
+            if (!NFLOAT_IS_ZERO(c))
+                _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs);
+            if (!NFLOAT_IS_ZERO(d))
+                _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs);
         }
-        else
-        {
-            if (n == 0)
-                return nfloat_zero(res, ctx);
 
-            norm = flint_clz(s[n - 1]);
-            if (norm)
-                mpn_lshift(NFLOAT_D(res) + nlimbs - n, s, n, norm);
-            else
-                flint_mpn_copyi(NFLOAT_D(res) + nlimbs - n, s, n);
-            flint_mpn_zero(NFLOAT_D(res), nlimbs - n);
-            xexp -= norm;
+        if (!NFLOAT_IS_ZERO(b))
+        {
+            if (!NFLOAT_IS_ZERO(d))
+                _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs);
+            if (!NFLOAT_IS_ZERO(c))
+                _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs);
         }
+    }
 
-        NFLOAT_EXP(res) = xexp;
-        NFLOAT_HANDLE_UNDERFLOW_OVERFLOW(res, ctx);
-        return GR_SUCCESS;
+    re_res = res;
+    im_res = (nn_ptr) res + real_part_size;
+
+    if (re_exp == WORD_MIN)
+    {
+        status |= nfloat_zero(re_res, ctx);
+    }
+    else
+    {
+        re_sgnbit = LIMB_MSB_IS_SET(re_s[nlimbs]);
+        if (re_sgnbit)
+            mpn_neg(re_s, re_s, nlimbs + 1);
+        status |= nfloat_set_mpn_2exp(re_res, re_s, nlimbs + 1, re_exp, re_sgnbit ^ subtract, ctx);
     }
 
-    return GR_UNABLE;
+    if (im_exp == WORD_MIN)
+    {
+        status |= nfloat_zero(im_res, ctx);
+    }
+    else
+    {
+        im_sgnbit = LIMB_MSB_IS_SET(im_s[nlimbs]);
+        if (im_sgnbit)
+            mpn_neg(im_s, im_s, nlimbs + 1);
+        status |= nfloat_set_mpn_2exp(im_res, im_s, nlimbs + 1, im_exp, im_sgnbit ^ subtract, ctx);
+    }
+
+    return status;
 }
 
 int
-_nfloat_vec_dot(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+_nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx)
 {
-    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, y, ctx->sizeof_elem, len, ctx);
+    return __nfloat_complex_vec_dot(res, initial, subtract, x, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), y, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), len, ctx);
 }
 
 int
-_nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_srcptr x, nfloat_srcptr y, slong len, gr_ctx_t ctx)
+_nfloat_complex_vec_dot_rev(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, nfloat_complex_srcptr y, slong len, gr_ctx_t ctx)
 {
-    return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx);
+    return __nfloat_complex_vec_dot(res, initial, subtract, x, NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), GR_ENTRY(y, len - 1, ctx->sizeof_elem), -NFLOAT_COMPLEX_CTX_DATA_NLIMBS(ctx), len, ctx);
 }
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index 05fa2b3977..a1d746bc15 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -13,522 +13,11 @@
 #include "fmpq.h"
 #include "arf.h"
 #include "acf.h"
+#include "gr.h"
 #include "gr_vec.h"
 #include "gr_special.h"
 #include "nfloat.h"
 
-int
-gr_test_cmp_fun(gr_ctx_t R, gr_method_binary_op_get_int op, gr_ctx_t R_ref, flint_rand_t state, int test_flags)
-{
-    int status = GR_SUCCESS;
-    int cmp1, cmp2;
-    gr_ptr a, b, a_ref, b_ref;
-
-    GR_TMP_INIT2(a, b, R);
-    GR_TMP_INIT2(a_ref, b_ref, R_ref);
-
-    status |= gr_randtest(a, state, R);
-    status |= gr_randtest(b, state, R);
-
-    status |= gr_set_other(a_ref, a, R, R_ref);
-    status |= gr_set_other(b_ref, b, R, R_ref);
-
-    status |= op(&cmp1, a, b, R);
-    status |= op(&cmp2, a_ref, b_ref, R_ref);
-
-    if (status == GR_SUCCESS && cmp1 != cmp2)
-        status = GR_TEST_FAIL;
-
-    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
-    {
-        flint_printf("\n");
-        gr_ctx_println(R);
-        gr_ctx_println(R_ref);
-        flint_printf("a = "); gr_println(a, R);
-        flint_printf("b = "); gr_println(b, R);
-        flint_printf("cmp1 = %d\n", cmp1);
-        flint_printf("cmp2 = %d\n", cmp2);
-        flint_printf("\n");
-    }
-
-    if (status == GR_TEST_FAIL)
-        flint_abort();
-
-    GR_TMP_CLEAR2(a, b, R);
-    GR_TMP_CLEAR2(a_ref, b_ref, R_ref);
-
-    return status;
-}
-
-int
-gr_test_approx_unary_op(gr_ctx_t R, gr_method_unary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
-{
-    int status = GR_SUCCESS;
-    int alias;
-    int cmp;
-    gr_ptr a, b, a_ref, b_ref, rel_err;
-
-    GR_TMP_INIT2(a, b, R);
-    GR_TMP_INIT3(a_ref, b_ref, rel_err, R_ref);
-
-    alias = n_randint(state, 2);
-
-    status |= gr_randtest(a, state, R);
-    status |= gr_randtest(b, state, R);
-
-    status |= gr_set_other(a_ref, a, R, R_ref);
-
-    if (status == GR_SUCCESS)
-    {
-        if (alias == 0)
-        {
-            status |= op(b, a, R);
-            status |= op(b_ref, a_ref, R_ref);
-        }
-        else
-        {
-            status |= gr_set(b, a, R);
-            status |= op(b, b, R);
-            status |= op(b_ref, a_ref, R_ref);
-        }
-
-        if (status == GR_SUCCESS)
-        {
-            status |= gr_set_other(rel_err, b, R, R_ref);
-            status |= gr_sub(rel_err, b_ref, rel_err, R_ref);
-            status |= gr_div(rel_err, rel_err, b_ref, R_ref);
-            status |= gr_abs(rel_err, rel_err, R_ref);
-            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
-
-            if (status == GR_SUCCESS && cmp > 0)
-                status = GR_TEST_FAIL;
-
-            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
-            {
-                flint_printf("\n");
-                gr_ctx_println(R);
-                gr_ctx_println(R_ref);
-                flint_printf("alias: %d\n", alias);
-                flint_printf("Computed:\n");
-                flint_printf("a = "); gr_println(a, R);
-                flint_printf("op(a) = "); gr_println(b, R);
-                flint_printf("Reference:\n");
-                flint_printf("a = "); gr_println(a_ref, R_ref);
-                flint_printf("op(a) = "); gr_println(b_ref, R_ref);
-                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
-                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
-                flint_printf("\n");
-            }
-        }
-    }
-
-    if (status == GR_TEST_FAIL)
-        flint_abort();
-
-    GR_TMP_CLEAR2(a, b, R);
-    GR_TMP_CLEAR3(a_ref, b_ref, rel_err, R_ref);
-
-    return status;
-}
-
-int
-gr_test_approx_binary_op(gr_ctx_t R, gr_method_binary_op op, gr_ctx_t R_ref, gr_srcptr rel_tol, flint_rand_t state, int test_flags)
-{
-    int status = GR_SUCCESS;
-    int alias;
-    int cmp;
-    gr_ptr a, b, c, a_ref, b_ref, c_ref, rel_err;
-
-    GR_TMP_INIT3(a, b, c, R);
-    GR_TMP_INIT4(a_ref, b_ref, c_ref, rel_err, R_ref);
-
-    alias = n_randint(state, 5);
-
-    status |= gr_randtest(a, state, R);
-    status |= gr_randtest(b, state, R);
-    status |= gr_randtest(c, state, R);
-
-    status |= gr_set_other(a_ref, a, R, R_ref);
-    status |= gr_set_other(b_ref, b, R, R_ref);
-
-    if (status == GR_SUCCESS)
-    {
-        if (alias == 0)
-        {
-            status |= op(c, a, b, R);
-            status |= op(c_ref, a_ref, b_ref, R_ref);
-        }
-        else if (alias == 1)
-        {
-            status |= gr_set(c, a, R);
-            status |= op(c, c, b, R);
-            status |= op(c_ref, a_ref, b_ref, R_ref);
-        }
-        else if (alias == 2)
-        {
-            status |= gr_set(c, b, R);
-            status |= op(c, a, c, R);
-            status |= op(c_ref, a_ref, b_ref, R_ref);
-        }
-        else if (alias == 3)
-        {
-            status |= gr_set(b, a, R);
-            status |= gr_set(b_ref, a_ref, R_ref);
-            status |= op(c, a, a, R);
-            status |= op(c_ref, a_ref, a_ref, R_ref);
-        }
-        else
-        {
-            status |= gr_set(b, a, R);
-            status |= gr_set(c, a, R);
-            status |= gr_set(b_ref, a_ref, R_ref);
-            status |= op(c, c, c, R);
-            status |= op(c_ref, a_ref, a_ref, R_ref);
-        }
-
-        if (status == GR_SUCCESS)
-        {
-            status |= gr_set_other(rel_err, c, R, R_ref);
-
-            status |= gr_sub(rel_err, c_ref, rel_err, R_ref);
-            status |= gr_div(rel_err, rel_err, c_ref, R_ref);
-            status |= gr_abs(rel_err, rel_err, R_ref);
-            status |= gr_cmp(&cmp, rel_err, rel_tol, R_ref);
-
-            if (status == GR_SUCCESS && cmp > 0)
-                status = GR_TEST_FAIL;
-
-            if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
-            {
-                flint_printf("\n");
-                gr_ctx_println(R);
-                gr_ctx_println(R_ref);
-                flint_printf("alias: %d\n", alias);
-                flint_printf("Computed:\n");
-                flint_printf("a = "); gr_println(a, R);
-                flint_printf("b = "); gr_println(b, R);
-                flint_printf("a (op) b = "); gr_println(c, R);
-                flint_printf("Reference:\n");
-                flint_printf("a = "); gr_println(a_ref, R_ref);
-                flint_printf("b = "); gr_println(b_ref, R_ref);
-                flint_printf("a (op) b = "); gr_println(c_ref, R_ref);
-                flint_printf("\nrel_err = "); gr_println(rel_err, R_ref);
-                flint_printf("\nrel_tol = "); gr_println(rel_tol, R_ref);
-                flint_printf("\n");
-            }
-        }
-    }
-
-    if (status == GR_TEST_FAIL)
-        flint_abort();
-
-    GR_TMP_CLEAR3(a, b, c, R);
-    GR_TMP_CLEAR4(a_ref, b_ref, c_ref, rel_err, R_ref);
-
-    return status;
-}
-
-int
-gr_test_approx_binary_op_type_variants(gr_ctx_t R,
-    const char * opname,
-    gr_method_binary_op gr_op,
-    gr_method_binary_op_ui gr_op_ui,
-    gr_method_binary_op_si gr_op_si,
-    gr_method_binary_op_fmpz gr_op_fmpz,
-    gr_method_binary_op_fmpq gr_op_fmpq,
-    int fused,
-    int small_test_values,
-    gr_srcptr rel_tol, flint_rand_t state, int test_flags)
-{
-    int status, alias, which;
-    gr_ptr x, y, xy1, xy2, err;
-    ulong uy;
-    slong sy;
-    fmpz_t zy;
-    fmpq_t qy;
-
-    GR_TMP_INIT5(x, y, xy1, xy2, err, R);
-    fmpz_init(zy);
-    fmpq_init(qy);
-
-    which = 0;
-
-    if (small_test_values)
-    {
-        uy = n_randint(state, 6);
-        sy = -5 + (slong) n_randint(state, 11);
-        fmpz_randtest(zy, state, 3);
-        fmpq_randtest(qy, state, 3);
-    }
-    else
-    {
-        uy = n_randtest(state);
-        sy = (slong) n_randtest(state);
-
-        if (n_randint(state, 10) == 0)
-        {
-            fmpz_randtest(zy, state, 10000);
-            fmpq_randtest(qy, state, 200);
-        }
-        else
-        {
-            fmpz_randtest(zy, state, 100);
-            fmpq_randtest(qy, state, 100);
-        }
-    }
-
-    for (which = 0; which < 4; which++)
-    {
-        status = GR_SUCCESS;
-        alias = n_randint(state, 2);
-
-        GR_MUST_SUCCEED(gr_randtest(x, state, R));
-        GR_MUST_SUCCEED(gr_randtest(y, state, R));
-        GR_MUST_SUCCEED(gr_randtest(xy1, state, R));
-
-        if (fused && alias)
-            GR_MUST_SUCCEED(gr_set(xy2, x, R));
-        else if (fused)
-            GR_MUST_SUCCEED(gr_set(xy2, xy1, R));
-        else
-            GR_MUST_SUCCEED(gr_randtest(xy2, state, R));
-
-        if (alias)
-            GR_MUST_SUCCEED(gr_set(xy1, x, R));
-
-        if (which == 0)
-        {
-            if (alias)
-                status |= gr_op_ui(xy1, xy1, uy, R);
-            else
-                status |= gr_op_ui(xy1, x, uy, R);
-
-            status |= gr_set_ui(y, uy, R);
-
-            if (status == GR_SUCCESS)
-                status |= gr_op(xy2, x, y, R);
-        }
-        else if (which == 1)
-        {
-            if (alias)
-                status |= gr_op_si(xy1, xy1, sy, R);
-            else
-                status |= gr_op_si(xy1, x, sy, R);
-            status |= gr_set_si(y, sy, R);
-
-            if (status == GR_SUCCESS)
-                status |= gr_op(xy2, x, y, R);
-        }
-        else if (which == 2)
-        {
-            if (alias)
-                status |= gr_op_fmpz(xy1, xy1, zy, R);
-            else
-                status |= gr_op_fmpz(xy1, x, zy, R);
-            status |= gr_set_fmpz(y, zy, R);
-
-            if (status == GR_SUCCESS)
-                status |= gr_op(xy2, x, y, R);
-        }
-        else
-        {
-            if (alias)
-                status |= gr_op_fmpq(xy1, xy1, qy, R);
-            else
-                status |= gr_op_fmpq(xy1, x, qy, R);
-            status |= gr_set_fmpq(y, qy, R);
-
-            if (status == GR_SUCCESS)
-                status |= gr_op(xy2, x, y, R);
-        }
-
-        if (status == GR_SUCCESS)
-        {
-            int cmp;
-
-            status |= gr_sub(err, xy1, xy2, R);
-            status |= gr_div(err, err, xy2, R);
-            status |= gr_abs(err, err, R);
-            status |= gr_cmp(&cmp, err, rel_tol, R);
-
-            if (status == GR_SUCCESS && cmp > 0)
-            {
-                status = GR_TEST_FAIL;
-                break;
-            }
-        }
-    }
-
-    if ((test_flags & GR_TEST_ALWAYS_ABLE) && (status & GR_UNABLE))
-        status = GR_TEST_FAIL;
-
-    if ((test_flags & GR_TEST_VERBOSE) || status == GR_TEST_FAIL)
-    {
-        flint_printf("\n");
-        flint_printf("%s\n", opname);
-        gr_ctx_println(R);
-        flint_printf("which: %d\n", which);
-        flint_printf("alias: %d\n", alias);
-        flint_printf("x = "); gr_println(x, R);
-        flint_printf("y = "); gr_println(y, R);
-        flint_printf("y (op) y (1) = "); gr_println(xy1, R);
-        flint_printf("x (op) y (2) = "); gr_println(xy2, R);
-        flint_printf("err = "); gr_println(err, R);
-        flint_printf("tol = "); gr_println(rel_tol, R);
-        flint_printf("\n");
-    }
-
-    if (status == GR_TEST_FAIL)
-        flint_abort();
-
-    GR_TMP_CLEAR5(x, y, xy1, xy2, err, R);
-
-    fmpz_clear(zy);
-    fmpq_clear(qy);
-
-    return status;
-}
-
-void
-nfloat_test_dot(flint_rand_t state, slong iters, gr_ctx_t ctx)
-{
-    slong iter, i, len, prec, ebits;
-    gr_ptr s0, vec1, vec2, res;
-    int subtract, initial, reverse;
-    arf_ptr avec1, avec2;
-    arf_t as0, ares, ares2, amag, err, t;
-
-    prec = NFLOAT_CTX_PREC(ctx);
-
-    for (iter = 0; iter < iters; iter++)
-    {
-        len = n_randint(state, 30);
-
-        if (n_randint(state, 2))
-            ebits = 1;
-        else
-            ebits = 10;
-
-        initial = n_randint(state, 2);
-        subtract = n_randint(state, 2);
-        reverse = n_randint(state, 2);
-
-        vec1 = gr_heap_init_vec(len, ctx);
-        vec2 = gr_heap_init_vec(len, ctx);
-        s0 = gr_heap_init(ctx);
-        res = gr_heap_init(ctx);
-
-        avec1 = _arf_vec_init(len);
-        avec2 = _arf_vec_init(len);
-        arf_init(as0);
-        arf_init(ares);
-        arf_init(ares2);
-        arf_init(amag);
-        arf_init(t);
-        arf_init(err);
-
-        if (initial)
-        {
-            arf_randtest(as0, state, prec, ebits);
-            GR_MUST_SUCCEED(nfloat_set_arf(s0, as0, ctx));
-
-            arf_set(ares, as0);
-            arf_abs(t, as0);
-            arf_add(amag, amag, t, prec, ARF_RND_DOWN);
-        }
-
-        for (i = 0; i < len; i++)
-        {
-            if (i > 0 && n_randint(state, 2))
-            {
-                arf_set(avec1 + i, avec1 + len - 1 - i);
-                arf_neg(avec2 + i, avec2 + len - 1 - i);
-            }
-            else
-            {
-                arf_randtest(avec1 + i, state, prec, ebits);
-                arf_randtest(avec2 + i, state, prec, ebits);
-            }
-        }
-
-        for (i = 0; i < len; i++)
-        {
-            arf_mul(t, avec1 + i, avec2 + (reverse ? len - 1 - i : i), 2 * prec, ARF_RND_DOWN);
-
-            if (subtract)
-                arf_sub(ares, ares, t, 2 * prec, ARF_RND_DOWN);
-            else
-                arf_add(ares, ares, t, 2 * prec, ARF_RND_DOWN);
-
-            arf_abs(t, t);
-            arf_add(amag, amag, t, prec, ARF_RND_DOWN);
-        }
-
-        /* tolerance */
-        arf_mul_2exp_si(t, amag, -prec + 3);
-
-        for (i = 0; i < len; i++)
-        {
-            GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec1, i, ctx->sizeof_elem), avec1 + i, ctx));
-            GR_MUST_SUCCEED(nfloat_set_arf(GR_ENTRY(vec2, i, ctx->sizeof_elem), avec2 + i, ctx));
-        }
-
-        if (reverse)
-            GR_MUST_SUCCEED(_nfloat_vec_dot_rev(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx));
-        else
-            GR_MUST_SUCCEED(_nfloat_vec_dot(res, initial ? s0 : NULL, subtract, vec1, vec2, len, ctx));
-
-        GR_MUST_SUCCEED(nfloat_get_arf(ares2, res, ctx));
-
-        arf_sub(err, ares, ares2, prec, ARF_RND_DOWN);
-        arf_abs(err, err);
-
-        if (arf_cmpabs(err, t) > 0)
-        {
-            flint_printf("FAIL: dot\n");
-            gr_ctx_println(ctx);
-
-            flint_printf("reverse = %d, subtract = %d\n", reverse, subtract);
-
-            if (initial)
-            {
-                flint_printf("\n\ninitial = ");
-                arf_printd(as0, 2 + prec / 3.33);
-            }
-
-            flint_printf("\n\nvec1 = ");
-            _gr_vec_print(vec1, len, ctx);
-            flint_printf("\n\nvec2 = ");
-            _gr_vec_print(vec2, len, ctx);
-            flint_printf("\n\nares = \n");
-            arf_printd(ares, 2 + prec / 3.33);
-            flint_printf("\n\nares2 = \n");
-            arf_printd(ares2, 2 + prec / 3.33);
-            flint_printf("\n\ntol = \n");
-            arf_printd(t, 10);
-            flint_printf("\n\nerr = \n");
-            arf_printd(err, 10);
-            flint_printf("\n\n");
-
-            flint_abort();
-        }
-
-        gr_heap_clear_vec(vec1, len, ctx);
-        gr_heap_clear_vec(vec2, len, ctx);
-        gr_heap_clear(s0, ctx);
-        gr_heap_clear(res, ctx);
-
-        _arf_vec_clear(avec1, len);
-        _arf_vec_clear(avec2, len);
-        arf_clear(as0);
-        arf_clear(ares);
-        arf_clear(ares2);
-        arf_clear(amag);
-        arf_clear(t);
-        arf_clear(err);
-    }
-}
-
 TEST_FUNCTION_START(nfloat, state)
 {
     gr_ctx_t ctx;
@@ -536,53 +25,12 @@ TEST_FUNCTION_START(nfloat, state)
     slong prec;
     slong iter;
 
-    for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS)
-    {
-        nfloat_complex_ctx_init(ctx, prec, 0);
-
-        gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0);
-
-        {
-            gr_ptr tol1, tol;
-            slong i, reps;
-
-            gr_ctx_init_complex_acb(ctx2, prec + 32);
-
-            tol1 = gr_heap_init(ctx);
-            tol = gr_heap_init(ctx2);
-
-            GR_IGNORE(gr_one(tol, ctx2));
-            GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 3, ctx2));
-
-            reps = (prec <= 256 ? 10000 : 1) * flint_test_multiplier();
-
-            for (i = 0; i < reps; i++)
-            {
-                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_add, ctx2, tol, state, 0);
-                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0);
-                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0);
-                gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0);
-                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_neg, ctx2, tol, state, 0);
-                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sqr, ctx2, tol, state, 0);
-                gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_inv, ctx2, tol, state, 0);
-            }
-
-            gr_heap_clear(tol1, ctx);
-            gr_heap_clear(tol, ctx2);
-            gr_ctx_clear(ctx2);
-        }
-
-        gr_ctx_clear(ctx);
-    }
-
     for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS)
     {
         nfloat_ctx_init(ctx, prec, 0);
 
         gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0);
 
-        nfloat_test_dot(state, (prec <= 128 ? 10000 : 100) * flint_test_multiplier(), ctx);
-
         {
             gr_ptr x, x2;
             slong i;
@@ -641,7 +89,7 @@ TEST_FUNCTION_START(nfloat, state)
             gr_ptr tol1, tol;
             slong i, reps;
 
-            gr_ctx_init_real_arb(ctx2, prec + 32);
+            gr_ctx_init_real_arb(ctx2, prec + 64);
 
             tol1 = gr_heap_init(ctx);
             tol = gr_heap_init(ctx2);
@@ -657,6 +105,12 @@ TEST_FUNCTION_START(nfloat, state)
                 gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0);
             }
 
+            reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier();
+
+            for (i = 0; i < reps; i++)
+                gr_test_approx_dot(ctx, ctx2, 30, tol, state, 0);
+
+
             reps = (prec <= 256 ? 1000 : 1) * flint_test_multiplier();
 
             for (i = 0; i < reps; i++)
diff --git a/src/nfloat/test/t-nfloat_complex.c b/src/nfloat/test/t-nfloat_complex.c
index f800dbe883..320cc34f66 100644
--- a/src/nfloat/test/t-nfloat_complex.c
+++ b/src/nfloat/test/t-nfloat_complex.c
@@ -18,12 +18,48 @@
 
 TEST_FUNCTION_START(nfloat_complex, state)
 {
-/*
     gr_ctx_t ctx;
     gr_ctx_t ctx2;
     slong prec;
-    slong iter;
-*/
+    slong iter, reps;
+    gr_ptr tol1, tol;
+
+    for (prec = NFLOAT_MIN_LIMBS * FLINT_BITS; prec <= NFLOAT_MAX_LIMBS * FLINT_BITS; prec += FLINT_BITS)
+    {
+        nfloat_complex_ctx_init(ctx, prec, 0);
+        gr_ctx_init_complex_acb(ctx2, prec + 64);
+
+        gr_test_floating_point(ctx, 100 * flint_test_multiplier(), 0);
+
+        tol1 = gr_heap_init(ctx);
+        tol = gr_heap_init(ctx2);
+
+        GR_IGNORE(gr_one(tol, ctx2));
+        GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 3, ctx2));
+        reps = (prec <= 256 ? 10000 : 1) * flint_test_multiplier();
+        for (iter = 0; iter < reps; iter++)
+        {
+            gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_add, ctx2, tol, state, 0);
+            gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_sub, ctx2, tol, state, 0);
+            gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_mul, ctx2, tol, state, 0);
+            gr_test_approx_binary_op(ctx, (gr_method_binary_op) gr_div, ctx2, tol, state, 0);
+            gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_neg, ctx2, tol, state, 0);
+            gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_sqr, ctx2, tol, state, 0);
+            gr_test_approx_unary_op(ctx, (gr_method_unary_op) gr_inv, ctx2, tol, state, 0);
+        }
+
+        GR_IGNORE(gr_one(tol, ctx2));
+        GR_IGNORE(gr_mul_2exp_si(tol, tol, -prec + 4, ctx2));
+        reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier();
+        for (iter = 0; iter < reps; iter++)
+            gr_test_approx_dot(ctx, ctx2, 10, tol, state, 0);
+
+        gr_heap_clear(tol1, ctx);
+        gr_heap_clear(tol, ctx2);
+        gr_ctx_clear(ctx2);
+
+        gr_ctx_clear(ctx);
+    }
 
     TEST_FUNCTION_END(state);
-}
+}
\ No newline at end of file

From 027e9219994b392950ef9de44644b3710311d997 Mon Sep 17 00:00:00 2001
From: Fredrik Johansson <fredrik.johansson@gmail.com>
Date: Wed, 29 May 2024 10:19:34 +0200
Subject: [PATCH 2/2] nfloat_complex dot products

---
 src/gr/test_ring.c         |   2 +-
 src/nfloat/complex.c       |   2 +-
 src/nfloat/dot.c           | 388 ++++++++++++++++++++++++++++++++++---
 src/nfloat/test/t-nfloat.c |   2 -
 4 files changed, 358 insertions(+), 36 deletions(-)

diff --git a/src/gr/test_ring.c b/src/gr/test_ring.c
index 78a5c939d3..c1a16d7c43 100644
--- a/src/gr/test_ring.c
+++ b/src/gr/test_ring.c
@@ -4025,7 +4025,7 @@ gr_test_approx_dot(gr_ctx_t R, gr_ctx_t R_ref, slong maxlen, gr_srcptr rel_tol,
     int status = GR_SUCCESS;
     int cmp;
 
-    len = n_randint(state, maxlen);
+    len = n_randint(state, maxlen + 1);
     initial = n_randint(state, 2);
     subtract = n_randint(state, 2);
     reverse = n_randint(state, 2);
diff --git a/src/nfloat/complex.c b/src/nfloat/complex.c
index 3d4dc0bfd5..ec2a7a26ee 100644
--- a/src/nfloat/complex.c
+++ b/src/nfloat/complex.c
@@ -17,7 +17,7 @@
 #include "acb.h"
 #include "nfloat.h"
 
-int
+static int
 _flint_mpn_signed_add_n(nn_ptr res, nn_srcptr x, int xsgnbit, nn_srcptr y, int ysgnbit, mp_size_t n)
 {
     if (xsgnbit == ysgnbit)
diff --git a/src/nfloat/dot.c b/src/nfloat/dot.c
index 32c678c200..4d2a0c0970 100644
--- a/src/nfloat/dot.c
+++ b/src/nfloat/dot.c
@@ -129,6 +129,56 @@ _nfloat_vec_dot_set_initial(ulong * s, slong sexp, nfloat_srcptr x, int subtract
         mpn_neg(s, s, nlimbs + 1);
 }
 
+/* n is the number of limbs of s and x */
+void
+_nfloat_vec_dot_add_limbs(ulong * s, slong sexp, mp_srcptr x, slong xexp, int subtract, slong n)
+{
+    slong delta, delta_limbs, delta_bits;
+    ulong t[NFLOAT_MAX_LIMBS + 1];
+
+    if (sexp < xexp)
+    {
+        delta_bits = xexp - sexp;
+        delta_limbs = 0;
+
+        FLINT_ASSERT(delta < FLINT_BITS);
+
+        mpn_lshift(t, x, n, delta_bits);
+
+        if (subtract)
+            mpn_sub_n(s, s, t, n);
+        else
+            mpn_add_n(s, s, t, n);
+    }
+    else
+    {
+        delta = sexp - xexp;
+
+        delta_limbs = delta / FLINT_BITS;
+        delta_bits = delta % FLINT_BITS;
+
+        if (delta_limbs < n)
+        {
+            if (delta_bits == 0)
+            {
+                if (subtract)
+                    mpn_sub(s, s, n, x + delta_limbs, n - delta_limbs);
+                else
+                    mpn_add(s, s, n, x + delta_limbs, n - delta_limbs);
+            }
+            else
+            {
+                mpn_rshift(t, x + delta_limbs, n - delta_limbs, delta_bits);
+
+                if (subtract)
+                    mpn_sub(s, s, n, t, n - delta_limbs);
+                else
+                    mpn_add(s, s, n, t, n - delta_limbs);
+            }
+        }
+    }
+}
+
 #include <math.h>
 
 void
@@ -641,7 +691,7 @@ _nfloat_vec_dot_rev(nfloat_ptr res, nfloat_srcptr initial, int subtract, nfloat_
     return __nfloat_vec_dot(res, initial, subtract, x, ctx->sizeof_elem, GR_ENTRY(y, len - 1, ctx->sizeof_elem), -ctx->sizeof_elem, len, ctx);
 }
 
-
+/*
 void
 print_s(ulong * s, slong sexp, slong nlimbs)
 {
@@ -656,6 +706,125 @@ print_s(ulong * s, slong sexp, slong nlimbs)
 
     flint_printf("SUM %.15e\n", (double) x * ldexp(1.0, sexp - FLINT_BITS) * c);
 }
+*/
+
+FLINT_FORCE_INLINE void
+_nfloat_vec_dot_addmul_1(ulong * _s1, ulong * _s0, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract)
+{
+    slong xexp, delta;
+    ulong s0, s1, t0, t1;
+    int xsgnbit;
+
+    xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+    delta = sexp - xexp;
+    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract;
+
+    s0 = *_s0;
+    s1 = *_s1;
+
+    if (delta < FLINT_BITS)
+    {
+        umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+        if (xsgnbit)
+            sub_ddmmss(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+        else
+            add_ssaaaa(s1, s0, s1, s0, t1 >> delta, (t0 >> delta) | (t1 << (FLINT_BITS - delta)));
+    }
+    else if (delta < 2 * FLINT_BITS)
+    {
+        umul_ppmm(t1, t0, NFLOAT_D(xi)[0], NFLOAT_D(yi)[0]);
+        if (xsgnbit)
+            sub_ddmmss(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+        else
+            add_ssaaaa(s1, s0, s1, s0, 0, t1 >> (delta - FLINT_BITS));
+    }
+
+    *_s0 = s0;
+    *_s1 = s1;
+}
+
+FLINT_FORCE_INLINE void
+_nfloat_vec_dot_addmul_2(ulong * _s2, ulong * _s1, ulong * _s0, slong sexp, nfloat_srcptr xi, nfloat_srcptr yi, int subtract)
+{
+    slong xexp, delta;
+    ulong s0, s1, s2, t0, t1, t2, t3;
+    int xsgnbit;
+
+    s0 = *_s0;
+    s1 = *_s1;
+    s2 = *_s2;
+
+    xexp = NFLOAT_EXP(xi) + NFLOAT_EXP(yi);
+    xsgnbit = NFLOAT_SGNBIT(xi) ^ NFLOAT_SGNBIT(yi) ^ subtract;
+    delta = sexp - xexp;
+    FLINT_ASSERT(delta != 0);
+
+    if (delta < FLINT_BITS)
+    {
+        FLINT_MPN_MUL_2X2(t3, t2, t1, t0, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+        (void) t0;
+
+        if (xsgnbit)
+            sub_dddmmmsss(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta)));
+        else
+            add_sssaaaaaa(s2, s1, s0, s2, s1, s0, t3 >> delta, (t2 >> delta) | (t3 << (FLINT_BITS - delta)), (t1 >> delta) | (t2 << (FLINT_BITS - delta)));
+    }
+    else if (delta < 3 * FLINT_BITS)
+    {
+        if (delta < 2 * FLINT_BITS)
+        {
+            FLINT_MPN_MUL_2X2H(t3, t2, t1, NFLOAT_D(xi)[1], NFLOAT_D(xi)[0], NFLOAT_D(yi)[1], NFLOAT_D(yi)[0]);
+
+            delta -= FLINT_BITS;
+
+            if (delta != 0)
+            {
+                t2 = (t2 >> delta) | (t3 << (FLINT_BITS - delta));
+                t3 = t3 >> delta;
+            }
+        }
+        else
+        {
+            umul_ppmm(t3, t2, NFLOAT_D(xi)[1], NFLOAT_D(yi)[1]);
+
+            delta -= 2 * FLINT_BITS;
+
+            if (delta != 0)
+                t3 = t3 >> delta;
+
+            t2 = t3;
+            t3 = 0;
+        }
+
+        if (xsgnbit)
+            sub_dddmmmsss(s2, s1, s0, s2, s1, s0, 0, t3, t2);
+        else
+            add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t3, t2);
+    }
+
+    *_s0 = s0;
+    *_s1 = s1;
+    *_s2 = s2;
+}
+
+static int
+_flint_mpn_signed_add_n(nn_ptr res, nn_srcptr x, int xsgnbit, nn_srcptr y, int ysgnbit, mp_size_t n)
+{
+    if (xsgnbit == ysgnbit)
+        mpn_add_n(res, x, y, n);
+    else
+    {
+        if (mpn_cmp(x, y, n) >= 0)
+            mpn_sub_n(res, x, y, n);
+        else
+        {
+            mpn_sub_n(res, y, x, n);
+            xsgnbit = !xsgnbit;
+        }
+    }
+
+    return xsgnbit;
+}
 
 int
 __nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial, int subtract, nfloat_complex_srcptr x, slong xstep, nfloat_complex_srcptr y, slong ystep, slong len, gr_ctx_t ctx)
@@ -729,50 +898,205 @@ __nfloat_complex_vec_dot(nfloat_complex_ptr res, nfloat_complex_srcptr initial,
             _nfloat_vec_dot_set_initial(im_s, im_exp, im_initial, subtract, nlimbs);
     }
 
-    for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
+    if (nlimbs == 1)
     {
-        a = xi;
-        b = (ulong *) xi + real_part_size;
-        c = yi;
-        d = (ulong *) yi + real_part_size;
+        ulong re0, re1;
+        ulong im0, im1;
 
-/*
-        aexp = NFLOAT_EXP(a);
-        bexp = NFLOAT_EXP(b);
-        cexp = NFLOAT_EXP(c);
-        dexp = NFLOAT_EXP(d);
+        re0 = re_s[0];
+        re1 = re_s[1];
+        im0 = im_s[0];
+        im1 = im_s[1];
 
-        asgnbit = NFLOAT_SGNBIT(a);
-        bsgnbit = NFLOAT_SGNBIT(b);
-        csgnbit = NFLOAT_SGNBIT(c);
-        dsgnbit = NFLOAT_SGNBIT(d);
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
+        {
+            a = xi;
+            b = (ulong *) xi + real_part_size;
+            c = yi;
+            d = (ulong *) yi + real_part_size;
 
-        abexp = FLINT_MAX(aexp, bexp) + 2;
-        cdexp = FLINT_MAX(cexp, dexp) + 2;
+            if (!NFLOAT_IS_ZERO(a))
+            {
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul_1(&re1, &re0, re_exp, a, c, 0);
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul_1(&im1, &im0, im_exp, a, d, 0);
+            }
 
-        adelta = abexp - aexp;
-        bdelta = abexp - bexp;
-        cdelta = cdexp - cexp;
-        ddelta = cdexp - dexp;
-*/
+            if (!NFLOAT_IS_ZERO(b))
+            {
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul_1(&re1, &re0, re_exp, b, d, 1);
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul_1(&im1, &im0, im_exp, b, c, 0);
+            }
+        }
 
-        if (!NFLOAT_IS_ZERO(a))
+        re_s[0] = re0;
+        re_s[1] = re1;
+        im_s[0] = im0;
+        im_s[1] = im1;
+    }
+    else if (nlimbs == 2)
+    {
+        ulong re0, re1, re2;
+        ulong im0, im1, im2;
+
+        re0 = re_s[0];
+        re1 = re_s[1];
+        re2 = re_s[2];
+        im0 = im_s[0];
+        im1 = im_s[1];
+        im2 = im_s[2];
+
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
         {
-            if (!NFLOAT_IS_ZERO(c))
-                _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs);
-            if (!NFLOAT_IS_ZERO(d))
-                _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs);
+            a = xi;
+            b = (ulong *) xi + real_part_size;
+            c = yi;
+            d = (ulong *) yi + real_part_size;
+
+            if (!NFLOAT_IS_ZERO(a))
+            {
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul_2(&re2, &re1, &re0, re_exp, a, c, 0);
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul_2(&im2, &im1, &im0, im_exp, a, d, 0);
+            }
+
+            if (!NFLOAT_IS_ZERO(b))
+            {
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul_2(&re2, &re1, &re0, re_exp, b, d, 1);
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul_2(&im2, &im1, &im0, im_exp, b, c, 0);
+            }
         }
 
-        if (!NFLOAT_IS_ZERO(b))
+        re_s[0] = re0;
+        re_s[1] = re1;
+        re_s[2] = re2;
+
+        im_s[0] = im0;
+        im_s[1] = im1;
+        im_s[2] = im2;
+    }
+    else
+    {
+        for (i = 0, xi = x, yi = y; i < len; i++, xi = (ulong *) xi + xstep, yi = (ulong *) yi + ystep)
         {
-            if (!NFLOAT_IS_ZERO(d))
-                _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs);
-            if (!NFLOAT_IS_ZERO(c))
-                _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs);
+            a = xi;
+            b = (ulong *) xi + real_part_size;
+            c = yi;
+            d = (ulong *) yi + real_part_size;
+
+            /* Want Karatsuba? */
+            if (nlimbs >= 12 && !NFLOAT_IS_ZERO(a) && !NFLOAT_IS_ZERO(b) &&
+                    !NFLOAT_IS_ZERO(c) && !NFLOAT_IS_ZERO(d))
+            {
+                slong abexp, cdexp, abcdexp, adelta, bdelta, cdelta, ddelta;
+
+                aexp = NFLOAT_EXP(a);
+                bexp = NFLOAT_EXP(b);
+                cexp = NFLOAT_EXP(c);
+                dexp = NFLOAT_EXP(d);
+
+                abexp = FLINT_MAX(aexp, bexp) + 2;
+                cdexp = FLINT_MAX(cexp, dexp) + 2;
+                abcdexp = abexp + cdexp;
+
+                adelta = abexp - aexp;
+                bdelta = abexp - bexp;
+                cdelta = cdexp - cexp;
+                ddelta = cdexp - dexp;
+
+                if (adelta < FLINT_BITS && bdelta < FLINT_BITS &&
+                    cdelta < FLINT_BITS && ddelta < FLINT_BITS &&
+                    /* To do: check very carefully that the rounded sums
+                       cannot overflow the guard bits in re_s and im_s. */
+                    abcdexp < re_exp + FLINT_BITS - 8 &&
+                    abcdexp < im_exp + FLINT_BITS - 8 &&
+                    /* To do: use with truncation also when this term
+                       is small. */
+                    abcdexp > re_exp - 2 * FLINT_BITS &&
+                    abcdexp > im_exp - 2 * FLINT_BITS)
+                {
+                    ulong aa[NFLOAT_MAX_LIMBS + 1];
+                    ulong bb[NFLOAT_MAX_LIMBS + 1];
+                    ulong cc[NFLOAT_MAX_LIMBS + 1];
+                    ulong dd[NFLOAT_MAX_LIMBS + 1];
+                    ulong s[NFLOAT_MAX_LIMBS + 1];
+                    ulong t[NFLOAT_MAX_LIMBS + 1];
+                    ulong u[NFLOAT_MAX_LIMBS + 1];
+                    ulong v[NFLOAT_MAX_LIMBS + 1];
+
+                    int asgnbit, bsgnbit, csgnbit, dsgnbit;
+                    int ssgnbit, tsgnbit, usgnbit;
+
+                    asgnbit = NFLOAT_SGNBIT(a);
+                    bsgnbit = NFLOAT_SGNBIT(b);
+                    csgnbit = NFLOAT_SGNBIT(c);
+                    dsgnbit = NFLOAT_SGNBIT(d);
+
+                    /*
+                        s = c * (a + b)
+                        t = a * (d - c)
+                        u = b * (c + d)
+                        re = s - u
+                        im = s + t
+                    */
+
+                    aa[0] = mpn_rshift(aa + 1, NFLOAT_D(a), nlimbs, adelta);
+                    bb[0] = mpn_rshift(bb + 1, NFLOAT_D(b), nlimbs, bdelta);
+                    cc[0] = mpn_rshift(cc + 1, NFLOAT_D(c), nlimbs, cdelta);
+                    dd[0] = mpn_rshift(dd + 1, NFLOAT_D(d), nlimbs, ddelta);
+
+                    ssgnbit = csgnbit ^ _flint_mpn_signed_add_n(v, aa, asgnbit, bb, bsgnbit, nlimbs + 1);
+                    flint_mpn_mulhigh_n(s, cc, v, nlimbs + 1);
+
+                    tsgnbit = asgnbit ^ _flint_mpn_signed_add_n(v, dd, dsgnbit, cc, !csgnbit, nlimbs + 1);
+                    flint_mpn_mulhigh_n(t, aa, v, nlimbs + 1);
+
+                    usgnbit = bsgnbit ^ _flint_mpn_signed_add_n(v, cc, csgnbit, dd, dsgnbit, nlimbs + 1);
+                    flint_mpn_mulhigh_n(u, bb, v, nlimbs + 1);
+
+                    usgnbit = _flint_mpn_signed_add_n(u, s, ssgnbit, u, !usgnbit, nlimbs + 1);
+                    tsgnbit = _flint_mpn_signed_add_n(t, s, ssgnbit, t, tsgnbit, nlimbs + 1);
+
+                    /*
+                    flint_printf("|u| = ");
+                    print_s(u, abexp + cdexp, nlimbs + 1);
+                    flint_printf("|t| = ");
+                    print_s(t, abexp + cdexp, nlimbs + 1);
+                    */
+
+                    _nfloat_vec_dot_add_limbs(re_s, re_exp, u, abcdexp, usgnbit, nlimbs + 1);
+                    _nfloat_vec_dot_add_limbs(im_s, im_exp, t, abcdexp, tsgnbit, nlimbs + 1);
+
+                    continue;
+                }
+            }
+
+            if (!NFLOAT_IS_ZERO(a))
+            {
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul(re_s, re_exp, a, c, 0, nlimbs);
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul(im_s, im_exp, a, d, 0, nlimbs);
+            }
+
+            if (!NFLOAT_IS_ZERO(b))
+            {
+                if (!NFLOAT_IS_ZERO(d))
+                    _nfloat_vec_dot_addmul(re_s, re_exp, b, d, 1, nlimbs);
+                if (!NFLOAT_IS_ZERO(c))
+                    _nfloat_vec_dot_addmul(im_s, im_exp, b, c, 0, nlimbs);
+            }
         }
     }
 
+    /* todo: maybe specialize for nlimbs=1, 2 */
+
     re_res = res;
     im_res = (nn_ptr) res + real_part_size;
 
diff --git a/src/nfloat/test/t-nfloat.c b/src/nfloat/test/t-nfloat.c
index a1d746bc15..cbc027d9fd 100644
--- a/src/nfloat/test/t-nfloat.c
+++ b/src/nfloat/test/t-nfloat.c
@@ -106,11 +106,9 @@ TEST_FUNCTION_START(nfloat, state)
             }
 
             reps = (prec <= 256 ? 10000 : 100) * flint_test_multiplier();
-
             for (i = 0; i < reps; i++)
                 gr_test_approx_dot(ctx, ctx2, 30, tol, state, 0);
 
-
             reps = (prec <= 256 ? 1000 : 1) * flint_test_multiplier();
 
             for (i = 0; i < reps; i++)