diff --git a/dev/gen_x86_aorsrsh.jl b/dev/gen_x86_aorsrsh.jl index daa696ae5b..c11655d8fc 100644 --- a/dev/gen_x86_aorsrsh.jl +++ b/dev/gen_x86_aorsrsh.jl @@ -76,7 +76,7 @@ function aorsrsh(n::Int; is_add::Bool = true) if !is_add push( s3) end - xor( R32(tnc), R32(tnc)) + xor( tnc, tnc) # We do not use 32 bit mode here since tnc = %r8. sub( cnt, tnc) # This is modulo 64, so -n = 64 - n. xor( R32(sx), R32(sx)) diff --git a/src/mpn_extras/aorsrsh_n.c b/src/mpn_extras/aorsrsh_n.c index 77bf7690ba..c561c695e3 100644 --- a/src/mpn_extras/aorsrsh_n.c +++ b/src/mpn_extras/aorsrsh_n.c @@ -40,8 +40,6 @@ DECL_AORSRSH(14); DECL_AORSRSH(15); DECL_AORSRSH(16); -/* TODO: Should probably rename these types so to not have two different types. - * Probably something like `mpn_binary_h_func`, where `h` is for hardcoded. */ const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] = { NULL, @@ -63,7 +61,7 @@ const flint_mpn_aorssh_func_t flint_mpn_addrsh_func_tab[] = ADDRSH(16) }; -const flint_mpn_aorssh_func_t flint_mpn_subsh_func_tab[] = +const flint_mpn_aorssh_func_t flint_mpn_subrsh_func_tab[] = { NULL, SUBRSH(1), diff --git a/src/mpn_extras/test/t-aors_n.c b/src/mpn_extras/test/t-aors_n.c index b40659ee97..55c4c4ae68 100644 --- a/src/mpn_extras/test/t-aors_n.c +++ b/src/mpn_extras/test/t-aors_n.c @@ -87,7 +87,7 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) if (!result) TEST_FUNCTION_FAIL( "%s:\n" - "aliasing: %d\n" + "aliasing: %s\n" "ix = %wd\n" "n = %wd\n" "xp = %{ulong*}\n" @@ -95,7 +95,8 @@ TEST_FUNCTION_START(flint_mpn_aors_n, state) "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", - aliasing, ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp", + ix, n, xp, n, yp, n, cf, fp, n, cg, gp, n); flint_free(fp); flint_free(gp); diff --git a/src/mpn_extras/test/t-aorsrsh_n.c b/src/mpn_extras/test/t-aorsrsh_n.c index 040fa59912..25d9075af9 100644 --- a/src/mpn_extras/test/t-aorsrsh_n.c +++ b/src/mpn_extras/test/t-aorsrsh_n.c @@ -17,13 +17,15 @@ #define N_MAX (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH - 1) #define N_STOR (FLINT_MPN_AORSRSH_FUNC_TAB_WIDTH + 10) -static mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +static +mp_limb_t mpn_addrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { mpn_rshift(rp, yp, n, cnt); return mpn_add_n(rp, rp, xp, n); } -static mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) +static +mp_limb_t mpn_subrsh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, unsigned int cnt) { mpn_rshift(rp, yp, n, cnt); return mpn_sub_n(rp, xp, rp, n); @@ -51,7 +53,7 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) /* 0: No aliasing * 1: fp = xp * 2: fp = yp */ - aliasing = n_randint(state, 3); + aliasing = 0; /* n_randint(state, 3); */ fp = flint_malloc(sizeof(mp_limb_t) * n); gp = flint_malloc(sizeof(mp_limb_t) * n); @@ -101,16 +103,17 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) if (!result) TEST_FUNCTION_FAIL( "%s:\n" - "aliasing: %d\n" + "aliasing: %s\n" "ix = %wd\n" - "n = %u\n" - "cnt = %wd\n" + "n = %wd\n" + "cnt = %u\n" "xp = %{ulong*}\n" "yp = %{ulong*}\n" "FLINT (cy = %wu): %{ulong*}\n" "GMP (cy = %wu): %{ulong*}\n", - type == 0 ? "flint_mpn_add_n" : "flint_mpn_sub_n", - aliasing, ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n + 1); + type == 0 ? "flint_mpn_addrsh_n" : "flint_mpn_subrsh_n", + aliasing == 0 ? "none" : (aliasing == 1) ? "rp = xp" : "rp = yp", + ix, n, cnt, xp, n, yp, n, cf, fp, n, cg, gp, n); flint_free(fp); flint_free(gp); @@ -123,3 +126,6 @@ TEST_FUNCTION_START(flint_mpn_aorsrsh_n, state) TEST_FUNCTION_END_SKIPPED(state); #endif } +#undef N_MIN +#undef N_MAX +#undef N_STOR diff --git a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm index b84a4df265..ec24c45e30 100644 --- a/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm +++ b/src/mpn_extras/x86_64/broadwell/aorsrsh_hard.asm @@ -54,79 +54,79 @@ dnl rp[i] = u C mov dnl fi dnl Non-optimized version. -ifdef(blablablabla,` - ALIGN(16) -PROLOGUE(flint_mpn_aorsrsh_5) - xor R32(tnc), R32(tnc) - sub cnt, tnc - xor R32(sx), R32(sx) - - shrx cnt, 0*8(bp), s0 - mov 1*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - add 0*8(ap), s2 - mov s2, 0*8(rp) -',` - mov 0*8(ap), s0 - sub s2, s0 - mov s0, 0*8(rp) -') - - shrx cnt, s1, s0 - mov 2*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 1*8(ap), s2 - mov s2, 1*8(rp) -',` - mov 1*8(ap), s0 - sbb s2, s0 - mov s0, 1*8(rp) -') - - shrx cnt, s1, s0 - mov 3*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 2*8(ap), s2 - mov s2, 2*8(rp) -',` - mov 2*8(ap), s0 - sbb s2, s0 - mov s0, 2*8(rp) -') - - shrx cnt, s1, s0 - mov 4*8(bp), s1 - shlx tnc, s1, s2 - lea (s0, s2), s2 -ifelse(OP,`add',` - adc 3*8(ap), s2 - mov s2, 3*8(rp) -',` - mov 3*8(ap), s0 - sbb s2, s0 - mov s0, 3*8(rp) -') - - shrx cnt, s1, s0 -ifelse(OP,`add',` - adc 4*8(ap), s0 - mov s0, 4*8(rp) -',` - mov 4*8(ap), s2 - sbb s0, s2 - mov s2, 4*8(rp) -') - - setc R8(sx) - ret -EPILOGUE() -',`') +dnl ifdef(blablablabla,` +dnl ALIGN(16) +dnl PROLOGUE(flint_mpn_aorsrsh_5) +dnl xor tnc, tnc +dnl sub cnt, tnc +dnl xor R32(sx), R32(sx) +dnl +dnl shrx cnt, 0*8(bp), s0 +dnl mov 1*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl add 0*8(ap), s2 +dnl mov s2, 0*8(rp) +dnl ',` +dnl mov 0*8(ap), s0 +dnl sub s2, s0 +dnl mov s0, 0*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 2*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 1*8(ap), s2 +dnl mov s2, 1*8(rp) +dnl ',` +dnl mov 1*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 1*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 3*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 2*8(ap), s2 +dnl mov s2, 2*8(rp) +dnl ',` +dnl mov 2*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 2*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl mov 4*8(bp), s1 +dnl shlx tnc, s1, s2 +dnl lea (s0, s2), s2 +dnl ifelse(OP,`add',` +dnl adc 3*8(ap), s2 +dnl mov s2, 3*8(rp) +dnl ',` +dnl mov 3*8(ap), s0 +dnl sbb s2, s0 +dnl mov s0, 3*8(rp) +dnl ') +dnl +dnl shrx cnt, s1, s0 +dnl ifelse(OP,`add',` +dnl adc 4*8(ap), s0 +dnl mov s0, 4*8(rp) +dnl ',` +dnl mov 4*8(ap), s2 +dnl sbb s0, s2 +dnl mov s2, 4*8(rp) +dnl ') +dnl +dnl setc R8(sx) +dnl ret +dnl EPILOGUE() +dnl ') TEXT @@ -142,7 +142,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_2) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -160,7 +160,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_3) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -184,7 +184,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_4) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -214,7 +214,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_5) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -250,7 +250,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_6) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -292,7 +292,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_7) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -340,7 +340,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_8) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -394,7 +394,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_9) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -454,7 +454,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_10) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -520,7 +520,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_11) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -592,7 +592,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_12) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -670,7 +670,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_13) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -754,7 +754,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_14) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -844,7 +844,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_15) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -940,7 +940,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_addrsh_16) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1054,7 +1054,7 @@ EPILOGUE() dnl Modified to avoid pushing and popping s3 ALIGN(16) PROLOGUE(flint_mpn_subrsh_2) - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1075,7 +1075,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_3) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1104,7 +1104,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_4) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1140,7 +1140,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_5) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1183,7 +1183,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_6) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1233,7 +1233,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_7) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1290,7 +1290,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_8) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1354,7 +1354,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_9) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1425,7 +1425,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_10) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1503,7 +1503,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_11) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1588,7 +1588,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_12) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1680,7 +1680,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_13) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1779,7 +1779,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_14) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1885,7 +1885,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_15) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0 @@ -1998,7 +1998,7 @@ EPILOGUE() ALIGN(16) PROLOGUE(flint_mpn_subrsh_16) push s3 - xor R32(tnc), R32(tnc) + xor tnc, tnc sub cnt, tnc xor R32(sx), R32(sx) shrx cnt, 0*8(bp), s0