diff --git a/backends/avx/ceed-avx-blocked.c b/backends/avx/ceed-avx-blocked.c index bcb7e4ba98..4134c11a54 100644 --- a/backends/avx/ceed-avx-blocked.c +++ b/backends/avx/ceed-avx-blocked.c @@ -16,17 +16,17 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Avx(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/avx") || !strcmp(resource, "/cpu/self/avx/blocked"), ceed, CEED_ERROR_BACKEND, "AVX backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/avx/ceed-avx-serial.c b/backends/avx/ceed-avx-serial.c index 91225920df..0b1ada3c2b 100644 --- a/backends/avx/ceed-avx-serial.c +++ b/backends/avx/ceed-avx-serial.c @@ -16,17 +16,17 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Avx(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/avx/serial"), ceed, CEED_ERROR_BACKEND, "AVX backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Avx)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/avx/ceed-avx-tensor.c b/backends/avx/ceed-avx-tensor.c index dfae004ca1..3655a5541f 100644 --- a/backends/avx/ceed-avx-tensor.c +++ b/backends/avx/ceed-avx-tensor.c @@ -43,6 +43,7 @@ static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, Ce const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v, const CeedInt JJ, const CeedInt CC) { CeedInt t_stride_0 = B, t_stride_1 = 1; + if (t_mode == CEED_TRANSPOSE) { t_stride_0 = 1; t_stride_1 = J; @@ -56,7 +57,6 @@ static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, Ce for (CeedInt jj = 0; jj < JJ; jj++) { for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); } - for (CeedInt b = 0; b < B; b++) { for (CeedInt jj = 0; jj < JJ; jj++) { // unroll rtype tqv = set1(t[(j + jj) * t_stride_0 + b * t_stride_1]); @@ -71,17 +71,19 @@ static inline int CeedTensorContract_Avx_Blocked(CeedTensorContract contract, Ce } } // Remainder of rows - CeedInt j = (J / JJ) * JJ; + const CeedInt j = (J / JJ) * JJ; + if (j < J) { for (CeedInt c = 0; c < (C / CC) * CC; c += CC) { rtype vv[JJ][CC / 4]; // Output tile to be held in registers + for (CeedInt jj = 0; jj < J - j; jj++) { for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); } - for (CeedInt b = 0; b < B; b++) { for (CeedInt jj = 0; jj < J - j; jj++) { // doesn't unroll rtype tqv = set1(t[(j + jj) * t_stride_0 + b * t_stride_1]); + for (CeedInt cc = 0; cc < CC / 4; cc++) { // unroll fmadd(vv[jj][cc], tqv, loadu(&u[(a * B + b) * C + c + cc * 4])); } @@ -103,22 +105,25 @@ static inline int CeedTensorContract_Avx_Remainder(CeedTensorContract contract, const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v, const CeedInt JJ, const CeedInt CC) { CeedInt t_stride_0 = B, t_stride_1 = 1; + if (t_mode == CEED_TRANSPOSE) { t_stride_0 = 1; t_stride_1 = J; } - CeedInt J_break = J % JJ ? (J / JJ) * JJ : (J / JJ - 1) * JJ; + const CeedInt J_break = J % JJ ? (J / JJ) * JJ : (J / JJ - 1) * JJ; + for (CeedInt a = 0; a < A; a++) { // Blocks of 4 columns for (CeedInt c = (C / CC) * CC; c < C; c += 4) { // Blocks of 4 rows for (CeedInt j = 0; j < J_break; j += JJ) { rtype vv[JJ]; // Output tile to be held in registers - for (CeedInt jj = 0; jj < JJ; jj++) vv[jj] = loadu(&v[(a * J + j + jj) * C + c]); + for (CeedInt jj = 0; jj < JJ; jj++) vv[jj] = loadu(&v[(a * J + j + jj) * C + c]); for (CeedInt b = 0; b < B; b++) { rtype tqu; + if (C - c == 1) tqu = set(0.0, 0.0, 0.0, u[(a * B + b) * C + c + 0]); else if (C - c == 2) tqu = set(0.0, 0.0, u[(a * B + b) * C + c + 1], u[(a * B + b) * C + c + 0]); else if (C - c == 3) tqu = set(0.0, u[(a * B + b) * C + c + 2], u[(a * B + b) * C + c + 1], u[(a * B + b) * C + c + 0]); @@ -133,7 +138,8 @@ static inline int CeedTensorContract_Avx_Remainder(CeedTensorContract contract, // Remainder of rows, all columns for (CeedInt j = J_break; j < J; j++) { for (CeedInt b = 0; b < B; b++) { - CeedScalar tq = t[j * t_stride_0 + b * t_stride_1]; + const CeedScalar tq = t[j * t_stride_0 + b * t_stride_1]; + for (CeedInt c = (C / CC) * CC; c < C; c++) v[(a * J + j) * C + c] += tq * u[(a * B + b) * C + c]; } } @@ -148,6 +154,7 @@ static inline int CeedTensorContract_Avx_Single(CeedTensorContract contract, Cee CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v, const CeedInt AA, const CeedInt JJ) { CeedInt t_stride_0 = B, t_stride_1 = 1; + if (t_mode == CEED_TRANSPOSE) { t_stride_0 = 1; t_stride_1 = J; @@ -157,14 +164,15 @@ static inline int CeedTensorContract_Avx_Single(CeedTensorContract contract, Cee for (CeedInt a = 0; a < (A / AA) * AA; a += AA) { for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { rtype vv[AA][JJ / 4]; // Output tile to be held in registers + for (CeedInt aa = 0; aa < AA; aa++) { for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); } - for (CeedInt b = 0; b < B; b++) { for (CeedInt jj = 0; jj < JJ / 4; jj++) { // unroll rtype tqv = set(t[(j + jj * 4 + 3) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 2) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 1) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 0) * t_stride_0 + b * t_stride_1]); + for (CeedInt aa = 0; aa < AA; aa++) { // unroll fmadd(vv[aa][jj], tqv, set1(u[(a + aa) * B + b])); } @@ -176,17 +184,19 @@ static inline int CeedTensorContract_Avx_Single(CeedTensorContract contract, Cee } } // Remainder of rows - CeedInt a = (A / AA) * AA; + const CeedInt a = (A / AA) * AA; + for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { rtype vv[AA][JJ / 4]; // Output tile to be held in registers + for (CeedInt aa = 0; aa < A - a; aa++) { for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); } - for (CeedInt b = 0; b < B; b++) { for (CeedInt jj = 0; jj < JJ / 4; jj++) { // unroll rtype tqv = set(t[(j + jj * 4 + 3) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 2) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 1) * t_stride_0 + b * t_stride_1], t[(j + jj * 4 + 0) * t_stride_0 + b * t_stride_1]); + for (CeedInt aa = 0; aa < A - a; aa++) { // unroll fmadd(vv[aa][jj], tqv, set1(u[(a + aa) * B + b])); } @@ -197,16 +207,18 @@ static inline int CeedTensorContract_Avx_Single(CeedTensorContract contract, Cee } } // Column remainder - CeedInt A_break = A % AA ? (A / AA) * AA : (A / AA - 1) * AA; + const CeedInt A_break = A % AA ? (A / AA) * AA : (A / AA - 1) * AA; + // Blocks of 4 columns for (CeedInt j = (J / JJ) * JJ; j < J; j += 4) { // Blocks of 4 rows for (CeedInt a = 0; a < A_break; a += AA) { rtype vv[AA]; // Output tile to be held in registers - for (CeedInt aa = 0; aa < AA; aa++) vv[aa] = loadu(&v[(a + aa) * J + j]); + for (CeedInt aa = 0; aa < AA; aa++) vv[aa] = loadu(&v[(a + aa) * J + j]); for (CeedInt b = 0; b < B; b++) { rtype tqv; + if (J - j == 1) { tqv = set(0.0, 0.0, 0.0, t[(j + 0) * t_stride_0 + b * t_stride_1]); } else if (J - j == 2) { @@ -228,7 +240,8 @@ static inline int CeedTensorContract_Avx_Single(CeedTensorContract contract, Cee // Remainder of rows, all columns for (CeedInt b = 0; b < B; b++) { for (CeedInt j = (J / JJ) * JJ; j < J; j++) { - CeedScalar tq = t[j * t_stride_0 + b * t_stride_1]; + const CeedScalar tq = t[j * t_stride_0 + b * t_stride_1]; + for (CeedInt a = A_break; a < A; a++) v[a * J + j] += tq * u[a * B + b]; } } @@ -271,7 +284,6 @@ static int CeedTensorContractApply_Avx(CeedTensorContract contract, CeedInt A, C // Remainder of columns if (C % blk_size) CeedTensorContract_Avx_Remainder_8_8(contract, A, B, C, J, t, t_mode, true, u, v); } - return CEED_ERROR_SUCCESS; } @@ -280,10 +292,9 @@ static int CeedTensorContractApply_Avx(CeedTensorContract contract, CeedInt A, C //------------------------------------------------------------------------------ int CeedTensorContractCreate_Avx(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Avx)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/blocked/ceed-blocked-operator.c b/backends/blocked/ceed-blocked-operator.c index 6fa2060aa7..b33c564b34 100644 --- a/backends/blocked/ceed-blocked-operator.c +++ b/backends/blocked/ceed-blocked-operator.c @@ -16,16 +16,17 @@ //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, CeedElemRestriction *blk_restr, CeedVector *e_vecs_full, +static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bool is_input, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { - CeedInt num_comp, size, P; - CeedSize e_size, q_size; - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction r; - CeedOperatorField *op_fields; + Ceed ceed; + CeedSize e_size, q_size; + CeedInt num_comp, size, P; + const CeedInt block_size = 8; CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); @@ -33,70 +34,77 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } - const CeedInt blk_size = 8; // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { CeedEvalMode eval_mode; + CeedBasis basis; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { - Ceed ceed_rstr; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &r)); - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed_rstr)); - CeedSize l_size; - CeedInt num_elem, elem_size, comp_stride; - CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); - CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - + Ceed ceed_rstr; + CeedSize l_size; + CeedInt num_elem, elem_size, comp_stride; CeedRestrictionType rstr_type; - CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); + CeedElemRestriction rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); switch (rstr_type) { case CEED_RESTRICTION_STANDARD: { const CeedInt *offsets = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionCreateBlocked(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, - CEED_COPY_VALUES, offsets, &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionCreateBlocked(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, + CEED_COPY_VALUES, offsets, &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); } break; case CEED_RESTRICTION_ORIENTED: { - const CeedInt *offsets = NULL; const bool *orients = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionGetOrientations(r, CEED_MEM_HOST, &orients)); - CeedCallBackend(CeedElemRestrictionCreateBlockedOriented(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, - CEED_MEM_HOST, CEED_COPY_VALUES, offsets, orients, &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); - CeedCallBackend(CeedElemRestrictionRestoreOrientations(r, &orients)); + const CeedInt *offsets = NULL; + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionGetOrientations(rstr, CEED_MEM_HOST, &orients)); + CeedCallBackend(CeedElemRestrictionCreateBlockedOriented(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, + CEED_MEM_HOST, CEED_COPY_VALUES, offsets, orients, &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); + CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr, &orients)); } break; case CEED_RESTRICTION_CURL_ORIENTED: { - const CeedInt *offsets = NULL; const CeedInt8 *curl_orients = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionGetCurlOrientations(r, CEED_MEM_HOST, &curl_orients)); - CeedCallBackend(CeedElemRestrictionCreateBlockedCurlOriented(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, + const CeedInt *offsets = NULL; + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr, CEED_MEM_HOST, &curl_orients)); + CeedCallBackend(CeedElemRestrictionCreateBlockedCurlOriented(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, curl_orients, - &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); - CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(r, &curl_orients)); + &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); + CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr, &curl_orients)); } break; case CEED_RESTRICTION_STRIDED: { CeedInt strides[3]; - CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); - CeedCallBackend( - CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, blk_size, num_comp, l_size, strides, &blk_restr[i + start_e])); + + CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides, + &block_rstr[i + start_e])); } break; } - CeedCallBackend(CeedElemRestrictionCreateVector(blk_restr[i + start_e], NULL, &e_vecs_full[i + start_e])); + CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e])); } switch (eval_mode) { case CEED_EVAL_NONE: CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)Q * size * blk_size; + q_size = (CeedSize)Q * size * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_INTERP: @@ -107,16 +115,16 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - e_size = (CeedSize)P * num_comp * blk_size; + e_size = (CeedSize)P * num_comp * block_size; CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); - q_size = (CeedSize)Q * size * blk_size; + q_size = (CeedSize)Q * size * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_WEIGHT: // Only on input fields CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); - q_size = (CeedSize)Q * blk_size; + q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); break; } } @@ -127,25 +135,27 @@ static int CeedOperatorSetupFields_Blocked(CeedQFunction qf, CeedOperator op, bo // Setup Operator //------------------------------------------------------------------------------ static int CeedOperatorSetup_Blocked(CeedOperator op) { - bool is_setup_done; + bool is_setup_done; + Ceed ceed; + CeedInt Q, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Blocked *impl; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Blocked *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, num_input_fields, num_output_fields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr)); CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); @@ -160,28 +170,28 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { // Set up infield and outfield pointer arrays // Infields CeedCallBackend( - CeedOperatorSetupFields_Blocked(qf, op, true, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); + CeedOperatorSetupFields_Blocked(qf, op, true, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, + CeedCallBackend(CeedOperatorSetupFields_Blocked(qf, op, false, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { - impl->is_identity_restr_op = true; + impl->is_identity_rstr_op = true; } else { CeedCallBackend(CeedVectorReferenceCopy(impl->q_vecs_in[0], &impl->q_vecs_out[0])); } } CeedCallBackend(CeedOperatorSetSetupDone(op)); - return CEED_ERROR_SUCCESS; } @@ -191,11 +201,11 @@ static int CeedOperatorSetup_Blocked(CeedOperator op) { static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, CeedVector in_vec, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl, CeedRequest *request) { - CeedEvalMode eval_mode; - CeedVector vec; - uint64_t state; - for (CeedInt i = 0; i < num_input_fields; i++) { + uint64_t state; + CeedEvalMode eval_mode; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { @@ -209,7 +219,7 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed // Restrict CeedCallBackend(CeedVectorGetState(vec, &state)); if (state != impl->input_states[i] || vec == in_vec) { - CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); + CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec @@ -223,24 +233,25 @@ static inline int CeedOperatorSetupInputs_Blocked(CeedInt num_input_fields, Ceed // Input Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, CeedInt blk_size, bool skip_active, + CeedInt num_input_fields, CeedInt block_size, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { - CeedInt elem_size, size, num_comp; - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size, num_comp; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + // Skip active input if (skip_active) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action @@ -255,7 +266,7 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i][e * elem_size * num_comp])); - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); break; case CEED_EVAL_WEIGHT: break; // No action @@ -268,17 +279,17 @@ static inline int CeedOperatorInputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunc // Output Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { - CeedInt elem_size, num_comp; - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt elem_size, num_comp; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -292,7 +303,7 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend( CeedVectorSetArray(impl->e_vecs_out[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data_full[i + num_input_fields][e * elem_size * num_comp])); - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -311,12 +322,13 @@ static inline int CeedOperatorOutputBasis_Blocked(CeedInt e, CeedInt Q, CeedQFun //------------------------------------------------------------------------------ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Blocked *impl) { - CeedEvalMode eval_mode; - for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + // Skip active inputs if (skip_active) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } @@ -333,30 +345,30 @@ static inline int CeedOperatorRestoreInputs_Blocked(CeedInt num_input_fields, Ce // Operator Apply //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + const CeedInt block_size = 8; + CeedEvalMode eval_mode; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Blocked *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - const CeedInt blk_size = 8; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedVector vec; - CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); // Setup CeedCallBackend(CeedOperatorSetup_Blocked(op)); // Restriction only operator - if (impl->is_identity_restr_op) { - CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); - CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); + if (impl->is_identity_rstr_op) { + CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[0], CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); + CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[1], CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); return CEED_ERROR_SUCCESS; } @@ -369,7 +381,7 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed } // Loop through elements - for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) { + for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) { // Output pointers for (CeedInt i = 0; i < num_output_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); @@ -380,20 +392,22 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed } // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, false, e_data_full, impl)); + CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, false, e_data_full, impl)); // Q function if (!impl->is_identity_qf) { - CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); } // Output basis apply - CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, blk_size, num_input_fields, num_output_fields, op, + CeedCallBackend(CeedOperatorOutputBasis_Blocked(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op, e_data_full, impl)); } // Output restriction for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Restore evec CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector @@ -402,12 +416,11 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict CeedCallBackend( - CeedElemRestrictionApply(impl->blk_restr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); + CeedElemRestrictionApply(impl->block_rstr[i + impl->num_inputs], CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); - return CEED_ERROR_SUCCESS; } @@ -416,27 +429,30 @@ static int CeedOperatorApplyAdd_Blocked(CeedOperator op, CeedVector in_vec, Ceed //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { + Ceed ceed; + CeedSize q_size; + CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + const CeedInt block_size = 8; + CeedScalar *l_vec_array; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Blocked *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - const CeedInt blk_size = 8; - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + CeedVector *active_in = impl->qf_active_in; + CeedVector l_vec = impl->qf_l_vec; + CeedElemRestriction block_rstr = impl->qf_block_rstr; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedVector vec, l_vec = impl->qf_l_vec; - CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; - CeedSize q_size; - CeedVector *active_in = impl->qf_active_in; - CeedScalar *a, *tmp; - Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {0}; + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); // Setup CeedCallBackend(CeedOperatorSetup_Blocked(op)); @@ -450,21 +466,25 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o // Count number of active input fields if (!num_active_in) { for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * blk_size; + q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q * blk_size])); + CeedCallBackend( + CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size])); } num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } impl->num_active_in = num_active_in; @@ -474,6 +494,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o // Count number of active output fields if (!num_active_out) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output @@ -490,27 +512,38 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o // Setup Lvec if (!l_vec) { - CeedSize l_size = (CeedSize)num_blks * blk_size * Q * num_active_in * num_active_out; + const CeedSize l_size = (CeedSize)num_blocks * block_size * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec)); impl->qf_l_vec = l_vec; } - CeedCallBackend(CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &a)); + CeedCallBackend(CeedVectorGetArrayWrite(l_vec, CEED_MEM_HOST, &l_vec_array)); + + // Setup block restriction + if (!block_rstr) { + const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, &block_rstr)); + impl->qf_block_rstr = block_rstr; + } // Build objects if needed - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; if (build_objects) { + const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + const CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + // Create output restriction CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } // Loop through elements - for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) { + for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) { // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, true, e_data_full, impl)); + CeedCallBackend(CeedOperatorInputBasis_Blocked(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, true, e_data_full, impl)); // Assemble QFunction for (CeedInt in = 0; in < num_active_in; in++) { @@ -521,22 +554,26 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o } // Set Outputs for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - a += size * Q * blk_size; // Advance the pointer by the size of the output + l_vec_array += size * Q * block_size; // Advance the pointer by the size of the output } } // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); } } // Un-set output Qvecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output @@ -549,16 +586,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Blocked(CeedOperator o CeedCallBackend(CeedOperatorRestoreInputs_Blocked(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl)); // Output blocked restriction - CeedCallBackend(CeedVectorRestoreArray(l_vec, &a)); + CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array)); CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - CeedElemRestriction blk_rstr = impl->qf_blk_rstr; - if (!impl->qf_blk_rstr) { - CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, &blk_rstr)); - impl->qf_blk_rstr = blk_rstr; - } - CeedCallBackend(CeedElemRestrictionApply(blk_rstr, CEED_TRANSPOSE, l_vec, *assembled, request)); - + CeedCallBackend(CeedElemRestrictionApply(block_rstr, CEED_TRANSPOSE, l_vec, *assembled, request)); return CEED_ERROR_SUCCESS; } @@ -581,13 +611,14 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Blocked(CeedOperator op, Ce //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Blocked(CeedOperator op) { CeedOperator_Blocked *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { - CeedCallBackend(CeedElemRestrictionDestroy(&impl->blk_restr[i])); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i])); CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } - CeedCallBackend(CeedFree(&impl->blk_restr)); + CeedCallBackend(CeedFree(&impl->block_rstr)); CeedCallBackend(CeedFree(&impl->e_vecs_full)); CeedCallBackend(CeedFree(&impl->input_states)); @@ -611,7 +642,7 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) { } CeedCallBackend(CeedFree(&impl->qf_active_in)); CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_blk_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -621,13 +652,12 @@ static int CeedOperatorDestroy_Blocked(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Blocked(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Blocked *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Blocked)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Blocked)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Blocked)); diff --git a/backends/blocked/ceed-blocked.c b/backends/blocked/ceed-blocked.c index c1718a1040..94cc94e9c6 100644 --- a/backends/blocked/ceed-blocked.c +++ b/backends/blocked/ceed-blocked.c @@ -16,21 +16,21 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Blocked(const char *resource, Ceed ceed) { + Ceed ceed_ref; + const char fallback_resource[] = "/cpu/self/ref/serial"; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/ref/blocked"), ceed, CEED_ERROR_BACKEND, "Blocked backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback Ceed resource for advanced operator functionality - const char fallbackresource[] = "/cpu/self/ref/serial"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Blocked)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/blocked/ceed-blocked.h b/backends/blocked/ceed-blocked.h index 72a484cf37..d6c2ba213b 100644 --- a/backends/blocked/ceed-blocked.h +++ b/backends/blocked/ceed-blocked.h @@ -18,8 +18,8 @@ typedef struct { } CeedBasis_Blocked; typedef struct { - bool is_identity_qf, is_identity_restr_op; - CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ + bool is_identity_qf, is_identity_rstr_op; + CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */ CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ uint64_t *input_states; /* State counter of inputs */ CeedVector *e_vecs_in; /* Element block input E-vectors */ @@ -30,7 +30,7 @@ typedef struct { CeedInt num_active_in, num_active_out; CeedVector *qf_active_in; CeedVector qf_l_vec; - CeedElemRestriction qf_blk_rstr; + CeedElemRestriction qf_block_rstr; } CeedOperator_Blocked; CEED_INTERN int CeedOperatorCreate_Blocked(CeedOperator op); diff --git a/backends/ceed-backend-weak.c b/backends/ceed-backend-weak.c index 84a12ce33c..37b92e8b73 100644 --- a/backends/ceed-backend-weak.c +++ b/backends/ceed-backend-weak.c @@ -19,10 +19,12 @@ static int CeedInit_Weak(const char *resource, Ceed ceed) { // This function provides a debug target for weak symbols static int CeedRegister_Weak(const char *name, int num_prefixes, ...) { va_list prefixes; + int ierr; + va_start(prefixes, num_prefixes); - int ierr; for (int i = 0; i < num_prefixes; i++) { const char *prefix = va_arg(prefixes, const char *); + CeedDebugEnv("** Weak Register: %s", prefix); ierr = CeedRegisterImpl(prefix, CeedInit_Weak, CEED_MAX_BACKEND_PRIORITY); if (ierr) va_end(prefixes); // Prevent leak on error diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 175dbcf396..ef9b20365b 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -28,37 +28,42 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { using std::ostringstream; using std::string; - bool is_setup_done; + + bool is_setup_done, is_identity_qf; + struct cudaDeviceProp prop; + Ceed ceed; + Ceed_Cuda *ceed_data; + CeedSize l_size; + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedElemRestriction_Cuda *rstr_data; + CeedBasis basis; + CeedBasis_Cuda_shared *basis_data; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Cuda_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda_gen *data; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Cuda_gen *data; CeedCallBackend(CeedOperatorGetData(op, &data)); - CeedQFunction qf; - CeedQFunction_Cuda_gen *qf_data; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedSize lsize; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); Q_1d = Q; - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedBasis basis; - CeedBasis_Cuda_shared *basis_data; - CeedElemRestriction Erestrict; - CeedElemRestriction_Cuda *restr_data; // TODO: put in a function? // Check for restriction only identity operator - bool is_identity_qf; CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); if (is_identity_qf) { CeedEvalMode eval_mode_in, eval_mode_out; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, @@ -69,12 +74,11 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { // TODO: put in a function? // Add atomicAdd function for old NVidia architectures - struct cudaDeviceProp prop; - Ceed_Cuda *ceed_data; CeedCallBackend(CeedGetData(ceed, &ceed_data)); CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { char *atomic_add_path, *atomic_add_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source)); @@ -87,6 +91,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { // TODO: generalize to accept different device functions? { char *tensor_basis_kernel_path, *tensor_basis_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); @@ -96,6 +101,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { } { char *cuda_gen_template_path, *cuda_gen_template_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source)); @@ -115,33 +121,34 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Collect dim, P_1d, and Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - CeedCheck(isTensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); - if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; + data->max_P_1d = CeedIntMax(data->max_P_1d, P_1d); } } // Check output bases for Q_1d, dim as well // The only input basis might be CEED_BASIS_NONE for (CeedInt i = 0; i < num_output_fields; i++) { CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Collect Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - CeedCheck(isTensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); } } @@ -151,8 +158,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { // Only use 3D collocated gradient parallelization strategy when gradient is computed // TODO: put in a function? bool use_collograd_parallelization = false; + if (dim == 3) { bool was_grad_found = false; + for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { @@ -216,10 +225,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants if (eval_mode != CEED_EVAL_WEIGHT) { @@ -274,10 +283,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); @@ -320,6 +329,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); break; // Should not occur @@ -340,10 +350,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // TODO: put in a function? // Restriction @@ -351,25 +361,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_data)); - data->indices.inputs[i] = restr_data->d_ind; - code << " readDofsOffset" << dim << "d(data, lsize_in_" << i + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = rstr_data->d_ind; + code << " readDofsOffset" << dim << "d(data, l_size_in_" << i << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readDofsStrided" << dim << "dindices.inputs[i] = restr_data->d_ind; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = rstr_data->d_ind; code << " readSliceQuadsOffset" - << "3d(data, lsize_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" + << "3d(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" << i << ", r_q_" << i << ");\n"; } else { - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readSliceQuadsStrided" @@ -603,10 +622,10 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // TODO put in a function // Basis action code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; @@ -648,25 +667,28 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { // TODO put in a function // Restriction bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_out_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_data)); - data->indices.outputs[i] = restr_data->d_ind; - code << " writeDofsOffset" << dim << "d(data, lsize_out_" << i + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.outputs[i] = rstr_data->d_ind; + code << " writeDofsOffset" << dim << "d(data, l_size_out_" << i << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " writeDofsStrided" << dim << "dfields.inputs[i] = NULL; @@ -126,6 +132,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Output vectors for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; @@ -136,6 +144,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, output_vecs[i] = vec; // Check for multiple output modes CeedInt index = -1; + for (CeedInt j = 0; j < i; j++) { if (vec == output_vecs[j]) { index = j; @@ -160,6 +169,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, const CeedInt P_1d = data->max_P_1d; const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); int max_threads_per_block, min_grid_size; + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000)); int block[3] = { @@ -168,13 +178,17 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, -1, }, grid; + CeedChkBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block, cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid)); CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs)); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { @@ -186,6 +200,8 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Restore output arrays for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { @@ -207,7 +223,6 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); - return CEED_ERROR_SUCCESS; } @@ -215,13 +230,12 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Cuda_gen(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Cuda_gen *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Cuda_gen)); return CEED_ERROR_SUCCESS; diff --git a/backends/cuda-gen/ceed-cuda-gen-qfunction.c b/backends/cuda-gen/ceed-cuda-gen-qfunction.c index 30887e4660..e88db072d3 100644 --- a/backends/cuda-gen/ceed-cuda-gen-qfunction.c +++ b/backends/cuda-gen/ceed-cuda-gen-qfunction.c @@ -17,6 +17,7 @@ //------------------------------------------------------------------------------ static int CeedQFunctionApply_Cuda_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); } @@ -25,9 +26,10 @@ static int CeedQFunctionApply_Cuda_gen(CeedQFunction qf, CeedInt Q, CeedVector * // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) { + Ceed ceed; CeedQFunction_Cuda_gen *data; + CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallCuda(ceed, cudaFree(data->d_c)); CeedCallBackend(CeedFree(&data->q_function_source)); @@ -39,9 +41,10 @@ static int CeedQFunctionDestroy_Cuda_gen(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Cuda_gen(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; CeedQFunction_Cuda_gen *data; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index 6e99b46e06..ae91e5305f 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -17,28 +17,28 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { - char *resource_root; + char *resource_root; + const char fallback_resource[] = "/gpu/cuda/ref"; + Ceed ceed_shared; + Ceed_Cuda *data; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/cuda") || !strcmp(resource_root, "/gpu/cuda/gen"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); - Ceed_Cuda *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); - Ceed ceed_shared; CeedCall(CeedInit("/gpu/cuda/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); - const char fallbackresource[] = "/gpu/cuda/ref"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c index c8c34a407d..3ec0d47bf5 100644 --- a/backends/cuda-ref/ceed-cuda-ref-basis.c +++ b/backends/cuda-ref/ceed-cuda-ref-basis.c @@ -19,18 +19,20 @@ // Basis apply - tensor //------------------------------------------------------------------------------ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt Q_1d, dim; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 32; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Cuda *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Cuda *ceed_Cuda; CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedBasis_Cuda *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 32; // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); @@ -38,10 +40,10 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(v, &length)); CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); } - CeedInt Q_1d, dim; CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); @@ -92,22 +94,23 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo //------------------------------------------------------------------------------ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt num_nodes, num_qpts; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + int elems_per_block = 1; + int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasisNonTensor_Cuda *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Cuda *ceed_Cuda; CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedBasisNonTensor_Cuda *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedInt num_nodes, num_qpts; CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - int elems_per_block = 1; - int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } @@ -116,6 +119,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(v, &length)); CeedCallCuda(ceed, cudaMemset(d_v, 0, length * sizeof(CeedScalar))); } @@ -164,19 +168,16 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr // Destroy tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Cuda(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasis_Cuda *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -184,19 +185,16 @@ static int CeedBasisDestroy_Cuda(CeedBasis basis) { // Destroy non-tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasisNonTensor_Cuda *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight)); CeedCallCuda(ceed, cudaFree(data->d_interp)); CeedCallCuda(ceed, cudaFree(data->d_grad)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -205,27 +203,26 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * P_1d; CeedBasis_Cuda *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU - const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); - - const CeedInt interp_bytes = q_bytes * P_1d; CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); // Compile basis kernels - CeedInt num_comp; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); @@ -251,28 +248,27 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const //------------------------------------------------------------------------------ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * num_nodes; + const CeedInt grad_bytes = q_bytes * num_nodes * dim; CeedBasisNonTensor_Cuda *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); - - const CeedInt interp_bytes = q_bytes * num_nodes; CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); - - const CeedInt grad_bytes = q_bytes * num_nodes * dim; CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); // Compile basis kernels - CeedInt num_comp; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 8c6d7de6c9..7d51d5fb7f 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -23,52 +23,55 @@ //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Cuda(CeedOperator op) { CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data - for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); + for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); } - CeedCallBackend(CeedFree(&impl->evecs)); + CeedCallBackend(CeedFree(&impl->e_vecs)); - for (CeedInt i = 0; i < impl->numein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); + for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - CeedCallBackend(CeedFree(&impl->qvecsin)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i = 0; i < impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - CeedCallBackend(CeedFree(&impl->qvecsout)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i = 0; i < impl->qfnumactivein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - CeedCallBackend(CeedFree(&impl->qfactivein)); + CeedCallBackend(CeedFree(&impl->qf_active_in)); // Diag data if (impl->diag) { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); - CeedCallBackend(CeedFree(&impl->diag->h_emodein)); - CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_emodein)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_emodeout)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_in)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_out)); CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_interpin)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_interpout)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_gradin)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_gradout)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_rstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); } CeedCallBackend(CeedFree(&impl->diag)); if (impl->asmb) { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallCuda(ceed, cuModuleUnload(impl->asmb->module)); CeedCallCuda(ceed, cudaFree(impl->asmb->d_B_in)); @@ -83,88 +86,91 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, - CeedInt numfields, CeedInt Q, CeedInt numelements) { - CeedInt dim, size; - CeedSize q_size; - Ceed ceed; +static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt e_start, + CeedInt num_fields, CeedInt Q, CeedInt num_elem) { + Ceed ceed; + bool is_strided, skip_restriction; + CeedSize q_size; + CeedInt dim, size; + CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedVector fieldvec; - bool strided; - bool skiprestrict; - - if (isinput) { - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); + + if (is_input) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } // Loop over fields - for (CeedInt i = 0; i < numfields; i++) { - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); + for (CeedInt i = 0; i < num_fields; i++) { + CeedEvalMode e_mode; + CeedBasis basis; - strided = false; - skiprestrict = false; - if (emode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + + is_strided = false; + skip_restriction = false; + if (e_mode != CEED_EVAL_WEIGHT) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); // Check whether this field can skip the element restriction: - // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. + // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. // First, check whether the field is input or output: - if (isinput) { + if (is_input) { + CeedVector vec; + // Check for passive input: - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); - if (fieldvec != CEED_VECTOR_ACTIVE) { - // Check emode - if (emode == CEED_EVAL_NONE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE) { + // Check e_mode + if (e_mode == CEED_EVAL_NONE) { // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); - if (strided) { + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); } } } } - if (skiprestrict) { + if (skip_restriction) { // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. - evecs[i + starte] = NULL; + e_vecs[i + e_start] = NULL; } else { - CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + e_start])); } } - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_WEIGHT: // Only on input fields - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - q_size = (CeedSize)numelements * Q; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, qvecs[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + q_size = (CeedSize)num_elem * Q; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); break; case CEED_EVAL_DIV: break; // TODO: Not implemented @@ -179,38 +185,39 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Cuda(CeedOperator op) { - bool setupdone; - CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); - if (setupdone) return CEED_ERROR_SUCCESS; - Ceed ceed; + Ceed ceed; + bool is_setup_done; + CeedInt Q, num_elem, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Cuda *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, numinputfields, numoutputfields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); - impl->numein = numinputfields; - impl->numeout = numoutputfields; + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; - // Set up infield and outfield evecs and qvecs + // Set up infield and outfield e_vecs and q_vecs // Infields - CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); - + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); + CeedCallBackend(CeedOperatorSetupFields_Cuda(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; @@ -219,37 +226,35 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], +static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl, CeedRequest *request) { - CeedEvalMode emode; - CeedVector vec; - CeedElemRestriction Erestrict; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; - for (CeedInt i = 0; i < numinputfields; i++) { // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - if (skipactive) continue; - else vec = invec; + if (skip_active_in) continue; + else vec = in_vec; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - if (vec == CEED_VECTOR_ACTIVE) vec = invec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; // Restrict, if necessary - if (!impl->evecs[i]) { + if (!impl->e_vecs[i]) { // No restriction for this field; read data directly from vec. - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); // Get evec - CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } } } @@ -259,38 +264,39 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt numinputfields, CeedQFunc //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], +static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { - CeedInt elemsize, size; - CeedElemRestriction Erestrict; - CeedEvalMode emode; - CeedBasis basis; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size; + CeedEvalMode e_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; - for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input - if (skipactive) { + if (skip_active_in) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_WEIGHT: break; // No action @@ -306,25 +312,25 @@ static inline int CeedOperatorInputBasis_Cuda(CeedInt numelements, CeedQFunction //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { - CeedEvalMode emode; - CeedVector vec; +static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; - for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input - if (skipactive) { - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + if (skip_active_in) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { - if (!impl->evecs[i]) { // This was a skiprestrict case - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); - CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); + if (!impl->e_vecs[i]) { // This was a skip_restriction case + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); } } } @@ -334,64 +340,65 @@ static inline int CeedOperatorRestoreInputs_Cuda(CeedInt numinputfields, CeedQFu //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { - CeedOperator_Cuda *impl; +static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedOperator_Cuda *impl; + CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; + CeedEvalMode e_mode; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedEvalMode emode; - CeedVector vec; - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedScalar *edata[2 * CEED_FIELD_MAX] = {NULL}; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Setup CeedCallBackend(CeedOperatorSetup_Cuda(op)); - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); + // Input e_vecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); // Input basis apply if needed - CeedCallBackend(CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); // Output pointers, as necessary - for (CeedInt i = 0; i < numoutputfields; i++) { - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); } } // Q function - CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); // Output basis apply if needed - for (CeedInt i = 0; i < numoutputfields; i++) { - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedElemRestriction elem_rstr; + CeedBasis basis; + + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -409,24 +416,27 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, CeedVect } // Output restriction - for (CeedInt i = 0; i < numoutputfields; i++) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + CeedElemRestriction elem_rstr; + // Restore evec - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); } // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); // Active - if (vec == CEED_VECTOR_ACTIVE) vec = outvec; + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -435,132 +445,142 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector invec, CeedVect //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed, ceedparent; - CeedOperator_Cuda *impl; - CeedQFunction qf; - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedOperatorField *opinputfields, *opoutputfields; - CeedVector vec, *activein; - CeedInt numactivein, numactiveout, Q, numelements, numinputfields, numoutputfields, size; + Ceed ceed, ceed_parent; + bool is_identity_qf; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; CeedSize q_size; - CeedScalar *a, *tmp, *edata[2 * CEED_FIELD_MAX] = {NULL}; + CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedVector *active_inputs; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceedparent)); + CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); - activein = impl->qfactivein; - numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + active_inputs = impl->qf_active_in; + num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; // Setup CeedCallBackend(CeedOperatorSetup_Cuda(op)); // Check for identity - bool identityqf; - CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); - CeedCheck(!identityqf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + CeedCheck(!is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); - // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); + // Input e_vecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); // Count number of active input fields - if (!numactivein) { - for (CeedInt i = 0; i < numinputfields; i++) { + if (!num_active_in) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); - CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); - CeedCallBackend(CeedRealloc(numactivein + size, &activein)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * numelements; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); - CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); + q_size = (CeedSize)Q * num_elem; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); + CeedCallBackend( + CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); } - numactivein += size; - CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); + num_active_in += size; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } - impl->qfnumactivein = numactivein; - impl->qfactivein = activein; + impl->num_active_in = num_active_in; + impl->qf_active_in = active_inputs; } // Count number of active output fields - if (!numactiveout) { - for (CeedInt i = 0; i < numoutputfields; i++) { + if (!num_active_out) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); - numactiveout += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + num_active_out += size; } } - impl->qfnumactiveout = numactiveout; + impl->num_active_out = num_active_out; } // Check sizes - CeedCheck(numactivein > 0 && numactiveout > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); // Build objects if needed if (build_objects) { // Create output restriction - CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */ - CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout, - numactivein * numactiveout * numelements * Q, strides, rstr)); + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout; - CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled)); + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a)); + CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Cuda(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Cuda(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); // Assemble QFunction - for (CeedInt in = 0; in < numactivein; in++) { + for (CeedInt in = 0; in < num_active_in; in++) { // Set Inputs - CeedCallBackend(CeedVectorSetValue(activein[in], 1.0)); - if (numactivein > 1) { - CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); + CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0)); + if (num_active_in > 1) { + CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out = 0; out < numoutputfields; out++) { + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); - a += size * Q * numelements; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } } // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } - // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out = 0; out < numoutputfields; out++) { + // Un-set output q_vecs to prevent accidental overwrite of Assembled + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Cuda(numinputfields, qfinputfields, opinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); // Restore output - CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); - + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); return CEED_ERROR_SUCCESS; } @@ -581,82 +601,87 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedV //------------------------------------------------------------------------------ // Create point block restriction //------------------------------------------------------------------------------ -static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); +static int CreatePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *point_block_rstr) { + Ceed ceed; + CeedSize l_size; + CeedInt num_elem, num_comp, elem_size, comp_stride, *point_block_offsets; const CeedInt *offsets; + + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; - CeedSize l_size; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); - CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); - CeedInt shift = ncomp; - if (compstride != 1) shift *= ncomp; - CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); - for (CeedInt i = 0; i < nelem * elemsize; i++) { - pbOffsets[i] = offsets[i] * shift; + CeedInt shift = num_comp; + + if (comp_stride != 1) shift *= num_comp; + CeedCallBackend(CeedCalloc(num_elem * elem_size, &point_block_offsets)); + for (CeedInt i = 0; i < num_elem * elem_size; i++) { + point_block_offsets[i] = offsets[i] * shift; } // Create new restriction - CeedCallBackend( - CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); + CeedCallBackend(CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp * num_comp, 1, l_size * num_comp, CEED_MEM_HOST, CEED_OWN_POINTER, + point_block_offsets, point_block_rstr)); // Cleanup CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); - return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble diagonal setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool pointBlock, CeedInt use_ceedsize_idx) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const bool is_point_block, CeedInt use_ceedsize_idx) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0, num_nodes, num_qpts; + CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *op_fields; + CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt numinputfields, numoutputfields; - CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Determine active input basis - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); - CeedInt numemodein = 0, ncomp = 0, dim = 1; - CeedEvalMode *emodein = NULL; - CeedBasis basisin = NULL; - CeedElemRestriction rstrin = NULL; - for (CeedInt i = 0; i < numinputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); - CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); - CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - CeedCheck(!rstrin || rstrin == rstr, ceed, CEED_ERROR_BACKEND, + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstrin = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_in = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); - emodein[numemodein] = emode; - numemodein += 1; + CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); + e_mode_in[num_e_mode_in] = e_mode; + num_e_mode_in += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); - for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; - numemodein += dim; + CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); + for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; + num_e_mode_in += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -667,35 +692,33 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const } // Determine active output basis - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); - CeedInt numemodeout = 0; - CeedEvalMode *emodeout = NULL; - CeedBasis basisout = NULL; - CeedElemRestriction rstrout = NULL; - for (CeedInt i = 0; i < numoutputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); + for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - CeedCheck(!rstrout || rstrout == rstr, ceed, CEED_ERROR_BACKEND, + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstrout = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_out = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); - emodeout[numemodeout] = emode; - numemodeout += 1; + CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); + e_mode_out[num_e_mode_out] = e_mode; + num_e_mode_out += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); - for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; - numemodeout += dim; + CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); + for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; + num_e_mode_out += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -706,153 +729,154 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, const } // Operator data struct - CeedOperator_Cuda *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Cuda *diag = impl->diag; - diag->basisin = basisin; - diag->basisout = basisout; - diag->h_emodein = emodein; - diag->h_emodeout = emodeout; - diag->numemodein = numemodein; - diag->numemodeout = numemodeout; + + diag->basis_in = basis_in; + diag->basis_out = basis_out; + diag->h_e_mode_in = e_mode_in; + diag->h_e_mode_out = e_mode_out; + diag->num_e_mode_in = num_e_mode_in; + diag->num_e_mode_out = num_e_mode_out; // Assemble kernel - char *diagonal_kernel_path, *diagonal_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); - CeedInt nnodes, nqpts; - CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); - diag->nnodes = nnodes; - CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES", - nnodes, "NQPTS", nqpts, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx)); + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + diag->num_nodes = num_nodes; + CeedCallCuda(ceed, + CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_E_MODE_IN", num_e_mode_in, "NUM_E_MODE_OUT", num_e_mode_out, + "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); CeedCallBackend(CeedFree(&diagonal_kernel_path)); CeedCallBackend(CeedFree(&diagonal_kernel_source)); // Basis matrices - const CeedInt qBytes = nqpts * sizeof(CeedScalar); - const CeedInt iBytes = qBytes * nnodes; - const CeedInt gBytes = qBytes * nnodes * dim; - const CeedInt eBytes = sizeof(CeedEvalMode); - const CeedScalar *interpin, *interpout, *gradin, *gradout; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * num_nodes; + const CeedInt grad_bytes = q_bytes * num_nodes * dim; + const CeedInt e_mode_bytes = sizeof(CeedEvalMode); + const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; // CEED_EVAL_NONE - CeedScalar *identity = NULL; - bool evalNone = false; - for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); - for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); - if (evalNone) { - CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); - for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, iBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, iBytes, cudaMemcpyHostToDevice)); + CeedScalar *identity = NULL; + bool is_eval_none = false; + + for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); + for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); + if (is_eval_none) { + CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); + for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice)); } // CEED_EVAL_INTERP - CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpin, iBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_interpin, interpin, iBytes, cudaMemcpyHostToDevice)); - CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interpout, iBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_interpout, interpout, iBytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_in, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_in, interp_in, interp_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_out, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_out, interp_out, interp_bytes, cudaMemcpyHostToDevice)); // CEED_EVAL_GRAD - CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradin, gBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_gradin, gradin, gBytes, cudaMemcpyHostToDevice)); - CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_gradout, gBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_gradout, gradout, gBytes, cudaMemcpyHostToDevice)); - - // Arrays of emodes - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodein, numemodein * eBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_emodein, emodein, numemodein * eBytes, cudaMemcpyHostToDevice)); - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_emodeout, numemodeout * eBytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_in, grad_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_in, grad_in, grad_bytes, cudaMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_out, grad_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_out, grad_out, grad_bytes, cudaMemcpyHostToDevice)); + + // Arrays of e_modes + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, cudaMemcpyHostToDevice)); + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, cudaMemcpyHostToDevice)); // Restriction - diag->diagrstr = rstrout; - + diag->diag_rstr = rstr_out; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble diagonal common code //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { + Ceed ceed; + CeedSize assembled_length = 0, assembled_qf_length = 0; + CeedInt use_ceedsize_idx = 0, num_elem; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr = NULL; + CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Cuda *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembledqf = NULL; - CeedElemRestriction rstr = NULL; - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); - CeedSize assembled_length = 0, assembledqf_length = 0; CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembledqf, &assembledqf_length)); - CeedInt use_ceedsize_idx = 0; - if ((assembled_length > INT_MAX) || (assembledqf_length > INT_MAX)) use_ceedsize_idx = 1; + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup - if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, pointBlock, use_ceedsize_idx)); + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, is_point_block, use_ceedsize_idx)); CeedOperatorDiag_Cuda *diag = impl->diag; + assert(diag != NULL); // Restriction - if (pointBlock && !diag->pbdiagrstr) { - CeedElemRestriction pbdiagrstr; - CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); - diag->pbdiagrstr = pbdiagrstr; + if (is_point_block && !diag->point_block_rstr) { + CeedElemRestriction point_block_rstr; + + CeedCallBackend(CreatePointBlockRestriction(diag->diag_rstr, &point_block_rstr)); + diag->point_block_rstr = point_block_rstr; } - CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; + CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_rstr : diag->diag_rstr; // Create diagonal vector - CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; - if (!elemdiag) { - CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); - if (pointBlock) diag->pbelemdiag = elemdiag; - else diag->elemdiag = elemdiag; + CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; + + if (!elem_diag) { + CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); + if (is_point_block) diag->point_block_elem_diag = elem_diag; + else diag->elem_diag = elem_diag; } - CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); + CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); // Assemble element operator diagonals - CeedScalar *elemdiagarray; - const CeedScalar *assembledqfarray; - CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); - CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); - CeedInt nelem; - CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); + CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); // Compute the diagonal of B^T D B - int elemsPerBlock = 1; - int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - void *args[] = {(void *)&nelem, &diag->d_identity, &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, - &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, &assembledqfarray, &elemdiagarray}; - if (pointBlock) { - CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->nnodes, 1, elemsPerBlock, args)); + int elem_per_block = 1; + int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); + void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, + &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; + if (is_point_block) { + CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->num_nodes, 1, elem_per_block, args)); } else { - CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->nnodes, 1, elemsPerBlock, args)); + CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->num_nodes, 1, elem_per_block, args)); } // Restore arrays - CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); - CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); + CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); // Assemble local operator diagonal - CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); + CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); // Cleanup - CeedCallBackend(CeedVectorDestroy(&assembledqf)); - + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -876,52 +900,54 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, // Single operator assembly setup //------------------------------------------------------------------------------ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { - Ceed ceed; + Ceed ceed; + char *assembly_kernel_path, *assembly_kernel_source; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, + num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; + CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *input_fields, *output_fields; + CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Cuda *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Get intput and output fields - CeedInt num_input_fields, num_output_fields; - CeedOperatorField *input_fields; - CeedOperatorField *output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Determine active input basis eval mode - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedQFunctionField *qf_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; - // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim. + // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so // num_B_in_mats_to_load will be incremented by 1. - CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; - CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load - CeedBasis basis_in = NULL; - CeedInt nqpts = 0, esize = 0; - CeedElemRestriction rstr_in = NULL; for (CeedInt i = 0; i < num_input_fields; i++) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize)); - CeedEvalMode eval_mode; + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); eval_mode_in[num_B_in_mats_to_load] = eval_mode; num_B_in_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_in += dim; - size_B_in += dim * esize * nqpts; + num_e_mode_in += dim; + size_B_in += dim * elem_size * num_qpts; } else { - num_emode_in += 1; - size_B_in += esize * nqpts; + num_e_mode_in += 1; + size_B_in += elem_size * num_qpts; } } } @@ -929,94 +955,95 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee // Determine active output basis; basis_out and rstr_out only used if same as input, TODO CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); - CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; - CeedEvalMode *eval_mode_out = NULL; - CeedBasis basis_out = NULL; - CeedElemRestriction rstr_out = NULL; for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); - CeedEvalMode eval_mode; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); eval_mode_out[num_B_out_mats_to_load] = eval_mode; num_B_out_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_out += dim; - size_B_out += dim * esize * nqpts; + num_e_mode_out += dim; + size_B_out += dim * elem_size * num_qpts; } else { - num_emode_out += 1; - size_B_out += esize * nqpts; + num_e_mode_out += 1; + size_B_out += elem_size * num_qpts; } } } } - CeedCheck(num_emode_in > 0 && num_emode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); - CeedInt nelem, ncomp; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); CeedCallBackend(CeedCalloc(1, &impl->asmb)); CeedOperatorAssemble_Cuda *asmb = impl->asmb; - asmb->nelem = nelem; + asmb->num_elem = num_elem; // Compile kernels - int elemsPerBlock = 1; - asmb->elemsPerBlock = elemsPerBlock; - CeedInt block_size = esize * esize * elemsPerBlock; + int elem_per_block = 1; + asmb->elem_per_block = elem_per_block; + CeedInt block_size = elem_size * elem_size * elem_per_block; Ceed_Cuda *cuda_data; + CeedCallBackend(CeedGetData(ceed, &cuda_data)); - char *assembly_kernel_path, *assembly_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; + if (fallback) { // Use fallback kernel with 1D threadblock - block_size = esize * elemsPerBlock; - asmb->block_size_x = esize; + block_size = elem_size * elem_per_block; + asmb->block_size_x = elem_size; asmb->block_size_y = 1; } else { // Use kernel with 2D threadblock - asmb->block_size_x = esize; - asmb->block_size_y = esize; + asmb->block_size_x = elem_size; + asmb->block_size_y = elem_size; } - CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", - num_emode_out, "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp, "CEEDSIZE", - use_ceedsize_idx)); + CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NUM_ELEM", num_elem, "NUM_E_MODE_IN", num_e_mode_in, + "NUM_E_MODE_OUT", num_e_mode_out, "NUM_QPTS", num_qpts, "NUM_NODES", elem_size, "BLOCK_SIZE", block_size, + "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); CeedCallBackend(CeedFree(&assembly_kernel_path)); CeedCallBackend(CeedFree(&assembly_kernel_source)); // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) const CeedScalar *interp_in, *grad_in; + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); // Load into B_in, in order that they will be used in eval_mode const CeedInt inBytes = size_B_in * sizeof(CeedScalar); CeedInt mat_start = 0; + CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); for (int i = 0; i < num_B_in_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_in[i]; + if (eval_mode == CEED_EVAL_INTERP) { - CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); - mat_start += esize * nqpts; + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { - CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); - mat_start += dim * esize * nqpts; + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); + mat_start += dim * elem_size * num_qpts; } } const CeedScalar *interp_out, *grad_out; - // Note that this function currently assumes 1 basis, so this should always be true - // for now + + // Note that this function currently assumes 1 basis, so this should always be true for now if (basis_out == basis_in) { interp_out = interp_in; grad_out = grad_in; @@ -1028,15 +1055,17 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee // Load into B_out, in order that they will be used in eval_mode const CeedInt outBytes = size_B_out * sizeof(CeedScalar); mat_start = 0; + CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); for (int i = 0; i < num_B_out_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_out[i]; + if (eval_mode == CEED_EVAL_INTERP) { - CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); - mat_start += esize * nqpts; + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { - CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * esize * nqpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); - mat_start += dim * esize * nqpts; + CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); + mat_start += dim * elem_size * num_qpts; } } return CEED_ERROR_SUCCESS; @@ -1051,26 +1080,27 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; + Ceed ceed; + CeedSize values_length = 0, assembled_qf_length = 0; + CeedInt use_ceedsize_idx = 0; + CeedScalar *values_array; + const CeedScalar *qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr_q = NULL; + CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Cuda *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembled_qf = NULL; - CeedElemRestriction rstr_q = NULL; CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); - CeedScalar *values_array; CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); values_array += offset; - const CeedScalar *qf_array; CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); - CeedSize values_length = 0, assembled_qf_length = 0; CeedCallBackend(CeedVectorGetLength(values, &values_length)); CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - CeedInt use_ceedsize_idx = 0; if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup if (!impl->asmb) { @@ -1079,12 +1109,13 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed } // Compute B^T D B - const CeedInt nelem = impl->asmb->nelem; - const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; - const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + const CeedInt num_elem = impl->asmb->num_elem; + const CeedInt elem_per_block = impl->asmb->elem_per_block; + const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); + void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + CeedCallBackend( - CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elemsPerBlock, args)); + CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); // Restore arrays CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); @@ -1092,7 +1123,6 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed // Cleanup CeedCallBackend(CeedVectorDestroy(&assembled_qf)); - return CEED_ERROR_SUCCESS; } @@ -1100,10 +1130,10 @@ static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, Ceed // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Cuda(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Cuda *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp index 81de176ecd..82ef2a90d7 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction-load.cpp @@ -22,9 +22,14 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { using std::ostringstream; using std::string; - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + + Ceed ceed; + char *read_write_kernel_path, *read_write_kernel_source; + CeedInt num_input_fields, num_output_fields, size; + CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Cuda *data; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); // QFunction is built @@ -33,12 +38,9 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or CUfunction provided."); // QFunction kernel generation - CeedInt num_input_fields, num_output_fields, size; - CeedQFunctionField *input_fields, *output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel - char *read_write_kernel_path, *read_write_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-qfunction.h", &read_write_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); @@ -116,7 +118,6 @@ extern "C" int CeedQFunctionBuildKernel_Cuda_ref(CeedQFunction qf) { CeedCallBackend(CeedFree(&data->qfunction_source)); CeedCallBackend(CeedFree(&read_write_kernel_path)); CeedCallBackend(CeedFree(&read_write_kernel_source)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunction.c b/backends/cuda-ref/ceed-cuda-ref-qfunction.c index d3b038be31..a2058e027c 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunction.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunction.c @@ -19,17 +19,18 @@ // Apply QFunction //------------------------------------------------------------------------------ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { - Ceed ceed; + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt num_input_fields, num_output_fields; + CeedQFunction_Cuda *data; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); // Build and compile kernel, if not done CeedCallBackend(CeedQFunctionBuildKernel_Cuda_ref(qf)); - CeedQFunction_Cuda *data; CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed_Cuda *ceed_Cuda; CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedInt num_input_fields, num_output_fields; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read vectors @@ -57,7 +58,6 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Restore context CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); - return CEED_ERROR_SUCCESS; } @@ -65,13 +65,13 @@ static int CeedQFunctionApply_Cuda(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) { + Ceed ceed; CeedQFunction_Cuda *data; + CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); if (data->module) CeedCallCuda(ceed, cuModuleUnload(data->module)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -80,6 +80,7 @@ static int CeedQFunctionDestroy_Cuda(CeedQFunction qf) { //------------------------------------------------------------------------------ static int CeedQFunctionSetCUDAUserFunction_Cuda(CeedQFunction qf, CUfunction f) { CeedQFunction_Cuda *data; + CeedCallBackend(CeedQFunctionGetData(qf, &data)); data->QFunction = f; return CEED_ERROR_SUCCESS; @@ -89,9 +90,10 @@ static int CeedQFunctionSetCUDAUserFunction_Cuda(CeedQFunction qf, CUfunction f) // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Cuda(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; CeedQFunction_Cuda *data; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); diff --git a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c index 106165316f..7b6974dc5d 100644 --- a/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c +++ b/backends/cuda-ref/ceed-cuda-ref-qfunctioncontext.c @@ -18,27 +18,25 @@ // Sync host to device //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Cuda *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCheck(impl->h_data, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->d_data_borrowed) { impl->d_data = impl->d_data_borrowed; } else if (impl->d_data_owned) { impl->d_data = impl->d_data_owned; } else { - CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctxsize)); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctx_size)); impl->d_data = impl->d_data_owned; } - - CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctxsize, cudaMemcpyHostToDevice)); - + CeedCallCuda(ceed, cudaMemcpy(impl->d_data, impl->h_data, ctx_size, cudaMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -46,27 +44,26 @@ static inline int CeedQFunctionContextSyncH2D_Cuda(const CeedQFunctionContext ct // Sync device to host //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncD2H_Cuda(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Cuda *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCheck(impl->d_data, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->h_data_borrowed) { impl->h_data = impl->h_data_borrowed; } else if (impl->h_data_owned) { impl->h_data = impl->h_data_owned; } else { - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data = impl->h_data_owned; } - - CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctxsize, cudaMemcpyDeviceToHost)); - + CeedCallCuda(ceed, cudaMemcpy(impl->h_data, impl->d_data, ctx_size, cudaMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -88,11 +85,10 @@ static inline int CeedQFunctionContextSync_Cuda(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSetAllInvalid_Cuda(const CeedQFunctionContext ctx) { CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); impl->h_data = NULL; impl->d_data = NULL; - return CEED_ERROR_SUCCESS; } @@ -101,10 +97,9 @@ static inline int CeedQFunctionContextSetAllInvalid_Cuda(const CeedQFunctionCont //------------------------------------------------------------------------------ static inline int CeedQFunctionContextHasValidData_Cuda(const CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); *has_valid_data = impl && (impl->h_data || impl->d_data); - return CEED_ERROR_SUCCESS; } @@ -114,8 +109,8 @@ static inline int CeedQFunctionContextHasValidData_Cuda(const CeedQFunctionConte static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_data_of_type = impl->h_data_borrowed; @@ -124,7 +119,6 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda(const CeedQFunc *has_borrowed_data_of_type = impl->d_data_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -132,10 +126,10 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Cuda(const CeedQFunc // Check if data of given type needs sync //------------------------------------------------------------------------------ static inline int CeedQFunctionContextNeedSync_Cuda(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { + bool has_valid_data = true; CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - bool has_valid_data = true; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); switch (mem_type) { case CEED_MEM_HOST: @@ -145,7 +139,6 @@ static inline int CeedQFunctionContextNeedSync_Cuda(const CeedQFunctionContext c *need_sync = has_valid_data && !impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -154,17 +147,18 @@ static inline int CeedQFunctionContextNeedSync_Cuda(const CeedQFunctionContext c //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Cuda *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedFree(&impl->h_data_owned)); switch (copy_mode) { case CEED_COPY_VALUES: { - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + size_t ctx_size; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data_borrowed = NULL; impl->h_data = impl->h_data_owned; - memcpy(impl->h_data, data, ctxsize); + memcpy(impl->h_data, data, ctx_size); } break; case CEED_OWN_POINTER: impl->h_data_owned = data; @@ -176,7 +170,6 @@ static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, impl->h_data = data; break; } - return CEED_ERROR_SUCCESS; } @@ -184,21 +177,22 @@ static int CeedQFunctionContextSetDataHost_Cuda(const CeedQFunctionContext ctx, // Set data from device //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; CeedQFunctionContext_Cuda *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallCuda(ceed, cudaFree(impl->d_data_owned)); impl->d_data_owned = NULL; switch (copy_mode) { case CEED_COPY_VALUES: { - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctxsize)); + size_t ctx_size; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_data_owned, ctx_size)); impl->d_data_borrowed = NULL; impl->d_data = impl->d_data_owned; - CeedCallCuda(ceed, cudaMemcpy(impl->d_data, data, ctxsize, cudaMemcpyDeviceToDevice)); + CeedCallCuda(ceed, cudaMemcpy(impl->d_data, data, ctx_size, cudaMemcpyDeviceToDevice)); } break; case CEED_OWN_POINTER: impl->d_data_owned = data; @@ -211,7 +205,6 @@ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx impl->d_data = data; break; } - return CEED_ERROR_SUCCESS; } @@ -221,8 +214,8 @@ static int CeedQFunctionContextSetDataDevice_Cuda(const CeedQFunctionContext ctx //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextSetAllInvalid_Cuda(ctx)); switch (mem_type) { case CEED_MEM_HOST: @@ -230,7 +223,6 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, cons case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Cuda(ctx, copy_mode, data); } - return CEED_ERROR_UNSUPPORTED; } @@ -238,9 +230,10 @@ static int CeedQFunctionContextSetData_Cuda(const CeedQFunctionContext ctx, cons // Take data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; CeedQFunctionContext_Cuda *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type @@ -261,7 +254,6 @@ static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, con impl->d_data = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -270,13 +262,14 @@ static int CeedQFunctionContextTakeData_Cuda(const CeedQFunctionContext ctx, con // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedQFunctionContextGetDataCore_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + bool need_sync = false; CeedQFunctionContext_Cuda *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type - bool need_sync = false; CeedCallBackend(CeedQFunctionContextNeedSync_Cuda(ctx, mem_type, &need_sync)); if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Cuda(ctx, mem_type)); @@ -289,7 +282,6 @@ static int CeedQFunctionContextGetDataCore_Cuda(const CeedQFunctionContext ctx, *(void **)data = impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -305,8 +297,8 @@ static int CeedQFunctionContextGetDataRead_Cuda(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextGetDataCore_Cuda(ctx, mem_type, data)); // Mark only pointer for requested memory as valid @@ -319,7 +311,6 @@ static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, cons impl->d_data = *(void **)data; break; } - return CEED_ERROR_SUCCESS; } @@ -327,15 +318,14 @@ static int CeedQFunctionContextGetData_Cuda(const CeedQFunctionContext ctx, cons // Destroy the user context //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Cuda(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; CeedQFunctionContext_Cuda *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallCuda(ceed, cudaFree(impl->d_data_owned)); CeedCallBackend(CeedFree(&impl->h_data_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -345,8 +335,8 @@ static int CeedQFunctionContextDestroy_Cuda(const CeedQFunctionContext ctx) { int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) { CeedQFunctionContext_Cuda *impl; Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Cuda)); @@ -354,10 +344,8 @@ int CeedQFunctionContextCreate_Cuda(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Cuda)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c index abdedcb457..71ed282166 100644 --- a/backends/cuda-ref/ceed-cuda-ref-restriction.c +++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c @@ -22,21 +22,22 @@ // Apply restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + Ceed ceed; + Ceed_Cuda *data; + CUfunction kernel; + CeedInt num_elem, elem_size; + const CeedScalar *d_u; + CeedScalar *d_v; CeedElemRestriction_Cuda *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); - Ceed_Cuda *data; CeedCallBackend(CeedGetData(ceed, &data)); - const CeedInt num_nodes = impl->num_nodes; - CeedInt num_elem, elem_size; CeedElemRestrictionGetNumElements(r, &num_elem); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CUfunction kernel; + const CeedInt num_nodes = impl->num_nodes; // Get vectors - const CeedScalar *d_u; - CeedScalar *d_v; CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec @@ -69,6 +70,7 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod if (impl->d_ind) { // -- Offsets provided CeedInt block_size = 32; + if (impl->OffsetTranspose) { kernel = impl->OffsetTranspose; void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; @@ -103,8 +105,8 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { CeedElemRestriction_Cuda *impl; - CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (mem_type) { case CEED_MEM_HOST: *offsets = impl->h_ind; @@ -120,10 +122,10 @@ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemT // Destroy restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { + Ceed ceed; CeedElemRestriction_Cuda *impl; - CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed ceed; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallCuda(ceed, cuModuleUnload(impl->module)); CeedCallBackend(CeedFree(&impl->h_ind_allocated)); @@ -132,7 +134,6 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -140,32 +141,32 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { // Create transpose offsets and indices //------------------------------------------------------------------------------ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; + bool *is_node; + CeedSize l_size; + CeedInt num_elem, elem_size, num_comp, num_nodes = 0; + CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; CeedElemRestriction_Cuda *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - CeedSize l_size; - CeedInt num_elem, elem_size, num_comp; CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + const CeedInt size_indices = num_elem * elem_size; // Count num_nodes - bool *is_node; CeedCallBackend(CeedCalloc(l_size, &is_node)); - const CeedInt size_indices = num_elem * elem_size; + for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; - CeedInt num_nodes = 0; for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; impl->num_nodes = num_nodes; // L-vector offsets array - CeedInt *ind_to_offset, *l_vec_indices; CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); - CeedInt j = 0; - for (CeedInt i = 0; i < l_size; i++) { + for (CeedInt i = 0, j = 0; i < l_size; i++) { if (is_node[i]) { l_vec_indices[j] = i; ind_to_offset[i] = j++; @@ -175,9 +176,8 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee // Compute transpose offsets and indices const CeedInt size_offsets = num_nodes + 1; - CeedInt *t_offsets; + CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); - CeedInt *t_indices; CeedCallBackend(CeedMalloc(size_indices, &t_indices)); // Count node multiplicity for (CeedInt e = 0; e < num_elem; ++e) { @@ -188,8 +188,9 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee // List all E-vec indices associated with L-vec node for (CeedInt e = 0; e < num_elem; ++e) { for (CeedInt i = 0; i < elem_size; ++i) { - const CeedInt lid = elem_size * e + i; - const CeedInt gid = indices[lid]; + const CeedInt lid = elem_size * e + i; + const CeedInt gid = indices[lid]; + t_indices[t_offsets[ind_to_offset[gid]]++] = lid; } } @@ -213,7 +214,6 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); - return CEED_ERROR_SUCCESS; } @@ -222,32 +222,32 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee //------------------------------------------------------------------------------ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, const CeedInt8 *curl_orients, CeedElemRestriction r) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed, ceed_parent; + bool is_deterministic, is_strided; + CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedRestrictionType rstr_type; CeedElemRestriction_Cuda *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - bool is_deterministic; - CeedCallBackend(CeedIsDeterministic(parent, &is_deterministic)); - CeedInt num_elem, num_comp, elem_size; + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt comp_stride = 1; + const CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; + CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); // Stride data - bool is_strided; CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (is_strided) { bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); @@ -264,7 +264,6 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, impl->d_t_offsets = NULL; impl->num_nodes = size; CeedCallBackend(CeedElemRestrictionSetData(r, impl)); - CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays @@ -327,15 +326,18 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, // Compile CUDA kernels (add atomicAdd function for old NVidia architectures) CeedInt num_nodes = impl->num_nodes; char *restriction_kernel_path, *restriction_kernel_source = NULL; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); if (!is_deterministic) { struct cudaDeviceProp prop; Ceed_Cuda *ceed_data; + CeedCallBackend(CeedGetData(ceed, &ceed_data)); CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { char *atomic_add_path; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source)); CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); @@ -346,9 +348,9 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); } CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, - "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", - strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", + strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); diff --git a/backends/cuda-ref/ceed-cuda-ref-vector.c b/backends/cuda-ref/ceed-cuda-ref-vector.c index e05bee8238..284bf476ac 100644 --- a/backends/cuda-ref/ceed-cuda-ref-vector.c +++ b/backends/cuda-ref/ceed-cuda-ref-vector.c @@ -19,10 +19,10 @@ // Check if host/device sync is needed //------------------------------------------------------------------------------ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { + bool has_valid_array = false; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - bool has_valid_array = false; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); switch (mem_type) { case CEED_MEM_HOST: @@ -32,7 +32,6 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_ *need_sync = has_valid_array && !impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -40,14 +39,15 @@ static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_ // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(impl->h_array, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); @@ -59,9 +59,7 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice)); - return CEED_ERROR_SUCCESS; } @@ -69,9 +67,11 @@ static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); @@ -82,16 +82,16 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { impl->h_array = impl->h_array_owned; } else { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedCalloc(length, &impl->h_array_owned)); impl->h_array = impl->h_array_owned; } - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); + CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -99,8 +99,9 @@ static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) { // Sync arrays //------------------------------------------------------------------------------ static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) { - // Check whether device/host sync is needed bool need_sync = false; + + // Check whether device/host sync is needed CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync)); if (!need_sync) return CEED_ERROR_SUCCESS; @@ -118,11 +119,10 @@ static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) //------------------------------------------------------------------------------ static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); impl->h_array = NULL; impl->d_array = NULL; - return CEED_ERROR_SUCCESS; } @@ -131,10 +131,9 @@ static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) { //------------------------------------------------------------------------------ static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = impl->h_array || impl->d_array; - return CEED_ERROR_SUCCESS; } @@ -143,8 +142,8 @@ static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_v //------------------------------------------------------------------------------ static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned; @@ -153,7 +152,6 @@ static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemTyp *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned; break; } - return CEED_ERROR_SUCCESS; } @@ -162,8 +160,8 @@ static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemTyp //------------------------------------------------------------------------------ static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_array_of_type = impl->h_array_borrowed; @@ -172,7 +170,6 @@ static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, Ce *has_borrowed_array_of_type = impl->d_array_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -181,12 +178,13 @@ static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, Ce //------------------------------------------------------------------------------ static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { case CEED_COPY_VALUES: { - CeedSize length; if (!impl->h_array_owned) { + CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedMalloc(length, &impl->h_array_owned)); } @@ -194,6 +192,7 @@ static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode impl->h_array = impl->h_array_owned; if (array) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); memcpy(impl->h_array, array, bytes); @@ -211,7 +210,6 @@ static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode impl->h_array = array; break; } - return CEED_ERROR_SUCCESS; } @@ -219,16 +217,18 @@ static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode // Set array from device //------------------------------------------------------------------------------ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { case CEED_COPY_VALUES: { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); + if (!impl->d_array_owned) { CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes)); } @@ -249,7 +249,6 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMod impl->d_array = array; break; } - return CEED_ERROR_SUCCESS; } @@ -258,11 +257,11 @@ static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMod // freeing any previously allocated array if applicable //------------------------------------------------------------------------------ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); switch (mem_type) { case CEED_MEM_HOST: @@ -270,7 +269,6 @@ static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_t case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array); } - return CEED_ERROR_UNSUPPORTED; } @@ -291,13 +289,13 @@ int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val // Set a vector to a value //------------------------------------------------------------------------------ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { if (impl->d_array_borrowed) { @@ -320,7 +318,6 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val)); impl->d_array = NULL; } - return CEED_ERROR_SUCCESS; } @@ -328,14 +325,13 @@ static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) { // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); - // Update pointer switch (mem_type) { case CEED_MEM_HOST: @@ -349,7 +345,6 @@ static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedSc impl->d_array = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -358,14 +353,13 @@ static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedSc // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type CeedCallBackend(CeedVectorSyncArray(vec, mem_type)); - // Update pointer switch (mem_type) { case CEED_MEM_HOST: @@ -375,7 +369,6 @@ static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType m *array = impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -391,10 +384,9 @@ static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType m //------------------------------------------------------------------------------ static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array)); - CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec)); switch (mem_type) { case CEED_MEM_HOST: @@ -404,7 +396,6 @@ static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_t impl->d_array = *array; break; } - return CEED_ERROR_SUCCESS; } @@ -412,10 +403,10 @@ static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_t // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { + bool has_array_of_type = true; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - bool has_array_of_type = true; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated @@ -432,7 +423,6 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType else impl->d_array = impl->d_array_owned; } } - return CeedVectorGetArray_Cuda(vec, mem_type, array); } @@ -440,13 +430,15 @@ static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType // Get the norm of a CeedVector //------------------------------------------------------------------------------ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) { - Ceed ceed; + Ceed ceed; + cublasHandle_t handle; + CeedSize length; + const CeedScalar *d_array; + CeedVector_Cuda *impl; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedVector_Cuda *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - cublasHandle_t handle; CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle)); #if CUDA_VERSION < 12000 @@ -454,11 +446,11 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no // we need to check if the vector is too long to handle with int32, // and if so, divide it into subsections for repeated cuBLAS calls. CeedSize num_calls = length / INT_MAX; + if (length % INT_MAX > 0) num_calls += 1; #endif // Compute norm - const CeedScalar *d_array; CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { case CEED_NORM_1: { @@ -469,10 +461,12 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no #else float sub_norm = 0.0; float *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); *norm += sub_norm; } @@ -483,10 +477,12 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no #else double sub_norm = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); *norm += sub_norm; } @@ -501,10 +497,12 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no #else float sub_norm = 0.0, norm_sum = 0.0; float *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); norm_sum += sub_norm * sub_norm; } @@ -516,10 +514,12 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no #else double sub_norm = 0.0, norm_sum = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); norm_sum += sub_norm * sub_norm; } @@ -531,42 +531,48 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no case CEED_NORM_MAX: { if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { #if CUDA_VERSION >= 12000 - int64_t indx; - CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &indx)); - CeedScalar normNoAbs; - CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - *norm = fabs(normNoAbs); + int64_t index; + CeedScalar norm_no_abs; + + CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + *norm = fabs(norm_no_abs); #else - CeedInt indx; + CeedInt index; float sub_max = 0.0, current_max = 0.0; float *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &indx)); - CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + + CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } *norm = current_max; #endif } else { #if CUDA_VERSION >= 12000 - int64_t indx; - CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &indx)); - CeedScalar normNoAbs; - CeedCallCuda(ceed, cudaMemcpy(&normNoAbs, impl->d_array + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); - *norm = fabs(normNoAbs); + int64_t index; + CeedScalar norm_no_abs; + + CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + *norm = fabs(norm_no_abs); #else - CeedInt indx; + CeedInt index; double sub_max = 0.0, current_max = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &indx)); - CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); + + CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); + CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost)); if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } *norm = current_max; @@ -576,7 +582,6 @@ static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *no } } CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); - return CEED_ERROR_SUCCESS; } @@ -599,17 +604,16 @@ int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length); // Take reciprocal of a vector //------------------------------------------------------------------------------ static int CeedVectorReciprocal_Cuda(CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - // Set value for synced device/host array if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length)); if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length)); - return CEED_ERROR_SUCCESS; } @@ -630,17 +634,16 @@ int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) // Compute x = alpha x //------------------------------------------------------------------------------ static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(x, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *x_impl; + + CeedCallBackend(CeedVectorGetCeed(x, &ceed)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(x, &length)); - // Set value for synced device/host array if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length)); if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length)); - return CEED_ERROR_SUCCESS; } @@ -661,14 +664,14 @@ int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_arr // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *y_impl, *x_impl; + + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(y, &length)); - // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); @@ -678,7 +681,6 @@ static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -699,14 +701,14 @@ int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, // Compute y = alpha x + beta y //------------------------------------------------------------------------------ static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *y_impl, *x_impl; + + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(y, &length)); - // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); @@ -716,7 +718,6 @@ static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -737,15 +738,15 @@ int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedS // Compute the pointwise multiplication w = x .* y //------------------------------------------------------------------------------ static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(w, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Cuda *w_impl, *x_impl, *y_impl; + + CeedCallBackend(CeedVectorGetCeed(w, &ceed)); CeedCallBackend(CeedVectorGetData(w, &w_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(w, &length)); - // Set value for synced device/host array if (!w_impl->d_array && !w_impl->h_array) { CeedCallBackend(CeedVectorSetValue(w, 0.0)); @@ -760,7 +761,6 @@ static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -768,15 +768,14 @@ static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y // Destroy the vector //------------------------------------------------------------------------------ static int CeedVectorDestroy_Cuda(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Cuda *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallCuda(ceed, cudaFree(impl->d_array_owned)); CeedCallBackend(CeedFree(&impl->h_array_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -786,8 +785,8 @@ static int CeedVectorDestroy_Cuda(const CeedVector vec) { int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) { CeedVector_Cuda *impl; Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda)); @@ -804,10 +803,8 @@ int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c index a9fffe557d..a008cbc4fc 100644 --- a/backends/cuda-ref/ceed-cuda-ref.c +++ b/backends/cuda-ref/ceed-cuda-ref.c @@ -27,8 +27,8 @@ static int CeedGetPreferredMemType_Cuda(CeedMemType *mem_type) { //------------------------------------------------------------------------------ int CeedGetCublasHandle_Cuda(Ceed ceed, cublasHandle_t *handle) { Ceed_Cuda *data; - CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedGetData(ceed, &data)); if (!data->cublas_handle) CeedCallCublas(ceed, cublasCreate(&data->cublas_handle)); *handle = data->cublas_handle; return CEED_ERROR_SUCCESS; @@ -38,13 +38,14 @@ int CeedGetCublasHandle_Cuda(Ceed ceed, cublasHandle_t *handle) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) { - char *resource_root; + Ceed_Cuda *data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/cuda/ref"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Cuda *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 6e9bd1b8b5..26ab4ba5c7 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -82,30 +82,29 @@ typedef struct { CUmodule module; CUfunction linearDiagonal; CUfunction linearPointBlock; - CeedBasis basisin, basisout; - CeedElemRestriction diagrstr, pbdiagrstr; - CeedVector elemdiag, pbelemdiag; - CeedInt numemodein, numemodeout, nnodes; - CeedEvalMode *h_emodein, *h_emodeout; - CeedEvalMode *d_emodein, *d_emodeout; - CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; + CeedBasis basis_in, basis_out; + CeedElemRestriction diag_rstr, point_block_rstr; + CeedVector elem_diag, point_block_elem_diag; + CeedInt num_e_mode_in, num_e_mode_out, num_nodes; + CeedEvalMode *h_e_mode_in, *h_e_mode_out; + CeedEvalMode *d_e_mode_in, *d_e_mode_out; + CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; } CeedOperatorDiag_Cuda; typedef struct { CUmodule module; CUfunction linearAssemble; - CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; + CeedInt num_elem, block_size_x, block_size_y, elem_per_block; CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Cuda; typedef struct { - CeedVector *evecs; // E-vectors, inputs followed by outputs - CeedVector *qvecsin; // Input Q-vectors needed to apply operator - CeedVector *qvecsout; // Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; - CeedInt qfnumactivein, qfnumactiveout; - CeedVector *qfactivein; + CeedVector *e_vecs; // E-vectors, inputs followed by outputs + CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator + CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; + CeedVector *qf_active_in; CeedOperatorDiag_Cuda *diag; CeedOperatorAssemble_Cuda *asmb; } CeedOperator_Cuda; diff --git a/backends/cuda-ref/kernels/cuda-ref-vector.cu b/backends/cuda-ref/kernels/cuda-ref-vector.cu index 133c626d37..696e42390f 100644 --- a/backends/cuda-ref/kernels/cuda-ref-vector.cu +++ b/backends/cuda-ref/kernels/cuda-ref-vector.cu @@ -13,10 +13,10 @@ //------------------------------------------------------------------------------ __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, CeedScalar val) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - vec[idx] = val; + vec[index] = val; } //------------------------------------------------------------------------------ @@ -24,13 +24,13 @@ __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, //------------------------------------------------------------------------------ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, CeedScalar val) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - setValueK<<>>(d_array, length, val); + if (block_size * grid_size < vec_size) + grid_size += 1; + setValueK<<>>(d_array, length, val); return 0; } @@ -38,24 +38,24 @@ extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, // Kernel for taking reciprocal //------------------------------------------------------------------------------ __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - if (fabs(vec[idx]) > 1E-16) - vec[idx] = 1./vec[idx]; + if (fabs(vec[index]) > 1E-16) + vec[index] = 1./vec[index]; } //------------------------------------------------------------------------------ // Take vector reciprocal in device memory //------------------------------------------------------------------------------ extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - rcpValueK<<>>(d_array, length); + if (block_size * grid_size < vec_size) + grid_size += 1; + rcpValueK<<>>(d_array, length); return 0; } @@ -64,10 +64,10 @@ extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { //------------------------------------------------------------------------------ __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - x[idx] *= alpha; + x[index] *= alpha; } //------------------------------------------------------------------------------ @@ -75,13 +75,13 @@ __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha //------------------------------------------------------------------------------ extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - scaleValueK<<>>(x_array, alpha, length); + if (block_size * grid_size < vec_size) + grid_size += 1; + scaleValueK<<>>(x_array, alpha, length); return 0; } @@ -90,10 +90,10 @@ extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, //------------------------------------------------------------------------------ __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar * __restrict__ x, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - y[idx] += alpha * x[idx]; + y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ @@ -101,13 +101,13 @@ __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, //------------------------------------------------------------------------------ extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - axpyValueK<<>>(y_array, alpha, x_array, length); + if (block_size * grid_size < vec_size) + grid_size += 1; + axpyValueK<<>>(y_array, alpha, x_array, length); return 0; } @@ -116,11 +116,11 @@ extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, //------------------------------------------------------------------------------ __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar * __restrict__ x, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - y[idx] = beta * y[idx]; - y[idx] += alpha * x[idx]; + y[index] = beta * y[index]; + y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ @@ -128,13 +128,13 @@ __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha //------------------------------------------------------------------------------ extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - axpbyValueK<<>>(y_array, alpha, beta, x_array, length); + if (block_size * grid_size < vec_size) + grid_size += 1; + axpbyValueK<<>>(y_array, alpha, beta, x_array, length); return 0; } @@ -143,10 +143,10 @@ extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedS //------------------------------------------------------------------------------ __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + if (index >= size) return; - w[idx] = x[idx] * y[idx]; + w[index] = x[index] * y[index]; } //------------------------------------------------------------------------------ @@ -154,13 +154,13 @@ __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, //------------------------------------------------------------------------------ extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) - gridsize += 1; - pointwiseMultValueK<<>>(w_array, x_array, y_array, length); + if (block_size * grid_size < vec_size) + grid_size += 1; + pointwiseMultValueK<<>>(w_array, x_array, y_array, length); return 0; } diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 0710a8f596..305ea669ea 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -29,19 +29,20 @@ int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, //------------------------------------------------------------------------------ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Cuda *ceed_Cuda; + CeedInt dim, num_comp; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Cuda_shared *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Cuda *ceed_Cuda; CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedBasis_Cuda_shared *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedInt dim, num_comp; CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); @@ -50,11 +51,14 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce switch (eval_mode) { case CEED_EVAL_INTERP: { CeedInt P_1d, Q_1d; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P_1d, Q_1d, &data->c_B)); void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; + if (dim == 1) { CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); // avoid >512 total threads @@ -94,9 +98,11 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce } break; case CEED_EVAL_GRAD: { CeedInt P_1d, Q_1d; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + if (data->d_collo_grad_1d) { CeedCallBackend(CeedInit_CudaCollocatedGrad(data->d_interp_1d, data->d_collo_grad_1d, P_1d, Q_1d, &data->c_B, &data->c_G)); } else { @@ -140,6 +146,7 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce } break; case CEED_EVAL_WEIGHT: { CeedInt Q_1d; + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; if (dim == 1) { @@ -186,21 +193,17 @@ int CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt num_elem, Ce // Destroy basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasis_Cuda_shared *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallCuda(ceed, cuModuleUnload(data->module)); - CeedCallCuda(ceed, cudaFree(data->d_q_weight_1d)); CeedCallCuda(ceed, cudaFree(data->d_interp_1d)); CeedCallCuda(ceed, cudaFree(data->d_grad_1d)); CeedCallCuda(ceed, cudaFree(data->d_collo_grad_1d)); - CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -209,28 +212,31 @@ static int CeedBasisDestroy_Cuda_shared(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * P_1d; CeedBasis_Cuda_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight_1d, q_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, cudaMemcpyHostToDevice)); - - const CeedInt interp_bytes = q_bytes * P_1d; CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_interp_1d, interp_1d, interp_bytes, cudaMemcpyHostToDevice)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad_1d, interp_bytes)); CeedCallCuda(ceed, cudaMemcpy(data->d_grad_1d, grad_1d, interp_bytes, cudaMemcpyHostToDevice)); // Compute collocated gradient and copy to GPU data->d_collo_grad_1d = NULL; bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; + if (has_collocated_grad) { CeedScalar *collo_grad_1d; + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); CeedCallCuda(ceed, cudaMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); @@ -239,9 +245,7 @@ int CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, } // Compile basis kernels - CeedInt num_comp; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); diff --git a/backends/cuda-shared/ceed-cuda-shared.c b/backends/cuda-shared/ceed-cuda-shared.c index 73add452da..2ff41d34b3 100644 --- a/backends/cuda-shared/ceed-cuda-shared.c +++ b/backends/cuda-shared/ceed-cuda-shared.c @@ -18,23 +18,23 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Cuda_shared(const char *resource, Ceed ceed) { - char *resource_root; + Ceed ceed_ref; + Ceed_Cuda *data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/cuda/shared"), ceed, CEED_ERROR_BACKEND, "Cuda backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Cuda *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Cuda(ceed, resource)); - Ceed ceed_ref; CeedCallBackend(CeedInit("/gpu/cuda/ref", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis.cu index 118055f1f7..4d61f9af0e 100644 --- a/backends/cuda-shared/kernels/cuda-shared-basis.cu +++ b/backends/cuda-shared/kernels/cuda-shared-basis.cu @@ -18,9 +18,9 @@ __constant__ CeedScalar c_G[sizeMax*sizeMax]; extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr) { const int bytes = P_1d*Q_1d*sizeof(CeedScalar); + cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); cudaGetSymbolAddress((void **)c_B_ptr, c_B); - return CEED_ERROR_SUCCESS; } @@ -30,11 +30,11 @@ extern "C" int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { const int bytes = P_1d*Q_1d*sizeof(CeedScalar); + cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); cudaGetSymbolAddress((void **)c_B_ptr, c_B); cudaMemcpyToSymbol(c_G, d_G, bytes, 0, cudaMemcpyDeviceToDevice); cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; } @@ -44,12 +44,12 @@ extern "C" int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, extern "C" int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr) { const int bytes_interp = P_1d*Q_1d*sizeof(CeedScalar); + const int bytes_grad = Q_1d*Q_1d*sizeof(CeedScalar); + cudaMemcpyToSymbol(c_B, d_B, bytes_interp, 0, cudaMemcpyDeviceToDevice); cudaGetSymbolAddress((void **)c_B_ptr, c_B); - const int bytes_grad = Q_1d*Q_1d*sizeof(CeedScalar); cudaMemcpyToSymbol(c_G, d_G, bytes_grad, 0, cudaMemcpyDeviceToDevice); cudaGetSymbolAddress((void **)c_G_ptr, c_G); - return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda/ceed-cuda-common.c b/backends/cuda/ceed-cuda-common.c index 077f1b4f0d..ea9c63b213 100644 --- a/backends/cuda/ceed-cuda-common.c +++ b/backends/cuda/ceed-cuda-common.c @@ -17,16 +17,17 @@ // Device information backend init //------------------------------------------------------------------------------ int CeedInit_Cuda(Ceed ceed, const char *resource) { + Ceed_Cuda *data; const char *device_spec = strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; + int current_device_id; - int current_device_id; CeedCallCuda(ceed, cudaGetDevice(¤t_device_id)); if (device_id >= 0 && current_device_id != device_id) { CeedCallCuda(ceed, cudaSetDevice(device_id)); current_device_id = device_id; } - Ceed_Cuda *data; + CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; CeedCallCuda(ceed, cudaGetDeviceProperties(&data->device_prop, current_device_id)); @@ -38,6 +39,7 @@ int CeedInit_Cuda(Ceed ceed, const char *resource) { //------------------------------------------------------------------------------ int CeedDestroy_Cuda(Ceed ceed) { Ceed_Cuda *data; + CeedCallBackend(CeedGetData(ceed, &data)); if (data->cublas_handle) CeedCallCublas(ceed, cublasDestroy(data->cublas_handle)); CeedCallBackend(CeedFree(&data)); diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index 88e46f4dad..1e5aa844ca 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -35,8 +35,15 @@ // Compile CUDA kernel //------------------------------------------------------------------------------ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { + size_t ptx_size; + char *jit_defs_path, *jit_defs_source, *ptx; + const int num_opts = 3; + const char *opts[num_opts]; + nvrtcProgram prog; + struct cudaDeviceProp prop; + Ceed_Cuda *ceed_data; + cudaFree(0); // Make sure a Context exists for nvrtc - nvrtcProgram prog; std::ostringstream code; @@ -46,6 +53,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed va_start(args, num_defines); char *name; int val; + for (int i = 0; i < num_defines; i++) { name = va_arg(args, char *); val = va_arg(args, int); @@ -55,7 +63,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed } // Standard libCEED definitions for CUDA backends - char *jit_defs_path, *jit_defs_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-jit.h", &jit_defs_path)); CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); code << jit_defs_source; @@ -64,11 +71,7 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed CeedCallBackend(CeedFree(&jit_defs_source)); // Non-macro options - const int num_opts = 3; - const char *opts[num_opts]; opts[0] = "-default-device"; - struct cudaDeviceProp prop; - Ceed_Cuda *ceed_data; CeedCallBackend(CeedGetData(ceed, &ceed_data)); CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id)); std::string arch_arg = "-arch=compute_" + std::to_string(prop.major) + std::to_string(prop.minor); @@ -83,18 +86,18 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed // Compile kernel nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts); + if (result != NVRTC_SUCCESS) { + char *log; size_t log_size; + CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); - char *log; CeedCallBackend(CeedMalloc(log_size, &log)); CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); } - size_t ptx_size; CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size)); - char *ptx; CeedCallBackend(CeedMalloc(ptx_size, &ptx)); CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx)); CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog)); @@ -119,9 +122,10 @@ int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction //------------------------------------------------------------------------------ int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) { int min_grid_size, max_block_size; + CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000)); CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args)); - return 0; + return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ @@ -150,8 +154,10 @@ int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_siz cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size); #endif CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); + if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { int max_threads_per_block, shared_size_bytes, num_regs; + cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 022e83bc67..4a7477c324 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -28,20 +28,23 @@ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_e const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); if (dim == 1) { CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; - elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - block_sizes[0] = thread1d; - block_sizes[1] = 1; - block_sizes[2] = elems_per_block; + + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[0] = thread1d; + block_sizes[1] = 1; + block_sizes[2] = elems_per_block; } else if (dim == 2) { const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; + + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; } else if (dim == 3) { const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; + + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; } return CEED_ERROR_SUCCESS; } @@ -52,37 +55,40 @@ extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_e extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { using std::ostringstream; using std::string; - bool is_setup_done; + + Ceed ceed; + bool is_setup_done, is_identity_qf; + CeedSize l_size; + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedElemRestriction_Hip *rstr_data; + CeedBasis basis; + CeedBasis_Hip_shared *basis_data; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip_gen *data; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip_gen *data; CeedCallBackend(CeedOperatorGetData(op, &data)); - CeedQFunction qf; - CeedQFunction_Hip_gen *qf_data; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedSize lsize; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); Q_1d = Q; - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedBasis basis; - CeedBasis_Hip_shared *basis_data; - CeedElemRestriction Erestrict; - CeedElemRestriction_Hip *restr_data; // TODO: put in a function? // Check for restriction only identity operator - bool is_identity_qf; CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); if (is_identity_qf) { CeedEvalMode eval_mode_in, eval_mode_out; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, @@ -95,6 +101,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { // TODO: generalize to accept different device functions? { char *tensor_basis_kernel_path, *tensor_basis_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source)); @@ -104,6 +111,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { } { char *hip_gen_template_path, *hip_gen_template_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-gen-templates.h", &hip_gen_template_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Hip-Gen Template Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source)); @@ -123,14 +131,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Collect dim, P_1d, and Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - CeedCheck(isTensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; @@ -142,14 +151,15 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_data)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Collect Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - CeedCheck(isTensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + CeedCheck(is_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); } } @@ -159,8 +169,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { // Only use 3D collocated gradient parallelization strategy when gradient is computed // TODO: put in a function? bool use_collograd_parallelization = false; + if (dim == 3) { bool was_grad_found = false; + for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { @@ -225,10 +237,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants if (eval_mode != CEED_EVAL_WEIGHT) { @@ -283,10 +295,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); @@ -349,35 +361,38 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Restriction if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { + bool is_strided; + code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; - bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_data)); - data->indices.inputs[i] = restr_data->d_ind; - code << " readDofsOffset" << dim << "d(data, lsize_in_" << i + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = rstr_data->d_ind; + code << " readDofsOffset" << dim << "d(data, l_size_in_" << i << ", elem, indices.inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readDofsStrided" << dim << "dindices.inputs[i] = restr_data->d_ind; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.inputs[i] = rstr_data->d_ind; code << " readSliceQuadsOffset" - << "3d(data, lsize_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" + << "3d(data, l_size_in_" << i << ", elem, q, indices.inputs[" << i << "], d_u_" << i << ", r_q_" << i << ");\n"; } else { - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readSliceQuadsStrided" @@ -611,10 +630,10 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // TODO put in a function // Basis action code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; @@ -643,6 +662,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); break; // Should not occur @@ -656,25 +676,29 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { // TODO put in a function // Restriction bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_out_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_data)); - data->indices.outputs[i] = restr_data->d_ind; - code << " writeDofsOffset" << dim << "d(data, lsize_out_" << i + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); + data->indices.outputs[i] = rstr_data->d_ind; + code << " writeDofsOffset" << dim << "d(data, l_size_out_" << i << ", elem, indices.outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " writeDofsStrided" << dim << "dmax_P_1d, Q_1d, block_sizes)); CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index 686c73c131..bd35fe3aa7 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -20,6 +20,7 @@ //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { CeedOperator_Hip_gen *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -29,22 +30,23 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { // Apply and add to output //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - Ceed ceed; + Ceed ceed; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedVector output_vecs[CEED_FIELD_MAX] = {NULL}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Hip_gen *qf_data; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip_gen *data; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip_gen *data; CeedCallBackend(CeedOperatorGetData(op, &data)); - CeedQFunction qf; - CeedQFunction_Hip_gen *qf_data; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedInt num_elem, num_input_fields, num_output_fields; CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedVector vec, output_vecs[CEED_FIELD_MAX] = {NULL}; // Creation of the operator CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op)); @@ -55,6 +57,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.inputs[i] = NULL; } else { + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; @@ -68,6 +72,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C if (eval_mode == CEED_EVAL_WEIGHT) { // Skip data->fields.outputs[i] = NULL; } else { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; @@ -98,18 +104,22 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C const CeedInt P_1d = data->max_P_1d; const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); CeedInt block_sizes[3]; + CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, P_1d, Q_1d, block_sizes)); if (dim == 1) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } else if (dim == 2) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } else if (dim == 3) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); } @@ -118,6 +128,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->fields.inputs[i])); @@ -129,6 +141,8 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; // Check for multiple output modes @@ -147,7 +161,6 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); - return CEED_ERROR_SUCCESS; } @@ -155,13 +168,12 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Hip_gen(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Hip_gen *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "Destroy", CeedOperatorDestroy_Hip_gen)); return CEED_ERROR_SUCCESS; diff --git a/backends/hip-gen/ceed-hip-gen-qfunction.c b/backends/hip-gen/ceed-hip-gen-qfunction.c index 3ee17b5211..22e254129b 100644 --- a/backends/hip-gen/ceed-hip-gen-qfunction.c +++ b/backends/hip-gen/ceed-hip-gen-qfunction.c @@ -17,6 +17,7 @@ //------------------------------------------------------------------------------ static int CeedQFunctionApply_Hip_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); } @@ -25,9 +26,10 @@ static int CeedQFunctionApply_Hip_gen(CeedQFunction qf, CeedInt Q, CeedVector *U // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) { + Ceed ceed; CeedQFunction_Hip_gen *data; + CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallHip(ceed, hipFree(data->d_c)); CeedCallBackend(CeedFree(&data->q_function_source)); @@ -39,9 +41,10 @@ static int CeedQFunctionDestroy_Hip_gen(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Hip_gen(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; CeedQFunction_Hip_gen *data; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); diff --git a/backends/hip-gen/ceed-hip-gen.c b/backends/hip-gen/ceed-hip-gen.c index ee46fe0498..62d97bbc65 100644 --- a/backends/hip-gen/ceed-hip-gen.c +++ b/backends/hip-gen/ceed-hip-gen.c @@ -17,28 +17,28 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Hip_gen(const char *resource, Ceed ceed) { - char *resource_root; + char *resource_root; + const char fallback_resource[] = "/gpu/hip/ref"; + Ceed ceed_shared; + Ceed_Hip *data; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/hip") || !strcmp(resource_root, "/gpu/hip/gen"), ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); - Ceed_Hip *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Hip(ceed, resource)); - Ceed ceed_shared; CeedCallBackend(CeedInit("/gpu/hip/shared", &ceed_shared)); CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); - const char fallbackresource[] = "/gpu/hip/ref"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Hip_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c index 7043321b1e..4927f1e472 100644 --- a/backends/hip-ref/ceed-hip-ref-basis.c +++ b/backends/hip-ref/ceed-hip-ref-basis.c @@ -18,18 +18,20 @@ // Basis apply - tensor //------------------------------------------------------------------------------ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Hip *ceed_Hip; + CeedInt Q_1d, dim; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 64; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Hip *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Hip *ceed_Hip; CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedBasis_Hip *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 64; // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); @@ -37,10 +39,10 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(v, &length)); CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); } - CeedInt Q_1d, dim; CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); @@ -91,22 +93,23 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod //------------------------------------------------------------------------------ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Hip *ceed_Hip; + CeedInt num_nodes, num_qpts; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + int elems_per_block = 1; + int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasisNonTensor_Hip *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Hip *ceed_Hip; CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedBasisNonTensor_Hip *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedInt num_nodes, num_qpts; CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - int elems_per_block = 1; - int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } @@ -115,6 +118,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra // Clear v for transpose operation if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(v, &length)); CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar))); } @@ -163,19 +167,16 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra // Destroy tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Hip(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasis_Hip *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -183,19 +184,16 @@ static int CeedBasisDestroy_Hip(CeedBasis basis) { // Destroy non-tensor basis //------------------------------------------------------------------------------ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasisNonTensor_Hip *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight)); CeedCallHip(ceed, hipFree(data->d_interp)); CeedCallHip(ceed, hipFree(data->d_grad)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -204,34 +202,33 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * P_1d; CeedBasis_Hip *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy data to GPU - const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); - - const CeedInt interp_bytes = q_bytes * P_1d; CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); // Compile basis kernels - CeedInt ncomp; - CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); - char *basis_kernel_path, *basis_kernel_source; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", - ncomp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", ncomp, "BASIS_NUM_NODES", - CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); + num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, + "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim))); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); @@ -250,34 +247,33 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C //------------------------------------------------------------------------------ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * num_nodes; + const CeedInt grad_bytes = q_bytes * num_nodes * dim; CeedBasisNonTensor_Hip *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); - - const CeedInt interp_bytes = q_bytes * num_nodes; CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); - - const CeedInt grad_bytes = q_bytes * num_nodes * dim; CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); // Compile basis kernels - CeedInt ncomp; - CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); - char *basis_kernel_path, *basis_kernel_source; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, - "BASIS_NUM_COMP", ncomp)); + "BASIS_NUM_COMP", num_comp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 6417cc91a3..90dfd6919d 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -22,52 +22,55 @@ //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Hip(CeedOperator op) { CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); // Apply data - for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); + for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); } - CeedCallBackend(CeedFree(&impl->evecs)); + CeedCallBackend(CeedFree(&impl->e_vecs)); - for (CeedInt i = 0; i < impl->numein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); + for (CeedInt i = 0; i < impl->num_inputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - CeedCallBackend(CeedFree(&impl->qvecsin)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i = 0; i < impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); + for (CeedInt i = 0; i < impl->num_outputs; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - CeedCallBackend(CeedFree(&impl->qvecsout)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i = 0; i < impl->qfnumactivein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - CeedCallBackend(CeedFree(&impl->qfactivein)); + CeedCallBackend(CeedFree(&impl->qf_active_in)); // Diag data if (impl->diag) { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); - CeedCallBackend(CeedFree(&impl->diag->h_emodein)); - CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); - CeedCallHip(ceed, hipFree(impl->diag->d_emodein)); - CeedCallHip(ceed, hipFree(impl->diag->d_emodeout)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); + CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_in)); + CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_out)); CeedCallHip(ceed, hipFree(impl->diag->d_identity)); - CeedCallHip(ceed, hipFree(impl->diag->d_interpin)); - CeedCallHip(ceed, hipFree(impl->diag->d_interpout)); - CeedCallHip(ceed, hipFree(impl->diag->d_gradin)); - CeedCallHip(ceed, hipFree(impl->diag->d_gradout)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); + CeedCallHip(ceed, hipFree(impl->diag->d_interp_in)); + CeedCallHip(ceed, hipFree(impl->diag->d_interp_out)); + CeedCallHip(ceed, hipFree(impl->diag->d_grad_in)); + CeedCallHip(ceed, hipFree(impl->diag->d_grad_out)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); } CeedCallBackend(CeedFree(&impl->diag)); if (impl->asmb) { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallHip(ceed, hipModuleUnload(impl->asmb->module)); CeedCallHip(ceed, hipFree(impl->asmb->d_B_in)); @@ -82,88 +85,87 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, - CeedInt numfields, CeedInt Q, CeedInt numelements) { - CeedInt dim, size; - CeedSize q_size; - Ceed ceed; +static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, + CeedInt num_fields, CeedInt Q, CeedInt num_elem) { + Ceed ceed; + CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedVector fieldvec; - bool strided; - bool skiprestrict; - - if (isinput) { - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); + if (is_input) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } // Loop over fields - for (CeedInt i = 0; i < numfields; i++) { - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - - strided = false; - skiprestrict = false; - if (emode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); + for (CeedInt i = 0; i < num_fields; i++) { + bool is_strided, skip_restriction; + CeedSize q_size; + CeedInt dim, size; + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + is_strided = false; + skip_restriction = false; + if (e_mode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); // Check whether this field can skip the element restriction: - // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. + // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. // First, check whether the field is input or output: - if (isinput) { + if (is_input) { // Check for passive input: - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); - if (fieldvec != CEED_VECTOR_ACTIVE) { - // Check emode - if (emode == CEED_EVAL_NONE) { + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE) { + // Check e_mode + if (e_mode == CEED_EVAL_NONE) { // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); - if (strided) { + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &skip_restriction)); } } } } - if (skiprestrict) { + if (skip_restriction) { // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. - evecs[i + starte] = NULL; + e_vecs[i + start_e] = NULL; } else { - CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); } } - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_WEIGHT: // Only on input fields - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - q_size = (CeedSize)numelements * Q; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, qvecs[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + q_size = (CeedSize)num_elem * Q; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); break; case CEED_EVAL_DIV: break; // TODO: Not implemented @@ -178,38 +180,40 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i // CeedOperator needs to connect all the named fields (be they active or passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Hip(CeedOperator op) { - bool setupdone; - CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); - if (setupdone) return CEED_ERROR_SUCCESS; - Ceed ceed; + Ceed ceed; + bool is_setup_done; + CeedInt Q, num_elem, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, numinputfields, numoutputfields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); - impl->numein = numinputfields; - impl->numeout = numoutputfields; + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; - // Set up infield and outfield evecs and qvecs + // Set up infield and outfield e_vecs and q_vecs // Infields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; @@ -218,37 +222,37 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl, - CeedRequest *request) { - CeedEvalMode emode; - CeedVector vec; - CeedElemRestriction Erestrict; - - for (CeedInt i = 0; i < numinputfields; i++) { +static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], + CeedOperator_Hip *impl, CeedRequest *request) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - if (skipactive) continue; - else vec = invec; + if (skip_active) continue; + else vec = in_vec; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - if (vec == CEED_VECTOR_ACTIVE) vec = invec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; // Restrict, if necessary - if (!impl->evecs[i]) { + if (!impl->e_vecs[i]) { // No restriction for this field; read data directly from vec. - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); // Get evec - CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } } } @@ -258,38 +262,39 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt numinputfields, CeedQFunct //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Hip(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], +static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { - CeedInt elemsize, size; - CeedElemRestriction Erestrict; - CeedEvalMode emode; - CeedBasis basis; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size; + CeedEvalMode e_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; - for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input - if (skipactive) { + if (skip_active) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_WEIGHT: break; // No action @@ -305,25 +310,24 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt numelements, CeedQFunctionF //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { - CeedEvalMode emode; - CeedVector vec; - - for (CeedInt i = 0; i < numinputfields; i++) { +static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; // Skip active input - if (skipactive) { - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + if (skip_active) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { - if (!impl->evecs[i]) { // This was a skiprestrict case - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); - CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); + if (!impl->e_vecs[i]) { // This was a skip_restriction case + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); } } } @@ -333,68 +337,72 @@ static inline int CeedOperatorRestoreInputs_Hip(CeedInt numinputfields, CeedQFun //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { - CeedOperator_Hip *impl; +static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedEvalMode emode; - CeedVector vec; - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedScalar *edata[2 * CEED_FIELD_MAX] = {NULL}; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Setup CeedCallBackend(CeedOperatorSetup_Hip(op)); // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); + CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); // Input basis apply if needed - CeedCallBackend(CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); // Output pointers, as necessary - for (CeedInt i = 0; i < numoutputfields; i++) { - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode e_mode; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); } } // Q function - CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); // Output basis apply if needed - for (CeedInt i = 0; i < numoutputfields; i++) { - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode e_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); break; // Should not occur @@ -408,24 +416,28 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec, CeedVecto } // Output restriction - for (CeedInt i = 0; i < numoutputfields; i++) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + // Restore evec - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); } // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); // Active - if (vec == CEED_VECTOR_ACTIVE) vec = outvec; + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_inputs], vec, request)); } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, opinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -434,132 +446,144 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector invec, CeedVecto //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed, ceedparent; - CeedOperator_Hip *impl; - CeedQFunction qf; - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedOperatorField *opinputfields, *opoutputfields; - CeedVector vec, *activein; - CeedInt numactivein, numactiveout, Q, numelements, numinputfields, numoutputfields, size; + Ceed ceed, ceed_parent; + bool is_identity_qf; CeedSize q_size; - CeedScalar *a, *tmp, *edata[2 * CEED_FIELD_MAX] = {NULL}; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; + CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedVector *active_in; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceedparent)); + CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); - activein = impl->qfactivein; - numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + active_in = impl->qf_active_in; + num_active_in = impl->num_active_in; + num_active_out = impl->num_active_out; // Setup CeedCallBackend(CeedOperatorSetup_Hip(op)); // Check for identity - bool identityqf; - CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); - CeedCheck(!identityqf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + CeedCheck(!is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Hip(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); + CeedCallBackend(CeedOperatorSetupInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); // Count number of active input fields - if (!numactivein) { - for (CeedInt i = 0; i < numinputfields; i++) { + if (!num_active_in) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); - CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); - CeedCallBackend(CeedRealloc(numactivein + size, &activein)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * numelements; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); - CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); + q_size = (CeedSize)Q * num_elem; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); + CeedCallBackend( + CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); } - numactivein += size; - CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); + num_active_in += size; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } - impl->qfnumactivein = numactivein; - impl->qfactivein = activein; + impl->num_active_in = num_active_in; + impl->qf_active_in = active_in; } // Count number of active output fields - if (!numactiveout) { - for (CeedInt i = 0; i < numoutputfields; i++) { + if (!num_active_out) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); - numactiveout += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + num_active_out += size; } } - impl->qfnumactiveout = numactiveout; + impl->num_active_out = num_active_out; } // Check sizes - CeedCheck(numactivein > 0 && numactiveout > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); + CeedCheck(num_active_in > 0 && num_active_out > 0, ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); // Build objects if needed if (build_objects) { // Create output restriction - CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */ - CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout, - numactivein * numactiveout * numelements * Q, strides, rstr)); + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout; - CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled)); + CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a)); + CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Hip(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Hip(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); // Assemble QFunction - for (CeedInt in = 0; in < numactivein; in++) { + for (CeedInt in = 0; in < num_active_in; in++) { // Set Inputs - CeedCallBackend(CeedVectorSetValue(activein[in], 1.0)); - if (numactivein > 1) { - CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); + CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); + if (num_active_in > 1) { + CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out = 0; out < numoutputfields; out++) { + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); - a += size * Q * numelements; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } } // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out = 0; out < numoutputfields; out++) { + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Hip(numinputfields, qfinputfields, opinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Hip(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); // Restore output - CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); - + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); return CEED_ERROR_SUCCESS; } @@ -580,82 +604,85 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVe //------------------------------------------------------------------------------ // Create point block restriction //------------------------------------------------------------------------------ -static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); +static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *point_block_rstr) { + Ceed ceed; + CeedSize l_size; + CeedInt num_elem, num_comp, elem_size, comp_stride, *point_block_offsets; const CeedInt *offsets; + + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; - CeedSize l_size; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); - CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); - CeedInt shift = ncomp; - if (compstride != 1) shift *= ncomp; - CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); - for (CeedInt i = 0; i < nelem * elemsize; i++) { - pbOffsets[i] = offsets[i] * shift; - } + CeedInt shift = num_comp; + + if (comp_stride != 1) shift *= num_comp; + CeedCallBackend(CeedCalloc(num_elem * elem_size, &point_block_offsets)); + for (CeedInt i = 0; i < num_elem * elem_size; i++) point_block_offsets[i] = offsets[i] * shift; // Create new restriction - CeedCallBackend( - CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); + CeedCallBackend(CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp * num_comp, 1, l_size * num_comp, CEED_MEM_HOST, CEED_OWN_POINTER, + point_block_offsets, point_block_rstr)); // Cleanup CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); - return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble diagonal setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, const bool pointBlock, CeedInt use_ceedsize_idx) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, const bool is_point_block, CeedInt use_ceedsize_idx) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0; + CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *op_fields; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt numinputfields, numoutputfields; - CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Determine active input basis - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); - CeedInt numemodein = 0, ncomp = 0, dim = 1; - CeedEvalMode *emodein = NULL; - CeedBasis basisin = NULL; - CeedElemRestriction rstrin = NULL; - for (CeedInt i = 0; i < numinputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); - CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); - CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - CeedCheck(!rstrin || rstrin == rstr, ceed, CEED_ERROR_BACKEND, + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstrin = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_in = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); - emodein[numemodein] = emode; - numemodein += 1; + CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); + e_mode_in[num_e_mode_in] = e_mode; + num_e_mode_in += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); - for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; - numemodein += dim; + CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); + for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; + num_e_mode_in += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -666,35 +693,33 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, const b } // Determine active output basis - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); - CeedInt numemodeout = 0; - CeedEvalMode *emodeout = NULL; - CeedBasis basisout = NULL; - CeedElemRestriction rstrout = NULL; - for (CeedInt i = 0; i < numoutputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); + for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - CeedCheck(!rstrout || rstrout == rstr, ceed, CEED_ERROR_BACKEND, + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); - rstrout = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_out = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); - emodeout[numemodeout] = emode; - numemodeout += 1; + CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); + e_mode_out[num_e_mode_out] = e_mode; + num_e_mode_out += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); - for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; - numemodeout += dim; + CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); + for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; + num_e_mode_out += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -705,155 +730,155 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, const b } // Operator data struct - CeedOperator_Hip *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Hip *diag = impl->diag; - diag->basisin = basisin; - diag->basisout = basisout; - diag->h_emodein = emodein; - diag->h_emodeout = emodeout; - diag->numemodein = numemodein; - diag->numemodeout = numemodeout; - // Assemble kernel + diag->basis_in = basis_in; + diag->basis_out = basis_out; + diag->h_e_mode_in = e_mode_in; + diag->h_e_mode_out = e_mode_out; + diag->num_e_mode_in = num_e_mode_in; + diag->num_e_mode_out = num_e_mode_out; - char *diagonal_kernel_path, *diagonal_kernel_source; + // Assemble kernel CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); - CeedInt nnodes, nqpts; - CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); - diag->nnodes = nnodes; - CeedCallBackend(CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", numemodein, "NUMEMODEOUT", numemodeout, "NNODES", - nnodes, "NQPTS", nqpts, "NCOMP", ncomp, "CEEDSIZE", use_ceedsize_idx)); + CeedInt num_modes, num_qpts; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_modes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + diag->num_modes = num_modes; + CeedCallBackend(CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", num_e_mode_out, + "NNODES", num_modes, "NQPTS", num_qpts, "NCOMP", num_comp, "CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); CeedCallBackend(CeedFree(&diagonal_kernel_path)); CeedCallBackend(CeedFree(&diagonal_kernel_source)); // Basis matrices - const CeedInt qBytes = nqpts * sizeof(CeedScalar); - const CeedInt iBytes = qBytes * nnodes; - const CeedInt gBytes = qBytes * nnodes * dim; - const CeedInt eBytes = sizeof(CeedEvalMode); - const CeedScalar *interpin, *interpout, *gradin, *gradout; + const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + const CeedInt interp_bytes = q_bytes * num_modes; + const CeedInt grad_bytes = q_bytes * num_modes * dim; + const CeedInt e_mode_bytes = sizeof(CeedEvalMode); + const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; // CEED_EVAL_NONE - CeedScalar *identity = NULL; - bool evalNone = false; - for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); - for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); - if (evalNone) { - CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); - for (CeedInt i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; - CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, iBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, iBytes, hipMemcpyHostToDevice)); + CeedScalar *identity = NULL; + bool is_eval_none = false; + + for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); + for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); + if (is_eval_none) { + CeedCallBackend(CeedCalloc(num_qpts * num_modes, &identity)); + for (CeedInt i = 0; i < (num_modes < num_qpts ? num_modes : num_qpts); i++) identity[i * num_modes + i] = 1.0; + CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, interp_bytes, hipMemcpyHostToDevice)); } // CEED_EVAL_INTERP - CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); - CeedCallHip(ceed, hipMalloc((void **)&diag->d_interpin, iBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_interpin, interpin, iBytes, hipMemcpyHostToDevice)); - CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); - CeedCallHip(ceed, hipMalloc((void **)&diag->d_interpout, iBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_interpout, interpout, iBytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_in, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_interp_in, interp_in, interp_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_out, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_interp_out, interp_out, interp_bytes, hipMemcpyHostToDevice)); // CEED_EVAL_GRAD - CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); - CeedCallHip(ceed, hipMalloc((void **)&diag->d_gradin, gBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_gradin, gradin, gBytes, hipMemcpyHostToDevice)); - CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); - CeedCallHip(ceed, hipMalloc((void **)&diag->d_gradout, gBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_gradout, gradout, gBytes, hipMemcpyHostToDevice)); - - // Arrays of emodes - CeedCallHip(ceed, hipMalloc((void **)&diag->d_emodein, numemodein * eBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_emodein, emodein, numemodein * eBytes, hipMemcpyHostToDevice)); - CeedCallHip(ceed, hipMalloc((void **)&diag->d_emodeout, numemodeout * eBytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_emodeout, emodeout, numemodeout * eBytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_in, grad_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_grad_in, grad_in, grad_bytes, hipMemcpyHostToDevice)); + CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_out, grad_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_grad_out, grad_out, grad_bytes, hipMemcpyHostToDevice)); + + // Arrays of e_modes + CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, hipMemcpyHostToDevice)); // Restriction - diag->diagrstr = rstrout; - + diag->diag_rstr = rstr_out; return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble diagonal common code //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { + Ceed ceed; + CeedSize assembled_length = 0, assembled_qf_length = 0; + CeedInt use_ceedsize_idx = 0, num_elem; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr = NULL; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembledqf = NULL; - CeedElemRestriction rstr = NULL; - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); - CeedSize assembled_length = 0, assembledqf_length = 0; CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembledqf, &assembledqf_length)); - CeedInt use_ceedsize_idx = 0; - if ((assembled_length > INT_MAX) || (assembledqf_length > INT_MAX)) use_ceedsize_idx = 1; + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup - if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, pointBlock, use_ceedsize_idx)); + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, is_point_block, use_ceedsize_idx)); CeedOperatorDiag_Hip *diag = impl->diag; assert(diag != NULL); // Restriction - if (pointBlock && !diag->pbdiagrstr) { - CeedElemRestriction pbdiagrstr; - CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); - diag->pbdiagrstr = pbdiagrstr; + if (is_point_block && !diag->point_block_diag_rstr) { + CeedElemRestriction point_block_diag_rstr; + + CeedCallBackend(CreatePBRestriction(diag->diag_rstr, &point_block_diag_rstr)); + diag->point_block_diag_rstr = point_block_diag_rstr; } - CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; + CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; // Create diagonal vector - CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; - if (!elemdiag) { + CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; + + if (!elem_diag) { // Element diagonal vector - CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); - if (pointBlock) diag->pbelemdiag = elemdiag; - else diag->elemdiag = elemdiag; + CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); + if (is_point_block) diag->point_block_elem_diag = elem_diag; + else diag->elem_diag = elem_diag; } - CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); + CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); // Assemble element operator diagonals - CeedScalar *elemdiagarray; - const CeedScalar *assembledqfarray; - CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); - CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); - CeedInt nelem; - CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); + CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); // Compute the diagonal of B^T D B - int elemsPerBlock = 1; - int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - void *args[] = {(void *)&nelem, &diag->d_identity, &diag->d_interpin, &diag->d_gradin, &diag->d_interpout, - &diag->d_gradout, &diag->d_emodein, &diag->d_emodeout, &assembledqfarray, &elemdiagarray}; - if (pointBlock) { - CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearPointBlock, grid, diag->nnodes, 1, elemsPerBlock, args)); + int elem_per_block = 1; + int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); + void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, + &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; + + if (is_point_block) { + CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearPointBlock, grid, diag->num_modes, 1, elem_per_block, args)); } else { - CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearDiagonal, grid, diag->nnodes, 1, elemsPerBlock, args)); + CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearDiagonal, grid, diag->num_modes, 1, elem_per_block, args)); } // Restore arrays - CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); - CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); + CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); // Assemble local operator diagonal - CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); + CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); // Cleanup - CeedCallBackend(CeedVectorDestroy(&assembledqf)); - + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -877,52 +902,53 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, // Single operator assembly setup //------------------------------------------------------------------------------ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { - Ceed ceed; + Ceed ceed; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, + num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; + CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *input_fields, *output_fields; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Get intput and output fields - CeedInt num_input_fields, num_output_fields; - CeedOperatorField *input_fields; - CeedOperatorField *output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Determine active input basis eval mode - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedQFunctionField *qf_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; - // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim. + // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so // num_B_in_mats_to_load will be incremented by 1. - CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; - CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load - CeedBasis basis_in = NULL; - CeedInt nqpts = 0, esize = 0; - CeedElemRestriction rstr_in = NULL; for (CeedInt i = 0; i < num_input_fields; i++) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize)); - CeedEvalMode eval_mode; + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); eval_mode_in[num_B_in_mats_to_load] = eval_mode; num_B_in_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_in += dim; - size_B_in += dim * esize * nqpts; + num_e_mode_in += dim; + size_B_in += dim * elem_size * num_qpts; } else { - num_emode_in += 1; - size_B_in += esize * nqpts; + num_e_mode_in += 1; + size_B_in += elem_size * num_qpts; } } } @@ -930,48 +956,45 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed // Determine active output basis; basis_out and rstr_out only used if same as input, TODO CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); - CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; - CeedEvalMode *eval_mode_out = NULL; - CeedBasis basis_out = NULL; - CeedElemRestriction rstr_out = NULL; for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); - CeedEvalMode eval_mode; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); eval_mode_out[num_B_out_mats_to_load] = eval_mode; num_B_out_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_out += dim; - size_B_out += dim * esize * nqpts; + num_e_mode_out += dim; + size_B_out += dim * elem_size * num_qpts; } else { - num_emode_out += 1; - size_B_out += esize * nqpts; + num_e_mode_out += 1; + size_B_out += elem_size * num_qpts; } } } } - CeedCheck(num_emode_in > 0 && num_emode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); - CeedInt nelem, ncomp; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); CeedCallBackend(CeedCalloc(1, &impl->asmb)); CeedOperatorAssemble_Hip *asmb = impl->asmb; - asmb->nelem = nelem; + asmb->num_elem = num_elem; // Compile kernels - int elemsPerBlock = 1; - asmb->elemsPerBlock = elemsPerBlock; - CeedInt block_size = esize * esize * elemsPerBlock; + int elem_per_block = 1; + asmb->elem_per_block = elem_per_block; + CeedInt block_size = elem_size * elem_size * elem_per_block; char *assembly_kernel_path, *assembly_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); @@ -979,15 +1002,15 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); bool fallback = block_size > 1024; if (fallback) { // Use fallback kernel with 1D threadblock - block_size = esize * elemsPerBlock; - asmb->block_size_x = esize; + block_size = elem_size * elem_per_block; + asmb->block_size_x = elem_size; asmb->block_size_y = 1; } else { // Use kernel with 2D threadblock - asmb->block_size_x = esize; - asmb->block_size_y = esize; + asmb->block_size_x = elem_size; + asmb->block_size_y = elem_size; } - CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", nelem, "NUMEMODEIN", num_emode_in, "NUMEMODEOUT", - num_emode_out, "NQPTS", nqpts, "NNODES", esize, "BLOCK_SIZE", block_size, "NCOMP", ncomp, "CEEDSIZE", + CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", num_elem, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", + num_e_mode_out, "NQPTS", num_qpts, "NNODES", elem_size, "BLOCK_SIZE", block_size, "NCOMP", num_comp, "CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); CeedCallBackend(CeedFree(&assembly_kernel_path)); @@ -999,23 +1022,24 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); // Load into B_in, in order that they will be used in eval_mode - const CeedInt inBytes = size_B_in * sizeof(CeedScalar); + const CeedInt in_bytes = size_B_in * sizeof(CeedScalar); CeedInt mat_start = 0; - CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, inBytes)); + + CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes)); for (int i = 0; i < num_B_in_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_in[i]; if (eval_mode == CEED_EVAL_INTERP) { - CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], interp_in, esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); - mat_start += esize * nqpts; + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { - CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); - mat_start += dim * esize * nqpts; + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); + mat_start += dim * elem_size * num_qpts; } } const CeedScalar *interp_out, *grad_out; - // Note that this function currently assumes 1 basis, so this should always be true - // for now + + // Note that this function currently assumes 1 basis, so this should always be true for now if (basis_out == basis_in) { interp_out = interp_in; grad_out = grad_in; @@ -1025,17 +1049,18 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed } // Load into B_out, in order that they will be used in eval_mode - const CeedInt outBytes = size_B_out * sizeof(CeedScalar); - mat_start = 0; - CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, outBytes)); + const CeedInt out_bytes = size_B_out * sizeof(CeedScalar); + + mat_start = 0; + CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes)); for (int i = 0; i < num_B_out_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_out[i]; if (eval_mode == CEED_EVAL_INTERP) { - CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], interp_out, esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); - mat_start += esize * nqpts; + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { - CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * esize * nqpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); - mat_start += dim * esize * nqpts; + CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); + mat_start += dim * elem_size * num_qpts; } } return CEED_ERROR_SUCCESS; @@ -1050,26 +1075,27 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; + Ceed ceed; + CeedSize values_length = 0, assembled_qf_length = 0; + CeedInt use_ceedsize_idx = 0; + CeedScalar *values_array; + const CeedScalar *qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr_q = NULL; + CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Hip *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Assemble QFunction - CeedVector assembled_qf = NULL; - CeedElemRestriction rstr_q = NULL; CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); - CeedScalar *values_array; CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); values_array += offset; - const CeedScalar *qf_array; CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); - CeedSize values_length = 0, assembled_qf_length = 0; CeedCallBackend(CeedVectorGetLength(values, &values_length)); CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - CeedInt use_ceedsize_idx = 0; if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; // Setup if (!impl->asmb) { @@ -1078,12 +1104,13 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV } // Compute B^T D B - const CeedInt nelem = impl->asmb->nelem; - const CeedInt elemsPerBlock = impl->asmb->elemsPerBlock; - const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + const CeedInt num_elem = impl->asmb->num_elem; + const CeedInt elem_per_block = impl->asmb->elem_per_block; + const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); + void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; + CeedCallBackend( - CeedRunKernelDim_Hip(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elemsPerBlock, args)); + CeedRunKernelDim_Hip(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); // Restore arrays CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); @@ -1091,7 +1118,6 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV // Cleanup CeedCallBackend(CeedVectorDestroy(&assembled_qf)); - return CEED_ERROR_SUCCESS; } @@ -1099,13 +1125,12 @@ static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedV // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Hip(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Hip *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleAddDiagonal", CeedOperatorLinearAssembleAddDiagonal_Hip)); diff --git a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp index a946187637..9fad0b3f69 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp +++ b/backends/hip-ref/ceed-hip-ref-qfunction-load.cpp @@ -23,11 +23,16 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { using std::ostringstream; using std::string; - Ceed ceed; + + Ceed ceed; + char *read_write_kernel_path, *read_write_kernel_source; + Ceed_Hip *ceed_Hip; + CeedInt num_input_fields, num_output_fields, size; + CeedQFunctionField *input_fields, *output_fields; + CeedQFunction_Hip *data; + CeedQFunctionGetCeed(qf, &ceed); - Ceed_Hip *ceed_Hip; CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedQFunction_Hip *data; CeedCallBackend(CeedQFunctionGetData(qf, (void **)&data)); // QFunction is built @@ -36,12 +41,9 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { CeedCheck(data->qfunction_source, ceed, CEED_ERROR_BACKEND, "No QFunction source or hipFunction_t provided."); // QFunction kernel generation - CeedInt num_input_fields, num_output_fields, size; - CeedQFunctionField *input_fields, *output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Build strings for final kernel - char *read_write_kernel_path, *read_write_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-qfunction.h", &read_write_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); @@ -120,7 +122,6 @@ extern "C" int CeedQFunctionBuildKernel_Hip_ref(CeedQFunction qf) { CeedCallBackend(CeedFree(&data->qfunction_source)); CeedCallBackend(CeedFree(&read_write_kernel_path)); CeedCallBackend(CeedFree(&read_write_kernel_source)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-qfunction.c b/backends/hip-ref/ceed-hip-ref-qfunction.c index 42c15dfaa5..491c047689 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunction.c +++ b/backends/hip-ref/ceed-hip-ref-qfunction.c @@ -19,17 +19,18 @@ // Apply QFunction //------------------------------------------------------------------------------ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { - Ceed ceed; + Ceed ceed; + Ceed_Hip *ceed_Hip; + CeedInt num_input_fields, num_output_fields; + CeedQFunction_Hip *data; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); // Build and compile kernel, if not done CeedCallBackend(CeedQFunctionBuildKernel_Hip_ref(qf)); - CeedQFunction_Hip *data; CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed_Hip *ceed_Hip; CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedInt num_input_fields, num_output_fields; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); const int block_size = ceed_Hip->opt_block_size; @@ -46,6 +47,7 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce // Run kernel void *args[] = {&data->d_c, (void *)&Q, &data->fields}; + CeedCallBackend(CeedRunKernel_Hip(ceed, data->QFunction, CeedDivUpInt(Q, block_size), block_size, args)); // Restore vectors @@ -58,7 +60,6 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce // Restore context CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &data->d_c)); - return CEED_ERROR_SUCCESS; } @@ -66,13 +67,13 @@ static int CeedQFunctionApply_Hip(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) { + Ceed ceed; CeedQFunction_Hip *data; + CeedCallBackend(CeedQFunctionGetData(qf, &data)); - Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); if (data->module) CeedCallHip(ceed, hipModuleUnload(data->module)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -80,12 +81,13 @@ static int CeedQFunctionDestroy_Hip(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Hip(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; + CeedInt num_input_fields, num_output_fields; CeedQFunction_Hip *data; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedQFunctionSetData(qf, data)); - CeedInt num_input_fields, num_output_fields; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read QFunction source diff --git a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c index fd79035c70..c1e830754d 100644 --- a/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c +++ b/backends/hip-ref/ceed-hip-ref-qfunctioncontext.c @@ -18,27 +18,25 @@ // Sync host to device //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Hip *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCheck(impl->h_data, ceed, CEED_ERROR_BACKEND, "No valid host data to sync to device"); - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->d_data_borrowed) { impl->d_data = impl->d_data_borrowed; } else if (impl->d_data_owned) { impl->d_data = impl->d_data_owned; } else { - CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctxsize)); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctx_size)); impl->d_data = impl->d_data_owned; } - - CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctxsize, hipMemcpyHostToDevice)); - + CeedCallHip(ceed, hipMemcpy(impl->d_data, impl->h_data, ctx_size, hipMemcpyHostToDevice)); return CEED_ERROR_SUCCESS; } @@ -46,27 +44,25 @@ static inline int CeedQFunctionContextSyncH2D_Hip(const CeedQFunctionContext ctx // Sync device to host //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncD2H_Hip(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Hip *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCheck(impl->d_data, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->h_data_borrowed) { impl->h_data = impl->h_data_borrowed; } else if (impl->h_data_owned) { impl->h_data = impl->h_data_owned; } else { - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data = impl->h_data_owned; } - - CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctxsize, hipMemcpyDeviceToHost)); - + CeedCallHip(ceed, hipMemcpy(impl->h_data, impl->d_data, ctx_size, hipMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -88,11 +84,10 @@ static inline int CeedQFunctionContextSync_Hip(const CeedQFunctionContext ctx, C //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSetAllInvalid_Hip(const CeedQFunctionContext ctx) { CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); impl->h_data = NULL; impl->d_data = NULL; - return CEED_ERROR_SUCCESS; } @@ -101,10 +96,9 @@ static inline int CeedQFunctionContextSetAllInvalid_Hip(const CeedQFunctionConte //------------------------------------------------------------------------------ static inline int CeedQFunctionContextHasValidData_Hip(const CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); *has_valid_data = impl && (impl->h_data || impl->d_data); - return CEED_ERROR_SUCCESS; } @@ -114,8 +108,8 @@ static inline int CeedQFunctionContextHasValidData_Hip(const CeedQFunctionContex static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_data_of_type = impl->h_data_borrowed; @@ -124,7 +118,6 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip(const CeedQFunct *has_borrowed_data_of_type = impl->d_data_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -132,10 +125,10 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Hip(const CeedQFunct // Check if data of given type needs sync //------------------------------------------------------------------------------ static inline int CeedQFunctionContextNeedSync_Hip(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { + bool has_valid_data = true; CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - bool has_valid_data = true; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextHasValidData_Hip(ctx, &has_valid_data)); switch (mem_type) { case CEED_MEM_HOST: @@ -145,7 +138,6 @@ static inline int CeedQFunctionContextNeedSync_Hip(const CeedQFunctionContext ct *need_sync = has_valid_data && !impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -154,17 +146,18 @@ static inline int CeedQFunctionContextNeedSync_Hip(const CeedQFunctionContext ct //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedFree(&impl->h_data_owned)); switch (copy_mode) { case CEED_COPY_VALUES: { - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + size_t ctx_size; + + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data_borrowed = NULL; impl->h_data = impl->h_data_owned; - memcpy(impl->h_data, data, ctxsize); + memcpy(impl->h_data, data, ctx_size); } break; case CEED_OWN_POINTER: impl->h_data_owned = data; @@ -176,7 +169,6 @@ static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, c impl->h_data = data; break; } - return CEED_ERROR_SUCCESS; } @@ -184,21 +176,22 @@ static int CeedQFunctionContextSetDataHost_Hip(const CeedQFunctionContext ctx, c // Set data from device //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; CeedQFunctionContext_Hip *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallHip(ceed, hipFree(impl->d_data_owned)); impl->d_data_owned = NULL; switch (copy_mode) { case CEED_COPY_VALUES: { - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctxsize)); + size_t ctx_size; + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallHip(ceed, hipMalloc((void **)&impl->d_data_owned, ctx_size)); impl->d_data_borrowed = NULL; impl->d_data = impl->d_data_owned; - CeedCallHip(ceed, hipMemcpy(impl->d_data, data, ctxsize, hipMemcpyDeviceToDevice)); + CeedCallHip(ceed, hipMemcpy(impl->d_data, data, ctx_size, hipMemcpyDeviceToDevice)); } break; case CEED_OWN_POINTER: impl->d_data_owned = data; @@ -211,7 +204,6 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, impl->d_data = data; break; } - return CEED_ERROR_SUCCESS; } @@ -221,8 +213,8 @@ static int CeedQFunctionContextSetDataDevice_Hip(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextSetAllInvalid_Hip(ctx)); switch (mem_type) { case CEED_MEM_HOST: @@ -230,7 +222,6 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Hip(ctx, copy_mode, data); } - return CEED_ERROR_UNSUPPORTED; } @@ -238,13 +229,14 @@ static int CeedQFunctionContextSetData_Hip(const CeedQFunctionContext ctx, const // Take data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + bool need_sync = false; + Ceed ceed; CeedQFunctionContext_Hip *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type - bool need_sync = false; CeedCallBackend(CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync)); if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Hip(ctx, mem_type)); @@ -261,7 +253,6 @@ static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, cons impl->d_data = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -270,13 +261,14 @@ static int CeedQFunctionContextTakeData_Hip(const CeedQFunctionContext ctx, cons // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedQFunctionContextGetDataCore_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + bool need_sync = false; + Ceed ceed; CeedQFunctionContext_Hip *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type - bool need_sync = false; CeedCallBackend(CeedQFunctionContextNeedSync_Hip(ctx, mem_type, &need_sync)); if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Hip(ctx, mem_type)); @@ -289,7 +281,6 @@ static int CeedQFunctionContextGetDataCore_Hip(const CeedQFunctionContext ctx, c *(void **)data = impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -305,8 +296,8 @@ static int CeedQFunctionContextGetDataRead_Hip(const CeedQFunctionContext ctx, c //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextGetDataCore_Hip(ctx, mem_type, data)); // Mark only pointer for requested memory as valid @@ -319,7 +310,6 @@ static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, const impl->d_data = *(void **)data; break; } - return CEED_ERROR_SUCCESS; } @@ -327,15 +317,14 @@ static int CeedQFunctionContextGetData_Hip(const CeedQFunctionContext ctx, const // Destroy the user context //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Hip(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; CeedQFunctionContext_Hip *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallHip(ceed, hipFree(impl->d_data_owned)); CeedCallBackend(CeedFree(&impl->h_data_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -345,8 +334,8 @@ static int CeedQFunctionContextDestroy_Hip(const CeedQFunctionContext ctx) { int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) { CeedQFunctionContext_Hip *impl; Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Hip)); @@ -354,10 +343,8 @@ int CeedQFunctionContextCreate_Hip(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Hip)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c index a88be52aab..0dd11b169a 100644 --- a/backends/hip-ref/ceed-hip-ref-restriction.c +++ b/backends/hip-ref/ceed-hip-ref-restriction.c @@ -21,21 +21,22 @@ // Apply restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + Ceed ceed; + Ceed_Hip *data; + CeedInt num_elem, elem_size; + const CeedScalar *d_u; + CeedScalar *d_v; CeedElemRestriction_Hip *impl; + hipFunction_t kernel; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); - Ceed_Hip *data; CeedCallBackend(CeedGetData(ceed, &data)); - const CeedInt num_nodes = impl->num_nodes; - CeedInt num_elem, elem_size; CeedElemRestrictionGetNumElements(r, &num_elem); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - hipFunction_t kernel; + const CeedInt num_nodes = impl->num_nodes; // Get vectors - const CeedScalar *d_u; - CeedScalar *d_v; CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec @@ -68,6 +69,7 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode if (impl->d_ind) { // -- Offsets provided CeedInt block_size = 64; + if (impl->OffsetTranspose) { kernel = impl->OffsetTranspose; void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; @@ -102,8 +104,8 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { CeedElemRestriction_Hip *impl; - CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); switch (mem_type) { case CEED_MEM_HOST: *offsets = impl->h_ind; @@ -119,10 +121,10 @@ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemTy // Destroy restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { + Ceed ceed; CeedElemRestriction_Hip *impl; - CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed ceed; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallHip(ceed, hipModuleUnload(impl->module)); CeedCallBackend(CeedFree(&impl->h_ind_allocated)); @@ -131,7 +133,6 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { CeedCallHip(ceed, hipFree(impl->d_t_indices)); CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -139,32 +140,30 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { // Create transpose offsets and indices //------------------------------------------------------------------------------ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; + bool *is_node; + CeedSize l_size; + CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; CeedElemRestriction_Hip *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - CeedSize l_size; - CeedInt num_elem, elem_size, num_comp; CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); + const CeedInt size_indices = num_elem * elem_size; // Count num_nodes - bool *is_node; CeedCallBackend(CeedCalloc(l_size, &is_node)); - const CeedInt size_indices = num_elem * elem_size; for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; - CeedInt num_nodes = 0; for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; impl->num_nodes = num_nodes; // L-vector offsets array - CeedInt *ind_to_offset, *l_vec_indices; CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); - CeedInt j = 0; - for (CeedInt i = 0; i < l_size; i++) { + for (CeedInt i = 0, j = 0; i < l_size; i++) { if (is_node[i]) { l_vec_indices[j] = i; ind_to_offset[i] = j++; @@ -174,9 +173,8 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed // Compute transpose offsets and indices const CeedInt size_offsets = num_nodes + 1; - CeedInt *t_offsets; + CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); - CeedInt *t_indices; CeedCallBackend(CeedMalloc(size_indices, &t_indices)); // Count node multiplicity for (CeedInt e = 0; e < num_elem; ++e) { @@ -187,8 +185,9 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed // List all E-vec indices associated with L-vec node for (CeedInt e = 0; e < num_elem; ++e) { for (CeedInt i = 0; i < elem_size; ++i) { - const CeedInt lid = elem_size * e + i; - const CeedInt gid = indices[lid]; + const CeedInt lid = elem_size * e + i; + const CeedInt gid = indices[lid]; + t_indices[t_offsets[ind_to_offset[gid]]++] = lid; } } @@ -212,7 +211,6 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); - return CEED_ERROR_SUCCESS; } @@ -221,32 +219,33 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed //------------------------------------------------------------------------------ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, const CeedInt8 *curl_orients, CeedElemRestriction r) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed, ceed_parent; + bool is_deterministic, is_strided; + char *restriction_kernel_path, *restriction_kernel_source; + CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedRestrictionType rstr_type; CeedElemRestriction_Hip *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - bool is_deterministic; - CeedCallBackend(CeedIsDeterministic(parent, &is_deterministic)); - CeedInt num_elem, num_comp, elem_size; + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt comp_stride = 1; + CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; + CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); // Stride data - bool is_strided; CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (is_strided) { bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); @@ -263,7 +262,6 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, impl->d_t_offsets = NULL; impl->num_nodes = size; CeedCallBackend(CeedElemRestrictionSetData(r, impl)); - CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays @@ -325,7 +323,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, // Compile HIP kernels CeedInt num_nodes = impl->num_nodes; - char *restriction_kernel_path, *restriction_kernel_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); diff --git a/backends/hip-ref/ceed-hip-ref-vector.c b/backends/hip-ref/ceed-hip-ref-vector.c index d83985d5c3..32bf1a3065 100644 --- a/backends/hip-ref/ceed-hip-ref-vector.c +++ b/backends/hip-ref/ceed-hip-ref-vector.c @@ -20,9 +20,9 @@ //------------------------------------------------------------------------------ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + bool has_valid_array = false; - bool has_valid_array = false; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); switch (mem_type) { case CEED_MEM_HOST: @@ -32,7 +32,6 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_t *need_sync = has_valid_array && !impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -40,12 +39,13 @@ static inline int CeedVectorNeedSync_Hip(const CeedVector vec, CeedMemType mem_t // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); @@ -59,9 +59,7 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes)); impl->d_array = impl->d_array_owned; } - CeedCallHip(ceed, hipMemcpy(impl->d_array, impl->h_array, bytes, hipMemcpyHostToDevice)); - return CEED_ERROR_SUCCESS; } @@ -69,9 +67,11 @@ static inline int CeedVectorSyncH2D_Hip(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); @@ -87,11 +87,10 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { impl->h_array = impl->h_array_owned; } - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); - CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost)); + CeedCallHip(ceed, hipMemcpy(impl->h_array, impl->d_array, bytes, hipMemcpyDeviceToHost)); return CEED_ERROR_SUCCESS; } @@ -99,8 +98,9 @@ static inline int CeedVectorSyncD2H_Hip(const CeedVector vec) { // Sync arrays //------------------------------------------------------------------------------ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) { - // Check whether device/host sync is needed bool need_sync = false; + + // Check whether device/host sync is needed CeedCallBackend(CeedVectorNeedSync_Hip(vec, mem_type, &need_sync)); if (!need_sync) return CEED_ERROR_SUCCESS; @@ -118,11 +118,10 @@ static int CeedVectorSyncArray_Hip(const CeedVector vec, CeedMemType mem_type) { //------------------------------------------------------------------------------ static inline int CeedVectorSetAllInvalid_Hip(const CeedVector vec) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); impl->h_array = NULL; impl->d_array = NULL; - return CEED_ERROR_SUCCESS; } @@ -131,10 +130,9 @@ static inline int CeedVectorSetAllInvalid_Hip(const CeedVector vec) { //------------------------------------------------------------------------------ static inline int CeedVectorHasValidArray_Hip(const CeedVector vec, bool *has_valid_array) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = impl->h_array || impl->d_array; - return CEED_ERROR_SUCCESS; } @@ -143,8 +141,8 @@ static inline int CeedVectorHasValidArray_Hip(const CeedVector vec, bool *has_va //------------------------------------------------------------------------------ static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned; @@ -153,7 +151,6 @@ static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, CeedMemType *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned; break; } - return CEED_ERROR_SUCCESS; } @@ -162,8 +159,8 @@ static inline int CeedVectorHasArrayOfType_Hip(const CeedVector vec, CeedMemType //------------------------------------------------------------------------------ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_array_of_type = impl->h_array_borrowed; @@ -172,7 +169,6 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, Cee *has_borrowed_array_of_type = impl->d_array_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -181,11 +177,12 @@ static inline int CeedVectorHasBorrowedArrayOfType_Hip(const CeedVector vec, Cee //------------------------------------------------------------------------------ static int CeedVectorSetArrayHost_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { case CEED_COPY_VALUES: { CeedSize length; + if (!impl->h_array_owned) { CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedMalloc(length, &impl->h_array_owned)); @@ -194,6 +191,7 @@ static int CeedVectorSetArrayHost_Hip(const CeedVector vec, const CeedCopyMode c impl->h_array = impl->h_array_owned; if (array) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); memcpy(impl->h_array, array, bytes); @@ -211,7 +209,6 @@ static int CeedVectorSetArrayHost_Hip(const CeedVector vec, const CeedCopyMode c impl->h_array = array; break; } - return CEED_ERROR_SUCCESS; } @@ -219,16 +216,18 @@ static int CeedVectorSetArrayHost_Hip(const CeedVector vec, const CeedCopyMode c // Set array from device //------------------------------------------------------------------------------ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { case CEED_COPY_VALUES: { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); + if (!impl->d_array_owned) { CeedCallHip(ceed, hipMalloc((void **)&impl->d_array_owned, bytes)); } @@ -249,7 +248,6 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode impl->d_array = array; break; } - return CEED_ERROR_SUCCESS; } @@ -258,11 +256,11 @@ static int CeedVectorSetArrayDevice_Hip(const CeedVector vec, const CeedCopyMode // freeing any previously allocated array if applicable //------------------------------------------------------------------------------ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { case CEED_MEM_HOST: @@ -270,7 +268,6 @@ static int CeedVectorSetArray_Hip(const CeedVector vec, const CeedMemType mem_ty case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Hip(vec, copy_mode, array); } - return CEED_ERROR_UNSUPPORTED; } @@ -291,13 +288,13 @@ int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) // Set a vector to a value //------------------------------------------------------------------------------ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - // Set value for synced device/host array if (!impl->d_array && !impl->h_array) { if (impl->d_array_borrowed) { @@ -320,7 +317,6 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { CeedCallBackend(CeedHostSetValue_Hip(impl->h_array, length, val)); impl->d_array = NULL; } - return CEED_ERROR_SUCCESS; } @@ -328,9 +324,10 @@ static int CeedVectorSetValue_Hip(CeedVector vec, CeedScalar val) { // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type @@ -349,7 +346,6 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedSca impl->d_array = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -358,9 +354,10 @@ static int CeedVectorTakeArray_Hip(CeedVector vec, CeedMemType mem_type, CeedSca // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type @@ -375,7 +372,6 @@ static int CeedVectorGetArrayCore_Hip(const CeedVector vec, const CeedMemType me *array = impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -391,10 +387,9 @@ static int CeedVectorGetArrayRead_Hip(const CeedVector vec, const CeedMemType me //------------------------------------------------------------------------------ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetArrayCore_Hip(vec, mem_type, array)); - CeedCallBackend(CeedVectorSetAllInvalid_Hip(vec)); switch (mem_type) { case CEED_MEM_HOST: @@ -404,7 +399,6 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_ty impl->d_array = *array; break; } - return CEED_ERROR_SUCCESS; } @@ -412,10 +406,10 @@ static int CeedVectorGetArray_Hip(const CeedVector vec, const CeedMemType mem_ty // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { + bool has_array_of_type = true; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - bool has_array_of_type = true; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasArrayOfType_Hip(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated @@ -432,7 +426,6 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType m else impl->d_array = impl->d_array_owned; } } - return CeedVectorGetArray_Hip(vec, mem_type, array); } @@ -440,22 +433,24 @@ static int CeedVectorGetArrayWrite_Hip(const CeedVector vec, const CeedMemType m // Get the norm of a CeedVector //------------------------------------------------------------------------------ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *norm) { - Ceed ceed; + Ceed ceed; + CeedSize length; + const CeedScalar *d_array; + CeedVector_Hip *impl; + hipblasHandle_t handle; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedVector_Hip *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - hipblasHandle_t handle; CeedCallBackend(CeedGetHipblasHandle_Hip(ceed, &handle)); // Is the vector too long to handle with int32? If so, we will divide // it up into "int32-sized" subsections and make repeated BLAS calls. CeedSize num_calls = length / INT_MAX; + if (length % INT_MAX > 0) num_calls += 1; // Compute norm - const CeedScalar *d_array; CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); switch (type) { case CEED_NORM_1: { @@ -463,20 +458,24 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { float sub_norm = 0.0; float *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallHipblas(ceed, hipblasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); *norm += sub_norm; } } else { double sub_norm = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallHipblas(ceed, hipblasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); *norm += sub_norm; } @@ -487,10 +486,12 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { float sub_norm = 0.0, norm_sum = 0.0; float *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallHipblas(ceed, hipblasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm)); norm_sum += sub_norm * sub_norm; } @@ -498,10 +499,12 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor } else { double sub_norm = 0.0, norm_sum = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; + CeedCallHipblas(ceed, hipblasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm)); norm_sum += sub_norm * sub_norm; } @@ -510,7 +513,8 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor break; } case CEED_NORM_MAX: { - CeedInt indx; + CeedInt index; + if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { float sub_max = 0.0, current_max = 0.0; float *d_array_start; @@ -518,20 +522,23 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor d_array_start = (float *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &indx)); - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + + CeedCallHipblas(ceed, hipblasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index)); + CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } *norm = current_max; } else { double sub_max = 0.0, current_max = 0.0; double *d_array_start; + for (CeedInt i = 0; i < num_calls; i++) { d_array_start = (double *)d_array + (CeedSize)(i)*INT_MAX; CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX; CeedInt sub_length = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX; - CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &indx)); - CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + indx - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); + + CeedCallHipblas(ceed, hipblasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index)); + CeedCallHip(ceed, hipMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), hipMemcpyDeviceToHost)); if (fabs(sub_max) > current_max) current_max = fabs(sub_max); } *norm = current_max; @@ -540,7 +547,6 @@ static int CeedVectorNorm_Hip(CeedVector vec, CeedNormType type, CeedScalar *nor } } CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); - return CEED_ERROR_SUCCESS; } @@ -563,17 +569,16 @@ int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length); // Take reciprocal of a vector //------------------------------------------------------------------------------ static int CeedVectorReciprocal_Hip(CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - // Set value for synced device/host array if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Hip(impl->d_array, length)); if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Hip(impl->h_array, length)); - return CEED_ERROR_SUCCESS; } @@ -594,17 +599,16 @@ int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length); // Compute x = alpha x //------------------------------------------------------------------------------ static int CeedVectorScale_Hip(CeedVector x, CeedScalar alpha) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(x, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *x_impl; + + CeedCallBackend(CeedVectorGetCeed(x, &ceed)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(x, &length)); - // Set value for synced device/host array if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Hip(x_impl->d_array, alpha, length)); if (x_impl->h_array) CeedCallBackend(CeedHostScale_Hip(x_impl->h_array, alpha, length)); - return CEED_ERROR_SUCCESS; } @@ -625,14 +629,14 @@ int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_arra // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *y_impl, *x_impl; + + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(y, &length)); - // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); @@ -642,7 +646,6 @@ static int CeedVectorAXPY_Hip(CeedVector y, CeedScalar alpha, CeedVector x) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPY_Hip(y_impl->h_array, alpha, x_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -663,14 +666,14 @@ int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, // Compute y = alpha x + beta y //------------------------------------------------------------------------------ static int CeedVectorAXPBY_Hip(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *y_impl, *x_impl; + + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(y, &length)); - // Set value for synced device/host array if (y_impl->d_array) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE)); @@ -680,7 +683,6 @@ static int CeedVectorAXPBY_Hip(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPBY_Hip(y_impl->h_array, alpha, beta, x_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -701,13 +703,14 @@ int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedSc // Compute the pointwise multiplication w = x .* y //------------------------------------------------------------------------------ static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, CeedVector y) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(w, &ceed)); + Ceed ceed; + CeedSize length; CeedVector_Hip *w_impl, *x_impl, *y_impl; + + CeedCallBackend(CeedVectorGetCeed(w, &ceed)); CeedCallBackend(CeedVectorGetData(w, &w_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(w, &length)); // Set value for synced device/host array @@ -724,7 +727,6 @@ static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, CeedVector y) CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); CeedCallBackend(CeedHostPointwiseMult_Hip(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -732,15 +734,14 @@ static int CeedVectorPointwiseMult_Hip(CeedVector w, CeedVector x, CeedVector y) // Destroy the vector //------------------------------------------------------------------------------ static int CeedVectorDestroy_Hip(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Hip *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallHip(ceed, hipFree(impl->d_array_owned)); CeedCallBackend(CeedFree(&impl->h_array_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -750,8 +751,8 @@ static int CeedVectorDestroy_Hip(const CeedVector vec) { int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) { CeedVector_Hip *impl; Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Hip)); @@ -768,10 +769,8 @@ int CeedVectorCreate_Hip(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Hip)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c index 3bfb24ba86..754c0b52af 100644 --- a/backends/hip-ref/ceed-hip-ref.c +++ b/backends/hip-ref/ceed-hip-ref.c @@ -27,8 +27,8 @@ static int CeedGetPreferredMemType_Hip(CeedMemType *type) { //------------------------------------------------------------------------------ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) { Ceed_Hip *data; - CeedCallBackend(CeedGetData(ceed, &data)); + CeedCallBackend(CeedGetData(ceed, &data)); if (!data->hipblas_handle) CeedCallHipblas(ceed, hipblasCreate(&data->hipblas_handle)); *handle = data->hipblas_handle; return CEED_ERROR_SUCCESS; @@ -38,13 +38,14 @@ int CeedGetHipblasHandle_Hip(Ceed ceed, hipblasHandle_t *handle) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) { - char *resource_root; + Ceed_Hip *data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/hip/ref"), ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Hip *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Hip(ceed, resource)); diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 824abc96c5..634bb68d08 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -86,30 +86,29 @@ typedef struct { hipModule_t module; hipFunction_t linearDiagonal; hipFunction_t linearPointBlock; - CeedBasis basisin, basisout; - CeedElemRestriction diagrstr, pbdiagrstr; - CeedVector elemdiag, pbelemdiag; - CeedInt numemodein, numemodeout, nnodes; - CeedEvalMode *h_emodein, *h_emodeout; - CeedEvalMode *d_emodein, *d_emodeout; - CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; + CeedBasis basis_in, basis_out; + CeedElemRestriction diag_rstr, point_block_diag_rstr; + CeedVector elem_diag, point_block_elem_diag; + CeedInt num_e_mode_in, num_e_mode_out, num_modes; + CeedEvalMode *h_e_mode_in, *h_e_mode_out; + CeedEvalMode *d_e_mode_in, *d_e_mode_out; + CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; } CeedOperatorDiag_Hip; typedef struct { hipModule_t module; hipFunction_t linearAssemble; - CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; + CeedInt num_elem, block_size_x, block_size_y, elem_per_block; CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Hip; typedef struct { - CeedVector *evecs; // E-vectors, inputs followed by outputs - CeedVector *qvecsin; // Input Q-vectors needed to apply operator - CeedVector *qvecsout; // Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; - CeedInt qfnumactivein, qfnumactiveout; - CeedVector *qfactivein; + CeedVector *e_vecs; // E-vectors, inputs followed by outputs + CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator + CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; + CeedVector *qf_active_in; CeedOperatorDiag_Hip *diag; CeedOperatorAssemble_Hip *asmb; } CeedOperator_Hip; diff --git a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp index 771c0b802e..13cbcf62d5 100644 --- a/backends/hip-ref/kernels/hip-ref-vector.hip.cpp +++ b/backends/hip-ref/kernels/hip-ref-vector.hip.cpp @@ -12,21 +12,22 @@ // Kernel for set value on device //------------------------------------------------------------------------------ __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - vec[idx] = val; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + vec[index] = val; } //------------------------------------------------------------------------------ // Set value on device memory //------------------------------------------------------------------------------ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(setValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length, val); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val); return 0; } @@ -34,21 +35,22 @@ extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, Ceed // Kernel for taking reciprocal //------------------------------------------------------------------------------ __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - if (fabs(vec[idx]) > 1E-16) vec[idx] = 1. / vec[idx]; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; } //------------------------------------------------------------------------------ // Take vector reciprocal in device memory //------------------------------------------------------------------------------ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(rcpValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length); return 0; } @@ -56,21 +58,22 @@ extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { // Kernel for scale //------------------------------------------------------------------------------ __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - x[idx] *= alpha; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + x[index] *= alpha; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(scaleValueK, dim3(gridsize), dim3(bsize), 0, 0, x_array, alpha, length); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length); return 0; } @@ -78,21 +81,22 @@ extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSi // Kernel for axpy //------------------------------------------------------------------------------ __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - y[idx] += alpha * x[idx]; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(axpyValueK, dim3(gridsize), dim3(bsize), 0, 0, y_array, alpha, x_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length); return 0; } @@ -100,22 +104,23 @@ extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSca // Kernel for axpby //------------------------------------------------------------------------------ __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - y[idx] = beta * y[idx]; - y[idx] += alpha * x[idx]; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + y[index] = beta * y[index]; + y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ // Compute y = alpha x + beta y on device //------------------------------------------------------------------------------ extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(axpbyValueK, dim3(gridsize), dim3(bsize), 0, 0, y_array, alpha, beta, x_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length); return 0; } @@ -123,21 +128,22 @@ extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedSc // Kernel for pointwise mult //------------------------------------------------------------------------------ __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { - CeedSize idx = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; - if (idx >= size) return; - w[idx] = x[idx] * y[idx]; + CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; + + if (index >= size) return; + w[index] = x[index] * y[index]; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device //------------------------------------------------------------------------------ extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { - const int bsize = 512; - const CeedSize vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const CeedSize vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - hipLaunchKernelGGL(pointwiseMultValueK, dim3(gridsize), dim3(bsize), 0, 0, w_array, x_array, y_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length); return 0; } diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index 03d5ac8dbe..5e6218f7a5 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -41,6 +41,7 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c // call any kernels except the ones for the dimension for which we have computed the // block sizes. const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); + switch (dim) { case 1: { // Interp kernels: @@ -55,7 +56,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c case 2: { // Interp kernels: CeedInt required = thread_1d * thread_1d; - block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); // Grad kernels: currently use same required minimum threads block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); @@ -68,7 +70,8 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c case 3: { // Interp kernels: CeedInt required = thread_1d * thread_1d; - block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); + + block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); // Grad kernels: currently use same required minimum threads block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); @@ -87,19 +90,20 @@ static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, c //------------------------------------------------------------------------------ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Hip *ceed_Hip; + CeedInt dim, num_comp; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Hip_shared *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Hip *ceed_Hip; CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedBasis_Hip_shared *data; CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedInt dim, num_comp; CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); @@ -109,10 +113,12 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee case CEED_EVAL_INTERP: { CeedInt P_1d, Q_1d; CeedInt block_size = data->block_sizes[0]; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; + if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; @@ -152,10 +158,12 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee case CEED_EVAL_GRAD: { CeedInt P_1d, Q_1d; CeedInt block_size = data->block_sizes[1]; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); CeedScalar *d_grad_1d = data->d_grad_1d; + if (data->d_collo_grad_1d) { d_grad_1d = data->d_collo_grad_1d; } @@ -197,8 +205,10 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee case CEED_EVAL_WEIGHT: { CeedInt Q_1d; CeedInt block_size = data->block_sizes[2]; + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; + if (dim == 1) { const CeedInt opt_elems = block_size / Q_1d; const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; @@ -244,20 +254,17 @@ int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, Cee // Destroy basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - + Ceed ceed; CeedBasis_Hip_shared *data; - CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallHip(ceed, hipModuleUnload(data->module)); - CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); CeedCallHip(ceed, hipFree(data->d_interp_1d)); CeedCallHip(ceed, hipFree(data->d_grad_1d)); CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -266,42 +273,43 @@ static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; + const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); + const CeedInt i_bytes = q_bytes * P_1d; CeedBasis_Hip_shared *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); // Copy basis data to GPU - const CeedInt qBytes = Q_1d * sizeof(CeedScalar); - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, qBytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, hipMemcpyHostToDevice)); - - const CeedInt iBytes = qBytes * P_1d; - CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, iBytes)); - CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, iBytes, hipMemcpyHostToDevice)); - - CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, iBytes)); - CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, iBytes, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, i_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, i_bytes, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, i_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, i_bytes, hipMemcpyHostToDevice)); // Compute collocated gradient and copy to GPU data->d_collo_grad_1d = NULL; bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; + if (has_collocated_grad) { CeedScalar *collo_grad_1d; + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d)); - CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, hipMemcpyHostToDevice)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); + CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, hipMemcpyHostToDevice)); CeedCallBackend(CeedFree(&collo_grad_1d)); } // Set number of threads per block for basis kernels - CeedInt num_comp; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes)); // Compile basis kernels - char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); diff --git a/backends/hip-shared/ceed-hip-shared.c b/backends/hip-shared/ceed-hip-shared.c index 4eec1631b8..1f17baa723 100644 --- a/backends/hip-shared/ceed-hip-shared.c +++ b/backends/hip-shared/ceed-hip-shared.c @@ -18,24 +18,24 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Hip_shared(const char *resource, Ceed ceed) { - char *resource_root; + Ceed ceed_ref; + Ceed_Hip *data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!strcmp(resource_root, "/gpu/hip/shared"), ceed, CEED_ERROR_BACKEND, "Hip backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Hip *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Hip(ceed, resource)); - Ceed ceed_ref; CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip_shared)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Hip)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/hip/ceed-hip-common.c b/backends/hip/ceed-hip-common.c index f2d61c81ef..ed32aa6dc8 100644 --- a/backends/hip/ceed-hip-common.c +++ b/backends/hip/ceed-hip-common.c @@ -16,16 +16,17 @@ // Device information backend init //------------------------------------------------------------------------------ int CeedInit_Hip(Ceed ceed, const char *resource) { + Ceed_Hip *data; const char *device_spec = strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; + int current_device_id; - int current_device_id; CeedCallHip(ceed, hipGetDevice(¤t_device_id)); if (device_id >= 0 && current_device_id != device_id) { CeedCallHip(ceed, hipSetDevice(device_id)); current_device_id = device_id; } - Ceed_Hip *data; + CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; CeedCallHip(ceed, hipGetDeviceProperties(&data->device_prop, current_device_id)); @@ -38,6 +39,7 @@ int CeedInit_Hip(Ceed ceed, const char *resource) { //------------------------------------------------------------------------------ int CeedDestroy_Hip(Ceed ceed) { Ceed_Hip *data; + CeedCallBackend(CeedGetData(ceed, &data)); if (data->hipblas_handle) CeedCallHipblas(ceed, hipblasDestroy(data->hipblas_handle)); CeedCallBackend(CeedFree(&data)); diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index e6d4aeb745..59b27ad960 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -34,13 +34,20 @@ // Compile HIP kernel //------------------------------------------------------------------------------ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { + size_t ptx_size; + char *jit_defs_path, *jit_defs_source, *ptx; + const int num_opts = 3; + const char *opts[num_opts]; + int runtime_version; + hiprtcProgram prog; + struct hipDeviceProp_t prop; + Ceed_Hip *ceed_data; + hipFree(0); // Make sure a Context exists for hiprtc - hiprtcProgram prog; std::ostringstream code; // Add hip runtime include statement for generation if runtime < 40400000 (implies ROCm < 4.5) - int runtime_version; CeedCallHip(ceed, hipRuntimeGetVersion(&runtime_version)); if (runtime_version < 40400000) { code << "\n#include \n"; @@ -58,6 +65,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce va_start(args, num_defines); char *name; int val; + for (int i = 0; i < num_defines; i++) { name = va_arg(args, char *); val = va_arg(args, int); @@ -67,7 +75,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce } // Standard libCEED definitions for HIP backends - char *jit_defs_path, *jit_defs_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-jit.h", &jit_defs_path)); CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); code << jit_defs_source; @@ -76,11 +83,7 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce CeedCallBackend(CeedFree(&jit_defs_source)); // Non-macro options - const int num_opts = 3; - const char *opts[num_opts]; opts[0] = "-default-device"; - struct hipDeviceProp_t prop; - Ceed_Hip *ceed_data; CeedCallBackend(CeedGetData(ceed, (void **)&ceed_data)); CeedCallHip(ceed, hipGetDeviceProperties(&prop, ceed_data->device_id)); std::string arch_arg = "--gpu-architecture=" + std::string(prop.gcnArchName); @@ -95,25 +98,24 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce // Compile kernel hiprtcResult result = hiprtcCompileProgram(prog, num_opts, opts); + if (result != HIPRTC_SUCCESS) { size_t log_size; + char *log; + CeedChk_hiprtc(ceed, hiprtcGetProgramLogSize(prog, &log_size)); - char *log; CeedCallBackend(CeedMalloc(log_size, &log)); CeedCallHiprtc(ceed, hiprtcGetProgramLog(prog, log)); return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log); } - size_t ptx_size; CeedCallHiprtc(ceed, hiprtcGetCodeSize(prog, &ptx_size)); - char *ptx; CeedCallBackend(CeedMalloc(ptx_size, &ptx)); CeedCallHiprtc(ceed, hiprtcGetCode(prog, ptx)); CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog)); CeedCallHip(ceed, hipModuleLoadData(module, ptx)); CeedCallBackend(CeedFree(&ptx)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 834b749120..3cb651f274 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -24,34 +24,35 @@ CEED_INTERN "C" #endif int - CeedBasisApply_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { - Ceed ceed; + CeedBasisApply_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { + Ceed ceed; + Ceed_Magma *data; + CeedInt dim, num_comp, num_dof, P_1d, Q_1d; + const CeedScalar *du; + CeedScalar *dv; + CeedBasis_Magma *impl; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedInt dim, ncomp, ndof; CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); - Ceed_Magma *data; CeedCallBackend(CeedGetData(ceed, &data)); - const CeedScalar *du; - CeedScalar *dv; if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); - else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); - CeedBasis_Magma *impl; CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedInt P1d, Q1d; - CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P1d)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q1d)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); - CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * CeedIntPow(P1d, dim), ncomp); + CeedDebug256(ceed, 4, "[CeedBasisApply_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * CeedIntPow(P_1d, dim), num_comp); - if (tmode == CEED_TRANSPOSE) { + if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(V, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); @@ -61,17 +62,18 @@ CEED_INTERN "C" ceed_magma_queue_sync(data->queue); } - switch (emode) { + switch (e_mode) { case CEED_EVAL_INTERP: { - CeedInt P = P1d, Q = Q1d; - if (tmode == CEED_TRANSPOSE) { - P = Q1d; - Q = P1d; + CeedInt P = P_1d, Q = Q_1d; + + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d; + Q = P_1d; } // Define element sizes for dofs/quad - CeedInt elquadsize = CeedIntPow(Q1d, dim); - CeedInt eldofssize = CeedIntPow(P1d, dim); + CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); + CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); // E-vector ordering -------------- Q-vector ordering // component component @@ -82,72 +84,72 @@ CEED_INTERN "C" // Input (du) is E-vector, output (dv) is Q-vector // Element strides - CeedInt u_elstride = eldofssize; - CeedInt v_elstride = elquadsize; + CeedInt u_elem_stride = elem_dofs_size; + CeedInt v_elem_stride = elem_qpts_size; // Component strides - CeedInt u_compstride = nelem * eldofssize; - CeedInt v_compstride = nelem * elquadsize; + CeedInt u_comp_stride = num_elem * elem_dofs_size; + CeedInt v_comp_stride = num_elem * elem_qpts_size; // --- Swap strides for TRANSPOSE mode: --- - if (tmode == CEED_TRANSPOSE) { + if (t_mode == CEED_TRANSPOSE) { // Input (du) is Q-vector, output (dv) is E-vector // Element strides - v_elstride = eldofssize; - u_elstride = elquadsize; + v_elem_stride = elem_dofs_size; + u_elem_stride = elem_qpts_size; // Component strides - v_compstride = nelem * eldofssize; - u_compstride = nelem * elquadsize; + v_comp_stride = num_elem * elem_dofs_size; + u_comp_stride = num_elem * elem_qpts_size; } - - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; - CeedInt maxPQ = CeedIntMax(P, Q); + CeedInt num_threads = 1; + CeedInt num_t_col = 1; + CeedInt shared_mem = 0; + CeedInt max_P_Q = CeedIntMax(P, Q); switch (dim) { case 1: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); - shmem += sizeof(CeedScalar) * (P * Q); + num_threads = max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); + shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); + shared_mem += sizeof(CeedScalar) * (P * Q); break; case 2: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += P * Q * sizeof(CeedScalar); // for sT - shmem += ntcol * (P * maxPQ * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ + num_threads = max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); + shared_mem += P * Q * sizeof(CeedScalar); // for sT + shared_mem += num_t_col * (P * max_P_Q * sizeof(CeedScalar)); // for reforming rU we need PxP, and for the intermediate output we need PxQ break; case 3: - nthreads = maxPQ * maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar) * (P * Q); // for sT - shmem += sizeof(CeedScalar) * ntcol * - (CeedIntMax(P * P * maxPQ, - P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) + num_threads = max_P_Q * max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); + shared_mem += sizeof(CeedScalar) * (P * Q); // for sT + shared_mem += sizeof(CeedScalar) * num_t_col * + (CeedIntMax(P * P * max_P_Q, + P * Q * Q)); // rU needs P^2xP, the intermediate output needs max(P^2xQ,PQ^2) } - CeedInt grid = (nelem + ntcol - 1) / ntcol; - void *args[] = {&impl->dinterp1d, &du, &u_elstride, &u_compstride, &dv, &v_elstride, &v_compstride, &nelem}; + CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; + void *args[] = {&impl->d_interp_1d, &du, &u_elem_stride, &u_comp_stride, &dv, &v_elem_stride, &v_comp_stride, &num_elem}; - if (tmode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, nthreads, ntcol, 1, shmem, args)); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, nthreads, ntcol, 1, shmem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_interp, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_GRAD: { - CeedInt P = P1d, Q = Q1d; + CeedInt P = P_1d, Q = Q_1d; + // In CEED_NOTRANSPOSE mode: - // du is (P^dim x nc), column-major layout (nc = ncomp) - // dv is (Q^dim x nc x dim), column-major layout (nc = ncomp) + // du is (P^dim x nc), column-major layout (nc = num_comp) + // dv is (Q^dim x nc x dim), column-major layout (nc = num_comp) // In CEED_TRANSPOSE mode, the sizes of du and dv are switched. - if (tmode == CEED_TRANSPOSE) { - P = Q1d; - Q = P1d; + if (t_mode == CEED_TRANSPOSE) { + P = Q_1d; + Q = P_1d; } // Define element sizes for dofs/quad - CeedInt elquadsize = CeedIntPow(Q1d, dim); - CeedInt eldofssize = CeedIntPow(P1d, dim); + CeedInt elem_qpts_size = CeedIntPow(Q_1d, dim); + CeedInt elem_dofs_size = CeedIntPow(P_1d, dim); // E-vector ordering -------------- Q-vector ordering // dim @@ -159,94 +161,93 @@ CEED_INTERN "C" // Input (du) is E-vector, output (dv) is Q-vector // Element strides - CeedInt u_elstride = eldofssize; - CeedInt v_elstride = elquadsize; + CeedInt u_elem_stride = elem_dofs_size; + CeedInt v_elem_stride = elem_qpts_size; // Component strides - CeedInt u_compstride = nelem * eldofssize; - CeedInt v_compstride = nelem * elquadsize; + CeedInt u_comp_stride = num_elem * elem_dofs_size; + CeedInt v_comp_stride = num_elem * elem_qpts_size; // Dimension strides - CeedInt u_dimstride = 0; - CeedInt v_dimstride = nelem * elquadsize * ncomp; + CeedInt u_dim_stride = 0; + CeedInt v_dim_stride = num_elem * elem_qpts_size * num_comp; // --- Swap strides for TRANSPOSE mode: --- - if (tmode == CEED_TRANSPOSE) { + if (t_mode == CEED_TRANSPOSE) { // Input (du) is Q-vector, output (dv) is E-vector // Element strides - v_elstride = eldofssize; - u_elstride = elquadsize; + v_elem_stride = elem_dofs_size; + u_elem_stride = elem_qpts_size; // Component strides - v_compstride = nelem * eldofssize; - u_compstride = nelem * elquadsize; + v_comp_stride = num_elem * elem_dofs_size; + u_comp_stride = num_elem * elem_qpts_size; // Dimension strides - v_dimstride = 0; - u_dimstride = nelem * elquadsize * ncomp; + v_dim_stride = 0; + u_dim_stride = num_elem * elem_qpts_size * num_comp; } - - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; - CeedInt maxPQ = CeedIntMax(P, Q); + CeedInt num_threads = 1; + CeedInt num_t_col = 1; + CeedInt shared_mem = 0; + CeedInt max_P_Q = CeedIntMax(P, Q); switch (dim) { case 1: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * ntcol * (ncomp * (1 * P + 1 * Q)); - shmem += sizeof(CeedScalar) * (P * Q); + num_threads = max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); + shared_mem += sizeof(CeedScalar) * num_t_col * (num_comp * (1 * P + 1 * Q)); + shared_mem += sizeof(CeedScalar) * (P * Q); break; case 2: - nthreads = maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad - shmem += sizeof(CeedScalar) * ntcol * (P * maxPQ); // for reforming rU we need PxP, and for the intermediate output we need PxQ + num_threads = max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); + shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad + shared_mem += sizeof(CeedScalar) * num_t_col * (P * max_P_Q); // for reforming rU we need PxP, and for the intermediate output we need PxQ break; case 3: - nthreads = maxPQ * maxPQ; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad - shmem += sizeof(CeedScalar) * ntcol * - CeedIntMax(P * P * P, - (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) + num_threads = max_P_Q * max_P_Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); + shared_mem += sizeof(CeedScalar) * 2 * P * Q; // for sTinterp and sTgrad + shared_mem += sizeof(CeedScalar) * num_t_col * + CeedIntMax(P * P * P, + (P * P * Q) + (P * Q * Q)); // rU needs P^2xP, the intermediate outputs need (P^2.Q + P.Q^2) } - CeedInt grid = (nelem + ntcol - 1) / ntcol; - void *args[] = {&impl->dinterp1d, &impl->dgrad1d, &du, &u_elstride, &u_compstride, &u_dimstride, &dv, - &v_elstride, &v_compstride, &v_dimstride, &nelem}; + CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; + void *args[] = {&impl->d_interp_1d, &impl->d_grad_1d, &du, &u_elem_stride, &u_comp_stride, &u_dim_stride, &dv, + &v_elem_stride, &v_comp_stride, &v_dim_stride, &num_elem}; - if (tmode == CEED_TRANSPOSE) { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, nthreads, ntcol, 1, shmem, args)); + if (t_mode == CEED_TRANSPOSE) { + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad_tr, grid, num_threads, num_t_col, 1, shared_mem, args)); } else { - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, nthreads, ntcol, 1, shmem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_grad, grid, num_threads, num_t_col, 1, shared_mem, args)); } } break; case CEED_EVAL_WEIGHT: { - CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); - CeedInt Q = Q1d; - CeedInt eldofssize = CeedIntPow(Q, dim); - CeedInt nthreads = 1; - CeedInt ntcol = 1; - CeedInt shmem = 0; + CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); + CeedInt Q = Q_1d; + CeedInt elem_dofs_size = CeedIntPow(Q, dim); + CeedInt num_threads = 1; + CeedInt num_t_col = 1; + CeedInt shared_mem = 0; switch (dim) { case 1: - nthreads = Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_1D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d - shmem += sizeof(CeedScalar) * ntcol * Q; // for output + num_threads = Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_1D); + shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d + shared_mem += sizeof(CeedScalar) * num_t_col * Q; // for output break; case 2: - nthreads = Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_2D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d + num_threads = Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_2D); + shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d break; case 3: - nthreads = Q * Q; - ntcol = MAGMA_BASIS_NTCOL(nthreads, MAGMA_MAXTHREADS_3D); - shmem += sizeof(CeedScalar) * Q; // for dqweight1d + num_threads = Q * Q; + num_t_col = MAGMA_BASIS_NTCOL(num_threads, MAGMA_MAXTHREADS_3D); + shared_mem += sizeof(CeedScalar) * Q; // for d_q_weight_1d } - CeedInt grid = (nelem + ntcol - 1) / ntcol; - void *args[] = {&impl->dqweight1d, &dv, &eldofssize, &nelem}; + CeedInt grid = (num_elem + num_t_col - 1) / num_t_col; + void *args[] = {&impl->d_q_weight_1d, &dv, &elem_dofs_size, &num_elem}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, nthreads, ntcol, 1, shmem, args)); + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, impl->magma_weight, grid, num_threads, num_t_col, 1, shared_mem, args)); } break; // LCOV_EXCL_START case CEED_EVAL_DIV: @@ -261,7 +262,7 @@ CEED_INTERN "C" // must sync to ensure completeness ceed_magma_queue_sync(data->queue); - if (emode != CEED_EVAL_WEIGHT) { + if (e_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); } CeedCallBackend(CeedVectorRestoreArray(V, &dv)); @@ -272,33 +273,36 @@ CEED_INTERN "C" CEED_INTERN "C" #endif int - CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt nelem, CeedTransposeMode tmode, CeedEvalMode emode, CeedVector U, CeedVector V) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode e_mode, CeedVector U, CeedVector V) { + Ceed ceed; + Ceed_Magma *data; + CeedInt dim, num_comp, num_dof, num_qpts, NB = 1; + const CeedScalar *du; + CeedScalar *dv; + CeedBasisNonTensor_Magma *impl; + CeedMagmaFunction *interp, *grad; - Ceed_Magma *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedGetData(ceed, &data)); - magma_int_t arch = magma_getdevice_arch(); - CeedInt dim, ncomp, ndof, nqpt; CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &ndof)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &nqpt)); - const CeedScalar *du; - CeedScalar *dv; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_dof)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedInt P = num_dof, Q = num_qpts, N = num_elem * num_comp; + if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_DEVICE, &du)); - else CeedCheck(emode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); + else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_DEVICE, &dv)); - CeedBasisNonTensor_Magma *impl; CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, ncomp * ndof, ncomp); + CeedDebug256(ceed, 4, "[CeedBasisApplyNonTensor_Magma] vsize=%" CeedInt_FMT ", comp = %" CeedInt_FMT, num_comp * num_dof, num_comp); - if (tmode == CEED_TRANSPOSE) { + if (t_mode == CEED_TRANSPOSE) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(V, &length)); if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) { magmablas_slaset(MagmaFull, length, 1, 0., 0., (float *)dv, length, data->queue); @@ -308,90 +312,90 @@ CEED_INTERN "C" ceed_magma_queue_sync(data->queue); } - CeedInt P = ndof, Q = nqpt, N = nelem * ncomp; - CeedInt NB = 1; - CeedMagmaFunction *interp, *grad; + CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; + CeedInt iN = 0; + CeedInt diff = abs(n_array[iN] - N); - CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; - CeedInt iN = 0; - CeedInt diff = abs(Narray[iN] - N); for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedInt idiff = abs(Narray[in] - N); + CeedInt idiff = abs(n_array[in] - N); if (idiff < diff) { iN = in; diff = idiff; } } - NB = nontensor_rtc_get_nb(arch, 'd', emode, tmode, P, Narray[iN], Q); - interp = (tmode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN]; - grad = (tmode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN]; + NB = nontensor_rtc_get_nb(arch, 'd', e_mode, t_mode, P, n_array[iN], Q); + interp = (t_mode == CEED_TRANSPOSE) ? &impl->magma_interp_tr_nontensor[iN] : &impl->magma_interp_nontensor[iN]; + grad = (t_mode == CEED_TRANSPOSE) ? &impl->magma_grad_tr_nontensor[iN] : &impl->magma_grad_nontensor[iN]; - switch (emode) { + switch (e_mode) { case CEED_EVAL_INTERP: { - CeedInt P = ndof, Q = nqpt; + CeedInt P = num_dof, Q = num_qpts; if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { - CeedInt M = (tmode == CEED_TRANSPOSE) ? P : Q; - CeedInt K = (tmode == CEED_TRANSPOSE) ? Q : P; - CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M); - CeedInt shmem = 0, shmemA = 0, shmemB = 0; - shmemB += ntcol * K * NB * sizeof(CeedScalar); - shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); - shmem = (tmode == CEED_TRANSPOSE) ? (shmemA + shmemB) : CeedIntMax(shmemA, shmemB); - - CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol); - magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; - magma_trans_t transB = MagmaNoTrans; + CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; + CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; + CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); + CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; + shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); + shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); + shared_mem = (t_mode == CEED_TRANSPOSE) ? (shared_mem_A + shared_mem_B) : CeedIntMax(shared_mem_A, shared_mem_B); + + CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); + magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; + magma_trans_t trans_B = MagmaNoTrans; CeedScalar alpha = 1.0, beta = 0.0; - void *args[] = {&transA, &transB, &N, &alpha, &impl->dinterp, &P, &du, &K, &beta, &dv, &M}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, ntcol, 1, shmem, args)); + void *args[] = {&trans_A, &trans_B, &N, &alpha, &impl->d_interp, &P, &du, &K, &beta, &dv, &M}; + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *interp, grid, M, num_t_col, 1, shared_mem, args)); } else { - if (tmode == CEED_TRANSPOSE) - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dinterp, P, du, Q, 0.0, dv, P, data->queue); - else magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dinterp, P, du, P, 0.0, dv, Q, data->queue); + if (t_mode == CEED_TRANSPOSE) { + magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_interp, P, du, Q, 0.0, dv, P, data->queue); + } else { + magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_interp, P, du, P, 0.0, dv, Q, data->queue); + } } } break; case CEED_EVAL_GRAD: { - CeedInt P = ndof, Q = nqpt; + CeedInt P = num_dof, Q = num_qpts; if (P < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q < MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { - CeedInt M = (tmode == CEED_TRANSPOSE) ? P : Q; - CeedInt K = (tmode == CEED_TRANSPOSE) ? Q : P; - CeedInt ntcol = MAGMA_NONTENSOR_BASIS_NTCOL(M); - CeedInt shmem = 0, shmemA = 0, shmemB = 0; - shmemB += ntcol * K * NB * sizeof(CeedScalar); - shmemA += (tmode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); - shmem = shmemA + shmemB; - - CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), ntcol); - magma_trans_t transA = (tmode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; - magma_trans_t transB = MagmaNoTrans; - - void *args[] = {&transA, &transB, &N, &impl->dgrad, &P, &du, &K, &dv, &M}; - CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, ntcol, 1, shmem, args)); + CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q; + CeedInt K = (t_mode == CEED_TRANSPOSE) ? Q : P; + CeedInt num_t_col = MAGMA_NONTENSOR_BASIS_NTCOL(M); + CeedInt shared_mem = 0, shared_mem_A = 0, shared_mem_B = 0; + shared_mem_B += num_t_col * K * NB * sizeof(CeedScalar); + shared_mem_A += (t_mode == CEED_TRANSPOSE) ? 0 : K * M * sizeof(CeedScalar); + shared_mem = shared_mem_A + shared_mem_B; + + CeedInt grid = MAGMA_CEILDIV(MAGMA_CEILDIV(N, NB), num_t_col); + magma_trans_t trans_A = (t_mode == CEED_TRANSPOSE) ? MagmaNoTrans : MagmaTrans; + magma_trans_t trans_B = MagmaNoTrans; + + void *args[] = {&trans_A, &trans_B, &N, &impl->d_grad, &P, &du, &K, &dv, &M}; + CeedCallBackend(CeedRunKernelDimSharedMagma(ceed, *grad, grid, M, num_t_col, 1, shared_mem, args)); } else { - if (tmode == CEED_TRANSPOSE) { + if (t_mode == CEED_TRANSPOSE) { CeedScalar beta = 0.0; for (int d = 0; d < dim; d++) { if (d > 0) beta = 1.0; - magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, nelem * ncomp, Q, 1.0, impl->dgrad + d * P * Q, P, du + d * nelem * ncomp * Q, Q, - beta, dv, P, data->queue); + magma_gemm_nontensor(MagmaNoTrans, MagmaNoTrans, P, num_elem * num_comp, Q, 1.0, impl->d_grad + d * P * Q, P, + du + d * num_elem * num_comp * Q, Q, beta, dv, P, data->queue); } } else { for (int d = 0; d < dim; d++) - magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, nelem * ncomp, P, 1.0, impl->dgrad + d * P * Q, P, du, P, 0.0, - dv + d * nelem * ncomp * Q, Q, data->queue); + magma_gemm_nontensor(MagmaTrans, MagmaNoTrans, Q, num_elem * num_comp, P, 1.0, impl->d_grad + d * P * Q, P, du, P, 0.0, + dv + d * num_elem * num_comp * Q, Q, data->queue); } } } break; case CEED_EVAL_WEIGHT: { - CeedCheck(tmode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); + CeedCheck(t_mode != CEED_TRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT inum_compatible with CEED_TRANSPOSE"); + + int elemsPerBlock = 1; // basis->Q_1d < 7 ? optElems[basis->Q_1d] : 1; + int grid = num_elem / elemsPerBlock + ((num_elem / elemsPerBlock * elemsPerBlock < num_elem) ? 1 : 0); - int elemsPerBlock = 1; // basis->Q1d < 7 ? optElems[basis->Q1d] : 1; - int grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - magma_weight_nontensor(grid, nqpt, nelem, nqpt, impl->dqweight, dv, data->queue); + magma_weight_nontensor(grid, num_qpts, num_elem, num_qpts, impl->d_q_weight, dv, data->queue); } break; // LCOV_EXCL_START @@ -407,7 +411,7 @@ CEED_INTERN "C" // must sync to ensure completeness ceed_magma_queue_sync(data->queue); - if (emode != CEED_EVAL_WEIGHT) { + if (e_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorRestoreArrayRead(U, &du)); } CeedCallBackend(CeedVectorRestoreArray(V, &dv)); @@ -419,23 +423,21 @@ CEED_INTERN "C" #endif int CeedBasisDestroy_Magma(CeedBasis basis) { + Ceed ceed; CeedBasis_Magma *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedCallBackend(magma_free(impl->dqref1d)); - CeedCallBackend(magma_free(impl->dinterp1d)); - CeedCallBackend(magma_free(impl->dgrad1d)); - CeedCallBackend(magma_free(impl->dqweight1d)); - Ceed ceed; + CeedCallBackend(CeedBasisGetData(basis, &impl)); + CeedCallBackend(magma_free(impl->d_q_ref_1d)); + CeedCallBackend(magma_free(impl->d_interp_1d)); + CeedCallBackend(magma_free(impl->d_grad_1d)); + CeedCallBackend(magma_free(impl->d_q_weight_1d)); CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); #ifdef CEED_MAGMA_USE_HIP CeedCallHip(ceed, hipModuleUnload(impl->module)); #else CeedCallCuda(ceed, cuModuleUnload(impl->module)); #endif - CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -444,14 +446,14 @@ CEED_INTERN "C" #endif int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { + Ceed ceed; CeedBasisNonTensor_Magma *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); - CeedCallBackend(magma_free(impl->dqref)); - CeedCallBackend(magma_free(impl->dinterp)); - CeedCallBackend(magma_free(impl->dgrad)); - CeedCallBackend(magma_free(impl->dqweight)); - Ceed ceed; + CeedCallBackend(CeedBasisGetData(basis, &impl)); + CeedCallBackend(magma_free(impl->d_q_ref)); + CeedCallBackend(magma_free(impl->d_interp)); + CeedCallBackend(magma_free(impl->d_grad)); + CeedCallBackend(magma_free(impl->d_q_weight)); CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); #ifdef CEED_MAGMA_USE_HIP for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { @@ -463,7 +465,6 @@ CEED_INTERN "C" } #endif CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -471,23 +472,22 @@ CEED_INTERN "C" CEED_INTERN "C" #endif int - CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, - const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis) { + CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { + Ceed ceed, ceed_delegate; + Ceed_Magma *data; + char *magma_common_path, *interp_path, *grad_path, *weight_path, *basis_kernel_source; + CeedInt num_comp = 0; CeedBasis_Magma *impl; + CeedCallBackend(CeedCalloc(1, &impl)); - Ceed ceed; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); // Check for supported parameters - CeedInt ncomp = 0; - CeedCallBackend(CeedBasisGetNumComponents(basis, &ncomp)); - Ceed_Magma *data; + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedGetData(ceed, &data)); // Compile kernels - char *magma_common_path; - char *interp_path, *grad_path, *weight_path; - char *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); @@ -496,28 +496,30 @@ CEED_INTERN "C" char *interp_name_base = "ceed/jit-source/magma/interp"; CeedInt interp_name_len = strlen(interp_name_base) + 6; char interp_name[interp_name_len]; + snprintf(interp_name, interp_name_len, "%s-%" CeedInt_FMT "d.h", interp_name_base, dim); CeedCallBackend(CeedGetJitAbsolutePath(ceed, interp_name, &interp_path)); CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, interp_path, &basis_kernel_source)); char *grad_name_base = "ceed/jit-source/magma/grad"; CeedInt grad_name_len = strlen(grad_name_base) + 6; char grad_name[grad_name_len]; + snprintf(grad_name, grad_name_len, "%s-%" CeedInt_FMT "d.h", grad_name_base, dim); CeedCallBackend(CeedGetJitAbsolutePath(ceed, grad_name, &grad_path)); CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, grad_path, &basis_kernel_source)); char *weight_name_base = "ceed/jit-source/magma/weight"; CeedInt weight_name_len = strlen(weight_name_base) + 6; char weight_name[weight_name_len]; + snprintf(weight_name, weight_name_len, "%s-%" CeedInt_FMT "d.h", weight_name_base, dim); CeedCallBackend(CeedGetJitAbsolutePath(ceed, weight_name, &weight_path)); CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_path, &basis_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip // data - Ceed delegate; - CeedCallBackend(CeedGetDelegate(ceed, &delegate)); - CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", ncomp, "P", P1d, "Q", Q1d, "MAXPQ", - CeedIntMax(P1d, Q1d))); + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module, 5, "DIM", dim, "NCOMP", num_comp, "P", P_1d, "Q", Q_1d, "MAXPQ", + CeedIntMax(P_1d, Q_1d))); // Kernel setup switch (dim) { @@ -546,21 +548,21 @@ CEED_INTERN "C" CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Magma)); - // Copy qref1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dqref1d, Q1d * sizeof(qref1d[0]))); - magma_setvector(Q1d, sizeof(qref1d[0]), qref1d, 1, impl->dqref1d, 1, data->queue); + // Copy q_ref_1d to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_ref_1d, Q_1d * sizeof(q_ref_1d[0]))); + magma_setvector(Q_1d, sizeof(q_ref_1d[0]), q_ref_1d, 1, impl->d_q_ref_1d, 1, data->queue); - // Copy interp1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dinterp1d, Q1d * P1d * sizeof(interp1d[0]))); - magma_setvector(Q1d * P1d, sizeof(interp1d[0]), interp1d, 1, impl->dinterp1d, 1, data->queue); + // Copy interp_1d to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_interp_1d, Q_1d * P_1d * sizeof(interp_1d[0]))); + magma_setvector(Q_1d * P_1d, sizeof(interp_1d[0]), interp_1d, 1, impl->d_interp_1d, 1, data->queue); - // Copy grad1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dgrad1d, Q1d * P1d * sizeof(grad1d[0]))); - magma_setvector(Q1d * P1d, sizeof(grad1d[0]), grad1d, 1, impl->dgrad1d, 1, data->queue); + // Copy grad_1d to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_grad_1d, Q_1d * P_1d * sizeof(grad_1d[0]))); + magma_setvector(Q_1d * P_1d, sizeof(grad_1d[0]), grad_1d, 1, impl->d_grad_1d, 1, data->queue); - // Copy qweight1d to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dqweight1d, Q1d * sizeof(qweight1d[0]))); - magma_setvector(Q1d, sizeof(qweight1d[0]), qweight1d, 1, impl->dqweight1d, 1, data->queue); + // Copy q_weight_1d to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight_1d, Q_1d * sizeof(q_weight_1d[0]))); + magma_setvector(Q_1d, sizeof(q_weight_1d[0]), q_weight_1d, 1, impl->d_q_weight_1d, 1, data->queue); CeedCallBackend(CeedBasisSetData(basis, impl)); CeedCallBackend(CeedFree(&magma_common_path)); @@ -568,7 +570,6 @@ CEED_INTERN "C" CeedCallBackend(CeedFree(&grad_path)); CeedCallBackend(CeedFree(&weight_path)); CeedCallBackend(CeedFree(&basis_kernel_source)); - return CEED_ERROR_SUCCESS; } @@ -576,20 +577,19 @@ CEED_INTERN "C" CEED_INTERN "C" #endif int - CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis) { + CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed, ceed_delegate; + Ceed_Magma *data; + char *magma_common_path, *interp_path, *grad_path, *basis_kernel_source; CeedBasisNonTensor_Magma *impl; - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Magma *data; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedGetData(ceed, &data)); magma_int_t arch = magma_getdevice_arch(); + CeedCallBackend(CeedCalloc(1, &impl)); // Compile kernels - char *magma_common_path; - char *interp_path, *grad_path; - char *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma_common_defs.h", &magma_common_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, magma_common_path, &basis_kernel_source)); @@ -605,21 +605,21 @@ CEED_INTERN "C" CeedInt nb_interp_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedInt nb_grad_n[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedInt nb_grad_t[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedInt P = ndof, Q = nqpts; - CeedInt Narray[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; + CeedInt P = num_dof, Q = num_qpts; + CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_N_VALUES}; + for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, Narray[in], Q); - nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, Narray[in], Q); - nb_grad_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, Narray[in], Q); - nb_grad_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, Narray[in], Q); + nb_interp_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_NOTRANSPOSE, P, n_array[in], Q); + nb_interp_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_INTERP, CEED_TRANSPOSE, P, n_array[in], Q); + nb_grad_n[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_NOTRANSPOSE, P, n_array[in], Q); + nb_grad_t[in] = nontensor_rtc_get_nb(arch, 'd', CEED_EVAL_GRAD, CEED_TRANSPOSE, P, n_array[in], Q); } // compile - Ceed delegate; - CeedCallBackend(CeedGetDelegate(ceed, &delegate)); + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - CeedCallBackend(CeedCompileMagma(delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", nb_interp_n[in], - "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in])); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[in], 7, "DIM", dim, "P", P, "Q", Q, "NB_INTERP_N", + nb_interp_n[in], "NB_INTERP_T", nb_interp_t[in], "NB_GRAD_N", nb_grad_n[in], "NB_GRAD_T", nb_grad_t[in])); } // get kernels @@ -633,21 +633,21 @@ CEED_INTERN "C" CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Magma)); - // Copy qref to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dqref, nqpts * sizeof(qref[0]))); - magma_setvector(nqpts, sizeof(qref[0]), qref, 1, impl->dqref, 1, data->queue); + // Copy q_ref to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_ref, num_qpts * sizeof(q_ref[0]))); + magma_setvector(num_qpts, sizeof(q_ref[0]), q_ref, 1, impl->d_q_ref, 1, data->queue); // Copy interp to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dinterp, nqpts * ndof * sizeof(interp[0]))); - magma_setvector(nqpts * ndof, sizeof(interp[0]), interp, 1, impl->dinterp, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_interp, num_qpts * num_dof * sizeof(interp[0]))); + magma_setvector(num_qpts * num_dof, sizeof(interp[0]), interp, 1, impl->d_interp, 1, data->queue); // Copy grad to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dgrad, nqpts * ndof * dim * sizeof(grad[0]))); - magma_setvector(nqpts * ndof * dim, sizeof(grad[0]), grad, 1, impl->dgrad, 1, data->queue); + CeedCallBackend(magma_malloc((void **)&impl->d_grad, num_qpts * num_dof * dim * sizeof(grad[0]))); + magma_setvector(num_qpts * num_dof * dim, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); - // Copy qweight to the GPU - CeedCallBackend(magma_malloc((void **)&impl->dqweight, nqpts * sizeof(qweight[0]))); - magma_setvector(nqpts, sizeof(qweight[0]), qweight, 1, impl->dqweight, 1, data->queue); + // Copy q_weight to the GPU + CeedCallBackend(magma_malloc((void **)&impl->d_q_weight, num_qpts * sizeof(q_weight[0]))); + magma_setvector(num_qpts, sizeof(q_weight[0]), q_weight, 1, impl->d_q_weight, 1, data->queue); CeedCallBackend(CeedBasisSetData(basis, impl)); CeedCallBackend(CeedFree(&magma_common_path)); diff --git a/backends/magma/ceed-magma-common.c b/backends/magma/ceed-magma-common.c index f04e0ca946..dcd7598913 100644 --- a/backends/magma/ceed-magma-common.c +++ b/backends/magma/ceed-magma-common.c @@ -18,16 +18,16 @@ int CeedInit_Magma_common(Ceed ceed, const char *resource) { const char *device_spec = strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : -1; + int current_device_id; + Ceed_Magma *data; CeedCallBackend(magma_init()); - int current_device_id; magma_getdevice(¤t_device_id); if (device_id >= 0 && current_device_id != device_id) { magma_setdevice(device_id); current_device_id = device_id; } - Ceed_Magma *data; CeedCallBackend(CeedGetData(ceed, &data)); data->device_id = current_device_id; #ifdef CEED_MAGMA_USE_HIP @@ -43,6 +43,7 @@ int CeedInit_Magma_common(Ceed ceed, const char *resource) { //------------------------------------------------------------------------------ int CeedDestroy_Magma(Ceed ceed) { Ceed_Magma *data; + CeedCallBackend(CeedGetData(ceed, &data)); magma_queue_destroy(data->queue); CeedCallBackend(CeedFree(&data)); diff --git a/backends/magma/ceed-magma-det.c b/backends/magma/ceed-magma-det.c index 69e8b1f9de..5896568790 100644 --- a/backends/magma/ceed-magma-det.c +++ b/backends/magma/ceed-magma-det.c @@ -16,18 +16,19 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Magma_Det(const char *resource, Ceed ceed) { - const int nrc = 18; // number of characters in resource + Ceed ceed_ref; + Ceed_Magma *data; + const int nrc = 18; // number of characters in resource + CeedCheck(!strncmp(resource, "/gpu/cuda/magma/det", nrc) || !strncmp(resource, "/gpu/hip/magma/det", nrc), ceed, CEED_ERROR_BACKEND, "Magma backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Magma *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Magma_common(ceed, resource)); // Create reference Ceed that implementation will be dispatched through - Ceed ceed_ref; #ifdef CEED_MAGMA_USE_HIP CeedCallBackend(CeedInit("/gpu/hip/magma", &ceed_ref)); #else @@ -36,7 +37,6 @@ static int CeedInit_Magma_Det(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma.c b/backends/magma/ceed-magma.c index 544b6a92dc..92fc1b3d8d 100644 --- a/backends/magma/ceed-magma.c +++ b/backends/magma/ceed-magma.c @@ -18,17 +18,18 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Magma(const char *resource, Ceed ceed) { - const int nrc = 14; // number of characters in resource + Ceed ceed_ref; + Ceed_Magma *data; + const int nrc = 14; // number of characters in resource + CeedCheck(!strncmp(resource, "/gpu/cuda/magma", nrc) || !strncmp(resource, "/gpu/hip/magma", nrc), ceed, CEED_ERROR_BACKEND, "Magma backend cannot use resource: %s", resource); - Ceed_Magma *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Magma_common(ceed, resource)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; #ifdef CEED_MAGMA_USE_HIP CeedCallBackend(CeedInit("/gpu/hip/ref", &ceed_ref)); #else @@ -39,7 +40,6 @@ static int CeedInit_Magma(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Magma)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Magma)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index e3f1c768b1..fcba887fe3 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -60,10 +60,10 @@ typedef struct { CeedMagmaFunction magma_grad; CeedMagmaFunction magma_grad_tr; CeedMagmaFunction magma_weight; - CeedScalar *dqref1d; - CeedScalar *dinterp1d; - CeedScalar *dgrad1d; - CeedScalar *dqweight1d; + CeedScalar *d_q_ref_1d; + CeedScalar *d_interp_1d; + CeedScalar *d_grad_1d; + CeedScalar *d_q_weight_1d; } CeedBasis_Magma; typedef struct { @@ -72,30 +72,30 @@ typedef struct { CeedMagmaFunction magma_interp_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction magma_grad_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction magma_grad_tr_nontensor[MAGMA_NONTENSOR_KERNEL_INSTANCES]; - CeedScalar *dqref; - CeedScalar *dinterp; - CeedScalar *dgrad; - CeedScalar *dqweight; + CeedScalar *d_q_ref; + CeedScalar *d_interp; + CeedScalar *d_grad; + CeedScalar *d_q_weight; } CeedBasisNonTensor_Magma; -CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t nelem, magma_int_t Q, CeedScalar *dqweight, CeedScalar *dv, - magma_queue_t queue); +CEED_INTERN void magma_weight_nontensor(magma_int_t grid, magma_int_t threads, magma_int_t num_elem, magma_int_t Q, CeedScalar *d_q_weight, + CeedScalar *d_v, magma_queue_t queue); -CEED_INTERN int magma_gemm_nontensor(magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, - const CeedScalar *dA, magma_int_t ldda, const CeedScalar *dB, magma_int_t lddb, CeedScalar beta, CeedScalar *dC, - magma_int_t lddc, magma_queue_t queue); +CEED_INTERN int magma_gemm_nontensor(magma_trans_t trans_A, magma_trans_t trans_B, magma_int_t m, magma_int_t n, magma_int_t k, CeedScalar alpha, + const CeedScalar *d_A, magma_int_t ldda, const CeedScalar *d_B, magma_int_t lddb, CeedScalar beta, + CeedScalar *d_C, magma_int_t lddc, magma_queue_t queue); -CEED_INTERN void gemm_selector(int gpu_arch, char precision, char transA, int m, int n, int k, int *nbatch, int *use_magma); +CEED_INTERN void gemm_selector(int gpu_arch, char precision, char trans_A, int m, int n, int k, int *n_batch, int *use_magma); -CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode emode, CeedTransposeMode tmode, int P_, int N, int Q_); +CEED_INTERN CeedInt nontensor_rtc_get_nb(int gpu_arch, char precision, CeedEvalMode e_mode, CeedTransposeMode t_mode, int P_, int N, int Q_); CEED_INTERN magma_int_t magma_isdevptr(const void *A); -CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P1d, CeedInt Q1d, const CeedScalar *interp1d, const CeedScalar *grad1d, - const CeedScalar *qref1d, const CeedScalar *qweight1d, CeedBasis basis); +CEED_INTERN int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); +CEED_INTERN int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); // Comment the line below to use the default magma_is_devptr function #define magma_is_devptr magma_isdevptr diff --git a/backends/memcheck/ceed-memcheck-blocked.c b/backends/memcheck/ceed-memcheck-blocked.c index 634f7e8b6e..866f7133ef 100644 --- a/backends/memcheck/ceed-memcheck-blocked.c +++ b/backends/memcheck/ceed-memcheck-blocked.c @@ -15,17 +15,17 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self/memcheck/blocked"), ceed, CEED_ERROR_BACKEND, "Valgrind Memcheck backend cannot use resource: %s", resource); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/ref/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Memcheck)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-qfunction.c b/backends/memcheck/ceed-memcheck-qfunction.c index 00276eebe0..1d4822abe9 100644 --- a/backends/memcheck/ceed-memcheck-qfunction.c +++ b/backends/memcheck/ceed-memcheck-qfunction.c @@ -17,25 +17,23 @@ // QFunction Apply //------------------------------------------------------------------------------ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { - Ceed ceed; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - + Ceed ceed; + void *ctx_data = NULL; + CeedInt num_in, num_out; + CeedQFunctionUser f = NULL; CeedQFunction_Memcheck *impl; - CeedCallBackend(CeedQFunctionGetData(qf, &impl)); - void *ctx_data = NULL; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data)); - - CeedQFunctionUser f = NULL; CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f)); - - CeedInt num_in, num_out; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out)); for (CeedInt i = 0; i < num_in; i++) { CeedCallBackend(CeedVectorGetArrayRead(U[i], CEED_MEM_HOST, &impl->inputs[i])); } int mem_block_ids[num_out]; + for (CeedInt i = 0; i < num_out; i++) { CeedSize len; char name[32] = ""; @@ -66,7 +64,6 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector * VALGRIND_DISCARD(mem_block_ids[i]); } CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data)); - return CEED_ERROR_SUCCESS; } @@ -75,12 +72,11 @@ static int CeedQFunctionApply_Memcheck(CeedQFunction qf, CeedInt Q, CeedVector * //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Memcheck(CeedQFunction qf) { CeedQFunction_Memcheck *impl; - CeedCallBackend(CeedQFunctionGetData(qf, (void *)&impl)); + CeedCallBackend(CeedQFunctionGetData(qf, (void *)&impl)); CeedCallBackend(CeedFree(&impl->inputs)); CeedCallBackend(CeedFree(&impl->outputs)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -88,18 +84,16 @@ static int CeedQFunctionDestroy_Memcheck(CeedQFunction qf) { // QFunction Create //------------------------------------------------------------------------------ int CeedQFunctionCreate_Memcheck(CeedQFunction qf) { - Ceed ceed; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - + Ceed ceed; CeedQFunction_Memcheck *impl; + + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->inputs)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->outputs)); CeedCallBackend(CeedQFunctionSetData(qf, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Memcheck)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-qfunctioncontext.c b/backends/memcheck/ceed-memcheck-qfunctioncontext.c index ae6750a17c..c3a4f75db6 100644 --- a/backends/memcheck/ceed-memcheck-qfunctioncontext.c +++ b/backends/memcheck/ceed-memcheck-qfunctioncontext.c @@ -18,10 +18,9 @@ //------------------------------------------------------------------------------ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); *has_valid_data = impl->data; - return CEED_ERROR_SUCCESS; } @@ -29,9 +28,10 @@ static int CeedQFunctionContextHasValidData_Memcheck(CeedQFunctionContext ctx, b // QFunctionContext has borrowed data //------------------------------------------------------------------------------ static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { + Ceed ceed; CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); switch (mem_type) { @@ -44,7 +44,6 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionConte // LCOV_EXCL_STOP break; } - return CEED_ERROR_SUCCESS; } @@ -52,11 +51,12 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Memcheck(CeedQFunctionConte // QFunctionContext Set Data //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - size_t ctx_size; CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); @@ -85,7 +85,6 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe impl->data = impl->data_allocated; VALGRIND_DISCARD(impl->mem_block_id); impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->data, ctx_size, "'QFunction backend context data copy'"); - return CEED_ERROR_SUCCESS; } @@ -93,9 +92,10 @@ static int CeedQFunctionContextSetData_Memcheck(CeedQFunctionContext ctx, CeedMe // QFunctionContext Take Data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + Ceed ceed; CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); @@ -105,7 +105,6 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedM impl->data = NULL; VALGRIND_DISCARD(impl->mem_block_id); CeedCallBackend(CeedFree(&impl->data_allocated)); - return CEED_ERROR_SUCCESS; } @@ -113,15 +112,15 @@ static int CeedQFunctionContextTakeData_Memcheck(CeedQFunctionContext ctx, CeedM // QFunctionContext Get Data //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + Ceed ceed; CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); *(void **)data = impl->data; - return CEED_ERROR_SUCCESS; } @@ -129,19 +128,18 @@ static int CeedQFunctionContextGetData_Memcheck(CeedQFunctionContext ctx, CeedMe // QFunctionContext Get Data Read-Only //------------------------------------------------------------------------------ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - size_t ctx_size; CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCallBackend(CeedQFunctionContextGetData_Memcheck(ctx, mem_type, data)); // Make copy to verify no write occurred CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->data_read_only_copy)); memcpy(impl->data_read_only_copy, *(void **)data, ctx_size); - return CEED_ERROR_SUCCESS; } @@ -149,18 +147,14 @@ static int CeedQFunctionContextGetDataRead_Memcheck(CeedQFunctionContext ctx, Ce // QFunctionContext Restore Data //------------------------------------------------------------------------------ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) { - size_t ctx_size; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - if (impl->data_borrowed) { - memcpy(impl->data_borrowed, impl->data, ctx_size); - } - if (impl->data_owned) { - memcpy(impl->data_owned, impl->data, ctx_size); - } + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); + if (impl->data_borrowed) memcpy(impl->data_borrowed, impl->data, ctx_size); + if (impl->data_owned) memcpy(impl->data_owned, impl->data, ctx_size); return CEED_ERROR_SUCCESS; } @@ -168,18 +162,18 @@ static int CeedQFunctionContextRestoreData_Memcheck(CeedQFunctionContext ctx) { // QFunctionContext Restore Data Read-Only //------------------------------------------------------------------------------ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx) { - size_t ctx_size; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Memcheck *impl; + + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(!memcmp(impl->data, impl->data_read_only_copy, ctx_size), ceed, CEED_ERROR_BACKEND, "Context data changed while accessed in read-only mode"); CeedCallBackend(CeedFree(&impl->data_read_only_copy)); - return CEED_ERROR_SUCCESS; } @@ -187,12 +181,13 @@ static int CeedQFunctionContextRestoreDataRead_Memcheck(CeedQFunctionContext ctx // QFunctionContext destroy user data //------------------------------------------------------------------------------ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { - CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - CeedQFunctionContextDataDestroyUser data_destroy_function; + Ceed ceed; CeedMemType data_destroy_mem_type; + CeedQFunctionContextDataDestroyUser data_destroy_function; + CeedQFunctionContext_Memcheck *impl; + + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextGetDataDestroy(ctx, &data_destroy_mem_type, &data_destroy_function)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(data_destroy_mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only destroy HOST memory for this backend"); @@ -201,7 +196,6 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { CeedCallBackend(data_destroy_function(impl->data_borrowed ? impl->data_borrowed : impl->data_owned)); } CeedCallBackend(CeedFree(&impl->data_allocated)); - return CEED_ERROR_SUCCESS; } @@ -210,8 +204,8 @@ static int CeedQFunctionContextDataDestroy_Memcheck(CeedQFunctionContext ctx) { //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) { CeedQFunctionContext_Memcheck *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedFree(&impl->data_allocated)); CeedCallBackend(CeedFree(&impl->data_owned)); CeedCallBackend(CeedFree(&impl)); @@ -222,10 +216,10 @@ static int CeedQFunctionContextDestroy_Memcheck(CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) { - CeedQFunctionContext_Memcheck *impl; Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedQFunctionContext_Memcheck *impl; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Memcheck)); @@ -236,10 +230,8 @@ int CeedQFunctionContextCreate_Memcheck(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreDataRead_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "DataDestroy", CeedQFunctionContextDataDestroy_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Memcheck)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-serial.c b/backends/memcheck/ceed-memcheck-serial.c index 43ced60b9c..94233fa9cf 100644 --- a/backends/memcheck/ceed-memcheck-serial.c +++ b/backends/memcheck/ceed-memcheck-serial.c @@ -15,18 +15,18 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Memcheck(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self/memcheck") || !strcmp(resource, "/cpu/self/memcheck/serial"), ceed, CEED_ERROR_BACKEND, "Valgrind Memcheck backend cannot use resource: %s", resource); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Memcheck)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/memcheck/ceed-memcheck-vector.c b/backends/memcheck/ceed-memcheck-vector.c index e0cb3efc12..e5b7282537 100644 --- a/backends/memcheck/ceed-memcheck-vector.c +++ b/backends/memcheck/ceed-memcheck-vector.c @@ -19,10 +19,9 @@ //------------------------------------------------------------------------------ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_array) { CeedVector_Memcheck *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = impl->array; - return CEED_ERROR_SUCCESS; } @@ -30,9 +29,10 @@ static int CeedVectorHasValidArray_Memcheck(CeedVector vec, bool *has_valid_arra // Check if has borrowed array of given type //------------------------------------------------------------------------------ static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { + Ceed ceed; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); switch (mem_type) { @@ -45,7 +45,6 @@ static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec // LCOV_EXCL_STOP break; } - return CEED_ERROR_SUCCESS; } @@ -53,11 +52,12 @@ static inline int CeedVectorHasBorrowedArrayOfType_Memcheck(const CeedVector vec // Vector Set Array //------------------------------------------------------------------------------ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) { + Ceed ceed; + CeedSize length; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); @@ -90,7 +90,6 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee impl->array = impl->array_allocated; VALGRIND_DISCARD(impl->mem_block_id); impl->mem_block_id = VALGRIND_CREATE_BLOCK(impl->array, length * sizeof(array[0]), "'Vector backend array data copy'"); - return CEED_ERROR_SUCCESS; } @@ -98,9 +97,10 @@ static int CeedVectorSetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + Ceed ceed; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); @@ -110,7 +110,6 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce impl->array = NULL; VALGRIND_DISCARD(impl->mem_block_id); CeedCallBackend(CeedFree(&impl->array_allocated)); - return CEED_ERROR_SUCCESS; } @@ -118,15 +117,15 @@ static int CeedVectorTakeArray_Memcheck(CeedVector vec, CeedMemType mem_type, Ce // Vector Get Array //------------------------------------------------------------------------------ static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + Ceed ceed; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); *array = impl->array; - return CEED_ERROR_SUCCESS; } @@ -134,11 +133,12 @@ static int CeedVectorGetArray_Memcheck(CeedVector vec, CeedMemType mem_type, Cee // Vector Get Array Read //------------------------------------------------------------------------------ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { + Ceed ceed; + CeedSize length; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, (CeedScalar **)array)); @@ -148,7 +148,6 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type, CeedCallBackend(CeedCalloc(length, &impl->array_read_only_copy)); memcpy(impl->array_read_only_copy, *array, length * sizeof((*array)[0])); } - return CEED_ERROR_SUCCESS; } @@ -156,11 +155,12 @@ static int CeedVectorGetArrayRead_Memcheck(CeedVector vec, CeedMemType mem_type, // Vector Get Array Write //------------------------------------------------------------------------------ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + Ceed ceed; + CeedSize length; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); // Invalidate data to make sure no read occurs @@ -168,7 +168,6 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type CeedCallBackend(CeedVectorGetArray_Memcheck(vec, mem_type, array)); for (CeedSize i = 0; i < length; i++) (*array)[i] = NAN; impl->is_write_only_access = true; - return CEED_ERROR_SUCCESS; } @@ -176,11 +175,12 @@ static int CeedVectorGetArrayWrite_Memcheck(CeedVector vec, CeedMemType mem_type // Vector Restore Array //------------------------------------------------------------------------------ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) { + Ceed ceed; + CeedSize length; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); if (impl->is_write_only_access) { @@ -195,7 +195,6 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) { if (impl->array_owned) { memcpy(impl->array_owned, impl->array, length * sizeof(impl->array[0])); } - return CEED_ERROR_SUCCESS; } @@ -203,18 +202,18 @@ static int CeedVectorRestoreArray_Memcheck(CeedVector vec) { // Vector Restore Array Read-Only //------------------------------------------------------------------------------ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) { + Ceed ceed; + CeedSize length; CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(!memcmp(impl->array, impl->array_read_only_copy, length * sizeof(impl->array[0])), ceed, CEED_ERROR_BACKEND, "Array data changed while accessed in read-only mode"); CeedCallBackend(CeedFree(&impl->array_read_only_copy)); - return CEED_ERROR_SUCCESS; } @@ -223,8 +222,8 @@ static int CeedVectorRestoreArrayRead_Memcheck(CeedVector vec) { //------------------------------------------------------------------------------ static int CeedVectorDestroy_Memcheck(CeedVector vec) { CeedVector_Memcheck *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); VALGRIND_DISCARD(impl->mem_block_id); CeedCallBackend(CeedFree(&impl->array_allocated)); CeedCallBackend(CeedFree(&impl->array_owned)); @@ -236,10 +235,10 @@ static int CeedVectorDestroy_Memcheck(CeedVector vec) { // Vector Create //------------------------------------------------------------------------------ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) { - CeedVector_Memcheck *impl; Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedVector_Memcheck *impl; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Memcheck)); @@ -250,10 +249,8 @@ int CeedVectorCreate_Memcheck(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Memcheck)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Memcheck)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/opt/ceed-opt-blocked.c b/backends/opt/ceed-opt-blocked.c index ddc824dec4..2d095327c1 100644 --- a/backends/opt/ceed-opt-blocked.c +++ b/backends/opt/ceed-opt-blocked.c @@ -17,9 +17,9 @@ //------------------------------------------------------------------------------ static int CeedDestroy_Opt(Ceed ceed) { Ceed_Opt *data; + CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -27,29 +27,28 @@ static int CeedDestroy_Opt(Ceed ceed) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) { + Ceed ceed_ref; + const char fallback_resource[] = "/cpu/self/ref/serial"; + Ceed_Opt *data; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/opt") || !strcmp(resource, "/cpu/self/opt/blocked"), ceed, CEED_ERROR_BACKEND, "Opt backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback Ceed resource for advanced operator functionality - const char fallbackresource[] = "/cpu/self/ref/serial"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Opt)); // Set block size - Ceed_Opt *data; CeedCallBackend(CeedCalloc(1, &data)); - data->blk_size = 8; + data->block_size = 8; CeedCallBackend(CeedSetData(ceed, data)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/opt/ceed-opt-operator.c b/backends/opt/ceed-opt-operator.c index 1c69000e9a..5b38087a84 100644 --- a/backends/opt/ceed-opt-operator.c +++ b/backends/opt/ceed-opt-operator.c @@ -16,17 +16,16 @@ //------------------------------------------------------------------------------ // Setup Input/Output Fields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt blk_size, CeedElemRestriction *blk_restr, +static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool is_input, const CeedInt block_size, CeedElemRestriction *block_rstr, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { - CeedInt num_comp, size, P; - CeedSize e_size, q_size; - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction r; - CeedOperatorField *op_fields; + Ceed ceed; + CeedSize e_size, q_size; + CeedInt num_comp, size, P; CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); @@ -38,67 +37,74 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { CeedEvalMode eval_mode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedBasis basis; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { - Ceed ceed_rstr; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &r)); - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed_rstr)); - CeedSize l_size; - CeedInt num_elem, elem_size, comp_stride; - CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); - CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - + Ceed ceed_rstr; + CeedSize l_size; + CeedInt num_elem, elem_size, comp_stride; CeedRestrictionType rstr_type; - CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); + CeedElemRestriction rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed_rstr)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); switch (rstr_type) { case CEED_RESTRICTION_STANDARD: { const CeedInt *offsets = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionCreateBlocked(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, - CEED_COPY_VALUES, offsets, &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionCreateBlocked(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, + CEED_COPY_VALUES, offsets, &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); } break; case CEED_RESTRICTION_ORIENTED: { - const CeedInt *offsets = NULL; const bool *orients = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionGetOrientations(r, CEED_MEM_HOST, &orients)); - CeedCallBackend(CeedElemRestrictionCreateBlockedOriented(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, - CEED_MEM_HOST, CEED_COPY_VALUES, offsets, orients, &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); - CeedCallBackend(CeedElemRestrictionRestoreOrientations(r, &orients)); + const CeedInt *offsets = NULL; + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionGetOrientations(rstr, CEED_MEM_HOST, &orients)); + CeedCallBackend(CeedElemRestrictionCreateBlockedOriented(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, + CEED_MEM_HOST, CEED_COPY_VALUES, offsets, orients, &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); + CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr, &orients)); } break; case CEED_RESTRICTION_CURL_ORIENTED: { - const CeedInt *offsets = NULL; const CeedInt8 *curl_orients = NULL; - CeedCallBackend(CeedElemRestrictionGetOffsets(r, CEED_MEM_HOST, &offsets)); - CeedCallBackend(CeedElemRestrictionGetCurlOrientations(r, CEED_MEM_HOST, &curl_orients)); - CeedCallBackend(CeedElemRestrictionCreateBlockedCurlOriented(ceed_rstr, num_elem, elem_size, blk_size, num_comp, comp_stride, l_size, + const CeedInt *offsets = NULL; + + CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); + CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr, CEED_MEM_HOST, &curl_orients)); + CeedCallBackend(CeedElemRestrictionCreateBlockedCurlOriented(ceed_rstr, num_elem, elem_size, block_size, num_comp, comp_stride, l_size, CEED_MEM_HOST, CEED_COPY_VALUES, offsets, curl_orients, - &blk_restr[i + start_e])); - CeedCallBackend(CeedElemRestrictionRestoreOffsets(r, &offsets)); - CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(r, &curl_orients)); + &block_rstr[i + start_e])); + CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); + CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr, &curl_orients)); } break; case CEED_RESTRICTION_STRIDED: { CeedInt strides[3]; - CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); - CeedCallBackend( - CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, blk_size, num_comp, l_size, strides, &blk_restr[i + start_e])); + + CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed_rstr, num_elem, elem_size, block_size, num_comp, l_size, strides, + &block_rstr[i + start_e])); } break; } - CeedCallBackend(CeedElemRestrictionCreateVector(blk_restr[i + start_e], NULL, &e_vecs_full[i + start_e])); + CeedCallBackend(CeedElemRestrictionCreateVector(block_rstr[i + start_e], NULL, &e_vecs_full[i + start_e])); } switch (eval_mode) { case CEED_EVAL_NONE: CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - e_size = (CeedSize)Q * size * blk_size; + e_size = (CeedSize)Q * size * block_size; CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); - q_size = (CeedSize)Q * size * blk_size; + q_size = (CeedSize)Q * size * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_INTERP: @@ -109,16 +115,16 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - e_size = (CeedSize)P * num_comp * blk_size; + e_size = (CeedSize)P * num_comp * block_size; CeedCallBackend(CeedVectorCreate(ceed, e_size, &e_vecs[i])); - q_size = (CeedSize)Q * size * blk_size; + q_size = (CeedSize)Q * size * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_WEIGHT: // Only on input fields CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); - q_size = (CeedSize)Q * blk_size; + q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); break; } if (is_input && e_vecs[i]) { @@ -132,28 +138,30 @@ static int CeedOperatorSetupFields_Opt(CeedQFunction qf, CeedOperator op, bool i // Setup Operator //------------------------------------------------------------------------------ static int CeedOperatorSetup_Opt(CeedOperator op) { - bool is_setup_done; + Ceed ceed; + bool is_setup_done; + Ceed_Opt *ceed_impl; + CeedInt Q, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Opt *impl; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Opt *ceed_impl; CeedCallBackend(CeedGetData(ceed, &ceed_impl)); - const CeedInt blk_size = ceed_impl->blk_size; - CeedOperator_Opt *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, num_input_fields, num_output_fields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + const CeedInt block_size = ceed_impl->block_size; // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->blk_restr)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->block_rstr)); CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs_full)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->input_states)); @@ -167,29 +175,29 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { // Set up infield and outfield pointer arrays // Infields - CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, blk_size, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, true, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_in, impl->q_vecs_in, 0, num_input_fields, Q)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, blk_size, impl->blk_restr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, + CeedCallBackend(CeedOperatorSetupFields_Opt(qf, op, false, block_size, impl->block_rstr, impl->e_vecs_full, impl->e_vecs_out, impl->q_vecs_out, num_input_fields, num_output_fields, Q)); // Identity QFunctions if (impl->is_identity_qf) { CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { - impl->is_identity_restr_op = true; + impl->is_identity_rstr_op = true; } else { CeedCallBackend(CeedVectorReferenceCopy(impl->q_vecs_in[0], &impl->q_vecs_out[0])); } } CeedCallBackend(CeedOperatorSetSetupDone(op)); - return CEED_ERROR_SUCCESS; } @@ -199,11 +207,11 @@ static int CeedOperatorSetup_Opt(CeedOperator op) { static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, CeedVector in_vec, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) { - CeedEvalMode eval_mode; - CeedVector vec; - uint64_t state; - for (CeedInt i = 0; i < num_input_fields; i++) { + uint64_t state; + CeedEvalMode eval_mode; + CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { @@ -213,7 +221,7 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun // Restrict CeedCallBackend(CeedVectorGetState(vec, &state)); if (state != impl->input_states[i]) { - CeedCallBackend(CeedElemRestrictionApply(impl->blk_restr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); + CeedCallBackend(CeedElemRestrictionApply(impl->block_rstr[i], CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec @@ -235,36 +243,34 @@ static inline int CeedOperatorSetupInputs_Opt(CeedInt num_input_fields, CeedQFun // Input Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - CeedInt num_input_fields, CeedInt blk_size, CeedVector in_vec, bool skip_active, + CeedInt num_input_fields, CeedInt block_size, CeedVector in_vec, bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Opt *impl, CeedRequest *request) { - CeedInt elem_size, size, num_comp; - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - CeedVector vec; - for (CeedInt i = 0; i < num_input_fields; i++) { + bool is_active_input = false; + CeedInt elem_size, size, num_comp; + CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + CeedBasis basis; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Skip active input - if (skip_active) { - if (vec == CEED_VECTOR_ACTIVE) continue; - } + is_active_input = vec == CEED_VECTOR_ACTIVE; + if (skip_active && is_active_input) continue; - CeedInt active_in = 0; // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Restrict block active input - if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[i], e / blk_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request)); - active_in = 1; + if (is_active_input) { + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[i], e / block_size, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[i], request)); } // Basis action switch (eval_mode) { case CEED_EVAL_NONE: - if (!active_in) { + if (!is_active_input) { CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][e * Q * size])); } break; @@ -273,11 +279,11 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (!active_in) { + if (!is_active_input) { CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedVectorSetArray(impl->e_vecs_in[i], CEED_MEM_HOST, CEED_USE_POINTER, &e_data[i][e * elem_size * num_comp])); } - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs_in[i], impl->q_vecs_in[i])); break; case CEED_EVAL_WEIGHT: break; // No action @@ -290,16 +296,16 @@ static inline int CeedOperatorInputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunction // Output Basis Action //------------------------------------------------------------------------------ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, - CeedInt blk_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, + CeedInt block_size, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, CeedVector out_vec, CeedOperator_Opt *impl, CeedRequest *request) { - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - CeedVector vec; - for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + CeedBasis basis; + // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -310,7 +316,7 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio case CEED_EVAL_DIV: case CEED_EVAL_CURL: CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, blk_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); + CeedCallBackend(CeedBasisApply(basis, block_size, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs_out[i])); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { @@ -326,7 +332,7 @@ static inline int CeedOperatorOutputBasis_Opt(CeedInt e, CeedInt Q, CeedQFunctio if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict CeedCallBackend( - CeedElemRestrictionApplyBlock(impl->blk_restr[i + impl->num_inputs], e / blk_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); + CeedElemRestrictionApplyBlock(impl->block_rstr[i + impl->num_inputs], e / block_size, CEED_TRANSPOSE, impl->e_vecs_out[i], vec, request)); } return CEED_ERROR_SUCCESS; } @@ -339,6 +345,7 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, CeedQF for (CeedInt i = 0; i < num_input_fields; i++) { CeedEvalMode eval_mode; CeedVector vec; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (eval_mode != CEED_EVAL_WEIGHT && vec != CEED_VECTOR_ACTIVE) { @@ -352,34 +359,35 @@ static inline int CeedOperatorRestoreInputs_Opt(CeedInt num_input_fields, CeedQF // Operator Apply //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - Ceed ceed; + Ceed ceed; + Ceed_Opt *ceed_impl; + CeedInt Q, num_input_fields, num_output_fields, num_elem; + CeedEvalMode eval_mode; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Opt *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Opt *ceed_impl; CeedCallBackend(CeedGetData(ceed, &ceed_impl)); - CeedInt blk_size = ceed_impl->blk_size; - CeedOperator_Opt *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedInt Q, num_input_fields, num_output_fields, num_elem; CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; + const CeedInt block_size = ceed_impl->block_size; + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); // Setup CeedCallBackend(CeedOperatorSetup_Opt(op)); // Restriction only operator - if (impl->is_identity_restr_op) { - for (CeedInt b = 0; b < num_blks; b++) { - CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[0], b, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[0], request)); - CeedCallBackend(CeedElemRestrictionApplyBlock(impl->blk_restr[1], b, CEED_TRANSPOSE, impl->e_vecs_in[0], out_vec, request)); + if (impl->is_identity_rstr_op) { + for (CeedInt b = 0; b < num_blocks; b++) { + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[0], b, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_in[0], request)); + CeedCallBackend(CeedElemRestrictionApplyBlock(impl->block_rstr[1], b, CEED_TRANSPOSE, impl->e_vecs_in[0], out_vec, request)); } return CEED_ERROR_SUCCESS; } @@ -400,24 +408,23 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect } // Loop through elements - for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) { + for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) { // Input basis apply CeedCallBackend( - CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, in_vec, false, e_data, impl, request)); + CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, in_vec, false, e_data, impl, request)); // Q function if (!impl->is_identity_qf) { - CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); } - // Output basis apply and restrict - CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, blk_size, num_input_fields, num_output_fields, op, out_vec, - impl, request)); + // Output basis apply and restriction + CeedCallBackend(CeedOperatorOutputBasis_Opt(e, Q, qf_output_fields, op_output_fields, block_size, num_input_fields, num_output_fields, op, + out_vec, impl, request)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); - return CEED_ERROR_SUCCESS; } @@ -426,35 +433,36 @@ static int CeedOperatorApplyAdd_Opt(CeedOperator op, CeedVector in_vec, CeedVect //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed; + Ceed ceed; + Ceed_Opt *ceed_impl; + CeedSize q_size; + CeedInt Q, num_input_fields, num_output_fields, num_elem, size; + CeedScalar *l_vec_array, *e_data[2 * CEED_FIELD_MAX] = {0}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Opt *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Opt *ceed_impl; CeedCallBackend(CeedGetData(ceed, &ceed_impl)); - const CeedInt blk_size = ceed_impl->blk_size; - CeedSize q_size; - CeedOperator_Opt *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedInt Q, num_input_fields, num_output_fields, num_elem, size; CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedInt num_blks = (num_elem / blk_size) + !!(num_elem % blk_size); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedVector vec, l_vec = impl->qf_l_vec; - CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; - CeedVector *active_in = impl->qf_active_in; - CeedScalar *a, *tmp; - CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; + const CeedInt block_size = ceed_impl->block_size; + const CeedInt num_blocks = (num_elem / block_size) + !!(num_elem % block_size); + CeedInt num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + CeedVector l_vec = impl->qf_l_vec; + CeedVector *active_in = impl->qf_active_in; + CeedElemRestriction block_rstr = impl->qf_block_rstr; // Setup CeedCallBackend(CeedOperatorSetup_Opt(op)); // Check for identity - CeedCheck(!impl->is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity qfunctions not supported"); + CeedCheck(!impl->is_identity_qf, ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); // Input Evecs and Restriction CeedCallBackend(CeedOperatorSetupInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, NULL, e_data, impl, request)); @@ -462,21 +470,25 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b // Count number of active input fields if (!num_active_in) { for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * blk_size; + q_size = (CeedSize)Q * block_size; CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q * blk_size])); + CeedCallBackend( + CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q * block_size])); } num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } impl->num_active_in = num_active_in; @@ -486,6 +498,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b // Count number of active output fields if (!num_active_out) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output @@ -502,39 +516,43 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b // Setup l_vec if (!l_vec) { - CeedSize l_size = (CeedSize)blk_size * Q * num_active_in * num_active_out; + const CeedSize l_size = (CeedSize)block_size * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed, l_size, &l_vec)); CeedCallBackend(CeedVectorSetValue(l_vec, 0.0)); impl->qf_l_vec = l_vec; } + // Output blocked restriction + if (!block_rstr) { + CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + + CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, block_size, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, &block_rstr)); + impl->qf_block_rstr = block_rstr; + } + // Build objects if needed - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; if (build_objects) { + const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; + // Create output restriction CeedCallBackend(CeedElemRestrictionCreateStrided(ceed, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); - } - // Output blocked restriction - CeedElemRestriction blk_rstr = impl->qf_blk_rstr; - if (!blk_rstr) { - CeedCallBackend(CeedElemRestrictionCreateBlockedStrided(ceed, num_elem, Q, blk_size, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, &blk_rstr)); - impl->qf_blk_rstr = blk_rstr; + CeedCallBackend(CeedVectorCreate(ceed, l_size, assembled)); } // Loop through elements CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - for (CeedInt e = 0; e < num_blks * blk_size; e += blk_size) { - CeedCallBackend(CeedVectorGetArray(l_vec, CEED_MEM_HOST, &a)); + for (CeedInt e = 0; e < num_blocks * block_size; e += block_size) { + CeedCallBackend(CeedVectorGetArray(l_vec, CEED_MEM_HOST, &l_vec_array)); // Input basis apply CeedCallBackend( - CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, blk_size, NULL, true, e_data, impl, request)); + CeedOperatorInputBasis_Opt(e, Q, qf_input_fields, op_input_fields, num_input_fields, block_size, NULL, true, e_data, impl, request)); // Assemble QFunction for (CeedInt in = 0; in < num_active_in; in++) { @@ -545,26 +563,30 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b } // Set Outputs for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, l_vec_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - a += size * Q * blk_size; // Advance the pointer by the size of the output + l_vec_array += size * Q * block_size; // Advance the pointer by the size of the output } } // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * blk_size, impl->q_vecs_in, impl->q_vecs_out)); + CeedCallBackend(CeedQFunctionApply(qf, Q * block_size, impl->q_vecs_in, impl->q_vecs_out)); } // Assemble into assembled vector - CeedCallBackend(CeedVectorRestoreArray(l_vec, &a)); - CeedCallBackend(CeedElemRestrictionApplyBlock(blk_rstr, e / blk_size, CEED_TRANSPOSE, l_vec, *assembled, request)); + CeedCallBackend(CeedVectorRestoreArray(l_vec, &l_vec_array)); + CeedCallBackend(CeedElemRestrictionApplyBlock(block_rstr, e / block_size, CEED_TRANSPOSE, l_vec, *assembled, request)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output @@ -575,7 +597,6 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Opt(CeedOperator op, b // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Opt(num_input_fields, qf_input_fields, op_input_fields, e_data, impl)); - return CEED_ERROR_SUCCESS; } @@ -598,13 +619,13 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Opt(CeedOperator op, CeedVe //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Opt(CeedOperator op) { CeedOperator_Opt *impl; - CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { - CeedCallBackend(CeedElemRestrictionDestroy(&impl->blk_restr[i])); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->block_rstr[i])); CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } - CeedCallBackend(CeedFree(&impl->blk_restr)); + CeedCallBackend(CeedFree(&impl->block_rstr)); CeedCallBackend(CeedFree(&impl->e_vecs_full)); CeedCallBackend(CeedFree(&impl->input_states)); @@ -628,7 +649,7 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) { } CeedCallBackend(CeedFree(&impl->qf_active_in)); CeedCallBackend(CeedVectorDestroy(&impl->qf_l_vec)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_blk_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->qf_block_rstr)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -638,17 +659,18 @@ static int CeedOperatorDestroy_Opt(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Opt(CeedOperator op) { - Ceed ceed; + Ceed ceed; + Ceed_Opt *ceed_impl; + CeedOperator_Opt *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Opt *ceed_impl; CeedCallBackend(CeedGetData(ceed, &ceed_impl)); - CeedInt blk_size = ceed_impl->blk_size; - CeedOperator_Opt *impl; + const CeedInt block_size = ceed_impl->block_size; CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCheck(blk_size == 1 || blk_size == 8, ceed, CEED_ERROR_BACKEND, "Opt backend cannot use blocksize: %" CeedInt_FMT, blk_size); + CeedCheck(block_size == 1 || block_size == 8, ceed, CEED_ERROR_BACKEND, "Opt backend cannot use blocksize: %" CeedInt_FMT, block_size); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Opt)); diff --git a/backends/opt/ceed-opt-serial.c b/backends/opt/ceed-opt-serial.c index 3e982db306..b683773933 100644 --- a/backends/opt/ceed-opt-serial.c +++ b/backends/opt/ceed-opt-serial.c @@ -17,9 +17,9 @@ //------------------------------------------------------------------------------ static int CeedDestroy_Opt(Ceed ceed) { Ceed_Opt *data; + CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; } @@ -27,30 +27,29 @@ static int CeedDestroy_Opt(Ceed ceed) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Opt_Serial(const char *resource, Ceed ceed) { + Ceed ceed_ref; + const char fallback_resource[] = "/cpu/self/ref/serial"; + Ceed_Opt *data; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/opt/serial"), ceed, CEED_ERROR_BACKEND, "Opt backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/ref/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); // Set fallback Ceed resource for advanced operator functionality - const char fallbackresource[] = "/cpu/self/ref/serial"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Opt)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Opt)); // Set block size - Ceed_Opt *data; CeedCallBackend(CeedCalloc(1, &data)); - data->blk_size = 1; + data->block_size = 1; CeedCallBackend(CeedSetData(ceed, data)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/opt/ceed-opt-tensor.c b/backends/opt/ceed-opt-tensor.c index 17b9a5270f..8f5e5f93ae 100644 --- a/backends/opt/ceed-opt-tensor.c +++ b/backends/opt/ceed-opt-tensor.c @@ -17,6 +17,7 @@ static inline int CeedTensorContractApply_Core_Opt(CeedTensorContract contract, const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { CeedInt t_stride_0 = B, t_stride_1 = 1; + if (t_mode == CEED_TRANSPOSE) { t_stride_0 = 1; t_stride_1 = J; @@ -30,7 +31,6 @@ static inline int CeedTensorContractApply_Core_Opt(CeedTensorContract contract, } } } - return CEED_ERROR_SUCCESS; } @@ -45,7 +45,6 @@ static int CeedTensorContractApply_Opt(CeedTensorContract contract, CeedInt A, C if (C == 1) return CeedTensorContractApply_Core_Opt(contract, A, B, 1, J, t, t_mode, add, u, v); else return CeedTensorContractApply_Core_Opt(contract, A, B, C, J, t, t_mode, add, u, v); - return CEED_ERROR_SUCCESS; } @@ -54,10 +53,9 @@ static int CeedTensorContractApply_Opt(CeedTensorContract contract, CeedInt A, C //------------------------------------------------------------------------------ int CeedTensorContractCreate_Opt(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Opt)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/opt/ceed-opt.h b/backends/opt/ceed-opt.h index 9f2b15e63b..49ffe4f876 100644 --- a/backends/opt/ceed-opt.h +++ b/backends/opt/ceed-opt.h @@ -14,7 +14,7 @@ #include typedef struct { - CeedInt blk_size; + CeedInt block_size; } Ceed_Opt; typedef struct { @@ -22,8 +22,8 @@ typedef struct { } CeedBasis_Opt; typedef struct { - bool is_identity_qf, is_identity_restr_op; - CeedElemRestriction *blk_restr; /* Blocked versions of restrictions */ + bool is_identity_qf, is_identity_rstr_op; + CeedElemRestriction *block_rstr; /* Blocked versions of restrictions */ CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ uint64_t *input_states; /* State counter of inputs */ CeedVector *e_vecs_in; /* Element block input E-vectors */ @@ -34,7 +34,7 @@ typedef struct { CeedInt num_active_in, num_active_out; CeedVector *qf_active_in; CeedVector qf_l_vec; - CeedElemRestriction qf_blk_rstr; + CeedElemRestriction qf_block_rstr; } CeedOperator_Opt; CEED_INTERN int CeedTensorContractCreate_Opt(CeedBasis basis, CeedTensorContract contract); diff --git a/backends/ref/ceed-ref-basis.c b/backends/ref/ceed-ref-basis.c index 9b2d31485b..b6e4062764 100644 --- a/backends/ref/ceed-ref-basis.c +++ b/backends/ref/ceed-ref-basis.c @@ -17,19 +17,23 @@ // Basis Apply //------------------------------------------------------------------------------ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector U, CeedVector V) { - Ceed ceed; + Ceed ceed; + bool is_tensor_basis; + CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; + const CeedInt add = (t_mode == CEED_TRANSPOSE); + const CeedScalar *u; + CeedScalar *v; + CeedTensorContract contract; + CeedBasis_Ref *impl; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedInt dim, num_comp, q_comp, num_nodes, num_qpts; + CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedTensorContract contract; CeedCallBackend(CeedBasisGetTensorContract(basis, &contract)); - const CeedInt add = (t_mode == CEED_TRANSPOSE); - const CeedScalar *u; - CeedScalar *v; if (U != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(U, CEED_MEM_HOST, &u)); else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(V, CEED_MEM_HOST, &v)); @@ -37,24 +41,25 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // Clear v if operating in transpose if (t_mode == CEED_TRANSPOSE) { const CeedInt v_size = num_elem * num_comp * num_nodes; + for (CeedInt i = 0; i < v_size; i++) v[i] = (CeedScalar)0.0; } - bool is_tensor_basis; + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor_basis)); if (is_tensor_basis) { // Tensor basis CeedInt P_1d, Q_1d; + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); switch (eval_mode) { // Interpolate to/from quadrature points case CEED_EVAL_INTERP: { - CeedBasis_Ref *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); if (impl->has_collo_interp) { memcpy(v, u, num_elem * num_comp * num_nodes * sizeof(u[0])); } else { CeedInt P = P_1d, Q = Q_1d; + if (t_mode == CEED_TRANSPOSE) { P = Q_1d; Q = P_1d; @@ -62,6 +67,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; CeedScalar tmp[2][num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; const CeedScalar *interp_1d; + CeedCallBackend(CeedBasisGetInterp1D(basis, &interp_1d)); for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, interp_1d, t_mode, add && (d == dim - 1), d == 0 ? u : tmp[d % 2], @@ -78,18 +84,19 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // v has shape [dim, num_comp, Q^dim, num_elem], row-major layout // In CEED_TRANSPOSE mode, the sizes of u and v are switched. CeedInt P = P_1d, Q = Q_1d; + if (t_mode == CEED_TRANSPOSE) { P = Q_1d; Q = Q_1d; } - CeedBasis_Ref *impl; - CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; const CeedScalar *interp_1d; + CeedCallBackend(CeedBasisGetInterp1D(basis, &interp_1d)); if (impl->collo_grad_1d) { CeedScalar tmp[2][num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; CeedScalar interp[num_elem * num_comp * Q * CeedIntPow(P > Q ? P : Q, dim - 1)]; + // Interpolate to quadrature points (NoTranspose) // or Grad to quadrature points (Transpose) for (CeedInt d = 0; d < dim; d++) { @@ -118,10 +125,12 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } } else if (impl->has_collo_interp) { // Qpts collocated with nodes const CeedScalar *grad_1d; + CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d)); // Dim contractions, identity in other directions CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; + for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply(contract, pre, P, post, Q, grad_1d, t_mode, add && (d > 0), t_mode == CEED_NOTRANSPOSE ? u : u + d * num_comp * num_qpts * num_elem, @@ -131,6 +140,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } } else { // Underintegration, P > Q const CeedScalar *grad_1d; + CeedCallBackend(CeedBasisGetGrad1D(basis, &grad_1d)); if (t_mode == CEED_TRANSPOSE) { @@ -142,6 +152,7 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo // Dim**2 contractions, apply grad when pass == dim for (CeedInt p = 0; p < dim; p++) { CeedInt pre = num_comp * CeedIntPow(P, dim - 1), post = num_elem; + for (CeedInt d = 0; d < dim; d++) { CeedCallBackend(CeedTensorContractApply( contract, pre, P, post, Q, (p == d) ? grad_1d : interp_1d, t_mode, add && (d == dim - 1), @@ -155,16 +166,19 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } break; // Retrieve interpolation weights case CEED_EVAL_WEIGHT: { - CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedInt Q = Q_1d; const CeedScalar *q_weight_1d; + + CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight_1d)); for (CeedInt d = 0; d < dim; d++) { CeedInt pre = CeedIntPow(Q, dim - d - 1), post = CeedIntPow(Q, d); + for (CeedInt i = 0; i < pre; i++) { for (CeedInt j = 0; j < Q; j++) { for (CeedInt k = 0; k < post; k++) { - CeedScalar w = q_weight_1d[j] * (d == 0 ? 1 : v[((i * Q + j) * post + k) * num_elem]); + const CeedScalar w = q_weight_1d[j] * (d == 0 ? 1 : v[((i * Q + j) * post + k) * num_elem]); + for (CeedInt e = 0; e < num_elem; e++) v[((i * Q + j) * post + k) * num_elem + e] = w; } } @@ -186,35 +200,41 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo } else { // Non-tensor basis CeedInt P = num_nodes, Q = num_qpts; + switch (eval_mode) { // Interpolate to/from quadrature points case CEED_EVAL_INTERP: { const CeedScalar *interp; + CeedCallBackend(CeedBasisGetInterp(basis, &interp)); CeedCallBackend(CeedTensorContractStridedApply(contract, num_comp, P, num_elem, q_comp, Q, interp, t_mode, add, u, v)); } break; // Evaluate the gradient to/from quadrature points case CEED_EVAL_GRAD: { const CeedScalar *grad; + CeedCallBackend(CeedBasisGetGrad(basis, &grad)); CeedCallBackend(CeedTensorContractStridedApply(contract, num_comp, P, num_elem, q_comp, Q, grad, t_mode, add, u, v)); } break; // Evaluate the divergence to/from the quadrature points case CEED_EVAL_DIV: { const CeedScalar *div; + CeedCallBackend(CeedBasisGetDiv(basis, &div)); CeedCallBackend(CeedTensorContractStridedApply(contract, num_comp, P, num_elem, q_comp, Q, div, t_mode, add, u, v)); } break; // Evaluate the curl to/from the quadrature points case CEED_EVAL_CURL: { const CeedScalar *curl; + CeedCallBackend(CeedBasisGetCurl(basis, &curl)); CeedCallBackend(CeedTensorContractStridedApply(contract, num_comp, P, num_elem, q_comp, Q, curl, t_mode, add, u, v)); } break; // Retrieve interpolation weights case CEED_EVAL_WEIGHT: { - CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); const CeedScalar *q_weight; + + CeedCheck(t_mode == CEED_NOTRANSPOSE, ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE"); CeedCallBackend(CeedBasisGetQWeights(basis, &q_weight)); for (CeedInt i = 0; i < num_qpts; i++) { for (CeedInt e = 0; e < num_elem; e++) v[i * num_elem + e] = q_weight[i]; @@ -231,7 +251,6 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo CeedCallBackend(CeedVectorRestoreArrayRead(U, &u)); } CeedCallBackend(CeedVectorRestoreArray(V, &v)); - return CEED_ERROR_SUCCESS; } @@ -240,10 +259,10 @@ static int CeedBasisApply_Ref(CeedBasis basis, CeedInt num_elem, CeedTransposeMo //------------------------------------------------------------------------------ static int CeedBasisDestroyTensor_Ref(CeedBasis basis) { CeedBasis_Ref *impl; + CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedFree(&impl->collo_grad_1d)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -252,20 +271,25 @@ static int CeedBasisDestroyTensor_Ref(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; + Ceed ceed, ceed_parent; + CeedBasis_Ref *impl; + CeedTensorContract contract; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedBasis_Ref *impl; + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedCalloc(1, &impl)); // Check for collocated interp if (Q_1d == P_1d) { - bool collocated = 1; + bool has_collocated = true; + for (CeedInt i = 0; i < P_1d; i++) { - collocated = collocated && (fabs(interp_1d[i + P_1d * i] - 1.0) < 1e-14); + has_collocated = has_collocated && (fabs(interp_1d[i + P_1d * i] - 1.0) < 1e-14); for (CeedInt j = 0; j < P_1d; j++) { - if (j != i) collocated = collocated && (fabs(interp_1d[j + P_1d * i]) < 1e-14); + if (j != i) has_collocated = has_collocated && (fabs(interp_1d[j + P_1d * i]) < 1e-14); } } - impl->has_collo_interp = collocated; + impl->has_collo_interp = has_collocated; } // Calculate collocated grad if (Q_1d >= P_1d && !impl->has_collo_interp) { @@ -274,15 +298,11 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C } CeedCallBackend(CeedBasisSetData(basis, impl)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - CeedTensorContract contract; - CeedCallBackend(CeedTensorContractCreate(parent, basis, &contract)); + CeedCallBackend(CeedTensorContractCreate(ceed_parent, basis, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyTensor_Ref)); - return CEED_ERROR_SUCCESS; } @@ -291,17 +311,16 @@ int CeedBasisCreateTensorH1_Ref(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C //------------------------------------------------------------------------------ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; + Ceed ceed, ceed_parent; + CeedTensorContract contract; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - CeedTensorContract contract; - CeedCallBackend(CeedTensorContractCreate(parent, basis, &contract)); + CeedCallBackend(CeedTensorContractCreate(ceed_parent, basis, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); - return CEED_ERROR_SUCCESS; } @@ -310,17 +329,16 @@ int CeedBasisCreateH1_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, //------------------------------------------------------------------------------ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; + Ceed ceed, ceed_parent; + CeedTensorContract contract; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - CeedTensorContract contract; - CeedCallBackend(CeedTensorContractCreate(parent, basis, &contract)); + CeedCallBackend(CeedTensorContractCreate(ceed_parent, basis, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); - return CEED_ERROR_SUCCESS; } @@ -329,17 +347,16 @@ int CeedBasisCreateHdiv_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_node //------------------------------------------------------------------------------ int CeedBasisCreateHcurl_Ref(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; + Ceed ceed, ceed_parent; + CeedTensorContract contract; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - Ceed parent; - CeedCallBackend(CeedGetParent(ceed, &parent)); - CeedTensorContract contract; - CeedCallBackend(CeedTensorContractCreate(parent, basis, &contract)); + CeedCallBackend(CeedTensorContractCreate(ceed_parent, basis, &contract)); CeedCallBackend(CeedBasisSetTensorContract(basis, contract)); CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Ref)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-operator.c b/backends/ref/ceed-ref-operator.c index e95b0b9c53..240c6de744 100644 --- a/backends/ref/ceed-ref-operator.c +++ b/backends/ref/ceed-ref-operator.c @@ -18,14 +18,13 @@ //------------------------------------------------------------------------------ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs_full, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, CeedInt num_fields, CeedInt Q) { - CeedInt num_comp, size, P; - CeedSize e_size, q_size; - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction elem_restr; - CeedOperatorField *op_fields; + Ceed ceed; + CeedSize e_size, q_size; + CeedInt num_comp, size, P; CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); if (is_input) { CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); @@ -36,12 +35,14 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i // Loop over fields for (CeedInt i = 0; i < num_fields; i++) { - CeedEvalMode eval_mode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionCreateVector(elem_restr, NULL, &e_vecs_full[i + start_e])); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs_full[i + start_e])); } switch (eval_mode) { @@ -78,21 +79,23 @@ static int CeedOperatorSetupFields_Ref(CeedQFunction qf, CeedOperator op, bool i // Setup Operator //------------------------------------------------------------------------------/* static int CeedOperatorSetup_Ref(CeedOperator op) { - bool is_setup_done; + bool is_setup_done; + Ceed ceed; + CeedInt Q, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Ref *impl; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Ref *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, num_input_fields, num_output_fields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedQFunctionIsIdentity(qf, &impl->is_identity_qf)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate @@ -118,19 +121,19 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { if (impl->is_identity_qf) { CeedEvalMode in_mode, out_mode; CeedQFunctionField *in_fields, *out_fields; + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &in_fields, NULL, &out_fields)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(in_fields[0], &in_mode)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(out_fields[0], &out_mode)); if (in_mode == CEED_EVAL_NONE && out_mode == CEED_EVAL_NONE) { - impl->is_identity_restr_op = true; + impl->is_identity_rstr_op = true; } else { CeedCallBackend(CeedVectorReferenceCopy(impl->q_vecs_in[0], &impl->q_vecs_out[0])); } } CeedCallBackend(CeedOperatorSetSetupDone(op)); - return CEED_ERROR_SUCCESS; } @@ -140,12 +143,12 @@ static int CeedOperatorSetup_Ref(CeedOperator op) { static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, CeedVector in_vec, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl, CeedRequest *request) { - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_restr; - uint64_t state; - for (CeedInt i = 0; i < num_input_fields; i++) { + uint64_t state; + CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { @@ -161,8 +164,8 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun CeedCallBackend(CeedVectorGetState(vec, &state)); // Skip restriction if input is unchanged if (state != impl->input_states[i] || vec == in_vec) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs_full[i], request)); impl->input_states[i] = state; } // Get evec @@ -178,21 +181,22 @@ static inline int CeedOperatorSetupInputs_Ref(CeedInt num_input_fields, CeedQFun static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { - CeedInt elem_size, size, num_comp; - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size, num_comp; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + // Skip active input if (skip_active) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action @@ -222,15 +226,15 @@ static inline int CeedOperatorInputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunction static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctionField *qf_output_fields, CeedOperatorField *op_output_fields, CeedInt num_input_fields, CeedInt num_output_fields, CeedOperator op, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { - CeedInt elem_size, num_comp; - CeedElemRestriction elem_restr; - CeedEvalMode eval_mode; - CeedBasis basis; - for (CeedInt i = 0; i < num_output_fields; i++) { + CeedInt elem_size, num_comp; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + // Get elem_size, eval_mode - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_restr, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Basis action switch (eval_mode) { @@ -249,6 +253,7 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio // LCOV_EXCL_START case CEED_EVAL_WEIGHT: { Ceed ceed; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); // LCOV_EXCL_STOP @@ -263,12 +268,13 @@ static inline int CeedOperatorOutputBasis_Ref(CeedInt e, CeedInt Q, CeedQFunctio //------------------------------------------------------------------------------ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, const bool skip_active, CeedScalar *e_data_full[2 * CEED_FIELD_MAX], CeedOperator_Ref *impl) { - CeedEvalMode eval_mode; - for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode; + // Skip active inputs if (skip_active) { CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } @@ -286,31 +292,32 @@ static inline int CeedOperatorRestoreInputs_Ref(CeedInt num_input_fields, CeedQF // Operator Apply //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { - CeedOperator_Ref *impl; + CeedInt Q, num_elem, num_input_fields, num_output_fields, size; + CeedEvalMode eval_mode; + CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Ref *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, num_elem, num_input_fields, num_output_fields, size; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_restr; - CeedScalar *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; // Setup CeedCallBackend(CeedOperatorSetup_Ref(op)); // Restriction only operator - if (impl->is_identity_restr_op) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_restr)); - CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_restr)); - CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); + if (impl->is_identity_rstr_op) { + CeedElemRestriction elem_rstr; + + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[0], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_NOTRANSPOSE, in_vec, impl->e_vecs_full[0], request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[0], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[0], out_vec, request)); return CEED_ERROR_SUCCESS; } @@ -348,6 +355,9 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect // Output restriction for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + CeedElemRestriction elem_rstr; + // Restore Evec CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs_full[i + impl->num_inputs], &e_data_full[i + num_input_fields])); // Get output vector @@ -355,13 +365,12 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect // Active if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_restr)); - CeedCallBackend(CeedElemRestrictionApply(elem_restr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionApply(elem_rstr, CEED_TRANSPOSE, impl->e_vecs_full[i + impl->num_inputs], vec, request)); } // Restore input arrays CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, false, e_data_full, impl)); - return CEED_ERROR_SUCCESS; } @@ -370,17 +379,18 @@ static int CeedOperatorApplyAdd_Ref(CeedOperator op, CeedVector in_vec, CeedVect //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed, ceed_parent; - CeedOperator_Ref *impl; - CeedQFunction qf; + Ceed ceed, ceed_ceed_parent; + CeedSize q_size; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; + CeedScalar *assembled_array, *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; + CeedVector *active_in; CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; CeedOperatorField *op_input_fields, *op_output_fields; - CeedVector vec, *active_in; - CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; - CeedSize q_size; - CeedScalar *a, *tmp, *e_data_full[2 * CEED_FIELD_MAX] = {NULL}; + CeedOperator_Ref *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); + CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); active_in = impl->qf_active_in; num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; @@ -402,21 +412,24 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b // Count number of active input fields if (!num_active_in) { for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &tmp)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_HOST, &q_vec_array)); CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { q_size = (CeedSize)Q; CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); - CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &tmp[field * Q])); + CeedCallBackend(CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_HOST, CEED_USE_POINTER, &q_vec_array[field * Q])); } num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &tmp)); + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } impl->num_active_in = num_active_in; @@ -426,6 +439,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b // Count number of active output fields if (!num_active_out) { for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output @@ -442,17 +457,18 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b // Build objects if needed if (build_objects) { + const CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; /* *NOPAD* */ + // Create output restriction - CeedInt strides[3] = {1, Q, num_active_in * num_active_out * Q}; /* *NOPAD* */ - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_ceed_parent, num_elem, Q, num_active_in * num_active_out, num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); + CeedCallBackend(CeedVectorCreate(ceed_ceed_parent, l_size, assembled)); } // Clear output vector CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_HOST, &a)); + CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_HOST, &assembled_array)); // Loop through elements for (CeedInt e = 0; e < num_elem; e++) { @@ -468,13 +484,15 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b } // Set Outputs for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, a)); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_HOST, CEED_USE_POINTER, assembled_array)); CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); - a += size * Q; // Advance the pointer by the size of the output + assembled_array += size * Q; // Advance the pointer by the size of the output } } // Apply QFunction @@ -484,6 +502,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b // Un-set output Qvecs to prevent accidental overwrite of Assembled for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output @@ -496,8 +516,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Ref(CeedOperator op, b CeedCallBackend(CeedOperatorRestoreInputs_Ref(num_input_fields, qf_input_fields, op_input_fields, true, e_data_full, impl)); // Restore output - CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); - + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); return CEED_ERROR_SUCCESS; } @@ -520,8 +539,8 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Ref(CeedOperator op, CeedVe //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Ref(CeedOperator op) { CeedOperator_Ref *impl; - CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); for (CeedInt i = 0; i < impl->num_inputs + impl->num_outputs; i++) { CeedCallBackend(CeedVectorDestroy(&impl->e_vecs_full[i])); } @@ -556,13 +575,12 @@ static int CeedOperatorDestroy_Ref(CeedOperator op) { // Operator Create //------------------------------------------------------------------------------ int CeedOperatorCreate_Ref(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Ref *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunction", CeedOperatorLinearAssembleQFunction_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "LinearAssembleQFunctionUpdate", CeedOperatorLinearAssembleQFunctionUpdate_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Operator", op, "ApplyAdd", CeedOperatorApplyAdd_Ref)); diff --git a/backends/ref/ceed-ref-qfunction.c b/backends/ref/ceed-ref-qfunction.c index f976181079..6b408f8483 100644 --- a/backends/ref/ceed-ref-qfunction.c +++ b/backends/ref/ceed-ref-qfunction.c @@ -15,16 +15,14 @@ // QFunction Apply //------------------------------------------------------------------------------ static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { + void *ctx_data = NULL; + CeedInt num_in, num_out; + CeedQFunctionUser f = NULL; CeedQFunction_Ref *impl; - CeedCallBackend(CeedQFunctionGetData(qf, &impl)); - void *ctx_data = NULL; + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); CeedCallBackend(CeedQFunctionGetContextData(qf, CEED_MEM_HOST, &ctx_data)); - - CeedQFunctionUser f = NULL; CeedCallBackend(CeedQFunctionGetUserFunction(qf, &f)); - - CeedInt num_in, num_out; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_in, &num_out)); for (CeedInt i = 0; i < num_in; i++) { @@ -43,7 +41,6 @@ static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce CeedCallBackend(CeedVectorRestoreArray(V[i], &impl->outputs[i])); } CeedCallBackend(CeedQFunctionRestoreContextData(qf, &ctx_data)); - return CEED_ERROR_SUCCESS; } @@ -52,12 +49,11 @@ static int CeedQFunctionApply_Ref(CeedQFunction qf, CeedInt Q, CeedVector *U, Ce //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Ref(CeedQFunction qf) { CeedQFunction_Ref *impl; - CeedCallBackend(CeedQFunctionGetData(qf, &impl)); + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); CeedCallBackend(CeedFree(&impl->inputs)); CeedCallBackend(CeedFree(&impl->outputs)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -65,18 +61,16 @@ static int CeedQFunctionDestroy_Ref(CeedQFunction qf) { // QFunction Create //------------------------------------------------------------------------------ int CeedQFunctionCreate_Ref(CeedQFunction qf) { - Ceed ceed; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - + Ceed ceed; CeedQFunction_Ref *impl; + + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->inputs)); CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->outputs)); CeedCallBackend(CeedQFunctionSetData(qf, impl)); - CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Ref)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-qfunctioncontext.c b/backends/ref/ceed-ref-qfunctioncontext.c index 83048981bf..d378c2ce7f 100644 --- a/backends/ref/ceed-ref-qfunctioncontext.c +++ b/backends/ref/ceed-ref-qfunctioncontext.c @@ -17,10 +17,9 @@ //------------------------------------------------------------------------------ static int CeedQFunctionContextHasValidData_Ref(CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Ref *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); *has_valid_data = impl->data; - return CEED_ERROR_SUCCESS; } @@ -28,11 +27,11 @@ static int CeedQFunctionContextHasValidData_Ref(CeedQFunctionContext ctx, bool * // QFunctionContext has borrowed data //------------------------------------------------------------------------------ static int CeedQFunctionContextHasBorrowedDataOfType_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { + Ceed ceed; CeedQFunctionContext_Ref *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_data_of_type = impl->data_borrowed; @@ -43,7 +42,6 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Ref(CeedQFunctionContext ct // LCOV_EXCL_STOP break; } - return CEED_ERROR_SUCCESS; } @@ -51,11 +49,12 @@ static int CeedQFunctionContextHasBorrowedDataOfType_Ref(CeedQFunctionContext ct // QFunctionContext Set Data //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, CeedCopyMode copy_mode, void *data) { + Ceed ceed; + size_t ctx_size; CeedQFunctionContext_Ref *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - size_t ctx_size; CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); @@ -84,9 +83,10 @@ static int CeedQFunctionContextSetData_Ref(CeedQFunctionContext ctx, CeedMemType // QFunctionContext Take Data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + Ceed ceed; CeedQFunctionContext_Ref *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); @@ -94,7 +94,6 @@ static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, CeedMemTyp *(void **)data = impl->data; impl->data_borrowed = NULL; impl->data = NULL; - return CEED_ERROR_SUCCESS; } @@ -102,15 +101,15 @@ static int CeedQFunctionContextTakeData_Ref(CeedQFunctionContext ctx, CeedMemTyp // QFunctionContext Get Data //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Ref(CeedQFunctionContext ctx, CeedMemType mem_type, void *data) { + Ceed ceed; CeedQFunctionContext_Ref *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, (void *)&impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); *(void **)data = impl->data; - return CEED_ERROR_SUCCESS; } @@ -124,8 +123,8 @@ static int CeedQFunctionContextRestoreData_Ref(CeedQFunctionContext ctx) { retur //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Ref(CeedQFunctionContext ctx) { CeedQFunctionContext_Ref *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedFree(&impl->data_owned)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -135,10 +134,10 @@ static int CeedQFunctionContextDestroy_Ref(CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) { - CeedQFunctionContext_Ref *impl; Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedQFunctionContext_Ref *impl; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Ref)); @@ -148,10 +147,8 @@ int CeedQFunctionContextCreate_Ref(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreData", CeedQFunctionContextRestoreData_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "RestoreDataRead", CeedQFunctionContextRestoreData_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Ref)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c index f69b903cde..63082bc9a8 100644 --- a/backends/ref/ceed-ref-restriction.c +++ b/backends/ref/ceed-ref-restriction.c @@ -16,20 +16,21 @@ //------------------------------------------------------------------------------ // Core ElemRestriction Apply Code //------------------------------------------------------------------------------ -static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // No offsets provided, identity restriction bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (has_backend_strides) { // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if branch is left separate to allow better inlining - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * num_comp]; } } @@ -38,12 +39,13 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe } else { // User provided strides CeedInt strides[3]; + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * strides[2]]; } } @@ -53,32 +55,34 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Default restriction with offsets CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * blk_size; i++) { - vv[elem_size * (k * blk_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; + CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) { + vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; } } } return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with orientations CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { - CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * blk_size; i++) { - vv[elem_size * (k * blk_size + e * num_comp) + i - v_offset] = + CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) { + vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); } } @@ -86,40 +90,41 @@ static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemR return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with tridiagonal transformation CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedInt n = 0; - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size] + - uu[impl->offsets[j + (n + 1) * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 2) * blk_size + e * 3 * elem_size]; + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + + uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; } for (n = 1; n < elem_size - 1; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + (n - 1) * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 0) * blk_size + e * 3 * elem_size] + - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size] + - uu[impl->offsets[j + (n + 1) * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 2) * blk_size + e * 3 * elem_size]; + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + + uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; } } - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + (n - 1) * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 0) * blk_size + e * 3 * elem_size] + - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]; + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; } } } @@ -127,59 +132,62 @@ static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedE } static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, - const CeedInt blk_size, const CeedInt comp_stride, CeedInt start, + const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with (unsigned) tridiagonal transformation CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedInt n = 0; - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]) + - uu[impl->offsets[j + (n + 1) * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 2) * blk_size + e * 3 * elem_size]); + + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + + uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); } for (n = 1; n < elem_size - 1; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + (n - 1) * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 0) * blk_size + e * 3 * elem_size]) + - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]) + - uu[impl->offsets[j + (n + 1) * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 2) * blk_size + e * 3 * elem_size]); + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + + uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); } } - CeedPragmaSIMD for (CeedInt j = 0; j < blk_size; j++) { - vv[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] = - uu[impl->offsets[j + (n - 1) * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 0) * blk_size + e * 3 * elem_size]) + - uu[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]); + CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) { + vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = + uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + + uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); } } } return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // No offsets provided, identity restriction bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (has_backend_strides) { // CPU backend strides are {1, elem_size, elem_size*num_comp} // This if brach is left separate to allow better inlining - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem - e); j++) { - vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset]; + CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { + vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; } } } @@ -187,13 +195,14 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest } else { // User provided strides CeedInt strides[3]; + CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) { CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) { - CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(blk_size, num_elem - e); j++) { + CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset]; + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; } } } @@ -202,18 +211,19 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyStandardTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyStandardTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Default restriction with offsets CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { for (CeedInt k = 0; k < num_comp; k++) { - for (CeedInt i = 0; i < elem_size * blk_size; i += blk_size) { + for (CeedInt i = 0; i < elem_size * block_size; i += block_size) { // Iteration bound set to discard padding elements - for (CeedInt j = i; j < i + CeedIntMin(blk_size, num_elem - e); j++) { - vv[impl->offsets[j + e * elem_size] + k * comp_stride] += uu[elem_size * (k * blk_size + e * num_comp) + j - v_offset]; + for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) { + vv[impl->offsets[j + e * elem_size] + k * comp_stride] += uu[elem_size * (k * block_size + e * num_comp) + j - v_offset]; } } } @@ -221,19 +231,20 @@ static inline int CeedElemRestrictionApplyStandardTranspose_Ref_Core(CeedElemRes return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with orientations CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { for (CeedInt k = 0; k < num_comp; k++) { - for (CeedInt i = 0; i < elem_size * blk_size; i += blk_size) { + for (CeedInt i = 0; i < elem_size * block_size; i += block_size) { // Iteration bound set to discard padding elements - for (CeedInt j = i; j < i + CeedIntMin(blk_size, num_elem - e); j++) { + for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) { vv[impl->offsets[j + e * elem_size] + k * comp_stride] += - uu[elem_size * (k * blk_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0); + uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0); } } } @@ -241,40 +252,41 @@ static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRes return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, +static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with tridiagonal transformation CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { for (CeedInt k = 0; k < num_comp; k++) { // Iteration bound set to discard padding elements - CeedInt blk_end = CeedIntMin(blk_size, num_elem - e), n = 0; - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size] + - uu[e * elem_size * num_comp + (k * elem_size + n + 1) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n + 3) * blk_size + e * 3 * elem_size]; + CeedInt block_end = CeedIntMin(block_size, num_elem - e), n = 0; + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + + uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; } for (n = 1; n < elem_size - 1; n++) { - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n - 1) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n - 1) * blk_size + e * 3 * elem_size] + - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size] + - uu[e * elem_size * num_comp + (k * elem_size + n + 1) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n + 3) * blk_size + e * 3 * elem_size]; + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] + + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + + uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; } } - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n - 1) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n - 1) * blk_size + e * 3 * elem_size] + - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]; + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] + + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; } } } @@ -282,59 +294,63 @@ static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedEle } static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, - const CeedInt blk_size, const CeedInt comp_stride, CeedInt start, + const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { // Restriction with (unsigned) tridiagonal transformation CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - for (CeedInt e = start * blk_size; e < stop * blk_size; e += blk_size) { + for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { for (CeedInt k = 0; k < num_comp; k++) { // Iteration bound set to discard padding elements - CeedInt blk_end = CeedIntMin(blk_size, num_elem - e), n = 0; - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]) + - uu[e * elem_size * num_comp + (k * elem_size + n + 1) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n + 3) * blk_size + e * 3 * elem_size]); + CeedInt n = 0; + const CeedInt block_end = CeedIntMin(block_size, num_elem - e); + + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + + uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); } for (n = 1; n < elem_size - 1; n++) { - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n - 1) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n - 1) * blk_size + e * 3 * elem_size]) + - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]) + - uu[e * elem_size * num_comp + (k * elem_size + n + 1) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n + 3) * blk_size + e * 3 * elem_size]); + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) + + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + + uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); } } - for (CeedInt j = 0; j < blk_end; j++) { - vv[impl->offsets[j + n * blk_size + e * elem_size] + k * comp_stride] += - uu[e * elem_size * num_comp + (k * elem_size + n - 1) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n - 1) * blk_size + e * 3 * elem_size]) + - uu[e * elem_size * num_comp + (k * elem_size + n) * blk_size + j - v_offset] * - abs(impl->curl_orients[j + (3 * n + 1) * blk_size + e * 3 * elem_size]); + for (CeedInt j = 0; j < block_end; j++) { + vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += + uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) + + uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * + abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); } } } return CEED_ERROR_SUCCESS; } -static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, - CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, - CeedVector u, CeedVector v, CeedRequest *request) { - const CeedScalar *uu; - CeedScalar *vv; - CeedInt num_elem, elem_size, v_offset; +static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, + const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, + bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { + CeedInt num_elem, elem_size, v_offset; + CeedRestrictionType rstr_type; + const CeedScalar *uu; + CeedScalar *vv; + CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - v_offset = start * blk_size * elem_size * num_comp; - CeedRestrictionType rstr_type; + v_offset = start * block_size * elem_size * num_comp; CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); - CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu)); + if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, E-vector to L-vector CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv)); @@ -350,27 +366,30 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const // Sum into for transpose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedElemRestrictionApplyStridedTranspose_Ref_Core(r, num_comp, blk_size, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStridedTranspose_Ref_Core(r, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv); break; case CEED_RESTRICTION_STANDARD: - CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); break; case CEED_RESTRICTION_ORIENTED: if (use_signs) { - CeedElemRestrictionApplyOrientedTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyOrientedTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + vv); } else { - CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + vv); } break; case CEED_RESTRICTION_CURL_ORIENTED: if (use_signs && use_orients) { - CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } else if (use_orients) { - CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, + CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } else { - CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStandardTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + vv); } break; } @@ -382,29 +401,30 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const // Overwrite for notranspose mode switch (rstr_type) { case CEED_RESTRICTION_STRIDED: - CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(r, num_comp, blk_size, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(r, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv); break; case CEED_RESTRICTION_STANDARD: - CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); + CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + vv); break; case CEED_RESTRICTION_ORIENTED: if (use_signs) { - CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } else { - CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } break; case CEED_RESTRICTION_CURL_ORIENTED: if (use_signs && use_orients) { - CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, - vv); + CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, + uu, vv); } else if (use_orients) { - CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, + CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } else { - CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, blk_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, + CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(r, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, v_offset, uu, vv); } break; @@ -419,77 +439,77 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction r, const //------------------------------------------------------------------------------ // ElemRestriction Apply - Common Sizes //------------------------------------------------------------------------------ -static int CeedElemRestrictionApply_Ref_110(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_110(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 1, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_111(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_111(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 1, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_180(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_180(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 1, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_181(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_181(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 1, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_310(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_310(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 3, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_311(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_311(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 3, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_380(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_380(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 3, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } -static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_381(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 3, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); } // LCOV_EXCL_START -static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_510(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 5, 1, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } // LCOV_EXCL_STOP -static int CeedElemRestrictionApply_Ref_511(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_511(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 5, 1, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); } // LCOV_EXCL_START -static int CeedElemRestrictionApply_Ref_580(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_580(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 5, 8, comp_stride, start, stop, t_mode, use_signs, use_orients, u, v, request); } // LCOV_EXCL_STOP -static int CeedElemRestrictionApply_Ref_581(CeedElemRestriction r, const CeedInt num_comp, const CeedInt blk_size, const CeedInt comp_stride, +static int CeedElemRestrictionApply_Ref_581(CeedElemRestriction r, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { return CeedElemRestrictionApply_Ref_Core(r, 5, 8, 1, start, stop, t_mode, use_signs, use_orients, u, v, request); @@ -499,45 +519,45 @@ static int CeedElemRestrictionApply_Ref_581(CeedElemRestriction r, const CeedInt // ElemRestriction Apply //------------------------------------------------------------------------------ static int CeedElemRestrictionApply_Ref(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - CeedInt num_blk, blk_size, num_comp, comp_stride; - CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); - CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedInt num_block, block_size, num_comp, comp_stride; + CeedElemRestriction_Ref *impl; + + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_block)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &block_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - - return impl->Apply(r, num_comp, blk_size, comp_stride, 0, num_blk, t_mode, true, true, u, v, request); + return impl->Apply(r, num_comp, block_size, comp_stride, 0, num_block, t_mode, true, true, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Apply Unsigned //------------------------------------------------------------------------------ static int CeedElemRestrictionApplyUnsigned_Ref(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - CeedInt num_blk, blk_size, num_comp, comp_stride; - CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); - CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedInt num_block, block_size, num_comp, comp_stride; + CeedElemRestriction_Ref *impl; + + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_block)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &block_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - - return impl->Apply(r, num_comp, blk_size, comp_stride, 0, num_blk, t_mode, false, true, u, v, request); + return impl->Apply(r, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, true, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Apply Unoriented //------------------------------------------------------------------------------ static int CeedElemRestrictionApplyUnoriented_Ref(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - CeedInt num_blk, blk_size, num_comp, comp_stride; - CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); - CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedInt num_block, block_size, num_comp, comp_stride; + CeedElemRestriction_Ref *impl; + + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_block)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &block_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - - return impl->Apply(r, num_comp, blk_size, comp_stride, 0, num_blk, t_mode, false, false, u, v, request); + return impl->Apply(r, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, false, u, v, request); } //------------------------------------------------------------------------------ @@ -545,23 +565,24 @@ static int CeedElemRestrictionApplyUnoriented_Ref(CeedElemRestriction r, CeedTra //------------------------------------------------------------------------------ static int CeedElemRestrictionApplyBlock_Ref(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - CeedInt blk_size, num_comp, comp_stride; - CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedInt block_size, num_comp, comp_stride; + CeedElemRestriction_Ref *impl; + + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &block_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - CeedElemRestriction_Ref *impl; CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - - return impl->Apply(r, num_comp, blk_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request); + return impl->Apply(r, num_comp, block_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request); } //------------------------------------------------------------------------------ // ElemRestriction Get Offsets //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Ref(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { + Ceed ceed; CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide to HOST memory"); @@ -574,9 +595,10 @@ static int CeedElemRestrictionGetOffsets_Ref(CeedElemRestriction rstr, CeedMemTy // ElemRestriction Get Orientations //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOrientations_Ref(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { + Ceed ceed; CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide to HOST memory"); @@ -589,9 +611,10 @@ static int CeedElemRestrictionGetOrientations_Ref(CeedElemRestriction rstr, Ceed // ElemRestriction Get Curl-Conforming Orientations //------------------------------------------------------------------------------ static int CeedElemRestrictionGetCurlOrientations_Ref(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { + Ceed ceed; CeedElemRestriction_Ref *impl; + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - Ceed ceed; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide to HOST memory"); @@ -605,8 +628,8 @@ static int CeedElemRestrictionGetCurlOrientations_Ref(CeedElemRestriction rstr, //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Ref(CeedElemRestriction r) { CeedElemRestriction_Ref *impl; - CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); + CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); CeedCallBackend(CeedFree(&impl->offsets_allocated)); CeedCallBackend(CeedFree(&impl->orients_allocated)); CeedCallBackend(CeedFree(&impl->curl_orients_allocated)); @@ -619,37 +642,35 @@ static int CeedElemRestrictionDestroy_Ref(CeedElemRestriction r) { //------------------------------------------------------------------------------ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients, const CeedInt8 *curl_orients, CeedElemRestriction r) { + Ceed ceed; + CeedInt num_elem, elem_size, num_block, block_size, num_comp, comp_stride; + CeedRestrictionType rstr_type; CeedElemRestriction_Ref *impl; - CeedInt num_elem, elem_size, num_blk, blk_size, num_comp, comp_stride; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_blk)); - CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &blk_size)); + CeedCallBackend(CeedElemRestrictionGetNumBlocks(r, &num_block)); + CeedCallBackend(CeedElemRestrictionGetBlockSize(r, &block_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + CeedInt layout[3] = {1, elem_size, elem_size * num_comp}; CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported"); CeedCallBackend(CeedCalloc(1, &impl)); // Offsets data - CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); if (rstr_type != CEED_RESTRICTION_STRIDED) { - // Check indices for ref or memcheck backends - Ceed parent_ceed = ceed, curr_ceed = NULL; - while (parent_ceed != curr_ceed) { - curr_ceed = parent_ceed; - CeedCallBackend(CeedGetParent(curr_ceed, &parent_ceed)); - } const char *resource; - CeedCallBackend(CeedGetResource(parent_ceed, &resource)); + + // Check indices for ref or memcheck backends + CeedCallBackend(CeedGetResource(ceed, &resource)); if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked") || !strcmp(resource, "/cpu/self/memcheck/serial") || !strcmp(resource, "/cpu/self/memcheck/blocked")) { CeedSize l_size; - CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); + CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); for (CeedInt i = 0; i < num_elem * elem_size; i++) { CeedCheck(offsets[i] >= 0 && offsets[i] + (num_comp - 1) * comp_stride < l_size, ceed, CEED_ERROR_BACKEND, "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, offsets[i], l_size); @@ -706,7 +727,6 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, } CeedCallBackend(CeedElemRestrictionSetData(r, impl)); - CeedInt layout[3] = {1, elem_size, elem_size * num_comp}; CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Ref)); @@ -717,10 +737,11 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Ref)); - // Set apply function based upon num_comp, blk_size, and comp_stride - CeedInt idx = -1; - if (blk_size < 10) idx = 100 * num_comp + 10 * blk_size + (comp_stride == 1); - switch (idx) { + // Set apply function based upon num_comp, block_size, and comp_stride + CeedInt index = -1; + + if (block_size < 10) index = 100 * num_comp + 10 * block_size + (comp_stride == 1); + switch (index) { case 110: impl->Apply = CeedElemRestrictionApply_Ref_110; break; @@ -765,7 +786,6 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, impl->Apply = CeedElemRestrictionApply_Ref_Core; break; } - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-tensor.c b/backends/ref/ceed-ref-tensor.c index 9ed8687a30..cf7b0d59d5 100644 --- a/backends/ref/ceed-ref-tensor.c +++ b/backends/ref/ceed-ref-tensor.c @@ -16,6 +16,7 @@ static int CeedTensorContractApply_Ref(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { CeedInt t_stride_0 = B, t_stride_1 = 1; + if (t_mode == CEED_TRANSPOSE) { t_stride_0 = 1; t_stride_1 = J; @@ -46,11 +47,10 @@ static int CeedTensorContractDestroy_Ref(CeedTensorContract contract) { return C //------------------------------------------------------------------------------ int CeedTensorContractCreate_Ref(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Destroy", CeedTensorContractDestroy_Ref)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref-vector.c b/backends/ref/ceed-ref-vector.c index 0762ca71bc..5c40789682 100644 --- a/backends/ref/ceed-ref-vector.c +++ b/backends/ref/ceed-ref-vector.c @@ -17,10 +17,10 @@ //------------------------------------------------------------------------------ static int CeedVectorHasValidArray_Ref(CeedVector vec, bool *has_valid_array) { CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = impl->array; - return CEED_ERROR_SUCCESS; } @@ -28,9 +28,10 @@ static int CeedVectorHasValidArray_Ref(CeedVector vec, bool *has_valid_array) { // Check if has borrowed array of given type //------------------------------------------------------------------------------ static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { + Ceed ceed; CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); switch (mem_type) { @@ -43,7 +44,6 @@ static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, Cee // LCOV_EXCL_STOP break; } - return CEED_ERROR_SUCCESS; } @@ -51,11 +51,12 @@ static inline int CeedVectorHasBorrowedArrayOfType_Ref(const CeedVector vec, Cee // Vector Set Array //------------------------------------------------------------------------------ static int CeedVectorSetArray_Ref(CeedVector vec, CeedMemType mem_type, CeedCopyMode copy_mode, CeedScalar *array) { + Ceed ceed; + CeedSize length; CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only set HOST memory for this backend"); @@ -87,9 +88,10 @@ static int CeedVectorSetArray_Ref(CeedVector vec, CeedMemType mem_type, CeedCopy // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + Ceed ceed; CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); @@ -97,7 +99,6 @@ static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, CeedSca (*array) = impl->array_borrowed; impl->array_borrowed = NULL; impl->array = NULL; - return CEED_ERROR_SUCCESS; } @@ -105,15 +106,15 @@ static int CeedVectorTakeArray_Ref(CeedVector vec, CeedMemType mem_type, CeedSca // Vector Get Array //------------------------------------------------------------------------------ static int CeedVectorGetArrayCore_Ref(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { + Ceed ceed; CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed ceed; CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Can only provide HOST memory for this backend"); *array = impl->array; - return CEED_ERROR_SUCCESS; } @@ -136,6 +137,7 @@ static int CeedVectorGetArray_Ref(CeedVector vec, CeedMemType mem_type, CeedScal //------------------------------------------------------------------------------ static int CeedVectorGetArrayWrite_Ref(CeedVector vec, CeedMemType mem_type, const CeedScalar **array) { CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetData(vec, &impl)); if (!impl->array) { @@ -148,7 +150,6 @@ static int CeedVectorGetArrayWrite_Ref(CeedVector vec, CeedMemType mem_type, con else impl->array = impl->array_owned; } } - return CeedVectorGetArrayCore_Ref(vec, mem_type, (CeedScalar **)array); } @@ -167,8 +168,8 @@ static int CeedVectorRestoreArrayRead_Ref(CeedVector vec) { return CEED_ERROR_SU //------------------------------------------------------------------------------ static int CeedVectorDestroy_Ref(CeedVector vec) { CeedVector_Ref *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedFree(&impl->array_owned)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -178,10 +179,10 @@ static int CeedVectorDestroy_Ref(CeedVector vec) { // Vector Create //------------------------------------------------------------------------------ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) { - CeedVector_Ref *impl; Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedVector_Ref *impl; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Ref)); @@ -192,10 +193,8 @@ int CeedVectorCreate_Ref(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArray", CeedVectorRestoreArray_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "RestoreArrayRead", CeedVectorRestoreArrayRead_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Ref)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedVectorSetData(vec, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref.c b/backends/ref/ceed-ref.c index 3850fb79f6..e75a69133c 100644 --- a/backends/ref/ceed-ref.c +++ b/backends/ref/ceed-ref.c @@ -30,7 +30,6 @@ static int CeedInit_Ref(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Ref)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/ref/ceed-ref.h b/backends/ref/ceed-ref.h index c88c2c0685..28e7bc0263 100644 --- a/backends/ref/ceed-ref.h +++ b/backends/ref/ceed-ref.h @@ -47,7 +47,7 @@ typedef struct { } CeedQFunctionContext_Ref; typedef struct { - bool is_identity_qf, is_identity_restr_op; + bool is_identity_qf, is_identity_rstr_op; CeedVector *e_vecs_full; /* Full E-vectors, inputs followed by outputs */ uint64_t *input_states; /* State counter of inputs */ CeedVector *e_vecs_in; /* Single element input E-vectors */ diff --git a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp index 6b3fb389ed..a3f9c86d1c 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp @@ -29,22 +29,26 @@ //------------------------------------------------------------------------------ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); + if (dim == 1) { CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; - elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - block_sizes[0] = thread1d; - block_sizes[1] = 1; - block_sizes[2] = elems_per_block; + + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + block_sizes[0] = thread1d; + block_sizes[1] = 1; + block_sizes[2] = elems_per_block; } else if (dim == 2) { const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; + + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; } else if (dim == 3) { const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); - block_sizes[0] = thread1d; - block_sizes[1] = thread1d; - block_sizes[2] = elems_per_block; + + block_sizes[0] = thread1d; + block_sizes[1] = thread1d; + block_sizes[2] = elems_per_block; } return CEED_ERROR_SUCCESS; } @@ -55,44 +59,44 @@ extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d // - [ ] Do kernel jitting! //------------------------------------------------------------------------------ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { - bool is_setup_done; + Ceed ceed; + Ceed_Sycl *sycl_data; + bool is_setup_done, is_identity_qf; + CeedSize l_size; + CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; + Fields_Sycl h_B, h_G; + FieldsInt_Sycl h_indices; + CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedElemRestriction_Sycl *rstr_impl; + CeedBasis basis; + CeedBasis_Sycl_shared *basis_impl; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Sycl_gen *qf_impl; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); - CeedOperator_Sycl_gen *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - Fields_Sycl h_B, h_G; - FieldsInt_Sycl h_indices; - CeedQFunction qf; - CeedQFunction_Sycl_gen *qf_impl; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); - CeedSize lsize; - CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); Q_1d = Q; - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedBasis basis; - CeedBasis_Sycl_shared *basis_impl; - CeedElemRestriction Erestrict; - CeedElemRestriction_Sycl *restr_impl; - // Check for restriction only identity operator - bool is_identity_qf; CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); if (is_identity_qf) { CeedEvalMode eval_mode_in, eval_mode_out; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) { @@ -106,6 +110,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // TODO: generalize to accept different device functions? { char *tensor_basis_kernel_path, *tensor_basis_code; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code)); @@ -115,6 +120,7 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } { char *sycl_gen_template_path, *sycl_gen_template_source; + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path)); CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source)); @@ -132,14 +138,15 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); // Collect dim, P_1d, and Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - if (isTensor) { + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + if (is_tensor) { CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d; @@ -156,14 +163,15 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); if (basis != CEED_BASIS_NONE) { + bool is_tensor; + CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); // Collect Q_1d CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - bool isTensor; - CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); - if (isTensor) { + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + if (is_tensor) { CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); } else { // LCOV_EXCL_START @@ -178,8 +186,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { // Only use 3D collocated gradient parallelization strategy when gradient is computed // TODO: put in a function? bool use_collograd_parallelization = false; + if (dim == 3) { bool was_grad_found = false; + for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_GRAD) { @@ -260,10 +270,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants if (eval_mode != CEED_EVAL_WEIGHT) { @@ -318,10 +328,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Set field constants CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); @@ -384,35 +394,39 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_input_fields; i++) { code << " // ---- Input field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Restriction if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { + bool is_strided; + code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; - bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); - h_indices.inputs[i] = restr_impl->d_ind; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); + h_indices.inputs[i] = rstr_impl->d_ind; code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i << "], d_u_" << i << ", r_u_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] @@ -497,31 +511,35 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; switch (eval_mode) { case CEED_EVAL_NONE: + bool is_strided; + code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; - bool is_strided; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); - h_indices.inputs[i] = restr_impl->d_ind; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); + h_indices.inputs[i] = rstr_impl->d_ind; code << " readSliceQuadsOffset" - << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, lsize_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" + << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, l_size_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" << i << ", r_q_" << i << ");\n"; } else { - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " readSliceQuadsStrided" @@ -647,10 +665,10 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { for (CeedInt i = 0; i < num_output_fields; i++) { code << " // ---- Output field " << i << " ----\n"; // Get elem_size, eval_mode, num_comp - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); // Basis action code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; switch (eval_mode) { @@ -691,25 +709,29 @@ extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { } // Restriction bool is_strided; - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); + + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); if (!is_strided) { - CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); - code << " const CeedInt lsize_out_" << i << " = " << lsize << ";\n"; CeedInt comp_stride; - CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); code << " // CompStride: " << comp_stride << "\n"; - CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); - h_indices.outputs[i] = restr_impl->d_ind; + CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); + h_indices.outputs[i] = rstr_impl->d_ind; code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i << "], r_v_" << i << ", d_v_" << i << ");\n"; } else { - bool has_backend_strides; - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); + bool has_backend_strides; CeedInt num_elem; - CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); + CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; + if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); + CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, &strides)); } code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] diff --git a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp index 80a63a4b0a..8364b9f6da 100644 --- a/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-operator.sycl.cpp @@ -18,6 +18,7 @@ //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Sycl_gen(CeedOperator op) { CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); CeedCallBackend(CeedFree(&impl)); return CEED_ERROR_SUCCESS; @@ -27,24 +28,25 @@ static int CeedOperatorDestroy_Sycl_gen(CeedOperator op) { // Apply and add to output //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *ceed_Sycl; + CeedInt num_elem, num_input_fields, num_output_fields; + CeedEvalMode eval_mode; + CeedVector output_vecs[CEED_FIELD_MAX] = {}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction_Sycl_gen *qf_impl; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Sycl *ceed_Sycl; CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); - CeedOperator_Sycl_gen *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; - CeedQFunction_Sycl_gen *qf_impl; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); - CeedInt num_elem, num_input_fields, num_output_fields; CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); - CeedOperatorField *op_input_fields, *op_output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); - CeedQFunctionField *qf_input_fields, *qf_output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedEvalMode eval_mode; - CeedVector vec, output_vecs[CEED_FIELD_MAX] = {}; // Creation of the operator CeedCallBackend(CeedOperatorBuildKernel_Sycl_gen(op)); @@ -55,6 +57,8 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (eval_mode == CEED_EVAL_WEIGHT) { // Skip impl->fields->inputs[i] = NULL; } else { + CeedVector vec; + // Get input vector CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; @@ -68,6 +72,8 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, if (eval_mode == CEED_EVAL_WEIGHT) { // Skip impl->fields->outputs[i] = NULL; } else { + CeedVector vec; + // Get output vector CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; @@ -96,6 +102,7 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, const CeedInt Q_1d = impl->Q_1d; const CeedInt P_1d = impl->max_P_1d; CeedInt block_sizes[3], grid = 0; + CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); if (dim == 1) { grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); @@ -128,6 +135,8 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = input_vec; CeedCallBackend(CeedVectorRestoreArrayRead(vec, &impl->fields->inputs[i])); @@ -139,10 +148,13 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); if (eval_mode == CEED_EVAL_WEIGHT) { // Skip } else { + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) vec = output_vec; // Check for multiple output modes CeedInt index = -1; + for (CeedInt j = 0; j < i; j++) { if (vec == output_vecs[j]) { index = j; @@ -157,7 +169,6 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_impl->d_c)); - return CEED_ERROR_SUCCESS; } @@ -165,12 +176,13 @@ static int CeedOperatorApplyAdd_Sycl_gen(CeedOperator op, CeedVector input_vec, // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Sycl_gen(CeedOperator op) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *sycl_data; + CeedOperator_Sycl_gen *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); - CeedOperator_Sycl_gen *impl; CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); diff --git a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp index 951474476a..a1ba9d8d7d 100644 --- a/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen-qfunction.sycl.cpp @@ -18,6 +18,7 @@ //------------------------------------------------------------------------------ static int CeedQFunctionApply_Sycl_gen(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { Ceed ceed; + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement QFunctionApply"); } @@ -26,11 +27,12 @@ static int CeedQFunctionApply_Sycl_gen(CeedQFunction qf, CeedInt Q, CeedVector * // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) { - Ceed ceed; - CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedQFunction_Sycl_gen *impl; + + CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); CeedCallBackend(CeedQFunctionGetData(qf, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Wait for all work to finish before freeing memory @@ -46,9 +48,10 @@ static int CeedQFunctionDestroy_Sycl_gen(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Sycl_gen(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; CeedQFunction_Sycl_gen *impl; + + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionSetData(qf, impl)); diff --git a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp index a5084ff6b7..cca5aa01e8 100644 --- a/backends/sycl-gen/ceed-sycl-gen.sycl.cpp +++ b/backends/sycl-gen/ceed-sycl-gen.sycl.cpp @@ -18,7 +18,11 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { - char *resource_root; + Ceed ceed_shared; + Ceed_Sycl *data, *shared_data; + char *resource_root; + const char fallback_resource[] = "/gpu/sycl/ref"; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":device_id=", &resource_root)); if (strcmp(resource_root, "/gpu/sycl") && strcmp(resource_root, "/gpu/sycl/gen")) { // LCOV_EXCL_START @@ -27,23 +31,19 @@ static int CeedInit_Sycl_gen(const char *resource, Ceed ceed) { } CeedCallBackend(CeedFree(&resource_root)); - Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Sycl(ceed, resource)); - Ceed ceed_shared; CeedCallBackend(CeedInit("/gpu/sycl/shared", &ceed_shared)); - Ceed_Sycl *shared_data; CeedCallBackend(CeedGetData(ceed_shared, &shared_data)); // Need to use the same queue everywhere for correct synchronization shared_data->sycl_queue = data->sycl_queue; CeedCallBackend(CeedSetDelegate(ceed, ceed_shared)); - const char fallbackresource[] = "/gpu/sycl/ref"; - CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallbackresource)); + CeedCallBackend(CeedSetOperatorFallbackResource(ceed, fallback_resource)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Sycl_gen)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Sycl_gen)); diff --git a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp index 601ebdda95..0308b3bd9a 100644 --- a/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-basis.sycl.cpp @@ -616,7 +616,7 @@ int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const // Create non-tensor //------------------------------------------------------------------------------ int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *qref, const CeedScalar *q_weight, CeedBasis basis) { + const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { Ceed ceed; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedBasisNonTensor_Sycl *impl; diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp index a5e87136d6..1a8b9370d5 100644 --- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp @@ -23,17 +23,17 @@ class CeedOperatorSyclLinearAssembleFallback; //------------------------------------------------------------------------------ // Get Basis Emode Pointer //------------------------------------------------------------------------------ -void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, const CeedScalar *interp, +void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, const CeedScalar *interp, const CeedScalar *grad) { - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - *basisptr = identity; + *basis_ptr = identity; break; case CEED_EVAL_INTERP: - *basisptr = interp; + *basis_ptr = interp; break; case CEED_EVAL_GRAD: - *basisptr = grad; + *basis_ptr = grad; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -46,52 +46,53 @@ void CeedOperatorGetBasisPointer_Sycl(const CeedScalar **basisptr, CeedEvalMode // Destroy operator //------------------------------------------------------------------------------ static int CeedOperatorDestroy_Sycl(CeedOperator op) { + Ceed ceed; + Ceed_Sycl *sycl_data; CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Apply data - for (CeedInt i = 0; i < impl->numein + impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->evecs[i])); + for (CeedInt i = 0; i < impl->num_e_in + impl->num_e_out; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->e_vecs[i])); } - CeedCallBackend(CeedFree(&impl->evecs)); + CeedCallBackend(CeedFree(&impl->e_vecs)); - for (CeedInt i = 0; i < impl->numein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsin[i])); + for (CeedInt i = 0; i < impl->num_e_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_in[i])); } - CeedCallBackend(CeedFree(&impl->qvecsin)); + CeedCallBackend(CeedFree(&impl->q_vecs_in)); - for (CeedInt i = 0; i < impl->numeout; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qvecsout[i])); + for (CeedInt i = 0; i < impl->num_e_out; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->q_vecs_out[i])); } - CeedCallBackend(CeedFree(&impl->qvecsout)); + CeedCallBackend(CeedFree(&impl->q_vecs_out)); // QFunction assembly data - for (CeedInt i = 0; i < impl->qfnumactivein; i++) { - CeedCallBackend(CeedVectorDestroy(&impl->qfactivein[i])); + for (CeedInt i = 0; i < impl->num_active_in; i++) { + CeedCallBackend(CeedVectorDestroy(&impl->qf_active_in[i])); } - CeedCallBackend(CeedFree(&impl->qfactivein)); + CeedCallBackend(CeedFree(&impl->qf_active_in)); // Diag data if (impl->diag) { - CeedCallBackend(CeedFree(&impl->diag->h_emodein)); - CeedCallBackend(CeedFree(&impl->diag->h_emodeout)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); + CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); CeedCallSycl(ceed, sycl_data->sycl_queue.wait_and_throw()); - CeedCallSycl(ceed, sycl::free(impl->diag->d_emodein, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_emodeout, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_in, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_e_mode_out, sycl_data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->diag->d_identity, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_interpin, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_interpout, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_gradin, sycl_data->sycl_context)); - CeedCallSycl(ceed, sycl::free(impl->diag->d_gradout, sycl_data->sycl_context)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->pbdiagrstr)); - - CeedCallBackend(CeedVectorDestroy(&impl->diag->elemdiag)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->pbelemdiag)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_in, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_interp_out, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_in, sycl_data->sycl_context)); + CeedCallSycl(ceed, sycl::free(impl->diag->d_grad_out, sycl_data->sycl_context)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + + CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); } CeedCallBackend(CeedFree(&impl->diag)); @@ -109,88 +110,88 @@ static int CeedOperatorDestroy_Sycl(CeedOperator op) { //------------------------------------------------------------------------------ // Setup infields or outfields //------------------------------------------------------------------------------ -static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool isinput, CeedVector *evecs, CeedVector *qvecs, CeedInt starte, - CeedInt numfields, CeedInt Q, CeedInt numelements) { - CeedInt dim, size; - CeedSize q_size; - Ceed ceed; +static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, + CeedInt num_fields, CeedInt Q, CeedInt num_elem) { + Ceed ceed; + CeedSize q_size; + bool is_strided, skip_restriction; + CeedInt dim, size; + CeedOperatorField *op_fields; + CeedQFunctionField *qf_fields; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedVector fieldvec; - bool strided; - bool skiprestrict; - - if (isinput) { - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); + if (is_input) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); } else { - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); } // Loop over fields - for (CeedInt i = 0; i < numfields; i++) { - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); + for (CeedInt i = 0; i < num_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction rstr; + CeedBasis basis; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); - strided = false; - skiprestrict = false; - if (emode != CEED_EVAL_WEIGHT) { - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &Erestrict)); + is_strided = false; + skip_restriction = false; + if (e_mode != CEED_EVAL_WEIGHT) { + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); // Check whether this field can skip the element restriction: - // must be passive input, with emode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. + // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. // First, check whether the field is input or output: - if (isinput) { + if (is_input) { // Check for passive input: - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &fieldvec)); - if (fieldvec != CEED_VECTOR_ACTIVE) { - // Check emode - if (emode == CEED_EVAL_NONE) { - // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &strided)); - if (strided) { + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE) { + // Check e_mode + if (e_mode == CEED_EVAL_NONE) { + // Check for is_strided restriction + CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided)); + if (is_strided) { // Check if vector is already in preferred backend ordering - CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &skiprestrict)); + CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &skip_restriction)); } } } } - if (skiprestrict) { + if (skip_restriction) { // We do not need an E-Vector, but will use the input field vector's data directly in the operator application - evecs[i + starte] = NULL; + e_vecs[i + start_e] = NULL; } else { - CeedCallBackend(CeedElemRestrictionCreateVector(Erestrict, NULL, &evecs[i + starte])); + CeedCallBackend(CeedElemRestrictionCreateVector(rstr, NULL, &e_vecs[i + start_e])); } } - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - CeedCallBackend(CeedQFunctionFieldGetSize(qffields[i], &size)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - q_size = (CeedSize)numelements * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); break; case CEED_EVAL_WEIGHT: // Only on input fields - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basis)); - q_size = (CeedSize)numelements * Q; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &qvecs[i])); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, qvecs[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); + q_size = (CeedSize)num_elem * Q; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, NULL, q_vecs[i])); break; case CEED_EVAL_DIV: break; // TODO: Not implemented @@ -206,39 +207,39 @@ static int CeedOperatorSetupFields_Sycl(CeedQFunction qf, CeedOperator op, bool // passive) to the named inputs and outputs of its CeedQFunction. //------------------------------------------------------------------------------ static int CeedOperatorSetup_Sycl(CeedOperator op) { - bool setupdone; - CeedCallBackend(CeedOperatorIsSetupDone(op, &setupdone)); - if (setupdone) return CEED_ERROR_SUCCESS; + Ceed ceed; + bool is_setup_done; + CeedInt Q, num_elem, num_input_fields, num_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Sycl *impl; + + CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); + if (is_setup_done) return CEED_ERROR_SUCCESS; - Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, numinputfields, numoutputfields; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Allocate - CeedCallBackend(CeedCalloc(numinputfields + numoutputfields, &impl->evecs)); + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsin)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->qvecsout)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); - impl->numein = numinputfields; - impl->numeout = numoutputfields; + impl->num_e_in = num_input_fields; + impl->num_e_out = num_output_fields; - // Set up infield and outfield evecs and qvecs + // Set up infield and outfield e_vecs and q_vecs // Infields - CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, true, impl->evecs, impl->qvecsin, 0, numinputfields, Q, numelements)); - + CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); // Outfields - CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->evecs, impl->qvecsout, numinputfields, numoutputfields, Q, numelements)); + CeedCallBackend(CeedOperatorSetupFields_Sycl(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); CeedCallBackend(CeedOperatorSetSetupDone(op)); return CEED_ERROR_SUCCESS; @@ -247,37 +248,37 @@ static int CeedOperatorSetup_Sycl(CeedOperator op) { //------------------------------------------------------------------------------ // Setup Operator Inputs //------------------------------------------------------------------------------ -static inline int CeedOperatorSetupInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedVector invec, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], +static inline int CeedOperatorSetupInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl, CeedRequest *request) { - CeedEvalMode emode; - CeedVector vec; - CeedElemRestriction Erestrict; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; + CeedElemRestriction rstr; - for (CeedInt i = 0; i < numinputfields; i++) { // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { - if (skipactive) continue; - else vec = invec; + if (skip_active) continue; + else vec = in_vec; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - if (vec == CEED_VECTOR_ACTIVE) vec = invec; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); + if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; // Restrict, if necessary - if (!impl->evecs[i]) { + if (!impl->e_vecs[i]) { // No restriction for this field; read data directly from vec. - CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_NOTRANSPOSE, vec, impl->evecs[i], request)); + CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_NOTRANSPOSE, vec, impl->e_vecs[i], request)); // Get evec - CeedCallBackend(CeedVectorGetArrayRead(impl->evecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorGetArrayRead(impl->e_vecs[i], CEED_MEM_DEVICE, (const CeedScalar **)&e_data[i])); } } } @@ -287,38 +288,39 @@ static inline int CeedOperatorSetupInputs_Sycl(CeedInt numinputfields, CeedQFunc //------------------------------------------------------------------------------ // Input Basis Action //------------------------------------------------------------------------------ -static inline int CeedOperatorInputBasis_Sycl(CeedInt numelements, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - CeedInt numinputfields, const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], +static inline int CeedOperatorInputBasis_Sycl(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { - CeedInt elemsize, size; - CeedElemRestriction Erestrict; - CeedEvalMode emode; - CeedBasis basis; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size; + CeedElemRestriction rstr; + CeedEvalMode e_mode; + CeedBasis basis; - for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input - if (skipactive) { + if (skip_active) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->qvecsin[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opinputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->evecs[i], impl->qvecsin[i])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); break; case CEED_EVAL_WEIGHT: break; // No action @@ -334,25 +336,25 @@ static inline int CeedOperatorInputBasis_Sycl(CeedInt numelements, CeedQFunction //------------------------------------------------------------------------------ // Restore Input Vectors //------------------------------------------------------------------------------ -static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFunctionField *qfinputfields, CeedOperatorField *opinputfields, - const bool skipactive, CeedScalar *edata[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { - CeedEvalMode emode; - CeedVector vec; +static inline int CeedOperatorRestoreInputs_Sycl(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Sycl *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode e_mode; + CeedVector vec; - for (CeedInt i = 0; i < numinputfields; i++) { // Skip active input - if (skipactive) { - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + if (skip_active) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) continue; } - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode)); - if (emode == CEED_EVAL_WEIGHT) { // Skip + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_WEIGHT) { // Skip } else { - if (!impl->evecs[i]) { // This was a skiprestrict case - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); - CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&edata[i])); + if (!impl->e_vecs[i]) { // This was a skip_restriction case + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + CeedCallBackend(CeedVectorRestoreArrayRead(vec, (const CeedScalar **)&e_data[i])); } else { - CeedCallBackend(CeedVectorRestoreArrayRead(impl->evecs[i], (const CeedScalar **)&edata[i])); + CeedCallBackend(CeedVectorRestoreArrayRead(impl->e_vecs[i], (const CeedScalar **)&e_data[i])); } } } @@ -362,64 +364,65 @@ static inline int CeedOperatorRestoreInputs_Sycl(CeedInt numinputfields, CeedQFu //------------------------------------------------------------------------------ // Apply and add to output //------------------------------------------------------------------------------ -static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector invec, CeedVector outvec, CeedRequest *request) { - CeedOperator_Sycl *impl; +static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { + CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; + CeedEvalMode e_mode; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {0}; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt Q, numelements, elemsize, numinputfields, numoutputfields, size; CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedOperatorField *opinputfields, *opoutputfields; - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedEvalMode emode; - CeedVector vec; - CeedBasis basis; - CeedElemRestriction Erestrict; - CeedScalar *edata[2 * CEED_FIELD_MAX] = {0}; + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Setup CeedCallBackend(CeedOperatorSetup_Sycl(op)); // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, invec, false, edata, impl, request)); + CeedCallBackend(CeedOperatorSetupInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); // Input basis apply if needed - CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Sycl(num_elem, qf_input_fields, op_input_fields, num_input_fields, false, e_data, impl)); // Output pointers, as necessary - for (CeedInt i = 0; i < numoutputfields; i++) { - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { // Set the output Q-Vector to use the E-Vector data directly - CeedCallBackend(CeedVectorGetArrayWrite(impl->evecs[i + impl->numein], CEED_MEM_DEVICE, &edata[i + numinputfields])); - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[i], CEED_MEM_DEVICE, CEED_USE_POINTER, edata[i + numinputfields])); + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_e_in], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); } } // Q function - CeedCallBackend(CeedQFunctionApply(qf, numelements * Q, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, num_elem * Q, impl->q_vecs_in, impl->q_vecs_out)); // Output basis apply if needed - for (CeedInt i = 0; i < numoutputfields; i++) { - // Get elemsize, emode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); - CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elemsize)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedElemRestriction rstr; + CeedBasis basis; + + // Get elem_size, e_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); // Basis action - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: break; case CEED_EVAL_INTERP: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedOperatorFieldGetBasis(opoutputfields[i], &basis)); - CeedCallBackend(CeedBasisApply(basis, numelements, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->qvecsout[i], impl->evecs[i + impl->numein])); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_e_in])); break; // LCOV_EXCL_START case CEED_EVAL_WEIGHT: @@ -436,24 +439,27 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector invec, CeedVect } // Output restriction - for (CeedInt i = 0; i < numoutputfields; i++) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + CeedElemRestriction rstr; + // Restore evec - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode)); - if (emode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->evecs[i + impl->numein], &edata[i + numinputfields])); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); + if (e_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_e_in], &e_data[i + num_input_fields])); } // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Restrict - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &rstr)); // Active - if (vec == CEED_VECTOR_ACTIVE) vec = outvec; + if (vec == CEED_VECTOR_ACTIVE) vec = out_vec; - CeedCallBackend(CeedElemRestrictionApply(Erestrict, CEED_TRANSPOSE, impl->evecs[i + impl->numein], vec, request)); + CeedCallBackend(CeedElemRestrictionApply(rstr, CEED_TRANSPOSE, impl->e_vecs[i + impl->num_e_in], vec, request)); } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, false, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, false, e_data, impl)); return CEED_ERROR_SUCCESS; } @@ -462,81 +468,88 @@ static int CeedOperatorApplyAdd_Sycl(CeedOperator op, CeedVector invec, CeedVect //------------------------------------------------------------------------------ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, CeedRequest *request) { - Ceed ceed, ceedparent; - CeedOperator_Sycl *impl; - CeedQFunction qf; - CeedQFunctionField *qfinputfields, *qfoutputfields; - CeedOperatorField *opinputfields, *opoutputfields; - CeedVector vec, *activein; - CeedInt numactivein, numactiveout, Q, numelements, numinputfields, numoutputfields, size; + Ceed ceed, ceed_parent; CeedSize q_size; - CeedScalar *a, *tmp, *edata[2 * CEED_FIELD_MAX] = {NULL}; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; + CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedVector *active_in; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceedparent)); + CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); CeedCallBackend(CeedOperatorGetData(op, &impl)); - activein = impl->qfactivein; - numactivein = impl->qfnumactivein, numactiveout = impl->qfnumactiveout; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &numelements)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qfinputfields, NULL, &qfoutputfields)); - CeedCallBackend(CeedOperatorGetFields(op, &numinputfields, &opinputfields, &numoutputfields, &opoutputfields)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + active_in = impl->qf_active_in; + num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; // Setup CeedCallBackend(CeedOperatorSetup_Sycl(op)); // Check for identity - bool identityqf; - CeedCallBackend(CeedQFunctionIsIdentity(qf, &identityqf)); - if (identityqf) { + bool is_identity_qf; + CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); + if (is_identity_qf) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Assembling identity QFunctions not supported"); // LCOV_EXCL_STOP } // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Sycl(numinputfields, qfinputfields, opinputfields, NULL, true, edata, impl, request)); + CeedCallBackend(CeedOperatorSetupInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); // Count number of active input fields - if (!numactivein) { - for (CeedInt i = 0; i < numinputfields; i++) { + if (!num_active_in) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedScalar *q_vec_array; + CeedVector vec; + // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(opinputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); // Check if active input if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfinputfields[i], &size)); - CeedCallBackend(CeedVectorSetValue(impl->qvecsin[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->qvecsin[i], CEED_MEM_DEVICE, &tmp)); - CeedCallBackend(CeedRealloc(numactivein + size, &activein)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); for (CeedInt field = 0; field < size; field++) { - q_size = (CeedSize)Q * numelements; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &activein[numactivein + field])); - CeedCallBackend(CeedVectorSetArray(activein[numactivein + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &tmp[field * Q * numelements])); + q_size = (CeedSize)Q * num_elem; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); + CeedCallBackend( + CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); } - numactivein += size; - CeedCallBackend(CeedVectorRestoreArray(impl->qvecsin[i], &tmp)); + num_active_in += size; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); } } - impl->qfnumactivein = numactivein; - impl->qfactivein = activein; + impl->num_active_in = num_active_in; + impl->qf_active_in = active_in; } // Count number of active output fields - if (!numactiveout) { - for (CeedInt i = 0; i < numoutputfields; i++) { + if (!num_active_out) { + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[i], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[i], &size)); - numactiveout += size; + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + num_active_out += size; } } - impl->qfnumactiveout = numactiveout; + impl->num_active_out = num_active_out; } // Check sizes - if (!numactivein || !numactiveout) { + if (!num_active_in || !num_active_out) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Cannot assemble QFunction without active inputs and outputs"); // LCOV_EXCL_STOP @@ -544,58 +557,62 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Sycl(CeedOperator op, // Build objects if needed if (build_objects) { + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + // Create output restriction - CeedInt strides[3] = {1, numelements * Q, Q}; /* *NOPAD* */ - CeedCallBackend(CeedElemRestrictionCreateStrided(ceedparent, numelements, Q, numactivein * numactiveout, - numactivein * numactiveout * numelements * Q, strides, rstr)); + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, rstr)); // Create assembled vector - CeedSize l_size = (CeedSize)numelements * Q * numactivein * numactiveout; - CeedCallBackend(CeedVectorCreate(ceedparent, l_size, assembled)); + CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); } CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); - CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &a)); + CeedCallBackend(CeedVectorGetArray(*assembled, CEED_MEM_DEVICE, &assembled_array)); // Input basis apply - CeedCallBackend(CeedOperatorInputBasis_Sycl(numelements, qfinputfields, opinputfields, numinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorInputBasis_Sycl(num_elem, qf_input_fields, op_input_fields, num_input_fields, true, e_data, impl)); // Assemble QFunction - for (CeedInt in = 0; in < numactivein; in++) { + for (CeedInt in = 0; in < num_active_in; in++) { // Set Inputs - CeedCallBackend(CeedVectorSetValue(activein[in], 1.0)); - if (numactivein > 1) { - CeedCallBackend(CeedVectorSetValue(activein[(in + numactivein - 1) % numactivein], 0.0)); + CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); + if (num_active_in > 1) { + CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); } // Set Outputs - for (CeedInt out = 0; out < numoutputfields; out++) { + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorSetArray(impl->qvecsout[out], CEED_MEM_DEVICE, CEED_USE_POINTER, a)); - CeedCallBackend(CeedQFunctionFieldGetSize(qfoutputfields[out], &size)); - a += size * Q * numelements; // Advance the pointer by the size of the output + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, CEED_USE_POINTER, assembled_array)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[out], &size)); + assembled_array += size * Q * num_elem; // Advance the pointer by the size of the output } } // Apply QFunction - CeedCallBackend(CeedQFunctionApply(qf, Q * numelements, impl->qvecsin, impl->qvecsout)); + CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); } // Un-set output Qvecs to prevent accidental overwrite of Assembled - for (CeedInt out = 0; out < numoutputfields; out++) { + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + // Get output vector - CeedCallBackend(CeedOperatorFieldGetVector(opoutputfields[out], &vec)); + CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[out], &vec)); // Check if active output if (vec == CEED_VECTOR_ACTIVE) { - CeedCallBackend(CeedVectorTakeArray(impl->qvecsout[out], CEED_MEM_DEVICE, NULL)); + CeedCallBackend(CeedVectorTakeArray(impl->q_vecs_out[out], CEED_MEM_DEVICE, NULL)); } } // Restore input arrays - CeedCallBackend(CeedOperatorRestoreInputs_Sycl(numinputfields, qfinputfields, opinputfields, true, edata, impl)); + CeedCallBackend(CeedOperatorRestoreInputs_Sycl(num_input_fields, qf_input_fields, op_input_fields, true, e_data, impl)); // Restore output - CeedCallBackend(CeedVectorRestoreArray(*assembled, &a)); - + CeedCallBackend(CeedVectorRestoreArray(*assembled, &assembled_array)); return CEED_ERROR_SUCCESS; } @@ -616,85 +633,90 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Sycl(CeedOperator op, CeedV //------------------------------------------------------------------------------ // Create point block restriction //------------------------------------------------------------------------------ -static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *pbRstr) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); +static int CreatePBRestriction(CeedElemRestriction rstr, CeedElemRestriction *point_block_rstr) { + Ceed ceed; + CeedSize l_size; + CeedInt num_elem, num_comp, elem_size, comp_stride, *point_block_offsets; const CeedInt *offsets; + + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedElemRestrictionGetOffsets(rstr, CEED_MEM_HOST, &offsets)); // Expand offsets - CeedInt nelem, ncomp, elemsize, compstride, *pbOffsets; - CeedSize l_size; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &ncomp)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elemsize)); - CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &compstride)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); - CeedInt shift = ncomp; - if (compstride != 1) shift *= ncomp; - CeedCallBackend(CeedCalloc(nelem * elemsize, &pbOffsets)); - for (CeedInt i = 0; i < nelem * elemsize; i++) { - pbOffsets[i] = offsets[i] * shift; + CeedInt shift = num_comp; + + if (comp_stride != 1) shift *= num_comp; + CeedCallBackend(CeedCalloc(num_elem * elem_size, &point_block_offsets)); + for (CeedInt i = 0; i < num_elem * elem_size; i++) { + point_block_offsets[i] = offsets[i] * shift; } // Create new restriction - CeedCallBackend( - CeedElemRestrictionCreate(ceed, nelem, elemsize, ncomp * ncomp, 1, l_size * ncomp, CEED_MEM_HOST, CEED_OWN_POINTER, pbOffsets, pbRstr)); + CeedCallBackend(CeedElemRestrictionCreate(ceed, num_elem, elem_size, num_comp * num_comp, 1, l_size * num_comp, CEED_MEM_HOST, CEED_OWN_POINTER, + point_block_offsets, point_block_rstr)); // Cleanup CeedCallBackend(CeedElemRestrictionRestoreOffsets(rstr, &offsets)); - return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Assemble diagonal setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const bool pointBlock) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const bool is_point_block) { + Ceed ceed; + Ceed_Sycl *sycl_data; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0; + CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *op_fields; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedInt numinputfields, numoutputfields; - CeedCallBackend(CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields)); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Determine active input basis - CeedOperatorField *opfields; - CeedQFunctionField *qffields; - CeedCallBackend(CeedOperatorGetFields(op, NULL, &opfields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qffields, NULL, NULL)); - CeedInt numemodein = 0, ncomp = 0, dim = 1; - CeedEvalMode *emodein = NULL; - CeedBasis basisin = NULL; - CeedElemRestriction rstrin = NULL; - for (CeedInt i = 0; i < numinputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisin)); - CeedCallBackend(CeedBasisGetNumComponents(basisin, &ncomp)); - CeedCallBackend(CeedBasisGetDimension(basisin, &dim)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - if (rstrin && rstrin != rstr) { + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + if (rstr_in && rstr_in != rstr) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); // LCOV_EXCL_STOP } - rstrin = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_in = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodein + 1, &emodein)); - emodein[numemodein] = emode; - numemodein += 1; + CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); + e_mode_in[num_e_mode_in] = e_mode; + num_e_mode_in += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodein + dim, &emodein)); - for (CeedInt d = 0; d < dim; d++) emodein[numemodein + d] = emode; - numemodein += dim; + CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); + for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; + num_e_mode_in += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -705,38 +727,36 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const } // Determine active output basis - CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &opfields)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qffields)); - CeedInt numemodeout = 0; - CeedEvalMode *emodeout = NULL; - CeedBasis basisout = NULL; - CeedElemRestriction rstrout = NULL; - for (CeedInt i = 0; i < numoutputfields; i++) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); + for (CeedInt i = 0; i < num_output_fields; i++) { CeedVector vec; - CeedCallBackend(CeedOperatorFieldGetVector(opfields[i], &vec)); + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode e_mode; CeedElemRestriction rstr; - CeedCallBackend(CeedOperatorFieldGetBasis(opfields[i], &basisout)); - CeedCallBackend(CeedOperatorFieldGetElemRestriction(opfields[i], &rstr)); - if (rstrout && rstrout != rstr) { + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); + if (rstr_out && rstr_out != rstr) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator diagonal assembly"); // LCOV_EXCL_STOP } - rstrout = rstr; - CeedEvalMode emode; - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qffields[i], &emode)); - switch (emode) { + rstr_out = rstr; + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); + switch (e_mode) { case CEED_EVAL_NONE: case CEED_EVAL_INTERP: - CeedCallBackend(CeedRealloc(numemodeout + 1, &emodeout)); - emodeout[numemodeout] = emode; - numemodeout += 1; + CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); + e_mode_out[num_e_mode_out] = e_mode; + num_e_mode_out += 1; break; case CEED_EVAL_GRAD: - CeedCallBackend(CeedRealloc(numemodeout + dim, &emodeout)); - for (CeedInt d = 0; d < dim; d++) emodeout[numemodeout + d] = emode; - numemodeout += dim; + CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); + for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; + num_e_mode_out += dim; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -747,153 +767,157 @@ static inline int CeedOperatorAssembleDiagonalSetup_Sycl(CeedOperator op, const } // Operator data struct - CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Sycl *diag = impl->diag; - diag->basisin = basisin; - diag->basisout = basisout; - diag->h_emodein = emodein; - diag->h_emodeout = emodeout; - diag->numemodein = numemodein; - diag->numemodeout = numemodeout; + + diag->basis_in = basis_in; + diag->basis_out = basis_out; + diag->h_e_mode_in = e_mode_in; + diag->h_e_mode_out = e_mode_out; + diag->num_e_mode_in = num_e_mode_in; + diag->num_e_mode_out = num_e_mode_out; // Kernel parameters - CeedInt nnodes, nqpts; - CeedCallBackend(CeedBasisGetNumNodes(basisin, &nnodes)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basisin, &nqpts)); - diag->nnodes = nnodes; - diag->nqpts = nqpts; - diag->ncomp = ncomp; + CeedInt num_nodes, num_qpts; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + diag->num_nodes = num_nodes; + diag->num_qpts = num_qpts; + diag->num_comp = num_comp; // Basis matrices - const CeedInt iLen = nqpts * nnodes; - const CeedInt gLen = nqpts * nnodes * dim; - const CeedScalar *interpin, *interpout, *gradin, *gradout; + const CeedInt i_len = num_qpts * num_nodes; + const CeedInt g_len = num_qpts * num_nodes * dim; + const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; // CEED_EVAL_NONE - CeedScalar *identity = NULL; - bool evalNone = false; - for (CeedInt i = 0; i < numemodein; i++) evalNone = evalNone || (emodein[i] == CEED_EVAL_NONE); - for (CeedInt i = 0; i < numemodeout; i++) evalNone = evalNone || (emodeout[i] == CEED_EVAL_NONE); + CeedScalar *identity = NULL; + bool has_eval_none = false; + for (CeedInt i = 0; i < num_e_mode_in; i++) has_eval_none = has_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); + for (CeedInt i = 0; i < num_e_mode_out; i++) has_eval_none = has_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); std::vector copy_events; - if (evalNone) { - CeedCallBackend(CeedCalloc(nqpts * nnodes, &identity)); - for (CeedSize i = 0; i < (nnodes < nqpts ? nnodes : nqpts); i++) identity[i * nnodes + i] = 1.0; - CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event identity_copy = sycl_data->sycl_queue.copy(identity, diag->d_identity, iLen, {e}); + if (has_eval_none) { + CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); + for (CeedSize i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; + CeedCallSycl(ceed, diag->d_identity = sycl::malloc_device(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event identity_copy = sycl_data->sycl_queue.copy(identity, diag->d_identity, i_len, {e}); copy_events.push_back(identity_copy); } // CEED_EVAL_INTERP - CeedCallBackend(CeedBasisGetInterp(basisin, &interpin)); - CeedCallSycl(ceed, diag->d_interpin = sycl::malloc_device(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event interpin_copy = sycl_data->sycl_queue.copy(interpin, diag->d_interpin, iLen, {e}); - copy_events.push_back(interpin_copy); + CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); + CeedCallSycl(ceed, diag->d_interp_in = sycl::malloc_device(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event interp_in_copy = sycl_data->sycl_queue.copy(interp_in, diag->d_interp_in, i_len, {e}); + copy_events.push_back(interp_in_copy); - CeedCallBackend(CeedBasisGetInterp(basisout, &interpout)); - CeedCallSycl(ceed, diag->d_interpout = sycl::malloc_device(iLen, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event interpout_copy = sycl_data->sycl_queue.copy(interpout, diag->d_interpout, iLen, {e}); - copy_events.push_back(interpout_copy); + CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); + CeedCallSycl(ceed, diag->d_interp_out = sycl::malloc_device(i_len, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event interp_out_copy = sycl_data->sycl_queue.copy(interp_out, diag->d_interp_out, i_len, {e}); + copy_events.push_back(interp_out_copy); // CEED_EVAL_GRAD - CeedCallBackend(CeedBasisGetGrad(basisin, &gradin)); - CeedCallSycl(ceed, diag->d_gradin = sycl::malloc_device(gLen, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event gradin_copy = sycl_data->sycl_queue.copy(gradin, diag->d_gradin, gLen, {e}); - copy_events.push_back(gradin_copy); + CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); + CeedCallSycl(ceed, diag->d_grad_in = sycl::malloc_device(g_len, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event grad_in_copy = sycl_data->sycl_queue.copy(grad_in, diag->d_grad_in, g_len, {e}); + copy_events.push_back(grad_in_copy); - CeedCallBackend(CeedBasisGetGrad(basisout, &gradout)); - CeedCallSycl(ceed, diag->d_gradout = sycl::malloc_device(gLen, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event gradout_copy = sycl_data->sycl_queue.copy(gradout, diag->d_gradout, gLen, {e}); - copy_events.push_back(gradout_copy); + CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); + CeedCallSycl(ceed, diag->d_grad_out = sycl::malloc_device(g_len, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event grad_out_copy = sycl_data->sycl_queue.copy(grad_out, diag->d_grad_out, g_len, {e}); + copy_events.push_back(grad_out_copy); - // Arrays of emodes - CeedCallSycl(ceed, diag->d_emodein = sycl::malloc_device(numemodein, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event emodein_copy = sycl_data->sycl_queue.copy(emodein, diag->d_emodein, numemodein, {e}); - copy_events.push_back(emodein_copy); + // Arrays of e_modes + CeedCallSycl(ceed, diag->d_e_mode_in = sycl::malloc_device(num_e_mode_in, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event e_mode_in_copy = sycl_data->sycl_queue.copy(e_mode_in, diag->d_e_mode_in, num_e_mode_in, {e}); + copy_events.push_back(e_mode_in_copy); - CeedCallSycl(ceed, diag->d_emodeout = sycl::malloc_device(numemodeout, sycl_data->sycl_device, sycl_data->sycl_context)); - sycl::event emodeout_copy = sycl_data->sycl_queue.copy(emodeout, diag->d_emodeout, numemodeout, {e}); - copy_events.push_back(emodeout_copy); + CeedCallSycl(ceed, diag->d_e_mode_out = sycl::malloc_device(num_e_mode_out, sycl_data->sycl_device, sycl_data->sycl_context)); + sycl::event e_mode_out_copy = sycl_data->sycl_queue.copy(e_mode_out, diag->d_e_mode_out, num_e_mode_out, {e}); + copy_events.push_back(e_mode_out_copy); // Restriction - diag->diagrstr = rstrout; + diag->diag_rstr = rstr_out; // Wait for all copies to complete and handle exceptions CeedCallSycl(ceed, sycl::event::wait_and_throw(copy_events)); - return CEED_ERROR_SUCCESS; } //------------------------------------------------------------------------------ // Kernel for diagonal assembly //------------------------------------------------------------------------------ -static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool pointBlock, const CeedInt nelem, const CeedOperatorDiag_Sycl *diag, - const CeedScalar *assembledqfarray, CeedScalar *elemdiagarray) { - const CeedSize nnodes = diag->nnodes; - const CeedSize nqpts = diag->nqpts; - const CeedSize ncomp = diag->ncomp; - const CeedSize numemodein = diag->numemodein; - const CeedSize numemodeout = diag->numemodeout; - - const CeedScalar *identity = diag->d_identity; - const CeedScalar *interpin = diag->d_interpin; - const CeedScalar *gradin = diag->d_gradin; - const CeedScalar *interpout = diag->d_interpout; - const CeedScalar *gradout = diag->d_gradout; - const CeedEvalMode *emodein = diag->d_emodein; - const CeedEvalMode *emodeout = diag->d_emodeout; - - sycl::range<1> kernel_range(nelem * nnodes); +static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool is_point_block, const CeedInt num_elem, + const CeedOperatorDiag_Sycl *diag, const CeedScalar *assembled_qf_array, CeedScalar *elem_diag_array) { + const CeedSize num_nodes = diag->num_nodes; + const CeedSize num_qpts = diag->num_qpts; + const CeedSize num_comp = diag->num_comp; + const CeedSize num_e_mode_in = diag->num_e_mode_in; + const CeedSize num_e_mode_out = diag->num_e_mode_out; + const CeedScalar *identity = diag->d_identity; + const CeedScalar *interp_in = diag->d_interp_in; + const CeedScalar *grad_in = diag->d_grad_in; + const CeedScalar *interp_out = diag->d_interp_out; + const CeedScalar *grad_out = diag->d_grad_out; + const CeedEvalMode *e_mode_in = diag->d_e_mode_in; + const CeedEvalMode *e_mode_out = diag->d_e_mode_out; + + sycl::range<1> kernel_range(num_elem * num_nodes); // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); sycl_queue.parallel_for(kernel_range, {e}, [=](sycl::id<1> idx) { - const CeedInt tid = idx % nnodes; - const CeedInt e = idx / nnodes; + const CeedInt tid = idx % num_nodes; + const CeedInt e = idx / num_nodes; // Compute the diagonal of B^T D B // Each element - CeedInt dout = -1; + CeedInt d_out = -1; // Each basis eval mode pair - for (CeedSize eout = 0; eout < numemodeout; eout++) { + for (CeedSize e_out = 0; e_out < num_e_mode_out; e_out++) { const CeedScalar *bt = NULL; - if (emodeout[eout] == CEED_EVAL_GRAD) ++dout; - CeedOperatorGetBasisPointer_Sycl(&bt, emodeout[eout], identity, interpout, &gradout[dout * nqpts * nnodes]); - CeedInt din = -1; - for (CeedSize ein = 0; ein < numemodein; ein++) { + + if (e_mode_out[e_out] == CEED_EVAL_GRAD) ++d_out; + CeedOperatorGetBasisPointer_Sycl(&bt, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * num_qpts * num_nodes]); + CeedInt d_in = -1; + + for (CeedSize e_in = 0; e_in < num_e_mode_in; e_in++) { const CeedScalar *b = NULL; - if (emodein[ein] == CEED_EVAL_GRAD) ++din; - CeedOperatorGetBasisPointer_Sycl(&b, emodein[ein], identity, interpin, &gradin[din * nqpts * nnodes]); + + if (e_mode_in[e_in] == CEED_EVAL_GRAD) ++d_in; + CeedOperatorGetBasisPointer_Sycl(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * num_qpts * num_nodes]); // Each component - for (CeedSize compOut = 0; compOut < ncomp; compOut++) { + for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) { // Each qpoint/node pair - if (pointBlock) { + if (is_point_block) { // Point Block Diagonal - for (CeedInt compIn = 0; compIn < ncomp; compIn++) { - CeedScalar evalue = 0.0; - for (CeedSize q = 0; q < nqpts; q++) { - const CeedScalar qfvalue = - assembledqfarray[((((ein * ncomp + compIn) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; - evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; + for (CeedInt comp_in = 0; comp_in < num_comp; comp_in++) { + CeedScalar e_value = 0.0; + + for (CeedSize q = 0; q < num_qpts; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * num_comp + comp_in) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + + q]; + + e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; } - elemdiagarray[((compOut * ncomp + compIn) * nelem + e) * nnodes + tid] += evalue; + elem_diag_array[((comp_out * num_comp + comp_in) * num_elem + e) * num_nodes + tid] += e_value; } } else { // Diagonal Only - CeedScalar evalue = 0.0; - for (CeedSize q = 0; q < nqpts; q++) { - const CeedScalar qfvalue = - assembledqfarray[((((ein * ncomp + compOut) * numemodeout + eout) * ncomp + compOut) * nelem + e) * nqpts + q]; - evalue += bt[q * nnodes + tid] * qfvalue * b[q * nnodes + tid]; + CeedScalar e_value = 0.0; + + for (CeedSize q = 0; q < num_qpts; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * num_comp + comp_out) * num_e_mode_out + e_out) * num_comp + comp_out) * num_elem + e) * num_qpts + q]; + e_value += bt[q * num_nodes + tid] * qf_value * b[q * num_nodes + tid]; } - elemdiagarray[(compOut * nelem + e) * nnodes + tid] += evalue; + elem_diag_array[(comp_out * num_elem + e) * num_nodes + tid] += e_value; } } } @@ -905,73 +929,75 @@ static int CeedOperatorLinearDiagonal_Sycl(sycl::queue &sycl_queue, const bool p //------------------------------------------------------------------------------ // Assemble diagonal common code //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool pointBlock) { - Ceed ceed; +static inline int CeedOperatorAssembleDiagonalCore_Sycl(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { + Ceed ceed; + Ceed_Sycl *sycl_data; + CeedInt num_elem; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr = NULL; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Assemble QFunction - CeedVector assembledqf = NULL; - CeedElemRestriction rstr = NULL; - CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembledqf, &rstr, request)); + CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); // Setup if (!impl->diag) { - CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Sycl(op, pointBlock)); + CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Sycl(op, is_point_block)); } CeedOperatorDiag_Sycl *diag = impl->diag; + assert(diag != NULL); // Restriction - if (pointBlock && !diag->pbdiagrstr) { - CeedElemRestriction pbdiagrstr; - CeedCallBackend(CreatePBRestriction(diag->diagrstr, &pbdiagrstr)); - diag->pbdiagrstr = pbdiagrstr; + if (is_point_block && !diag->point_block_diag_rstr) { + CeedElemRestriction point_block_diag_rstr; + CeedCallBackend(CreatePBRestriction(diag->diag_rstr, &point_block_diag_rstr)); + diag->point_block_diag_rstr = point_block_diag_rstr; } - CeedElemRestriction diagrstr = pointBlock ? diag->pbdiagrstr : diag->diagrstr; + CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; // Create diagonal vector - CeedVector elemdiag = pointBlock ? diag->pbelemdiag : diag->elemdiag; - if (!elemdiag) { - CeedCallBackend(CeedElemRestrictionCreateVector(diagrstr, NULL, &elemdiag)); - if (pointBlock) diag->pbelemdiag = elemdiag; - else diag->elemdiag = elemdiag; + CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; + + if (!elem_diag) { + CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); + if (is_point_block) diag->point_block_elem_diag = elem_diag; + else diag->elem_diag = elem_diag; } - CeedCallBackend(CeedVectorSetValue(elemdiag, 0.0)); + CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); // Assemble element operator diagonals - CeedScalar *elemdiagarray; - const CeedScalar *assembledqfarray; - CeedCallBackend(CeedVectorGetArray(elemdiag, CEED_MEM_DEVICE, &elemdiagarray)); - CeedCallBackend(CeedVectorGetArrayRead(assembledqf, CEED_MEM_DEVICE, &assembledqfarray)); - CeedInt nelem; - CeedCallBackend(CeedElemRestrictionGetNumElements(diagrstr, &nelem)); + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); + CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); // Compute the diagonal of B^T D B // Umesh: This needs to be reviewed later - // if (pointBlock) { - // CeedCallBackend(CeedOperatorLinearPointBlockDiagonal_Sycl(sycl_data->sycl_queue, nelem, diag, assembledqfarray, elemdiagarray)); + // if (is_point_block) { + // CeedCallBackend(CeedOperatorLinearPointBlockDiagonal_Sycl(sycl_data->sycl_queue, num_elem, diag, assembled_qf_array, elem_diag_array)); //} else { - CeedCallBackend(CeedOperatorLinearDiagonal_Sycl(sycl_data->sycl_queue, pointBlock, nelem, diag, assembledqfarray, elemdiagarray)); + CeedCallBackend(CeedOperatorLinearDiagonal_Sycl(sycl_data->sycl_queue, is_point_block, num_elem, diag, assembled_qf_array, elem_diag_array)); // } // Wait for queue to complete and handle exceptions sycl_data->sycl_queue.wait_and_throw(); // Restore arrays - CeedCallBackend(CeedVectorRestoreArray(elemdiag, &elemdiagarray)); - CeedCallBackend(CeedVectorRestoreArrayRead(assembledqf, &assembledqfarray)); + CeedCallBackend(CeedVectorRestoreArray(elem_diag, &elem_diag_array)); + CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); // Assemble local operator diagonal - CeedCallBackend(CeedElemRestrictionApply(diagrstr, CEED_TRANSPOSE, elemdiag, assembled, request)); + CeedCallBackend(CeedElemRestrictionApply(diag_rstr, CEED_TRANSPOSE, elem_diag, assembled, request)); // Cleanup - CeedCallBackend(CeedVectorDestroy(&assembledqf)); - + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); return CEED_ERROR_SUCCESS; } @@ -995,52 +1021,54 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Sycl(CeedOperator op, // Single operator assembly setup //------------------------------------------------------------------------------ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { - Ceed ceed; + Ceed ceed; + CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_e_mode_out = 0, + num_B_out_mats_to_load = 0, size_B_out = 0, num_qpts = 0, elem_size = 0, num_elem, num_comp, + mat_start = 0; + CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; + const CeedScalar *interp_in, *grad_in; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *input_fields, *output_fields; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); // Get input and output fields - CeedInt num_input_fields, num_output_fields; - CeedOperatorField *input_fields; - CeedOperatorField *output_fields; CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); // Determine active input basis eval mode - CeedQFunction qf; CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedQFunctionField *qf_fields; CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); // Note that the kernel will treat each dimension of a gradient action separately; - // i.e., when an active input has a CEED_EVAL_GRAD mode, num_emode_in will increment by dim. - // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so - // num_B_in_mats_to_load will be incremented by 1. - CeedInt num_emode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0; - CeedEvalMode *eval_mode_in = NULL; // will be of size num_B_in_mats_load - CeedBasis basis_in = NULL; - CeedInt nqpts = 0, esize = 0; - CeedElemRestriction rstr_in = NULL; + // i.e., when an active input has a CEED_EVAL_GRAD mode, num_ e_mode_in will increment by dim. + // However, for the purposes of load_ing the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, + // so num_B_in_mats_to_load will be incremented by 1. for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; + CeedEvalMode eval_mode; + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &nqpts)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &esize)); - CeedEvalMode eval_mode; + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); eval_mode_in[num_B_in_mats_to_load] = eval_mode; num_B_in_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_in += dim; - size_B_in += dim * esize * nqpts; + num_e_mode_in += dim; + size_B_in += dim * elem_size * num_qpts; } else { - num_emode_in += 1; - size_B_in += esize * nqpts; + num_e_mode_in += 1; + size_B_in += elem_size * num_qpts; } } } @@ -1048,12 +1076,10 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { // Determine active output basis; basis_out and rstr_out only used if same as input, TODO CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); - CeedInt num_emode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0; - CeedEvalMode *eval_mode_out = NULL; - CeedBasis basis_out = NULL; - CeedElemRestriction rstr_out = NULL; for (CeedInt i = 0; i < num_output_fields; i++) { - CeedVector vec; + CeedEvalMode eval_mode; + CeedVector vec; + CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); if (vec == CEED_VECTOR_ACTIVE) { CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); @@ -1063,85 +1089,83 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); // LCOV_EXCL_STOP } - CeedEvalMode eval_mode; CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); if (eval_mode != CEED_EVAL_NONE) { CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); eval_mode_out[num_B_out_mats_to_load] = eval_mode; num_B_out_mats_to_load += 1; if (eval_mode == CEED_EVAL_GRAD) { - num_emode_out += dim; - size_B_out += dim * esize * nqpts; + num_e_mode_out += dim; + size_B_out += dim * elem_size * num_qpts; } else { - num_emode_out += 1; - size_B_out += esize * nqpts; + num_e_mode_out += 1; + size_B_out += elem_size * num_qpts; } } } } - if (num_emode_in == 0 || num_emode_out == 0) { + if (num_e_mode_in == 0 || num_e_mode_out == 0) { // LCOV_EXCL_START return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); // LCOV_EXCL_STOP } - CeedInt nelem, ncomp; - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &nelem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &ncomp)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); CeedCallBackend(CeedCalloc(1, &impl->asmb)); CeedOperatorAssemble_Sycl *asmb = impl->asmb; - asmb->nelem = nelem; + asmb->num_elem = num_elem; Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Kernel setup - int elemsPerBlock = 1; - asmb->elemsPerBlock = elemsPerBlock; - CeedInt block_size = esize * esize * elemsPerBlock; + int elem_per_block = 1; + asmb->elem_per_block = elem_per_block; + CeedInt block_size = elem_size * elem_size * elem_per_block; + /* CeedInt maxThreadsPerBlock = sycl_data->sycl_device.get_info(); bool fallback = block_size > maxThreadsPerBlock; asmb->fallback = fallback; if (fallback) { // Use fallback kernel with 1D threadblock - block_size = esize * elemsPerBlock; - asmb->block_size_x = esize; + block_size = elem_size * elem_per_block; + asmb->block_size_x = elem_size; asmb->block_size_y = 1; } else { // Use kernel with 2D threadblock - asmb->block_size_x = esize; - asmb->block_size_y = esize; + asmb->block_size_x = elem_size; + asmb->block_size_y = elem_size; }*/ - asmb->block_size_x = esize; - asmb->block_size_y = esize; - asmb->numemodein = num_emode_in; - asmb->numemodeout = num_emode_out; - asmb->nqpts = nqpts; - asmb->nnodes = esize; - asmb->block_size = block_size; - asmb->ncomp = ncomp; + asmb->block_size_x = elem_size; + asmb->block_size_y = elem_size; + asmb->num_e_mode_in = num_e_mode_in; + asmb->num_e_mode_out = num_e_mode_out; + asmb->num_qpts = num_qpts; + asmb->num_nodes = elem_size; + asmb->block_size = block_size; + asmb->num_comp = num_comp; // Build 'full' B matrices (not 1D arrays used for tensor-product matrices - const CeedScalar *interp_in, *grad_in; CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); // Load into B_in, in order that they will be used in eval_mode - CeedInt mat_start = 0; CeedCallSycl(ceed, asmb->d_B_in = sycl::malloc_device(size_B_in, sycl_data->sycl_device, sycl_data->sycl_context)); for (int i = 0; i < num_B_in_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_in[i]; + if (eval_mode == CEED_EVAL_INTERP) { // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl_data->sycl_queue.copy(interp_in, &asmb->d_B_in[mat_start], esize * nqpts, {e}); - mat_start += esize * nqpts; + sycl_data->sycl_queue.copy(interp_in, &asmb->d_B_in[mat_start], elem_size * num_qpts, {e}); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl_data->sycl_queue.copy(grad_in, &asmb->d_B_in[mat_start], dim * esize * nqpts, {e}); - mat_start += dim * esize * nqpts; + sycl_data->sycl_queue.copy(grad_in, &asmb->d_B_in[mat_start], dim * elem_size * num_qpts, {e}); + mat_start += dim * elem_size * num_qpts; } } @@ -1161,16 +1185,17 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { CeedCallSycl(ceed, asmb->d_B_out = sycl::malloc_device(size_B_out, sycl_data->sycl_device, sycl_data->sycl_context)); for (int i = 0; i < num_B_out_mats_to_load; i++) { CeedEvalMode eval_mode = eval_mode_out[i]; + if (eval_mode == CEED_EVAL_INTERP) { // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl_data->sycl_queue.copy(interp_out, &asmb->d_B_out[mat_start], esize * nqpts, {e}); - mat_start += esize * nqpts; + sycl_data->sycl_queue.copy(interp_out, &asmb->d_B_out[mat_start], elem_size * num_qpts, {e}); + mat_start += elem_size * num_qpts; } else if (eval_mode == CEED_EVAL_GRAD) { // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl_data->sycl_queue.copy(grad_out, &asmb->d_B_out[mat_start], dim * esize * nqpts, {e}); - mat_start += dim * esize * nqpts; + sycl_data->sycl_queue.copy(grad_out, &asmb->d_B_out[mat_start], dim * elem_size * num_qpts, {e}); + mat_start += dim * elem_size * num_qpts; } } return CEED_ERROR_SUCCESS; @@ -1183,25 +1208,25 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp CeedScalar *values_array) { // This kernels assumes B_in and B_out have the same number of quadrature points and basis points. // TODO: expand to more general cases - CeedOperatorAssemble_Sycl *asmb = impl->asmb; - const CeedInt nelem = asmb->nelem; - const CeedSize nnodes = asmb->nnodes; - const CeedSize ncomp = asmb->ncomp; - const CeedSize nqpts = asmb->nqpts; - const CeedSize numemodein = asmb->numemodein; - const CeedSize numemodeout = asmb->numemodeout; + CeedOperatorAssemble_Sycl *asmb = impl->asmb; + const CeedInt num_elem = asmb->num_elem; + const CeedSize num_nodes = asmb->num_nodes; + const CeedSize num_comp = asmb->num_comp; + const CeedSize num_qpts = asmb->num_qpts; + const CeedSize num_e_mode_in = asmb->num_e_mode_in; + const CeedSize num_e_mode_out = asmb->num_e_mode_out; // Strides for final output ordering, determined by the reference (inference) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col - const CeedSize comp_out_stride = nnodes * nnodes; - const CeedSize comp_in_stride = comp_out_stride * ncomp; - const CeedSize e_stride = comp_in_stride * ncomp; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedSize qe_stride = nqpts; - const CeedSize qcomp_out_stride = nelem * qe_stride; - const CeedSize qemode_out_stride = qcomp_out_stride * ncomp; - const CeedSize qcomp_in_stride = qemode_out_stride * numemodeout; - const CeedSize qemode_in_stride = qcomp_in_stride * ncomp; + const CeedSize comp_out_stride = num_nodes * num_nodes; + const CeedSize comp_in_stride = comp_out_stride * num_comp; + const CeedSize e_stride = comp_in_stride * num_comp; + // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt + const CeedSize q_e_stride = num_qpts; + const CeedSize q_comp_out_stride = num_elem * q_e_stride; + const CeedSize q_e_mode_out_stride = q_comp_out_stride * num_comp; + const CeedSize q_comp_in_stride = q_e_mode_out_stride * num_e_mode_out; + const CeedSize q_e_mode_in_stride = q_comp_in_stride * num_comp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; @@ -1209,7 +1234,7 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp const CeedInt block_size_x = asmb->block_size_x; const CeedInt block_size_y = asmb->block_size_y; - sycl::range<3> kernel_range(nelem, block_size_y, block_size_x); + sycl::range<3> kernel_range(num_elem, block_size_y, block_size_x); // Order queue sycl::event e = sycl_queue.ext_oneapi_submit_barrier(); @@ -1218,27 +1243,30 @@ static int CeedOperatorLinearAssemble_Sycl(sycl::queue &sycl_queue, const CeedOp const int l = idx.get(1); // The output column index of each B^TDB operation const int i = idx.get(2); // The output row index of each B^TDB operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - for (CeedSize comp_in = 0; comp_in < ncomp; comp_in++) { - for (CeedSize comp_out = 0; comp_out < ncomp; comp_out++) { + for (CeedSize comp_in = 0; comp_in < num_comp; comp_in++) { + for (CeedSize comp_out = 0; comp_out < num_comp; comp_out++) { CeedScalar result = 0.0; - CeedSize qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (CeedSize emode_in = 0; emode_in < numemodein; emode_in++) { - CeedSize b_in_index = emode_in * nqpts * nnodes; - for (CeedSize emode_out = 0; emode_out < numemodeout; emode_out++) { - CeedSize b_out_index = emode_out * nqpts * nnodes; - CeedSize qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + CeedSize qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + + for (CeedSize e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) { + CeedSize b_in_index = e_mode_in * num_qpts * num_nodes; + + for (CeedSize e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) { + CeedSize b_out_index = e_mode_out * num_qpts * num_nodes; + CeedSize qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (CeedSize j = 0; j < nqpts; j++) { - result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; + for (CeedSize j = 0; j < num_qpts; j++) { + result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; } - } // end of emode_out - } // end of emode_in - CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; + } // end of e_mode_out + } // end of e_mode_in + CeedSize val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; + values_array[val_index] = result; } // end of out component } // end of in component }); - return CEED_ERROR_SUCCESS; } @@ -1251,35 +1279,35 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. // TODO: expand to more general cases CeedOperatorAssemble_Sycl *asmb = impl->asmb; - const CeedInt nelem = asmb->nelem; - const CeedInt nnodes = asmb->nnodes; - const CeedInt ncomp = asmb->ncomp; - const CeedInt nqpts = asmb->nqpts; - const CeedInt numemodein = asmb->numemodein; - const CeedInt numemodeout = asmb->numemodeout; + const CeedInt num_elem = asmb->num_elem; + const CeedInt num_nodes = asmb->num_nodes; + const CeedInt num_comp = asmb->num_comp; + const CeedInt num_qpts = asmb->num_qpts; + const CeedInt num_e_mode_in = asmb->num_e_mode_in; + const CeedInt num_e_mode_out = asmb->num_e_mode_out; // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: elememt, // comp_in, comp_out, node_row, node_col - const CeedInt comp_out_stride = nnodes * nnodes; - const CeedInt comp_in_stride = comp_out_stride * ncomp; - const CeedInt e_stride = comp_in_stride * ncomp; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const CeedInt qe_stride = nqpts; - const CeedInt qcomp_out_stride = nelem * qe_stride; - const CeedInt qemode_out_stride = qcomp_out_stride * ncomp; - const CeedInt qcomp_in_stride = qemode_out_stride * numemodeout; - const CeedInt qemode_in_stride = qcomp_in_stride * ncomp; + const CeedInt comp_out_stride = num_nodes * num_nodes; + const CeedInt comp_in_stride = comp_out_stride * num_comp; + const CeedInt e_stride = comp_in_stride * num_comp; + // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt + const CeedInt q_e_stride = num_qpts; + const CeedInt q_comp_out_stride = num_elem * q_e_stride; + const CeedInt q_e_mode_out_stride = q_comp_out_stride * num_comp; + const CeedInt q_comp_in_stride = q_e_mode_out_stride * num_e_mode_out; + const CeedInt q_e_mode_in_stride = q_comp_in_stride * num_comp; CeedScalar *B_in, *B_out; B_in = asmb->d_B_in; B_out = asmb->d_B_out; - const CeedInt elemsPerBlock = asmb->elemsPerBlock; + const CeedInt elem_per_block = asmb->elem_per_block; const CeedInt block_size_x = asmb->block_size_x; const CeedInt block_size_y = asmb->block_size_y; // This will be 1 for the fallback kernel - const CeedInt grid = nelem / elemsPerBlock + ((nelem / elemsPerBlock * elemsPerBlock < nelem) ? 1 : 0); - sycl::range<3> local_range(block_size_x, block_size_y, elemsPerBlock); - sycl::range<3> global_range(grid * block_size_x, block_size_y, elemsPerBlock); + const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); + sycl::range<3> local_range(block_size_x, block_size_y, elem_per_block); + sycl::range<3> global_range(grid * block_size_x, block_size_y, elem_per_block); sycl::nd_range<3> kernel_range(global_range, local_range); sycl_queue.parallel_for(kernel_range, [=](sycl::nd_item<3> work_item) { @@ -1291,24 +1319,24 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons const int l = threadIdx; // The output column index of each B^TDB operation // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - for (CeedInt e = blockIdx * blockDimz + threadIdz; e < nelem; e += gridDimx * blockDimz) { - for (CeedInt comp_in = 0; comp_in < ncomp; comp_in++) { - for (CeedInt comp_out = 0; comp_out < ncomp; comp_out++) { - for (CeedInt i = 0; i < nnodes; i++) { + for (CeedInt e = blockIdx * blockDimz + threadIdz; e < num_elem; e += gridDimx * blockDimz) { + for (CeedInt comp_in = 0; comp_in < num_comp; comp_in++) { + for (CeedInt comp_out = 0; comp_out < num_comp; comp_out++) { + for (CeedInt i = 0; i < num_nodes; i++) { CeedScalar result = 0.0; - CeedInt qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (CeedInt emode_in = 0; emode_in < numemodein; emode_in++) { - CeedInt b_in_index = emode_in * nqpts * nnodes; - for (CeedInt emode_out = 0; emode_out < numemodeout; emode_out++) { - CeedInt b_out_index = emode_out * nqpts * nnodes; - CeedInt qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + CeedInt qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + for (CeedInt e_mode_in = 0; e_mode_in < num_e_mode_in; e_mode_in++) { + CeedInt b_in_index = e_mode_in * num_qpts * num_nodes; + for (CeedInt e_mode_out = 0; e_mode_out < num_e_mode_out; e_mode_out++) { + CeedInt b_out_index = e_mode_out * num_qpts * num_nodes; + CeedInt qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (CeedInt j = 0; j < nqpts; j++) { - result += B_out[b_out_index + j * nnodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * nnodes + l]; + for (CeedInt j = 0; j < num_qpts; j++) { + result += B_out[b_out_index + j * num_nodes + i] * qf_array[qf_index + j] * B_in[b_in_index + j * num_nodes + l]; } - } // end of emode_out - } // end of emode_in - CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + nnodes * i + l; + } // end of e_mode_out + } // end of e_mode_in + CeedInt val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + num_nodes * i + l; values_array[val_index] = result; } // end of loop over element node index, i } // end of out component @@ -1327,11 +1355,16 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons // TODO: allow multiple active input restrictions/basis objects //------------------------------------------------------------------------------ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *sycl_data; + CeedScalar *values_array; + const CeedScalar *qf_array; + CeedVector assembled_qf = NULL; + CeedElemRestriction rstr_q = NULL; + CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedOperator_Sycl *impl; CeedCallBackend(CeedOperatorGetData(op, &impl)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Setup @@ -1341,14 +1374,10 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed } // Assemble QFunction - CeedVector assembled_qf = NULL; - CeedElemRestriction rstr_q = NULL; CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); - CeedScalar *values_array; CeedCallBackend(CeedVectorGetArrayWrite(values, CEED_MEM_DEVICE, &values_array)); values_array += offset; - const CeedScalar *qf_array; CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); // Compute B^T D B @@ -1364,7 +1393,6 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed // Cleanup CeedCallBackend(CeedVectorDestroy(&assembled_qf)); - return CEED_ERROR_SUCCESS; } @@ -1372,10 +1400,11 @@ static int CeedSingleOperatorAssemble_Sycl(CeedOperator op, CeedInt offset, Ceed // Create operator //------------------------------------------------------------------------------ int CeedOperatorCreate_Sycl(CeedOperator op) { - Ceed ceed; - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + Ceed ceed; CeedOperator_Sycl *impl; + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedOperatorSetData(op, impl)); diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp index 35b3699518..bcdc1a6e5f 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction-load.sycl.cpp @@ -27,23 +27,26 @@ // TODO: Refactor //------------------------------------------------------------------------------ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { + Ceed ceed; + Ceed_Sycl* data; + char * qfunction_name, *qfunction_source, *read_write_kernel_path, *read_write_kernel_source; + CeedInt num_input_fields, num_output_fields; + CeedQFunctionField *input_fields, *output_fields; CeedQFunction_Sycl* impl; + CeedCallBackend(CeedQFunctionGetData(qf, (void**)&impl)); // QFunction is built if (impl->QFunction) return CEED_ERROR_SUCCESS; - Ceed ceed; CeedQFunctionGetCeed(qf, &ceed); - Ceed_Sycl* data; CeedCallBackend(CeedGetData(ceed, &data)); // QFunction kernel generation - CeedInt num_input_fields, num_output_fields; - CeedQFunctionField *input_fields, *output_fields; CeedCallBackend(CeedQFunctionGetFields(qf, &num_input_fields, &input_fields, &num_output_fields, &output_fields)); std::vector input_sizes(num_input_fields); CeedQFunctionField* input_i = input_fields; + for (auto& size_i : input_sizes) { CeedCallBackend(CeedQFunctionFieldGetSize(*input_i, &size_i)); ++input_i; @@ -51,23 +54,20 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { std::vector output_sizes(num_output_fields); CeedQFunctionField* output_i = output_fields; + for (auto& size_i : output_sizes) { CeedCallBackend(CeedQFunctionFieldGetSize(*output_i, &size_i)); ++output_i; } - char* qfunction_name; CeedCallBackend(CeedQFunctionGetKernelName(qf, &qfunction_name)); - char* qfunction_source; CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source -----\n"); CeedCallBackend(CeedQFunctionLoadSourceToBuffer(qf, &qfunction_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction User Source Complete! -----\n"); - char* read_write_kernel_path; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-ref-qfunction.h", &read_write_kernel_path)); - char* read_write_kernel_source; CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, read_write_kernel_path, &read_write_kernel_source)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading QFunction Read/Write Kernel Source Complete! -----\n"); @@ -169,7 +169,6 @@ extern "C" int CeedQFunctionBuildKernel_Sycl(CeedQFunction qf) { CeedCallBackend(CeedFree(&qfunction_source)); CeedCallBackend(CeedFree(&read_write_kernel_path)); CeedCallBackend(CeedFree(&read_write_kernel_source)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp index 10e090b58c..3bf1328e30 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunction.sycl.cpp @@ -24,18 +24,20 @@ // Apply QFunction //------------------------------------------------------------------------------ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, CeedVector *V) { + Ceed ceed; + Ceed_Sycl *ceed_Sycl; + void *context_data; + CeedInt num_input_fields, num_output_fields; CeedQFunction_Sycl *impl; + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); // Build and compile kernel, if not done if (!impl->QFunction) CeedCallBackend(CeedQFunctionBuildKernel_Sycl(qf)); - Ceed ceed; CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - Ceed_Sycl *ceed_Sycl; CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); - CeedInt num_input_fields, num_output_fields; CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); // Read vectors @@ -54,7 +56,6 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C } // Get context data - void *context_data; CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &context_data)); // Order queue @@ -101,7 +102,6 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Restore context CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &context_data)); - return CEED_ERROR_SUCCESS; } @@ -109,17 +109,14 @@ static int CeedQFunctionApply_Sycl(CeedQFunction qf, CeedInt Q, CeedVector *U, C // Destroy QFunction //------------------------------------------------------------------------------ static int CeedQFunctionDestroy_Sycl(CeedQFunction qf) { + Ceed ceed; CeedQFunction_Sycl *impl; - CeedCallBackend(CeedQFunctionGetData(qf, &impl)); - Ceed ceed; + CeedCallBackend(CeedQFunctionGetData(qf, &impl)); CeedCallBackend(CeedQFunctionGetCeed(qf, &ceed)); - delete impl->QFunction; delete impl->sycl_module; - CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -127,13 +124,12 @@ static int CeedQFunctionDestroy_Sycl(CeedQFunction qf) { // Create QFunction //------------------------------------------------------------------------------ int CeedQFunctionCreate_Sycl(CeedQFunction qf) { - Ceed ceed; - CeedQFunctionGetCeed(qf, &ceed); + Ceed ceed; CeedQFunction_Sycl *impl; + CeedQFunctionGetCeed(qf, &ceed); CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionSetData(qf, impl)); - // Register backend functions CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Apply", CeedQFunctionApply_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunction", qf, "Destroy", CeedQFunctionDestroy_Sycl)); diff --git a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp index 2f5a4e634e..4c5257d3e4 100644 --- a/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref-qfunctioncontext.sycl.cpp @@ -17,11 +17,13 @@ // Sync host to device //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ctx) { + Ceed ceed; + Ceed_Sycl *sycl_data; + size_t ctx_size; CeedQFunctionContext_Sycl *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); if (!impl->h_data) { @@ -30,22 +32,20 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct // LCOV_EXCL_STOP } - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->d_data_borrowed) { impl->d_data = impl->d_data_borrowed; } else if (impl->d_data_owned) { impl->d_data = impl->d_data_owned; } else { - CeedCallSycl(ceed, impl->d_data_owned = sycl::malloc_device(ctxsize, sycl_data->sycl_device, sycl_data->sycl_context)); + CeedCallSycl(ceed, impl->d_data_owned = sycl::malloc_device(ctx_size, sycl_data->sycl_device, sycl_data->sycl_context)); impl->d_data = impl->d_data_owned; } // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctxsize, {e}); + sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, impl->h_data, ctx_size, {e}); CeedCallSycl(ceed, copy_event.wait_and_throw()); - return CEED_ERROR_SUCCESS; } @@ -53,11 +53,13 @@ static inline int CeedQFunctionContextSyncH2D_Sycl(const CeedQFunctionContext ct // Sync device to host //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ctx) { + Ceed ceed; + Ceed_Sycl *sycl_data; + size_t ctx_size; CeedQFunctionContext_Sycl *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); if (!impl->d_data) { @@ -66,23 +68,21 @@ static inline int CeedQFunctionContextSyncD2H_Sycl(const CeedQFunctionContext ct // LCOV_EXCL_STOP } - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); if (impl->h_data_borrowed) { impl->h_data = impl->h_data_borrowed; } else if (impl->h_data_owned) { impl->h_data = impl->h_data_owned; } else { - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data = impl->h_data_owned; } // Order queue sycl::event e = sycl_data->sycl_queue.ext_oneapi_submit_barrier(); - sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctxsize, {e}); + sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->h_data, impl->d_data, ctx_size, {e}); CeedCallSycl(ceed, copy_event.wait_and_throw()); - return CEED_ERROR_SUCCESS; } @@ -104,11 +104,10 @@ static inline int CeedQFunctionContextSync_Sycl(const CeedQFunctionContext ctx, //------------------------------------------------------------------------------ static inline int CeedQFunctionContextSetAllInvalid_Sycl(const CeedQFunctionContext ctx) { CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); impl->h_data = NULL; impl->d_data = NULL; - return CEED_ERROR_SUCCESS; } @@ -117,10 +116,9 @@ static inline int CeedQFunctionContextSetAllInvalid_Sycl(const CeedQFunctionCont //------------------------------------------------------------------------------ static inline int CeedQFunctionContextHasValidData_Sycl(const CeedQFunctionContext ctx, bool *has_valid_data) { CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); *has_valid_data = impl && (impl->h_data || impl->d_data); - return CEED_ERROR_SUCCESS; } @@ -130,8 +128,8 @@ static inline int CeedQFunctionContextHasValidData_Sycl(const CeedQFunctionConte static inline int CeedQFunctionContextHasBorrowedDataOfType_Sycl(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *has_borrowed_data_of_type) { CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_data_of_type = impl->h_data_borrowed; @@ -140,7 +138,6 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Sycl(const CeedQFunc *has_borrowed_data_of_type = impl->d_data_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -148,10 +145,10 @@ static inline int CeedQFunctionContextHasBorrowedDataOfType_Sycl(const CeedQFunc // Check if data of given type needs sync //------------------------------------------------------------------------------ static inline int CeedQFunctionContextNeedSync_Sycl(const CeedQFunctionContext ctx, CeedMemType mem_type, bool *need_sync) { + bool has_valid_data = true; CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - bool has_valid_data = true; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedQFunctionContextHasValidData(ctx, &has_valid_data)); switch (mem_type) { case CEED_MEM_HOST: @@ -161,7 +158,6 @@ static inline int CeedQFunctionContextNeedSync_Sycl(const CeedQFunctionContext c *need_sync = has_valid_data && !impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -170,17 +166,18 @@ static inline int CeedQFunctionContextNeedSync_Sycl(const CeedQFunctionContext c //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataHost_Sycl(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedFree(&impl->h_data_owned)); switch (copy_mode) { case CEED_COPY_VALUES: - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallBackend(CeedMallocArray(1, ctxsize, &impl->h_data_owned)); + size_t ctx_size; + + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallBackend(CeedMallocArray(1, ctx_size, &impl->h_data_owned)); impl->h_data_borrowed = NULL; impl->h_data = impl->h_data_owned; - memcpy(impl->h_data, data, ctxsize); + memcpy(impl->h_data, data, ctx_size); break; case CEED_OWN_POINTER: impl->h_data_owned = data; @@ -192,7 +189,6 @@ static int CeedQFunctionContextSetDataHost_Sycl(const CeedQFunctionContext ctx, impl->h_data = data; break; } - return CEED_ERROR_SUCCESS; } @@ -200,11 +196,12 @@ static int CeedQFunctionContextSetDataHost_Sycl(const CeedQFunctionContext ctx, // Set data from device //------------------------------------------------------------------------------ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx, const CeedCopyMode copy_mode, void *data) { + Ceed ceed; + Ceed_Sycl *sycl_data; CeedQFunctionContext_Sycl *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Order queue @@ -219,12 +216,13 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx switch (copy_mode) { case CEED_COPY_VALUES: { - size_t ctxsize; - CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctxsize)); - CeedCallSycl(ceed, impl->d_data_owned = sycl::malloc_device(ctxsize, sycl_data->sycl_device, sycl_data->sycl_context)); + size_t ctx_size; + + CeedCallBackend(CeedQFunctionContextGetContextSize(ctx, &ctx_size)); + CeedCallSycl(ceed, impl->d_data_owned = sycl::malloc_device(ctx_size, sycl_data->sycl_device, sycl_data->sycl_context)); impl->d_data_borrowed = NULL; impl->d_data = impl->d_data_owned; - sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, data, ctxsize, {e}); + sycl::event copy_event = sycl_data->sycl_queue.memcpy(impl->d_data, data, ctx_size, {e}); CeedCallSycl(ceed, copy_event.wait_and_throw()); } break; case CEED_OWN_POINTER: { @@ -238,7 +236,6 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx impl->d_data = data; } break; } - return CEED_ERROR_SUCCESS; } @@ -248,8 +245,8 @@ static int CeedQFunctionContextSetDataDevice_Sycl(const CeedQFunctionContext ctx //------------------------------------------------------------------------------ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, const CeedCopyMode copy_mode, void *data) { Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextSetAllInvalid_Sycl(ctx)); switch (mem_type) { case CEED_MEM_HOST: @@ -257,7 +254,6 @@ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, cons case CEED_MEM_DEVICE: return CeedQFunctionContextSetDataDevice_Sycl(ctx, copy_mode, data); } - return CEED_ERROR_UNSUPPORTED; } @@ -265,19 +261,19 @@ static int CeedQFunctionContextSetData_Sycl(const CeedQFunctionContext ctx, cons // Take data //------------------------------------------------------------------------------ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + Ceed_Sycl *ceedSycl; + bool need_sync = false; CeedQFunctionContext_Sycl *impl; - CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed_Sycl *ceedSycl; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); CeedCallBackend(CeedGetData(ceed, &ceedSycl)); // Order queue ceedSycl->sycl_queue.ext_oneapi_submit_barrier(); // Sync data to requested mem_type - bool need_sync = false; CeedCallBackend(CeedQFunctionContextNeedSync_Sycl(ctx, mem_type, &need_sync)); if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Sycl(ctx, mem_type)); @@ -294,7 +290,6 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con impl->d_data = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -303,13 +298,14 @@ static int CeedQFunctionContextTakeData_Sycl(const CeedQFunctionContext ctx, con // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedQFunctionContextGetDataCore_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + bool need_sync = false; CeedQFunctionContext_Sycl *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); // Sync data to requested mem_type - bool need_sync = false; CeedCallBackend(CeedQFunctionContextNeedSync_Sycl(ctx, mem_type, &need_sync)); if (need_sync) CeedCallBackend(CeedQFunctionContextSync_Sycl(ctx, mem_type)); @@ -322,7 +318,6 @@ static int CeedQFunctionContextGetDataCore_Sycl(const CeedQFunctionContext ctx, *(void **)data = impl->d_data; break; } - return CEED_ERROR_SUCCESS; } @@ -337,11 +332,11 @@ static int CeedQFunctionContextGetDataRead_Sycl(const CeedQFunctionContext ctx, // Get read/write access to the data //------------------------------------------------------------------------------ static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, const CeedMemType mem_type, void *data) { + Ceed ceed; CeedQFunctionContext_Sycl *impl; + CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed ceed; CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); - CeedCallBackend(CeedQFunctionContextGetDataCore_Sycl(ctx, mem_type, data)); // Mark only pointer for requested memory as valid @@ -354,7 +349,6 @@ static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, cons impl->d_data = *(void **)data; break; } - return CEED_ERROR_SUCCESS; } @@ -362,11 +356,12 @@ static int CeedQFunctionContextGetData_Sycl(const CeedQFunctionContext ctx, cons // Destroy the user context //------------------------------------------------------------------------------ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) { - Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + Ceed ceed; + Ceed_Sycl *sycl_data; CeedQFunctionContext_Sycl *impl; + + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedQFunctionContextGetBackendData(ctx, &impl)); - Ceed_Sycl *sycl_data; CeedCallBackend(CeedGetData(ceed, &sycl_data)); // Wait for all work to finish before freeing memory @@ -374,7 +369,6 @@ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) { CeedCallSycl(ceed, sycl::free(impl->d_data_owned, sycl_data->sycl_context)); CeedCallBackend(CeedFree(&impl->h_data_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -382,10 +376,10 @@ static int CeedQFunctionContextDestroy_Sycl(const CeedQFunctionContext ctx) { // QFunctionContext Create //------------------------------------------------------------------------------ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) { - CeedQFunctionContext_Sycl *impl; Ceed ceed; - CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); + CeedQFunctionContext_Sycl *impl; + CeedCallBackend(CeedQFunctionContextGetCeed(ctx, &ceed)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "HasValidData", CeedQFunctionContextHasValidData_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "HasBorrowedDataOfType", CeedQFunctionContextHasBorrowedDataOfType_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "SetData", CeedQFunctionContextSetData_Sycl)); @@ -393,10 +387,8 @@ int CeedQFunctionContextCreate_Sycl(CeedQFunctionContext ctx) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetData", CeedQFunctionContextGetData_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "GetDataRead", CeedQFunctionContextGetDataRead_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "QFunctionContext", ctx, "Destroy", CeedQFunctionContextDestroy_Sycl)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallBackend(CeedQFunctionContextSetBackendData(ctx, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp index e4e6fa1deb..56544c38ca 100644 --- a/backends/sycl-ref/ceed-sycl-ref.hpp +++ b/backends/sycl-ref/ceed-sycl-ref.hpp @@ -83,31 +83,32 @@ typedef struct { } CeedQFunctionContext_Sycl; typedef struct { - CeedBasis basisin, basisout; - CeedElemRestriction diagrstr, pbdiagrstr; - CeedVector elemdiag, pbelemdiag; - CeedInt numemodein, numemodeout, nnodes; - CeedInt nqpts, ncomp; // Kernel parameters - CeedEvalMode *h_emodein, *h_emodeout; - CeedEvalMode *d_emodein, *d_emodeout; - CeedScalar *d_identity, *d_interpin, *d_interpout, *d_gradin, *d_gradout; + CeedBasis basis_in, basis_out; + CeedElemRestriction diag_rstr, point_block_diag_rstr; + CeedVector elem_diag, point_block_elem_diag; + CeedInt num_e_mode_in, num_e_mode_out, num_nodes; + CeedInt num_qpts, num_comp; // Kernel parameters + CeedEvalMode *h_e_mode_in, *h_e_mode_out; + CeedEvalMode *d_e_mode_in, *d_e_mode_out; + CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; } CeedOperatorDiag_Sycl; typedef struct { - CeedInt nelem, block_size_x, block_size_y, elemsPerBlock; - CeedInt numemodein, numemodeout, nqpts, nnodes, block_size, ncomp; // Kernel parameters + CeedInt num_elem, block_size_x, block_size_y, elem_per_block; + CeedInt num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp; // Kernel parameters bool fallback; CeedScalar *d_B_in, *d_B_out; } CeedOperatorAssemble_Sycl; typedef struct { - CeedVector *evecs; // E-vectors, inputs followed by outputs - CeedVector *qvecsin; // Input Q-vectors needed to apply operator - CeedVector *qvecsout; // Output Q-vectors needed to apply operator - CeedInt numein; - CeedInt numeout; - CeedInt qfnumactivein, qfnumactiveout; - CeedVector *qfactivein; + CeedVector *e_vecs; // E-vectors, inputs followed by outputs + CeedVector *q_vecs_in; // Input Q-vectors needed to apply operator + CeedVector *q_vecs_out; // Output Q-vectors needed to apply operator + CeedInt num_e_in; + CeedInt num_e_out; + CeedInt num_inputs, num_outputs; + CeedInt num_active_in, num_active_out; + CeedVector *qf_active_in; CeedOperatorDiag_Sycl *diag; CeedOperatorAssemble_Sycl *asmb; } CeedOperator_Sycl; @@ -115,10 +116,10 @@ typedef struct { CEED_INTERN int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec); CEED_INTERN int CeedBasisCreateTensorH1_Sycl(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *qref_1d, const CeedScalar *qweight_1d, CeedBasis basis); + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt ndof, CeedInt nqpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *qref, const CeedScalar *qweight, CeedBasis basis); +CEED_INTERN int CeedBasisCreateH1_Sycl(CeedElemTopology topo, CeedInt dim, CeedInt num_dof, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); CEED_INTERN int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, const CeedInt8 *curl_orients, CeedElemRestriction r); diff --git a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp index 65d4fead36..2115460b54 100644 --- a/backends/sycl-ref/ceed-sycl-ref.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-ref.sycl.cpp @@ -26,14 +26,15 @@ static int CeedGetPreferredMemType_Sycl(CeedMemType *mem_type) { // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) { - char *resource_root; + Ceed_Sycl *data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/ref") || !std::strcmp(resource_root, "/cpu/sycl/ref"), ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Sycl(ceed, resource)); @@ -48,7 +49,6 @@ static int CeedInit_Sycl_ref(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "QFunctionContextCreate", &CeedQFunctionContextCreate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "OperatorCreate", &CeedOperatorCreate_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Ceed", ceed, "Destroy", &CeedDestroy_Sycl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp index 328986aea9..4ace997696 100644 --- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp @@ -52,12 +52,11 @@ static int CeedElemRestrictionStridedNoTranspose_Sycl(sycl::queue &sycl_queue, c //------------------------------------------------------------------------------ static int CeedElemRestrictionOffsetNoTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, CeedScalar *v) { - const CeedInt elem_size = impl->elem_size; - const CeedInt num_elem = impl->num_elem; - const CeedInt num_comp = impl->num_comp; - const CeedInt comp_stride = impl->comp_stride; - - const CeedInt *indices = impl->d_ind; + const CeedInt elem_size = impl->elem_size; + const CeedInt num_elem = impl->num_elem; + const CeedInt num_comp = impl->num_comp; + const CeedInt comp_stride = impl->comp_stride; + const CeedInt *indices = impl->d_ind; sycl::range<1> kernel_range(num_elem * elem_size); @@ -107,12 +106,11 @@ static int CeedElemRestrictionStridedTranspose_Sycl(sycl::queue &sycl_queue, con //------------------------------------------------------------------------------ static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, const CeedElemRestriction_Sycl *impl, const CeedScalar *u, CeedScalar *v) { - const CeedInt num_nodes = impl->num_nodes; - const CeedInt elem_size = impl->elem_size; - const CeedInt num_elem = impl->num_elem; - const CeedInt num_comp = impl->num_comp; - const CeedInt comp_stride = impl->comp_stride; - + const CeedInt num_nodes = impl->num_nodes; + const CeedInt elem_size = impl->elem_size; + const CeedInt num_elem = impl->num_elem; + const CeedInt num_comp = impl->num_comp; + const CeedInt comp_stride = impl->comp_stride; const CeedInt *l_vec_indices = impl->d_l_vec_indices; const CeedInt *t_offsets = impl->d_t_offsets; const CeedInt *t_indices = impl->d_t_indices; @@ -127,8 +125,7 @@ static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, cons const CeedInt ind = l_vec_indices[node]; const CeedInt range_1 = t_offsets[node]; const CeedInt range_N = t_offsets[node + 1]; - - CeedScalar value = 0.0; + CeedScalar value = 0.0; for (CeedInt j = range_1; j < range_N; j++) { const CeedInt t_ind = t_indices[j]; @@ -146,16 +143,17 @@ static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, cons // Apply restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + const CeedScalar *d_u; + CeedScalar *d_v; CeedElemRestriction_Sycl *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Get vectors - const CeedScalar *d_u; - CeedScalar *d_v; CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); if (t_mode == CEED_TRANSPOSE) { // Sum into for transpose mode, e-vec to l-vec @@ -200,9 +198,10 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMod // Get offsets //------------------------------------------------------------------------------ static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType m_type, const CeedInt **offsets) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; CeedElemRestriction_Sycl *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); switch (m_type) { @@ -220,11 +219,12 @@ static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType // Destroy restriction //------------------------------------------------------------------------------ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedElemRestriction_Sycl *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Wait for all work to finish before freeing memory @@ -236,7 +236,6 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { CeedCallSycl(ceed, sycl::free(impl->d_t_indices, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_l_vec_indices, data->sycl_context)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -244,32 +243,32 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { // Create transpose offsets and indices //------------------------------------------------------------------------------ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const CeedInt *indices) { - Ceed ceed; - CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + bool *is_node; + CeedSize l_size; + CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; CeedElemRestriction_Sycl *impl; + + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); - CeedSize l_size; - CeedInt num_elem, elem_size, num_comp; CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); // Count num_nodes - bool *is_node; CeedCallBackend(CeedCalloc(l_size, &is_node)); const CeedInt size_indices = num_elem * elem_size; + for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; - CeedInt num_nodes = 0; for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; impl->num_nodes = num_nodes; // L-vector offsets array - CeedInt *ind_to_offset, *l_vec_indices; CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); - CeedInt j = 0; - for (CeedInt i = 0; i < l_size; i++) { + for (CeedInt i = 0, j = 0; i < l_size; i++) { if (is_node[i]) { l_vec_indices[j] = i; ind_to_offset[i] = j++; @@ -279,9 +278,8 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee // Compute transpose offsets and indices const CeedInt size_offsets = num_nodes + 1; - CeedInt *t_offsets; + CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); - CeedInt *t_indices; CeedCallBackend(CeedMalloc(size_indices, &t_indices)); // Count node multiplicity for (CeedInt e = 0; e < num_elem; ++e) { @@ -302,7 +300,6 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee t_offsets[0] = 0; // Copy data to device - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Order queue @@ -326,7 +323,6 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee CeedCallBackend(CeedFree(&l_vec_indices)); CeedCallBackend(CeedFree(&t_offsets)); CeedCallBackend(CeedFree(&t_indices)); - return CEED_ERROR_SUCCESS; } @@ -335,30 +331,31 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee //------------------------------------------------------------------------------ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, const CeedInt8 *curl_orients, CeedElemRestriction r) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *data; + bool is_strided; + CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedRestrictionType rstr_type; + CeedElemRestriction_Sycl *impl; + CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - CeedElemRestriction_Sycl *impl; CeedCallBackend(CeedCalloc(1, &impl)); - CeedInt num_elem, num_comp, elem_size; CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); - CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt comp_stride = 1; + CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; - CeedRestrictionType rstr_type; CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); // Stride data - bool is_strided; CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); if (is_strided) { bool has_backend_strides; + CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); if (!has_backend_strides) { CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); @@ -386,7 +383,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); // Set up device indices/offset arrays - if (m_type == CEED_MEM_HOST) { + if (mem_type == CEED_MEM_HOST) { switch (copy_mode) { case CEED_OWN_POINTER: impl->h_ind_allocated = (CeedInt *)indices; @@ -414,7 +411,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallSycl(ceed, copy_event.wait_and_throw()); CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); } - } else if (m_type == CEED_MEM_DEVICE) { + } else if (mem_type == CEED_MEM_DEVICE) { switch (copy_mode) { case CEED_COPY_VALUES: if (indices != NULL) { diff --git a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp index e498a31b2b..92cb98305c 100644 --- a/backends/sycl-ref/ceed-sycl-vector.sycl.cpp +++ b/backends/sycl-ref/ceed-sycl-vector.sycl.cpp @@ -18,10 +18,10 @@ // Check if host/device sync is needed //------------------------------------------------------------------------------ static inline int CeedVectorNeedSync_Sycl(const CeedVector vec, CeedMemType mem_type, bool *need_sync) { + bool has_valid_array = false; CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - bool has_valid_array = false; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array)); switch (mem_type) { case CEED_MEM_HOST: @@ -31,7 +31,6 @@ static inline int CeedVectorNeedSync_Sycl(const CeedVector vec, CeedMemType mem_ *need_sync = has_valid_array && !impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -39,11 +38,13 @@ static inline int CeedVectorNeedSync_Sycl(const CeedVector vec, CeedMemType mem_ // Sync host to device //------------------------------------------------------------------------------ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); if (!impl->h_array) { @@ -52,9 +53,7 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { // LCOV_EXCL_STOP } - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - if (impl->d_array_borrowed) { impl->d_array = impl->d_array_borrowed; } else if (impl->d_array_owned) { @@ -69,7 +68,6 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { sycl::event copy_event = data->sycl_queue.copy(impl->h_array, impl->d_array, length, {e}); // Wait for copy to finish and handle exceptions. CeedCallSycl(ceed, copy_event.wait_and_throw()); - return CEED_ERROR_SUCCESS; } @@ -77,18 +75,18 @@ static inline int CeedVectorSyncH2D_Sycl(const CeedVector vec) { // Sync device to host //------------------------------------------------------------------------------ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host"); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - if (impl->h_array_borrowed) { impl->h_array = impl->h_array_borrowed; } else if (impl->h_array_owned) { @@ -102,10 +100,8 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { sycl::event e = data->sycl_queue.ext_oneapi_submit_barrier(); // Copy from device to host sycl::event copy_event = data->sycl_queue.copy(impl->d_array, impl->h_array, length, {e}); - // Wait for copy to finish and handle exceptions. CeedCallSycl(ceed, copy_event.wait_and_throw()); - return CEED_ERROR_SUCCESS; } @@ -113,8 +109,9 @@ static inline int CeedVectorSyncD2H_Sycl(const CeedVector vec) { // Sync arrays //------------------------------------------------------------------------------ static int CeedVectorSyncArray_Sycl(const CeedVector vec, CeedMemType mem_type) { - // Check whether device/host sync is needed bool need_sync = false; + + // Check whether device/host sync is needed CeedCallBackend(CeedVectorNeedSync_Sycl(vec, mem_type, &need_sync)); if (!need_sync) return CEED_ERROR_SUCCESS; @@ -132,11 +129,10 @@ static int CeedVectorSyncArray_Sycl(const CeedVector vec, CeedMemType mem_type) //------------------------------------------------------------------------------ static inline int CeedVectorSetAllInvalid_Sycl(const CeedVector vec) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); impl->h_array = NULL; impl->d_array = NULL; - return CEED_ERROR_SUCCESS; } @@ -145,10 +141,9 @@ static inline int CeedVectorSetAllInvalid_Sycl(const CeedVector vec) { //------------------------------------------------------------------------------ static inline int CeedVectorHasValidArray_Sycl(const CeedVector vec, bool *has_valid_array) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); *has_valid_array = impl->h_array || impl->d_array; - return CEED_ERROR_SUCCESS; } @@ -157,8 +152,8 @@ static inline int CeedVectorHasValidArray_Sycl(const CeedVector vec, bool *has_v //------------------------------------------------------------------------------ static inline int CeedVectorHasArrayOfType_Sycl(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned; @@ -167,7 +162,6 @@ static inline int CeedVectorHasArrayOfType_Sycl(const CeedVector vec, CeedMemTyp *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned; break; } - return CEED_ERROR_SUCCESS; } @@ -176,8 +170,8 @@ static inline int CeedVectorHasArrayOfType_Sycl(const CeedVector vec, CeedMemTyp //------------------------------------------------------------------------------ static inline int CeedVectorHasBorrowedArrayOfType_Sycl(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (mem_type) { case CEED_MEM_HOST: *has_borrowed_array_of_type = impl->h_array_borrowed; @@ -186,7 +180,6 @@ static inline int CeedVectorHasBorrowedArrayOfType_Sycl(const CeedVector vec, Ce *has_borrowed_array_of_type = impl->d_array_borrowed; break; } - return CEED_ERROR_SUCCESS; } @@ -195,12 +188,13 @@ static inline int CeedVectorHasBorrowedArrayOfType_Sycl(const CeedVector vec, Ce //------------------------------------------------------------------------------ static int CeedVectorSetArrayHost_Sycl(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); switch (copy_mode) { case CEED_COPY_VALUES: { if (!impl->h_array_owned) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); CeedCallBackend(CeedMalloc(length, &impl->h_array_owned)); } @@ -208,8 +202,10 @@ static int CeedVectorSetArrayHost_Sycl(const CeedVector vec, const CeedCopyMode impl->h_array = impl->h_array_owned; if (array) { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); size_t bytes = length * sizeof(CeedScalar); + memcpy(impl->h_array, array, bytes); } } break; @@ -233,11 +229,12 @@ static int CeedVectorSetArrayHost_Sycl(const CeedVector vec, const CeedCopyMode // Set array from device //------------------------------------------------------------------------------ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Order queue @@ -246,6 +243,7 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod switch (copy_mode) { case CEED_COPY_VALUES: { CeedSize length; + CeedCallBackend(CeedVectorGetLength(vec, &length)); if (!impl->d_array_owned) { CeedCallSycl(ceed, impl->d_array_owned = sycl::malloc_device(length, data->sycl_device, data->sycl_context)); @@ -278,7 +276,6 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod impl->d_array = array; break; } - return CEED_ERROR_SUCCESS; } @@ -287,9 +284,10 @@ static int CeedVectorSetArrayDevice_Sycl(const CeedVector vec, const CeedCopyMod // freeing any previously allocated array if applicable //------------------------------------------------------------------------------ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorSetAllInvalid_Sycl(vec)); @@ -299,7 +297,6 @@ static int CeedVectorSetArray_Sycl(const CeedVector vec, const CeedMemType mem_t case CEED_MEM_DEVICE: return CeedVectorSetArrayDevice_Sycl(vec, copy_mode, array); } - return CEED_ERROR_UNSUPPORTED; } @@ -325,13 +322,14 @@ static int CeedDeviceSetValue_Sycl(sycl::queue &sycl_queue, CeedScalar *d_array, // Set a vector to a value, //------------------------------------------------------------------------------ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array @@ -356,7 +354,6 @@ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) { CeedCallBackend(CeedHostSetValue_Sycl(impl->h_array, length, val)); impl->d_array = NULL; } - return CEED_ERROR_SUCCESS; } @@ -364,12 +361,12 @@ static int CeedVectorSetValue_Sycl(CeedVector vec, CeedScalar val) { // Vector Take Array //------------------------------------------------------------------------------ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed_Sycl *data; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedGetData(ceed, &data)); // Order queue @@ -391,7 +388,6 @@ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedSc impl->d_array = NULL; break; } - return CEED_ERROR_SUCCESS; } @@ -400,9 +396,10 @@ static int CeedVectorTakeArray_Sycl(CeedVector vec, CeedMemType mem_type, CeedSc // If a different memory type is most up to date, this will perform a copy //------------------------------------------------------------------------------ static int CeedVectorGetArrayCore_Sycl(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); // Sync array to requested mem_type @@ -417,7 +414,6 @@ static int CeedVectorGetArrayCore_Sycl(const CeedVector vec, const CeedMemType m *array = impl->d_array; break; } - return CEED_ERROR_SUCCESS; } @@ -433,10 +429,9 @@ static int CeedVectorGetArrayRead_Sycl(const CeedVector vec, const CeedMemType m //------------------------------------------------------------------------------ static int CeedVectorGetArray_Sycl(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorGetArrayCore_Sycl(vec, mem_type, array)); - CeedCallBackend(CeedVectorSetAllInvalid_Sycl(vec)); switch (mem_type) { case CEED_MEM_HOST: @@ -446,7 +441,6 @@ static int CeedVectorGetArray_Sycl(const CeedVector vec, const CeedMemType mem_t impl->d_array = *array; break; } - return CEED_ERROR_SUCCESS; } @@ -454,10 +448,10 @@ static int CeedVectorGetArray_Sycl(const CeedVector vec, const CeedMemType mem_t // Get write access to a vector via the specified mem_type //------------------------------------------------------------------------------ static int CeedVectorGetArrayWrite_Sycl(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) { + bool has_array_of_type = true; CeedVector_Sycl *impl; - CeedCallBackend(CeedVectorGetData(vec, &impl)); - bool has_array_of_type = true; + CeedCallBackend(CeedVectorGetData(vec, &impl)); CeedCallBackend(CeedVectorHasArrayOfType_Sycl(vec, mem_type, &has_array_of_type)); if (!has_array_of_type) { // Allocate if array is not yet allocated @@ -474,7 +468,6 @@ static int CeedVectorGetArrayWrite_Sycl(const CeedVector vec, const CeedMemType else impl->d_array = impl->d_array_owned; } } - return CeedVectorGetArray_Sycl(vec, mem_type, array); } @@ -482,19 +475,19 @@ static int CeedVectorGetArrayWrite_Sycl(const CeedVector vec, const CeedMemType // Get the norm of a CeedVector //------------------------------------------------------------------------------ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *norm) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; + const CeedScalar *d_array; + CeedVector_Sycl *impl; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - CeedVector_Sycl *impl; CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Compute norm - const CeedScalar *d_array; CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array)); - switch (type) { case CEED_NORM_1: { // Order queue @@ -518,9 +511,7 @@ static int CeedVectorNorm_Sycl(CeedVector vec, CeedNormType type, CeedScalar *no // L2 norm - square root over reduced value if (type == CEED_NORM_2) *norm = sqrt(*impl->reduction_norm); else *norm = *impl->reduction_norm; - CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array)); - return CEED_ERROR_SUCCESS; } @@ -550,19 +541,19 @@ static int CeedDeviceReciprocal_Sycl(sycl::queue &sycl_queue, CeedScalar *d_arra // Take reciprocal of a vector //------------------------------------------------------------------------------ static int CeedVectorReciprocal_Sycl(CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(vec, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Sycl(data->sycl_queue, impl->d_array, length)); if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Sycl(impl->h_array, length)); - return CEED_ERROR_SUCCESS; } @@ -588,19 +579,19 @@ static int CeedDeviceScale_Sycl(sycl::queue &sycl_queue, CeedScalar *x_array, Ce // Compute x = alpha x //------------------------------------------------------------------------------ static int CeedVectorScale_Sycl(CeedVector x, CeedScalar alpha) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(x, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *x_impl; + + CeedCallBackend(CeedVectorGetCeed(x, &ceed)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(x, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Sycl(data->sycl_queue, x_impl->d_array, alpha, length)); if (x_impl->h_array) CeedCallBackend(CeedHostScale_Sycl(x_impl->h_array, alpha, length)); - return CEED_ERROR_SUCCESS; } @@ -626,14 +617,15 @@ static int CeedDeviceAXPY_Sycl(sycl::queue &sycl_queue, CeedScalar *y_array, Cee // Compute y = alpha x + y //------------------------------------------------------------------------------ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(y, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *y_impl, *x_impl; + + CeedCallBackend(CeedVectorGetCeed(y, &ceed)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(y, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array @@ -645,7 +637,6 @@ static int CeedVectorAXPY_Sycl(CeedVector y, CeedScalar alpha, CeedVector x) { CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST)); CeedCallBackend(CeedHostAXPY_Sycl(y_impl->h_array, alpha, x_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -671,15 +662,16 @@ static int CeedDevicePointwiseMult_Sycl(sycl::queue &sycl_queue, CeedScalar *w_a // Compute the pointwise multiplication w = x .* y //------------------------------------------------------------------------------ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(w, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + CeedSize length; CeedVector_Sycl *w_impl, *x_impl, *y_impl; + + CeedCallBackend(CeedVectorGetCeed(w, &ceed)); CeedCallBackend(CeedVectorGetData(w, &w_impl)); CeedCallBackend(CeedVectorGetData(x, &x_impl)); CeedCallBackend(CeedVectorGetData(y, &y_impl)); - CeedSize length; CeedCallBackend(CeedVectorGetLength(w, &length)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Set value for synced device/host array @@ -696,7 +688,6 @@ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST)); CeedCallBackend(CeedHostPointwiseMult_Sycl(w_impl->h_array, x_impl->h_array, y_impl->h_array, length)); } - return CEED_ERROR_SUCCESS; } @@ -704,11 +695,12 @@ static int CeedVectorPointwiseMult_Sycl(CeedVector w, CeedVector x, CeedVector y // Destroy the vector //------------------------------------------------------------------------------ static int CeedVectorDestroy_Sycl(const CeedVector vec) { - Ceed ceed; - CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedVector_Sycl *impl; + + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); CeedCallBackend(CeedVectorGetData(vec, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); // Wait for all work to finish before freeing memory @@ -718,7 +710,6 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) { CeedCallBackend(CeedFree(&impl->h_array_owned)); CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -726,15 +717,14 @@ static int CeedVectorDestroy_Sycl(const CeedVector vec) { // Create a vector of the specified length (does not allocate memory) //------------------------------------------------------------------------------ int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec) { - CeedVector_Sycl *impl; Ceed ceed; + Ceed_Sycl *data; + CeedVector_Sycl *impl; + CeedCallBackend(CeedVectorGetCeed(vec, &ceed)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - CeedCallBackend(CeedCalloc(1, &impl)); CeedCallSycl(ceed, impl->reduction_norm = sycl::malloc_host(1, data->sycl_context)); - CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Sycl)); @@ -750,8 +740,6 @@ int CeedVectorCreate_Sycl(CeedSize n, CeedVector vec) { CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Scale", CeedVectorScale_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Sycl)); CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Sycl)); - CeedCallBackend(CeedVectorSetData(vec, impl)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl-ref/kernels/sycl-ref-vector.cpp b/backends/sycl-ref/kernels/sycl-ref-vector.cpp index 308d74d209..68e2bfc9b9 100644 --- a/backends/sycl-ref/kernels/sycl-ref-vector.cpp +++ b/backends/sycl-ref/kernels/sycl-ref-vector.cpp @@ -13,21 +13,22 @@ // Kernel for set value on device //------------------------------------------------------------------------------ __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedInt size, CeedScalar val) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) return; - vec[idx] = val; + int index = threadIdx.x + blockDim.x * blockIdx.x; + + if (index >= size) return; + vec[index] = val; } //------------------------------------------------------------------------------ // Set value on device memory //------------------------------------------------------------------------------ extern "C" int CeedDeviceSetValue_Sycl(CeedScalar *d_array, CeedInt length, CeedScalar val) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const int vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - setValueK<<>>(d_array, length, val); + if (block_size * grid_size < vec_size) grid_size += 1; + setValueK<<>>(d_array, length, val); return 0; } @@ -35,21 +36,22 @@ extern "C" int CeedDeviceSetValue_Sycl(CeedScalar *d_array, CeedInt length, Ceed // Kernel for taking reciprocal //------------------------------------------------------------------------------ __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedInt size) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) return; - if (fabs(vec[idx]) > 1E-16) vec[idx] = 1. / vec[idx]; + int index = threadIdx.x + blockDim.x * blockIdx.x; + + if (index >= size) return; + if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; } //------------------------------------------------------------------------------ // Take vector reciprocal in device memory //------------------------------------------------------------------------------ extern "C" int CeedDeviceReciprocal_Sycl(CeedScalar *d_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const int vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - rcpValueK<<>>(d_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + rcpValueK<<>>(d_array, length); return 0; } @@ -57,21 +59,22 @@ extern "C" int CeedDeviceReciprocal_Sycl(CeedScalar *d_array, CeedInt length) { // Kernel for scale //------------------------------------------------------------------------------ __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedInt size) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) return; - x[idx] *= alpha; + int index = threadIdx.x + blockDim.x * blockIdx.x; + + if (index >= size) return; + x[index] *= alpha; } //------------------------------------------------------------------------------ // Compute x = alpha x on device //------------------------------------------------------------------------------ extern "C" int CeedDeviceScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const int vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - scaleValueK<<>>(x_array, alpha, length); + if (block_size * grid_size < vec_size) grid_size += 1; + scaleValueK<<>>(x_array, alpha, length); return 0; } @@ -79,21 +82,21 @@ extern "C" int CeedDeviceScale_Sycl(CeedScalar *x_array, CeedScalar alpha, CeedI // Kernel for axpy //------------------------------------------------------------------------------ __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedInt size) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) return; - y[idx] += alpha * x[idx]; + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index >= size) return; + y[index] += alpha * x[index]; } //------------------------------------------------------------------------------ // Compute y = alpha x + y on device //------------------------------------------------------------------------------ extern "C" int CeedDeviceAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const int vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - axpyValueK<<>>(y_array, alpha, x_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + axpyValueK<<>>(y_array, alpha, x_array, length); return 0; } @@ -101,21 +104,22 @@ extern "C" int CeedDeviceAXPY_Sycl(CeedScalar *y_array, CeedScalar alpha, CeedSc // Kernel for pointwise mult //------------------------------------------------------------------------------ __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedInt size) { - int idx = threadIdx.x + blockDim.x * blockIdx.x; - if (idx >= size) return; - w[idx] = x[idx] * y[idx]; + int index = threadIdx.x + blockDim.x * blockIdx.x; + + if (index >= size) return; + w[index] = x[index] * y[index]; } //------------------------------------------------------------------------------ // Compute the pointwise multiplication w = x .* y on device //------------------------------------------------------------------------------ extern "C" int CeedDevicePointwiseMult_Sycl(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedInt length) { - const int bsize = 512; - const int vecsize = length; - int gridsize = vecsize / bsize; + const int block_size = 512; + const int vec_size = length; + int grid_size = vec_size / block_size; - if (bsize * gridsize < vecsize) gridsize += 1; - pointwiseMultValueK<<>>(w_array, x_array, y_array, length); + if (block_size * grid_size < vec_size) grid_size += 1; + pointwiseMultValueK<<>>(w_array, x_array, y_array, length); return 0; } diff --git a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp index f002c0b2ea..41f936b120 100644 --- a/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared-basis.sycl.cpp @@ -20,10 +20,10 @@ // Compute the local range of for basis kernels //------------------------------------------------------------------------------ static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedInt max_group_size = 256) { - local_range[0] = thread_1d; - local_range[1] = (dim > 1) ? thread_1d : 1; - + local_range[0] = thread_1d; + local_range[1] = (dim > 1) ? thread_1d : 1; const CeedInt min_group_size = local_range[0] * local_range[1]; + CeedCheck(min_group_size <= max_group_size, ceed, CEED_ERROR_BACKEND, "Requested group size is smaller than the required minimum."); local_range[2] = max_group_size / min_group_size; // elements per group @@ -35,16 +35,17 @@ static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt //------------------------------------------------------------------------------ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; + Ceed ceed; + Ceed_Sycl *ceed_Sycl; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasis_Sycl_shared *impl; + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - Ceed_Sycl *ceed_Sycl; CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); - CeedBasis_Sycl_shared *impl; CeedCallBackend(CeedBasisGetData(basis, &impl)); // Read vectors - const CeedScalar *d_u; - CeedScalar *d_v; if (eval_mode != CEED_EVAL_WEIGHT) { CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); } @@ -136,13 +137,13 @@ int CeedBasisApplyTensor_Sycl_shared(CeedBasis basis, const CeedInt num_elem, Ce // Destroy basis //------------------------------------------------------------------------------ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + Ceed_Sycl *data; CeedBasis_Sycl_shared *impl; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - CeedCallSycl(ceed, data->sycl_queue.wait_and_throw()); CeedCallSycl(ceed, sycl::free(impl->d_q_weight_1d, data->sycl_context)); CeedCallSycl(ceed, sycl::free(impl->d_interp_1d, data->sycl_context)); @@ -157,7 +158,6 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { delete impl->sycl_module; CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; } @@ -167,14 +167,15 @@ static int CeedBasisDestroy_Sycl_shared(CeedBasis basis) { //------------------------------------------------------------------------------ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { - Ceed ceed; - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + Ceed ceed; + Ceed_Sycl *data; + char *basis_kernel_path, *basis_kernel_source; + CeedInt num_comp; CeedBasis_Sycl_shared *impl; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &impl)); - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); - - CeedInt num_comp; CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); @@ -182,10 +183,12 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedInt num_qpts = CeedIntPow(Q_1d, dim); CeedInt *interp_lrange = impl->interp_local_range; + CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange)); const CeedInt interp_group_size = interp_lrange[0] * interp_lrange[1] * interp_lrange[2]; CeedInt *grad_lrange = impl->grad_local_range; + CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange)); const CeedInt grad_group_size = grad_lrange[0] * grad_lrange[1] * grad_lrange[2]; @@ -207,11 +210,13 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, // Compute collocated gradient and copy to GPU impl->d_collo_grad_1d = NULL; const bool has_collocated_grad = (dim == 3) && (Q_1d >= P_1d); + if (has_collocated_grad) { - CeedScalar *collo_grad_1d; + CeedScalar *collo_grad_1d; + const CeedInt cgrad_length = Q_1d * Q_1d; + CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); - const CeedInt cgrad_length = Q_1d * Q_1d; CeedCallSycl(ceed, impl->d_collo_grad_1d = sycl::malloc_device(cgrad_length, data->sycl_device, data->sycl_context)); CeedCallSycl(ceed, data->sycl_queue.copy(collo_grad_1d, impl->d_collo_grad_1d, cgrad_length).wait_and_throw()); CeedCallBackend(CeedFree(&collo_grad_1d)); @@ -232,7 +237,6 @@ int CeedBasisCreateTensorH1_Sycl_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, jit_constants["BASIS_GRAD_SCRATCH_SIZE"] = grad_group_size; // Load kernel source - char *basis_kernel_path, *basis_kernel_source; CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor.h", &basis_kernel_path)); CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); diff --git a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp index be6b5c8d42..1ff18422e5 100644 --- a/backends/sycl-shared/ceed-sycl-shared.sycl.cpp +++ b/backends/sycl-shared/ceed-sycl-shared.sycl.cpp @@ -18,7 +18,10 @@ // Backend init //------------------------------------------------------------------------------ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) { - char *resource_root; + Ceed ceed_ref; + Ceed_Sycl *data, *ref_data; + char *resource_root; + CeedCallBackend(CeedGetResourceRoot(ceed, resource, ":", &resource_root)); CeedCheck(!std::strcmp(resource_root, "/gpu/sycl/shared") || !std::strcmp(resource_root, "/cpu/sycl/shared"), ceed, CEED_ERROR_BACKEND, "Sycl backend cannot use resource: %s", resource); @@ -33,15 +36,12 @@ static int CeedInit_Sycl_shared(const char *resource, Ceed ceed) { CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedSetDeterministic(ceed, true)); - Ceed_Sycl *data; CeedCallBackend(CeedCalloc(1, &data)); CeedCallBackend(CeedSetData(ceed, data)); CeedCallBackend(CeedInit_Sycl(ceed, resource)); - Ceed ceed_ref; CeedCallBackend(CeedInit(ref_resource.str().c_str(), &ceed_ref)); - Ceed_Sycl *ref_data; CeedCallBackend(CeedGetData(ceed_ref, &ref_data)); // Need to use the same queue everywhere for correct synchronization diff --git a/backends/sycl/ceed-sycl-common.sycl.cpp b/backends/sycl/ceed-sycl-common.sycl.cpp index d6e7067eee..a935619e55 100644 --- a/backends/sycl/ceed-sycl-common.sycl.cpp +++ b/backends/sycl/ceed-sycl-common.sycl.cpp @@ -15,6 +15,7 @@ // Device information backend init //------------------------------------------------------------------------------ int CeedInit_Sycl(Ceed ceed, const char *resource) { + Ceed_Sycl *data; const char *device_spec = std::strstr(resource, ":device_id="); const int device_id = (device_spec) ? atoi(device_spec + 11) : 0; @@ -65,13 +66,11 @@ int CeedInit_Sycl(Ceed ceed, const char *resource) { sycl::context sycl_context{sycl_device.get_platform().get_devices()}; sycl::queue sycl_queue{sycl_context, sycl_device, sycl_async_handler}; - Ceed_Sycl *data; CeedCallBackend(CeedGetData(ceed, &data)); data->sycl_device = sycl_device; data->sycl_context = sycl_context; data->sycl_queue = sycl_queue; - return CEED_ERROR_SUCCESS; } @@ -80,6 +79,7 @@ int CeedInit_Sycl(Ceed ceed, const char *resource) { //------------------------------------------------------------------------------ int CeedDestroy_Sycl(Ceed ceed) { Ceed_Sycl *data; + CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedFree(&data)); return CEED_ERROR_SUCCESS; @@ -89,7 +89,9 @@ int CeedDestroy_Sycl(Ceed ceed) { // Use an external queue //------------------------------------------------------------------------------ int CeedSetStream_Sycl(Ceed ceed, void *handle) { + Ceed ceed_delegate = NULL, ceed_fallback = NULL; Ceed_Sycl *data; + CeedCallBackend(CeedGetData(ceed, &data)); CeedCheck(handle, ceed, CEED_ERROR_BACKEND, "Stream handle is null"); @@ -102,10 +104,10 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) { data->sycl_queue = *q; // Revisit this when we have a hierarchy of delegates - Ceed ceed_delegate = NULL; CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); if (ceed_delegate) { Ceed_Sycl *delegate_data; + CeedCallBackend(CeedGetData(ceed_delegate, &delegate_data)); delegate_data->sycl_device = q->get_device(); delegate_data->sycl_context = q->get_context(); @@ -113,16 +115,15 @@ int CeedSetStream_Sycl(Ceed ceed, void *handle) { } // Set queue and context for Ceed Fallback object - Ceed ceed_fallback = NULL; CeedGetOperatorFallbackCeed(ceed, &ceed_fallback); if (ceed_fallback) { Ceed_Sycl *fallback_data; + CeedCallBackend(CeedGetData(ceed_fallback, &fallback_data)); fallback_data->sycl_device = q->get_device(); fallback_data->sycl_context = q->get_context(); fallback_data->sycl_queue = *q; } - return CEED_ERROR_SUCCESS; } diff --git a/backends/sycl/ceed-sycl-compile.sycl.cpp b/backends/sycl/ceed-sycl-compile.sycl.cpp index 845c46b5c5..50a00edc55 100644 --- a/backends/sycl/ceed-sycl-compile.sycl.cpp +++ b/backends/sycl/ceed-sycl-compile.sycl.cpp @@ -28,14 +28,15 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc const std::map &constants = {}) { std::ostringstream oss; + char *jit_defs_path, *jit_defs_source; + const char *sycl_jith_path = "ceed/jit-source/sycl/sycl-jit.h"; + // Prepend defined constants for (const auto &[name, value] : constants) { oss << "#define " << name << " " << value << "\n"; } // libCeed definitions for Sycl Backends - char *jit_defs_path, *jit_defs_source; - const char *sycl_jith_path = "ceed/jit-source/sycl/sycl-jit.h"; CeedCallBackend(CeedGetJitAbsolutePath(ceed, sycl_jith_path, &jit_defs_path)); CeedCallBackend(CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source)); @@ -48,7 +49,6 @@ static int CeedJitAddDefinitions_Sycl(Ceed ceed, const std::string &kernel_sourc oss << "\n" << kernel_source; jit_source = oss.str(); - return CEED_ERROR_SUCCESS; } @@ -97,9 +97,10 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con if (ZE_RESULT_SUCCESS != lz_err) { size_t log_size = 0; + char *log_message; + zeModuleBuildLogGetString(lz_log, &log_size, nullptr); - char *log_message; CeedCall(CeedCalloc(log_size, &log_message)); zeModuleBuildLogGetString(lz_log, &log_size, log_message); @@ -109,7 +110,6 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con // sycl make_ only throws errors for backend mismatch--assume we have vetted this already *sycl_module = new SyclModule_t(sycl::make_kernel_bundle( {lz_module, sycl::ext::oneapi::level_zero::ownership::transfer}, sycl_context)); - return CEED_ERROR_SUCCESS; } @@ -117,20 +117,16 @@ static int CeedLoadModule_Sycl(Ceed ceed, const sycl::context &sycl_context, con // Compile kernel source to an executable `sycl::kernel_bundle` // ------------------------------------------------------------------------------ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule_t **sycl_module, const std::map &constants) { - Ceed_Sycl *data; - CeedCallBackend(CeedGetData(ceed, &data)); + Ceed_Sycl *data; + std::string jit_source; + std::vector flags; + ByteVector_t il_binary; - std::string jit_source; + CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedJitAddDefinitions_Sycl(ceed, kernel_source, jit_source, constants)); - - std::vector flags; CeedCallBackend(CeedJitGetFlags_Sycl(flags)); - - ByteVector_t il_binary; CeedCallBackend(CeedJitCompileSource_Sycl(ceed, data->sycl_device, jit_source, il_binary, flags)); - CeedCallBackend(CeedLoadModule_Sycl(ceed, data->sycl_context, data->sycl_device, il_binary, sycl_module)); - return CEED_ERROR_SUCCESS; } @@ -141,6 +137,7 @@ int CeedBuildModule_Sycl(Ceed ceed, const std::string &kernel_source, SyclModule // ------------------------------------------------------------------------------ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::string &kernel_name, sycl::kernel **sycl_kernel) { Ceed_Sycl *data; + CeedCallBackend(CeedGetData(ceed, &data)); // sycl::get_native returns std::vector for lz backend @@ -157,7 +154,6 @@ int CeedGetKernel_Sycl(Ceed ceed, const SyclModule_t *sycl_module, const std::st *sycl_kernel = new sycl::kernel(sycl::make_kernel( {*sycl_module, lz_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, data->sycl_context)); - return CEED_ERROR_SUCCESS; } @@ -173,6 +169,7 @@ int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_s //----------- // Order queue Ceed_Sycl *ceed_Sycl; + CeedCallBackend(CeedGetData(ceed, &ceed_Sycl)); sycl::event e = ceed_Sycl->sycl_queue.ext_oneapi_submit_barrier(); @@ -181,6 +178,5 @@ int CeedRunKernelDimSharedSycl(Ceed ceed, sycl::kernel *kernel, const int grid_s cgh.set_args(*kernel_args); cgh.parallel_for(kernel_range, *kernel); }); - return CEED_ERROR_SUCCESS; } diff --git a/backends/xsmm/ceed-xsmm-blocked.c b/backends/xsmm/ceed-xsmm-blocked.c index cc9b9399f0..f77dac350b 100644 --- a/backends/xsmm/ceed-xsmm-blocked.c +++ b/backends/xsmm/ceed-xsmm-blocked.c @@ -16,17 +16,17 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Xsmm_Blocked(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/xsmm") || !strcmp(resource, "/cpu/self/xsmm/blocked"), ceed, CEED_ERROR_BACKEND, "blocked libXSMM backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/xsmm/ceed-xsmm-serial.c b/backends/xsmm/ceed-xsmm-serial.c index 51d4c1e461..75e725ea7c 100644 --- a/backends/xsmm/ceed-xsmm-serial.c +++ b/backends/xsmm/ceed-xsmm-serial.c @@ -16,17 +16,17 @@ // Backend Init //------------------------------------------------------------------------------ static int CeedInit_Xsmm_Serial(const char *resource, Ceed ceed) { + Ceed ceed_ref; + CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/xsmm/serial"), ceed, CEED_ERROR_BACKEND, "serial libXSMM backend cannot use resource: %s", resource); CeedCallBackend(CeedSetDeterministic(ceed, true)); // Create reference Ceed that implementation will be dispatched through unless overridden - Ceed ceed_ref; CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref)); CeedCallBackend(CeedSetDelegate(ceed, ceed_ref)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_Xsmm)); - return CEED_ERROR_SUCCESS; } diff --git a/backends/xsmm/ceed-xsmm-tensor.c b/backends/xsmm/ceed-xsmm-tensor.c index 888cd22970..63c374ba4b 100644 --- a/backends/xsmm/ceed-xsmm-tensor.c +++ b/backends/xsmm/ceed-xsmm-tensor.c @@ -17,6 +17,7 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v) { Ceed ceed; + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); if (C == 1) { @@ -30,10 +31,11 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, : libxsmm_create_gemm_shape(J, A, B, !t_mode ? B : J, B, J, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + libxsmm_gemm_param gemm_param; + CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); // Run kernel - libxsmm_gemm_param gemm_param; gemm_param.a.primary = (CeedScalar *)&t[0]; gemm_param.b.primary = (CeedScalar *)&u[0]; gemm_param.c.primary = (CeedScalar *)&v[0]; @@ -49,10 +51,11 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, : libxsmm_create_gemm_shape(C, J, B, C, !t_mode ? B : J, C, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32); const libxsmm_gemmfunction kernel = libxsmm_dispatch_gemm_v2(gemm_shape, (libxsmm_bitfield)(flags), (libxsmm_bitfield)LIBXSMM_GEMM_PREFETCH_NONE); + libxsmm_gemm_param gemm_param; + CeedCheck(kernel, ceed, CEED_ERROR_BACKEND, "LIBXSMM kernel failed to build."); // Run kernel - libxsmm_gemm_param gemm_param; gemm_param.b.primary = (CeedScalar *)&t[0]; for (CeedInt a = 0; a < A; a++) { gemm_param.a.primary = (CeedScalar *)&u[a * B * C]; @@ -60,7 +63,6 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, kernel(&gemm_param); } } - return CEED_ERROR_SUCCESS; } @@ -69,10 +71,9 @@ static int CeedTensorContractApply_Xsmm(CeedTensorContract contract, CeedInt A, //------------------------------------------------------------------------------ int CeedTensorContractCreate_Xsmm(CeedBasis basis, CeedTensorContract contract) { Ceed ceed; - CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); + CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed)); CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Xsmm)); - return CEED_ERROR_SUCCESS; } diff --git a/examples/fluids/qfunctions/setupgeo_helpers.h b/examples/fluids/qfunctions/setupgeo_helpers.h index 5c8de0a387..b6f2a07d28 100644 --- a/examples/fluids/qfunctions/setupgeo_helpers.h +++ b/examples/fluids/qfunctions/setupgeo_helpers.h @@ -100,7 +100,7 @@ CEED_QFUNCTION_HELPER void InvertMappingJacobian_3D(CeedInt Q, CeedInt i, const * @param[out] normal Inverse of mapping Jacobian at quadrature point i * @param[out] detJ_ptr Determinate of the Jacobian, may be NULL is not desired */ -CEED_QFUNCTION_HELPER void NormalVectorFromdxdX_3D(CeedInt Q, CeedInt i, const CeedScalar dxdX_q[][3][CEED_Q_VLA], CeedScalar normal[3], +CEED_QFUNCTION_HELPER void NormalVectorFromdxdX_3D(CeedInt Q, CeedInt i, const CeedScalar (*dxdX_q)[3][CEED_Q_VLA], CeedScalar normal[3], CeedScalar *detJ_ptr) { const CeedScalar dxdX[3][2] = { {dxdX_q[0][0][i], dxdX_q[1][0][i]}, diff --git a/examples/fluids/qfunctions/sgs_dd_utils.h b/examples/fluids/qfunctions/sgs_dd_utils.h index e79dc6fdcd..29cca1628e 100644 --- a/examples/fluids/qfunctions/sgs_dd_utils.h +++ b/examples/fluids/qfunctions/sgs_dd_utils.h @@ -51,7 +51,7 @@ CEED_QFUNCTION_HELPER void OrientBasisWithVector(CeedScalar basis[3][3], const C } // @brief Denormalize outputs using min-max (de-)normalization -CEED_QFUNCTION_HELPER void DenormalizeDDOutputs(CeedScalar output[6], const CeedScalar new_bounds[6][2], const CeedScalar old_bounds[6][2]) { +CEED_QFUNCTION_HELPER void DenormalizeDDOutputs(CeedScalar output[6], const CeedScalar (*new_bounds)[2], const CeedScalar old_bounds[6][2]) { CeedScalar bounds_ratio; for (int i = 0; i < 6; i++) { bounds_ratio = (new_bounds[i][1] - new_bounds[i][0]) / (old_bounds[i][1] - old_bounds[i][0]); @@ -118,7 +118,7 @@ CEED_QFUNCTION_HELPER void ComputeSGS_DDAnisotropicInputs(const CeedScalar grad_ * @param[out] kmsgs_stress Physical SGS stresses in Kelvin-Mandel notation */ CEED_QFUNCTION_HELPER void ComputeSGS_DDAnisotropicOutputs(CeedScalar outputs[6], const CeedScalar delta, const CeedScalar eigenvectors[3][3], - const CeedScalar new_bounds[6][2], const CeedScalar grad_velo_magnitude, + const CeedScalar (*new_bounds)[2], const CeedScalar grad_velo_magnitude, CeedScalar kmsgs_stress[6]) { CeedScalar old_bounds[6][2] = {{0}}; for (int j = 0; j < 6; j++) old_bounds[j][1] = 1; diff --git a/examples/rust/ex1-volume/src/main.rs b/examples/rust/ex1-volume/src/main.rs index ba5c763c0d..ca7554384a 100644 --- a/examples/rust/ex1-volume/src/main.rs +++ b/examples/rust/ex1-volume/src/main.rs @@ -97,12 +97,12 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { // Build ElemRestriction objects describing the mesh and solution discrete // representations - let (restr_mesh, _) = + let (rstr_mesh, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; - let (restr_solution, restr_qdata) = + let (rstr_solution, rstr_qdata) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?; - let mesh_size = restr_mesh.lvector_size(); - let solution_size = restr_solution.lvector_size(); + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); if !quiet { println!("Number of mesh nodes : {}", mesh_size / dim); println!("Number of solution nodes : {}", solution_size); @@ -178,14 +178,14 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { let op_build = ceed .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? .name("build qdata")? - .field("dx", &restr_mesh, &basis_mesh, VectorOpt::Active)? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? .field( "weights", ElemRestrictionOpt::None, &basis_mesh, VectorOpt::None, )? - .field("qdata", &restr_qdata, BasisOpt::None, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? .check()?; // Compute the quadrature data for the mass operator @@ -222,9 +222,9 @@ fn example_1(options: opt::Opt) -> libceed::Result<()> { let op_mass = ceed .operator(qf_mass, QFunctionOpt::None, QFunctionOpt::None)? .name("mass")? - .field("u", &restr_solution, &basis_solution, VectorOpt::Active)? - .field("qdata", &restr_qdata, BasisOpt::None, &qdata)? - .field("v", &restr_solution, &basis_solution, VectorOpt::Active)? + .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)? .check()?; // Solution vectors diff --git a/examples/rust/ex2-surface/src/main.rs b/examples/rust/ex2-surface/src/main.rs index e495b6a904..8b167911da 100644 --- a/examples/rust/ex2-surface/src/main.rs +++ b/examples/rust/ex2-surface/src/main.rs @@ -103,9 +103,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { // Build ElemRestriction objects describing the mesh and solution discrete // representations - let (restr_mesh, _) = + let (rstr_mesh, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; - let (_, restr_qdata) = mesh::build_cartesian_restriction( + let (_, rstr_qdata) = mesh::build_cartesian_restriction( &ceed, dim, num_xyz, @@ -114,10 +114,10 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { num_qpts, )?; - let (restr_solution, _) = + let (rstr_solution, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?; - let mesh_size = restr_mesh.lvector_size(); - let solution_size = restr_solution.lvector_size(); + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); if !quiet { println!("Number of mesh nodes : {}", mesh_size / dim); println!("Number of solution nodes : {}", solution_size); @@ -221,14 +221,14 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { let op_build = ceed .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? .name("build qdata")? - .field("dx", &restr_mesh, &basis_mesh, VectorOpt::Active)? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? .field( "weights", ElemRestrictionOpt::None, &basis_mesh, VectorOpt::None, )? - .field("qdata", &restr_qdata, BasisOpt::None, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? .check()?; // Compute the quadrature data for the diff operator @@ -302,9 +302,9 @@ fn example_2(options: opt::Opt) -> libceed::Result<()> { let op_diff = ceed .operator(qf_diff, QFunctionOpt::None, QFunctionOpt::None)? .name("Poisson")? - .field("du", &restr_solution, &basis_solution, VectorOpt::Active)? - .field("qdata", &restr_qdata, BasisOpt::None, &qdata)? - .field("dv", &restr_solution, &basis_solution, VectorOpt::Active)? + .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)? .check()?; // Solution vectors diff --git a/examples/rust/ex3-vector-volume/src/main.rs b/examples/rust/ex3-vector-volume/src/main.rs index 8982aa634a..9abfa4d558 100644 --- a/examples/rust/ex3-vector-volume/src/main.rs +++ b/examples/rust/ex3-vector-volume/src/main.rs @@ -105,14 +105,14 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { // Build ElemRestriction objects describing the mesh and solution discrete // representations - let (restr_mesh, _) = + let (rstr_mesh, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; - let (_, restr_qdata) = + let (_, rstr_qdata) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, 1, num_qpts)?; - let (restr_solution, _) = + let (rstr_solution, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, ncomp_u, num_qpts)?; - let mesh_size = restr_mesh.lvector_size(); - let solution_size = restr_solution.lvector_size(); + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); if !quiet { println!("Number of mesh nodes : {}", mesh_size / dim); println!("Number of solution nodes : {}", solution_size); @@ -188,14 +188,14 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { let op_build = ceed .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? .name("build qdata")? - .field("dx", &restr_mesh, &basis_mesh, VectorOpt::Active)? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? .field( "weights", ElemRestrictionOpt::None, &basis_mesh, VectorOpt::None, )? - .field("qdata", &restr_qdata, BasisOpt::None, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? .check()?; // Compute the quadrature data for the mass operator @@ -236,9 +236,9 @@ fn example_3(options: opt::Opt) -> libceed::Result<()> { let op_mass = ceed .operator(qf_mass, QFunctionOpt::None, QFunctionOpt::None)? .name("mass")? - .field("u", &restr_solution, &basis_solution, VectorOpt::Active)? - .field("qdata", &restr_qdata, BasisOpt::None, &qdata)? - .field("v", &restr_solution, &basis_solution, VectorOpt::Active)? + .field("u", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("v", &rstr_solution, &basis_solution, VectorOpt::Active)? .check()?; // Solution vectors diff --git a/examples/rust/ex4-vector-surface/src/main.rs b/examples/rust/ex4-vector-surface/src/main.rs index f5c86b593a..93408e02e1 100644 --- a/examples/rust/ex4-vector-surface/src/main.rs +++ b/examples/rust/ex4-vector-surface/src/main.rs @@ -110,9 +110,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { // Build ElemRestriction objects describing the mesh and solution discrete // representations - let (restr_mesh, _) = + let (rstr_mesh, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, mesh_degree, ncomp_x, num_qpts)?; - let (_, restr_qdata) = mesh::build_cartesian_restriction( + let (_, rstr_qdata) = mesh::build_cartesian_restriction( &ceed, dim, num_xyz, @@ -121,10 +121,10 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { num_qpts, )?; - let (restr_solution, _) = + let (rstr_solution, _) = mesh::build_cartesian_restriction(&ceed, dim, num_xyz, solution_degree, ncomp_u, num_qpts)?; - let mesh_size = restr_mesh.lvector_size(); - let solution_size = restr_solution.lvector_size(); + let mesh_size = rstr_mesh.lvector_size(); + let solution_size = rstr_solution.lvector_size(); if !quiet { println!("Number of mesh nodes : {}", mesh_size / dim); println!("Number of solution nodes : {}", solution_size); @@ -228,14 +228,14 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { let op_build = ceed .operator(qf_build, QFunctionOpt::None, QFunctionOpt::None)? .name("build qdata")? - .field("dx", &restr_mesh, &basis_mesh, VectorOpt::Active)? + .field("dx", &rstr_mesh, &basis_mesh, VectorOpt::Active)? .field( "weights", ElemRestrictionOpt::None, &basis_mesh, VectorOpt::None, )? - .field("qdata", &restr_qdata, BasisOpt::None, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, VectorOpt::Active)? .check()?; // Compute the quadrature data for the diff operator @@ -323,9 +323,9 @@ fn example_4(options: opt::Opt) -> libceed::Result<()> { let op_diff = ceed .operator(qf_diff, QFunctionOpt::None, QFunctionOpt::None)? .name("Poisson")? - .field("du", &restr_solution, &basis_solution, VectorOpt::Active)? - .field("qdata", &restr_qdata, BasisOpt::None, &qdata)? - .field("dv", &restr_solution, &basis_solution, VectorOpt::Active)? + .field("du", &rstr_solution, &basis_solution, VectorOpt::Active)? + .field("qdata", &rstr_qdata, BasisOpt::None, &qdata)? + .field("dv", &rstr_solution, &basis_solution, VectorOpt::Active)? .check()?; // Solution vectors diff --git a/examples/rust/mesh/src/lib.rs b/examples/rust/mesh/src/lib.rs index 04d6ebd232..0731c8eb8e 100644 --- a/examples/rust/mesh/src/lib.rs +++ b/examples/rust/mesh/src/lib.rs @@ -85,7 +85,7 @@ pub fn build_cartesian_restriction( } // Mesh/solution data restriction - let restr = ceed.elem_restriction( + let rstr = ceed.elem_restriction( num_elem, num_nodes, num_comp, @@ -96,14 +96,14 @@ pub fn build_cartesian_restriction( )?; // Quadratue data restriction - let restr_qdata = ceed.strided_elem_restriction( + let rstr_qdata = ceed.strided_elem_restriction( num_elem, elem_qpts, num_comp, num_comp * elem_qpts * num_elem, CEED_STRIDES_BACKEND, )?; - Ok((restr, restr_qdata)) + Ok((rstr, rstr_qdata)) } // ---------------------------------------------------------------------------- diff --git a/include/ceed/backend.h b/include/ceed/backend.h index 0d2f1f8c32..2fd483a27f 100644 --- a/include/ceed/backend.h +++ b/include/ceed/backend.h @@ -219,7 +219,7 @@ CEED_EXTERN int CeedGetDelegate(Ceed ceed, Ceed *delegate); CEED_EXTERN int CeedSetDelegate(Ceed ceed, Ceed delegate); CEED_EXTERN int CeedGetObjectDelegate(Ceed ceed, Ceed *delegate, const char *obj_name); CEED_EXTERN int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name); -CEED_EXTERN int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource); +CEED_EXTERN int CeedGetOperatorfallback_resource(Ceed ceed, const char **resource); CEED_EXTERN int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed); CEED_EXTERN int CeedSetOperatorFallbackResource(Ceed ceed, const char *resource); CEED_EXTERN int CeedSetDeterministic(Ceed ceed, bool is_deterministic); diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index b8db4f2a6a..d6634b4bd7 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -27,50 +27,54 @@ inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__res //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d) { + if (data.t_id_x < P_1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + const CeedInt ind = indices[node + elem * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d) { + if (data.t_id_x < P_1d) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d) { + if (data.t_id_x < P_1d) { const CeedInt node = data.t_id_x; - const CeedInt ind = indices[node + elem * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + const CeedInt ind = indices[node + elem * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d) { + if (data.t_id_x < P_1d) { const CeedInt node = data.t_id_x; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } @@ -81,50 +85,54 @@ inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt e //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d; - const CeedInt ind = indices[node + elem * P1d * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d; - const CeedInt ind = indices[node + elem * P1d * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); + if (data.t_id_x < P_1d && data.t_id_y < P_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d && data.t_id_y < P1d) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } @@ -142,105 +150,110 @@ inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt e // - readSliceQuadsStrided3d -> readSliceStrided3d ? // - writeDofsOffset3d -> writeOffset3d ? // - writeDofsStrided3d -> writeStrided3d ? -template -inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d && data.t_id_y < P1d) - for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; - const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp]; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) + for (CeedInt z = 0; z < P_1d; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < P1d && data.t_id_y < P1d) - for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) + for (CeedInt z = 0; z < P_1d; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + comp * STRIDES_COMP]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ -template +template inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; - const CeedInt ind = indices[node + elem * Q1d * Q1d * Q1d]; - ; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; + if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; + const CeedInt ind = indices[node + elem * Q_1d * Q_1d * Q_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; } } //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { - if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; + if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ -template -inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, +template +inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d && data.t_id_y < P1d) - for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; - const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; - for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]); + if (data.t_id_x < P_1d && data.t_id_y < P_1d) + for (CeedInt z = 0; z < P_1d; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ -template +template inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { - if (data.t_id_x < P1d && data.t_id_y < P1d) - for (CeedInt z = 0; z < P1d; ++z) { - const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; + if (data.t_id_x < P_1d && data.t_id_y < P_1d) + for (CeedInt z = 0; z < P_1d; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; - for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P1d]; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d]; } } //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ -template +template inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - for (CeedInt comp = 0; comp < NCOMP; ++comp) { - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d]; + if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d]; __syncthreads(); // X derivative - r_V[comp + 0 * NCOMP] = 0.0; - for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + r_V[comp + 0 * NUM_COMP] = 0.0; + for (CeedInt i = 0; i < Q_1d; i++) + r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) // Y derivative - r_V[comp + 1 * NCOMP] = 0.0; - for (CeedInt i = 0; i < Q1d; ++i) - r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + r_V[comp + 1 * NUM_COMP] = 0.0; + for (CeedInt i = 0; i < Q_1d; i++) + r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) // Z derivative - r_V[comp + 2 * NCOMP] = 0.0; - for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d]; // Contract z direction (Z derivative) + r_V[comp + 2 * NUM_COMP] = 0.0; + for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative) __syncthreads(); } } @@ -249,26 +262,26 @@ inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ -template +template inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { - if (data.t_id_x < Q1d && data.t_id_y < Q1d) { - for (CeedInt comp = 0; comp < NCOMP; ++comp) { + if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { // X derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q1d; ++i) - r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) + for (CeedInt i = 0; i < Q_1d; i++) + r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) __syncthreads(); // Y derivative - data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP]; + data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; __syncthreads(); - for (CeedInt i = 0; i < Q1d; ++i) - r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) + for (CeedInt i = 0; i < Q_1d; i++) + r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) __syncthreads(); // Z derivative - for (CeedInt i = 0; i < Q1d; ++i) - r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP]; // PARTIAL contract z direction (Z derivative) + for (CeedInt i = 0; i < Q_1d; i++) + r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative) } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h index 31f3c11e16..484d755f77 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h @@ -21,8 +21,7 @@ //------------------------------------------------------------------------------ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { - const CeedInt t_id = threadIdx.x; - + const CeedInt t_id = threadIdx.x; const CeedScalar *U; CeedScalar V; // TODO load B in shared memory if blockDim.z > 1? @@ -51,8 +50,7 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos //------------------------------------------------------------------------------ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { - const CeedInt t_id = threadIdx.x; - + const CeedInt t_id = threadIdx.x; const CeedScalar *U; // TODO load G in shared memory if blockDim.z > 1? @@ -88,6 +86,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { const CeedInt t_id = threadIdx.x; + // TODO load q_weight in shared memory if blockDim.z > 1? for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h index 6a1f880bf5..26053c7a48 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h +++ b/include/ceed/jit-source/cuda/cuda-ref-basis-tensor.h @@ -46,11 +46,13 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; + for (CeedInt k = i; k < u_size; k += blockDim.x) { s_buffer_1[k] = cur_u[k]; } CeedInt pre = u_size; CeedInt post = 1; + for (CeedInt d = 0; d < BASIS_DIM; d++) { __syncthreads(); // Update buffers used @@ -67,10 +69,8 @@ extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpos CeedScalar vk = 0; for (CeedInt b = 0; b < P; b++) vk += s_interp_1d[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; - out[k] = vk; } - post *= Q; } } @@ -114,6 +114,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, CeedInt post = 1; const CeedScalar *cur_u = u + elem * u_stride + dim_1 * u_dim_stride + comp * u_comp_stride; CeedScalar *cur_v = v + elem * v_stride + dim_1 * v_dim_stride + comp * v_comp_stride; + for (CeedInt dim_2 = 0; dim_2 < BASIS_DIM; dim_2++) { __syncthreads(); // Update buffers used @@ -129,12 +130,12 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedInt j = (k / post) % Q; const CeedInt a = k / (post * Q); CeedScalar v_k = 0; + for (CeedInt b = 0; b < P; b++) v_k += op[j * stride_0 + b * stride_1] * in[(a * P + b) * post + c]; if (transpose && dim_2 == BASIS_DIM - 1) out[k] += v_k; else out[k] = v_k; } - post *= Q; } } @@ -147,8 +148,10 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, //------------------------------------------------------------------------------ __device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; + if (i < BASIS_Q_1D) { const size_t elem = blockIdx.x; + if (elem < num_elem) w[elem * BASIS_Q_1D + i] = q_weight_1d[i]; } } @@ -159,11 +162,14 @@ __device__ void Weight1d(const CeedInt num_elem, const CeedScalar *q_weight_1d, __device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; const CeedInt j = threadIdx.y; + if (i < BASIS_Q_1D && j < BASIS_Q_1D) { const size_t elem = blockIdx.x; + if (elem < num_elem) { const size_t ind = (elem * BASIS_Q_1D + j) * BASIS_Q_1D + i; - w[ind] = q_weight_1d[i] * q_weight_1d[j]; + + w[ind] = q_weight_1d[i] * q_weight_1d[j]; } } } @@ -174,12 +180,15 @@ __device__ void Weight2d(const CeedInt num_elem, const CeedScalar *q_weight_1d, __device__ void Weight3d(const CeedInt num_elem, const CeedScalar *q_weight_1d, CeedScalar *w) { const CeedInt i = threadIdx.x; const CeedInt j = threadIdx.y; + if (i < BASIS_Q_1D && j < BASIS_Q_1D) { const size_t elem = blockIdx.x; + if (elem < num_elem) { for (CeedInt k = 0; k < BASIS_Q_1D; k++) { const size_t ind = ((elem * BASIS_Q_1D + k) * BASIS_Q_1D + j) * BASIS_Q_1D + i; - w[ind] = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; + + w[ind] = q_weight_1d[i] * q_weight_1d[j] * q_weight_1d[k]; } } } diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h index acde10fd3e..7c6f8789e8 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h @@ -12,7 +12,7 @@ #include -#if CEEDSIZE +#if USE_CEEDSIZE typedef CeedSize IndexType; #else typedef CeedInt IndexType; @@ -21,17 +21,17 @@ typedef CeedInt IndexType; //------------------------------------------------------------------------------ // Get Basis Emode Pointer //------------------------------------------------------------------------------ -extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, +extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, const CeedScalar *interp, const CeedScalar *grad) { - switch (emode) { + switch (e_mode) { case CEED_EVAL_NONE: - *basisptr = identity; + *basis_ptr = identity; break; case CEED_EVAL_INTERP: - *basisptr = interp; + *basis_ptr = interp; break; case CEED_EVAL_GRAD: - *basisptr = grad; + *basis_ptr = grad; break; case CEED_EVAL_WEIGHT: case CEED_EVAL_DIV: @@ -43,49 +43,59 @@ extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **b //------------------------------------------------------------------------------ // Core code for diagonal assembly //------------------------------------------------------------------------------ -__device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const CeedScalar *identity, const CeedScalar *interpin, - const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, - const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { +__device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, const CeedScalar *interp_in, + const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, + const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, + CeedScalar *__restrict__ elem_diag_array) { const int tid = threadIdx.x; // running with P threads, tid is evec node - if (tid >= NNODES) return; + if (tid >= NUM_NODES) return; // Compute the diagonal of B^T D B // Each element - for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < nelem; e += gridDim.x * blockDim.z) { - IndexType dout = -1; + for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { + IndexType d_out = -1; + // Each basis eval mode pair - for (IndexType eout = 0; eout < NUMEMODEOUT; eout++) { - const CeedScalar *bt = NULL; - if (emodeout[eout] == CEED_EVAL_GRAD) dout += 1; - CeedOperatorGetBasisPointer_Cuda(&bt, emodeout[eout], identity, interpout, &gradout[dout * NQPTS * NNODES]); - IndexType din = -1; - for (IndexType ein = 0; ein < NUMEMODEIN; ein++) { + for (IndexType e_out = 0; e_out < NUM_E_MODE_OUT; e_out++) { + const CeedScalar *b_t = NULL; + + if (e_mode_out[e_out] == CEED_EVAL_GRAD) d_out += 1; + CeedOperatorGetBasisPointer_Cuda(&b_t, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * NUM_QPTS * NUM_NODES]); + IndexType d_in = -1; + + for (IndexType e_in = 0; e_in < NUM_E_MODE_IN; e_in++) { const CeedScalar *b = NULL; - if (emodein[ein] == CEED_EVAL_GRAD) din += 1; - CeedOperatorGetBasisPointer_Cuda(&b, emodein[ein], identity, interpin, &gradin[din * NQPTS * NNODES]); + + if (e_mode_in[e_in] == CEED_EVAL_GRAD) d_in += 1; + CeedOperatorGetBasisPointer_Cuda(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * NUM_QPTS * NUM_NODES]); // Each component - for (IndexType compOut = 0; compOut < NCOMP; compOut++) { + for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { // Each qpoint/node pair - if (pointBlock) { + if (is_point_block) { // Point Block Diagonal - for (IndexType compIn = 0; compIn < NCOMP; compIn++) { - CeedScalar evalue = 0.; - for (IndexType q = 0; q < NQPTS; q++) { - const CeedScalar qfvalue = - assembledqfarray[((((ein * NCOMP + compIn) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; - evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { + CeedScalar e_value = 0.; + + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + + q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } - elemdiagarray[((compOut * NCOMP + compIn) * nelem + e) * NNODES + tid] += evalue; + elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; } } else { // Diagonal Only - CeedScalar evalue = 0.; - for (IndexType q = 0; q < NQPTS; q++) { - const CeedScalar qfvalue = - assembledqfarray[((((ein * NCOMP + compOut) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; - evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; + CeedScalar e_value = 0.; + + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } - elemdiagarray[(compOut * nelem + e) * NNODES + tid] += evalue; + elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; } } } @@ -96,21 +106,21 @@ __device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const C //------------------------------------------------------------------------------ // Linear diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, const CeedScalar *gradin, - const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, - const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, - CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, false, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, + const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, + const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, + CeedScalar *__restrict__ elem_diag_array) { + diagonalCore(num_elem, false, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); } //------------------------------------------------------------------------------ // Linear point block diagonal //------------------------------------------------------------------------------ -extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, - const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, - const CeedEvalMode *emodein, const CeedEvalMode *emodeout, - const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { - diagonalCore(nelem, true, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); +extern "C" __global__ void linearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, + const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, + const CeedEvalMode *e_mode_in, const CeedEvalMode *e_mode_out, + const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { + diagonalCore(num_elem, true, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); } //------------------------------------------------------------------------------ diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h index 70f5267727..eeb256fed6 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h @@ -12,7 +12,7 @@ #include -#if CEEDSIZE +#if USE_CEEDSIZE typedef CeedSize IndexType; #else typedef CeedInt IndexType; @@ -32,34 +32,38 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col - const IndexType comp_out_stride = NNODES * NNODES; - const IndexType comp_in_stride = comp_out_stride * NCOMP; - const IndexType e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const IndexType qe_stride = NQPTS; - const IndexType qcomp_out_stride = NELEM * qe_stride; - const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; - const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; + const IndexType comp_out_stride = NUM_NODES * NUM_NODES; + const IndexType comp_in_stride = comp_out_stride * NUM_COMP; + const IndexType e_stride = comp_in_stride * NUM_COMP; + // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt + const IndexType q_e_stride = NUM_QPTS; + const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; + const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; + const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; + const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; // Loop over each element (if necessary) - for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { - for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { - for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { + for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { + for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { CeedScalar result = 0.0; - IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { - IndexType b_in_index = emode_in * NQPTS * NNODES; - for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { - IndexType b_out_index = emode_out * NQPTS * NNODES; - IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + + for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { + IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; + + for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { + IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; + IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (IndexType j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + for (IndexType j = 0; j < NUM_QPTS; j++) { + result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; } - } // end of emode_out - } // end of emode_in - IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + } // end of e_mode_out + } // end of e_mode_in + IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; + values_array[val_index] = result; } // end of out component } // end of in component @@ -79,35 +83,39 @@ extern "C" __launch_bounds__(BLOCK_SIZE) __global__ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, // comp_in, comp_out, node_row, node_col - const IndexType comp_out_stride = NNODES * NNODES; - const IndexType comp_in_stride = comp_out_stride * NCOMP; - const IndexType e_stride = comp_in_stride * NCOMP; - // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt - const IndexType qe_stride = NQPTS; - const IndexType qcomp_out_stride = NELEM * qe_stride; - const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; - const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; - const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; + const IndexType comp_out_stride = NUM_NODES * NUM_NODES; + const IndexType comp_in_stride = comp_out_stride * NUM_COMP; + const IndexType e_stride = comp_in_stride * NUM_COMP; + // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt + const IndexType q_e_stride = NUM_QPTS; + const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; + const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; + const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; + const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; // Loop over each element (if necessary) - for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { - for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { - for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { - for (IndexType i = 0; i < NNODES; i++) { + for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { + for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { + for (IndexType i = 0; i < NUM_NODES; i++) { CeedScalar result = 0.0; - IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; - for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { - IndexType b_in_index = emode_in * NQPTS * NNODES; - for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { - IndexType b_out_index = emode_out * NQPTS * NNODES; - IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + + for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { + IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; + + for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { + IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; + IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (IndexType j = 0; j < NQPTS; j++) { - result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; + for (IndexType j = 0; j < NUM_QPTS; j++) { + result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; } - } // end of emode_out - } // end of emode_in - IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; + } // end of e_mode_out + } // end of e_mode_in + IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; + values_array[val_index] = result; } // end of loop over element node index, i } // end of out component diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h index 96a19dc0eb..1df6f04901 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction.h @@ -16,13 +16,13 @@ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { - v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = - u[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM]; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = + u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; } } } @@ -31,13 +31,13 @@ extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const Ceed // E-vector -> L-vector, strided //------------------------------------------------------------------------------ extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { - v[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM] += - u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += + u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; } } } @@ -47,13 +47,13 @@ extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedSc //------------------------------------------------------------------------------ extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { - v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = u[ind + comp * RESTR_COMP_STRIDE]; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; } } } @@ -63,39 +63,39 @@ extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedI //------------------------------------------------------------------------------ extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RESTR_ELEM_SIZE; - const CeedInt elem = node / RESTR_ELEM_SIZE; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RESTR_COMP_STRIDE, u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]); + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); } } } extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RESTR_NUM_COMP]; + CeedScalar value[RSTR_NUM_COMP]; - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RESTR_NUM_NODES; i += blockDim.x * gridDim.x) { + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { const CeedInt ind = l_vec_indices[i]; const CeedInt range_1 = t_offsets[i]; const CeedInt range_N = t_offsets[i + 1]; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) value[comp] = 0.0; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; for (CeedInt j = range_1; j < range_N; j++) { const CeedInt t_ind = t_indices[j]; - CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; - CeedInt elem = t_ind / RESTR_ELEM_SIZE; + CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + CeedInt elem = t_ind / RSTR_ELEM_SIZE; - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; } } - for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) v[ind + comp * RESTR_COMP_STRIDE] += value[comp]; + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; } } diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index b8a08d6bdb..2afeeb8e04 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -25,6 +25,7 @@ inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -40,6 +41,7 @@ inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedIn if (data.t_id_x < P_1D) { const CeedInt node = data.t_id_x; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -59,6 +61,7 @@ inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[comp] = d_u[ind + comp * strides_comp]; } @@ -74,6 +77,7 @@ inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedIn if (data.t_id_x < P_1D && data.t_id_y < P_1D) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[comp]; } @@ -94,6 +98,7 @@ inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; } @@ -111,6 +116,7 @@ inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedIn for (CeedInt z = 0; z < P_1D; z++) { const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; const CeedInt ind = node * strides_node + elem * strides_elem; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { d_v[ind + comp * strides_comp] = r_v[z + comp * P_1D]; } diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c index 305fdae3e7..1c85459aae 100644 --- a/interface/ceed-operator.c +++ b/interface/ceed-operator.c @@ -34,14 +34,14 @@ @ref Developer **/ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction r, CeedBasis b) { - CeedInt dim = 1, num_comp = 1, q_comp = 1, restr_num_comp = 1, size = qf_field->size; + CeedInt dim = 1, num_comp = 1, q_comp = 1, rstr_num_comp = 1, size = qf_field->size; CeedEvalMode eval_mode = qf_field->eval_mode; // Restriction CeedCheck((r == CEED_ELEMRESTRICTION_NONE) == (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_INCOMPATIBLE, "CEED_ELEMRESTRICTION_NONE and CEED_EVAL_WEIGHT must be used together."); if (r != CEED_ELEMRESTRICTION_NONE) { - CeedCall(CeedElemRestrictionGetNumComponents(r, &restr_num_comp)); + CeedCall(CeedElemRestrictionGetNumComponents(r, &rstr_num_comp)); } // Basis CeedCheck((b == CEED_BASIS_NONE) == (eval_mode == CEED_EVAL_NONE), ceed, CEED_ERROR_INCOMPATIBLE, @@ -50,17 +50,17 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl CeedCall(CeedBasisGetDimension(b, &dim)); CeedCall(CeedBasisGetNumComponents(b, &num_comp)); CeedCall(CeedBasisGetNumQuadratureComponents(b, eval_mode, &q_comp)); - CeedCheck(r == CEED_ELEMRESTRICTION_NONE || restr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, + CeedCheck(r == CEED_ELEMRESTRICTION_NONE || rstr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components, but Basis has %" CeedInt_FMT " components", - qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], restr_num_comp, num_comp); + qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp, num_comp); } // Field size switch (eval_mode) { case CEED_EVAL_NONE: - CeedCheck(size == restr_num_comp, ceed, CEED_ERROR_DIMENSION, + CeedCheck(size == rstr_num_comp, ceed, CEED_ERROR_DIMENSION, "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components", qf_field->field_name, - qf_field->size, CeedEvalModes[qf_field->eval_mode], restr_num_comp); + qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp); break; case CEED_EVAL_INTERP: case CEED_EVAL_GRAD: @@ -1192,10 +1192,10 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { // Input FLOPs for (CeedInt i = 0; i < num_input_fields; i++) { if (input_fields[i]->vec == CEED_VECTOR_ACTIVE) { - CeedSize restr_flops, basis_flops; + CeedSize rstr_flops, basis_flops; - CeedCall(CeedElemRestrictionGetFlopsEstimate(input_fields[i]->elem_rstr, CEED_NOTRANSPOSE, &restr_flops)); - *flops += restr_flops; + CeedCall(CeedElemRestrictionGetFlopsEstimate(input_fields[i]->elem_rstr, CEED_NOTRANSPOSE, &rstr_flops)); + *flops += rstr_flops; CeedCall(CeedBasisGetFlopsEstimate(input_fields[i]->basis, CEED_NOTRANSPOSE, op->qf->input_fields[i]->eval_mode, &basis_flops)); *flops += basis_flops * num_elem; } @@ -1213,10 +1213,10 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) { // Output FLOPs for (CeedInt i = 0; i < num_output_fields; i++) { if (output_fields[i]->vec == CEED_VECTOR_ACTIVE) { - CeedSize restr_flops, basis_flops; + CeedSize rstr_flops, basis_flops; - CeedCall(CeedElemRestrictionGetFlopsEstimate(output_fields[i]->elem_rstr, CEED_TRANSPOSE, &restr_flops)); - *flops += restr_flops; + CeedCall(CeedElemRestrictionGetFlopsEstimate(output_fields[i]->elem_rstr, CEED_TRANSPOSE, &rstr_flops)); + *flops += rstr_flops; CeedCall(CeedBasisGetFlopsEstimate(output_fields[i]->basis, CEED_TRANSPOSE, op->qf->output_fields[i]->eval_mode, &basis_flops)); *flops += basis_flops * num_elem; } diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c index 394d72a8dd..4d700ad16a 100644 --- a/interface/ceed-preconditioning.c +++ b/interface/ceed-preconditioning.c @@ -758,14 +758,14 @@ static int CeedSingleOperatorAssemblyCountEntries(CeedOperator op, CeedSize *num @param[in] basis_c_to_f Basis for coarse to fine interpolation, or NULL if not creating prolongation/restriction operators @param[out] op_coarse Coarse grid operator @param[out] op_prolong Coarse to fine operator, or NULL - @param[out] op_restrict Fine to coarse operator, or NULL + @param[out] op_rstrict Fine to coarse operator, or NULL @return An error code: 0 - success, otherwise - failure @ref Developer **/ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { + CeedBasis basis_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_rstrict) { bool is_composite; Ceed ceed; CeedInt num_comp; @@ -803,7 +803,7 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m CeedCall(CeedQFunctionAssemblyDataReferenceCopy(op_fine->qf_assembled, &(*op_coarse)->qf_assembled)); // Multiplicity vector - if (op_restrict || op_prolong) { + if (op_rstrict || op_prolong) { CeedVector mult_e_vec; CeedRestrictionType rstr_type; @@ -826,49 +826,49 @@ static int CeedSingleOperatorMultigridLevel(CeedOperator op_fine, CeedVector p_m size_t name_len = op_fine->name ? strlen(op_fine->name) : 0; CeedCall(CeedOperatorSetName(*op_coarse, op_fine->name)); - // Check that coarse to fine basis is provided if prolong/restrict operators are requested - CeedCheck(basis_c_to_f || (!op_restrict && !op_prolong), ceed, CEED_ERROR_INCOMPATIBLE, + // Check that coarse to fine basis is provided if prolong/rstrict operators are requested + CeedCheck(basis_c_to_f || (!op_rstrict && !op_prolong), ceed, CEED_ERROR_INCOMPATIBLE, "Prolongation or restriction operator creation requires coarse-to-fine basis"); // Restriction/Prolongation Operators CeedCall(CeedBasisGetNumComponents(basis_coarse, &num_comp)); // Restriction - if (op_restrict) { + if (op_rstrict) { CeedInt *num_comp_r_data; CeedQFunctionContext ctx_r; - CeedQFunction qf_restrict; + CeedQFunction qf_rstrict; - CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_restrict)); + CeedCall(CeedQFunctionCreateInteriorByName(ceed, "Scale", &qf_rstrict)); CeedCall(CeedCalloc(1, &num_comp_r_data)); num_comp_r_data[0] = num_comp; CeedCall(CeedQFunctionContextCreate(ceed, &ctx_r)); CeedCall(CeedQFunctionContextSetData(ctx_r, CEED_MEM_HOST, CEED_OWN_POINTER, sizeof(*num_comp_r_data), num_comp_r_data)); - CeedCall(CeedQFunctionSetContext(qf_restrict, ctx_r)); + CeedCall(CeedQFunctionSetContext(qf_rstrict, ctx_r)); CeedCall(CeedQFunctionContextDestroy(&ctx_r)); - CeedCall(CeedQFunctionAddInput(qf_restrict, "input", num_comp, CEED_EVAL_NONE)); - CeedCall(CeedQFunctionAddInput(qf_restrict, "scale", num_comp, CEED_EVAL_NONE)); - CeedCall(CeedQFunctionAddOutput(qf_restrict, "output", num_comp, CEED_EVAL_INTERP)); - CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_restrict, num_comp)); + CeedCall(CeedQFunctionAddInput(qf_rstrict, "input", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddInput(qf_rstrict, "scale", num_comp, CEED_EVAL_NONE)); + CeedCall(CeedQFunctionAddOutput(qf_rstrict, "output", num_comp, CEED_EVAL_INTERP)); + CeedCall(CeedQFunctionSetUserFlopsEstimate(qf_rstrict, num_comp)); - CeedCall(CeedOperatorCreate(ceed, qf_restrict, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, op_restrict)); - CeedCall(CeedOperatorSetField(*op_restrict, "input", rstr_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - CeedCall(CeedOperatorSetField(*op_restrict, "scale", rstr_p_mult_fine, CEED_BASIS_NONE, mult_vec)); - CeedCall(CeedOperatorSetField(*op_restrict, "output", rstr_coarse, basis_c_to_f, CEED_VECTOR_ACTIVE)); + CeedCall(CeedOperatorCreate(ceed, qf_rstrict, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, op_rstrict)); + CeedCall(CeedOperatorSetField(*op_rstrict, "input", rstr_fine, CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + CeedCall(CeedOperatorSetField(*op_rstrict, "scale", rstr_p_mult_fine, CEED_BASIS_NONE, mult_vec)); + CeedCall(CeedOperatorSetField(*op_rstrict, "output", rstr_coarse, basis_c_to_f, CEED_VECTOR_ACTIVE)); // Set name char *restriction_name; CeedCall(CeedCalloc(17 + name_len, &restriction_name)); sprintf(restriction_name, "restriction%s%s", has_name ? " for " : "", has_name ? op_fine->name : ""); - CeedCall(CeedOperatorSetName(*op_restrict, restriction_name)); + CeedCall(CeedOperatorSetName(*op_rstrict, restriction_name)); CeedCall(CeedFree(&restriction_name)); // Check - CeedCall(CeedOperatorCheckReady(*op_restrict)); + CeedCall(CeedOperatorCheckReady(*op_rstrict)); // Cleanup - CeedCall(CeedQFunctionDestroy(&qf_restrict)); + CeedCall(CeedQFunctionDestroy(&qf_rstrict)); } // Prolongation @@ -2242,20 +2242,20 @@ grid interpolation @param[in] basis_coarse Coarse grid active vector basis @param[out] op_coarse Coarse grid operator @param[out] op_prolong Coarse to fine operator, or NULL - @param[out] op_restrict Fine to coarse operator, or NULL + @param[out] op_rstrict Fine to coarse operator, or NULL @return An error code: 0 - success, otherwise - failure @ref User **/ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_restrict) { + CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_rstrict) { CeedBasis basis_c_to_f = NULL; CeedCall(CeedOperatorCheckReady(op_fine)); // Build prolongation matrix, if required - if (op_prolong || op_restrict) { + if (op_prolong || op_rstrict) { CeedBasis basis_fine; CeedCall(CeedOperatorGetActiveBasis(op_fine, &basis_fine)); @@ -2263,7 +2263,7 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_rstrict)); return CEED_ERROR_SUCCESS; } @@ -2279,7 +2279,7 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin @param[in] interp_c_to_f Matrix for coarse to fine interpolation, or NULL if not creating prolongation/restriction operators @param[out] op_coarse Coarse grid operator @param[out] op_prolong Coarse to fine operator, or NULL - @param[out] op_restrict Fine to coarse operator, or NULL + @param[out] op_rstrict Fine to coarse operator, or NULL @return An error code: 0 - success, otherwise - failure @@ -2287,7 +2287,7 @@ int CeedOperatorMultigridLevelCreate(CeedOperator op_fine, CeedVector p_mult_fin **/ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, - CeedOperator *op_restrict) { + CeedOperator *op_rstrict) { Ceed ceed; CeedInt Q_f, Q_c; CeedBasis basis_fine, basis_c_to_f = NULL; @@ -2302,7 +2302,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); // Create coarse to fine basis, if required - if (op_prolong || op_restrict) { + if (op_prolong || op_rstrict) { CeedInt dim, num_comp, num_nodes_c, P_1d_f, P_1d_c; CeedScalar *q_ref, *q_weight, *grad; @@ -2324,7 +2324,7 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_rstrict)); return CEED_ERROR_SUCCESS; } @@ -2340,15 +2340,14 @@ int CeedOperatorMultigridLevelCreateTensorH1(CeedOperator op_fine, CeedVector p_ @param[in] interp_c_to_f Matrix for coarse to fine interpolation, or NULL if not creating prolongation/restriction operators @param[out] op_coarse Coarse grid operator @param[out] op_prolong Coarse to fine operator, or NULL - @param[out] op_restrict Fine to coarse operator, or NULL + @param[out] op_rstrict Fine to coarse operator, or NULL @return An error code: 0 - success, otherwise - failure @ref User **/ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_fine, CeedElemRestriction rstr_coarse, CeedBasis basis_coarse, - const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, - CeedOperator *op_restrict) { + const CeedScalar *interp_c_to_f, CeedOperator *op_coarse, CeedOperator *op_prolong, CeedOperator *op_rstrict) { Ceed ceed; CeedInt Q_f, Q_c; CeedBasis basis_fine, basis_c_to_f = NULL; @@ -2363,7 +2362,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f CeedCheck(Q_f == Q_c, ceed, CEED_ERROR_DIMENSION, "Bases must have compatible quadrature spaces"); // Coarse to fine basis - if (op_prolong || op_restrict) { + if (op_prolong || op_rstrict) { CeedInt dim, num_comp, num_nodes_c, num_nodes_f; CeedScalar *q_ref, *q_weight, *grad; CeedElemTopology topo; @@ -2386,7 +2385,7 @@ int CeedOperatorMultigridLevelCreateH1(CeedOperator op_fine, CeedVector p_mult_f } // Core code - CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_restrict)); + CeedCall(CeedSingleOperatorMultigridLevel(op_fine, p_mult_fine, rstr_coarse, basis_coarse, basis_c_to_f, op_coarse, op_prolong, op_rstrict)); return CEED_ERROR_SUCCESS; } diff --git a/interface/ceed.c b/interface/ceed.c index 3abcd9f440..a40104cb06 100644 --- a/interface/ceed.c +++ b/interface/ceed.c @@ -463,7 +463,7 @@ int CeedSetObjectDelegate(Ceed ceed, Ceed delegate, const char *obj_name) { @ref Backend **/ -int CeedGetOperatorFallbackResource(Ceed ceed, const char **resource) { +int CeedGetOperatorfallback_resource(Ceed ceed, const char **resource) { *resource = (const char *)ceed->op_fallback_resource; return CEED_ERROR_SUCCESS; } @@ -491,7 +491,7 @@ int CeedGetOperatorFallbackCeed(Ceed ceed, Ceed *fallback_ceed) { Ceed fallback_ceed; const char *fallback_resource; - CeedCall(CeedGetOperatorFallbackResource(ceed, &fallback_resource)); + CeedCall(CeedGetOperatorfallback_resource(ceed, &fallback_resource)); CeedCall(CeedInit(fallback_resource, &fallback_ceed)); fallback_ceed->op_fallback_parent = ceed; fallback_ceed->Error = ceed->Error; @@ -866,8 +866,8 @@ int CeedInit(const char *resource, Ceed *ceed) { memcpy((*ceed)->f_offsets, f_offsets, sizeof(f_offsets)); // Set fallback for advanced CeedOperator functions - const char fallbackresource[] = ""; - CeedCall(CeedSetOperatorFallbackResource(*ceed, fallbackresource)); + const char fallback_resource[] = ""; + CeedCall(CeedSetOperatorFallbackResource(*ceed, fallback_resource)); // Record env variables CEED_DEBUG or DBG (*ceed)->is_debug = getenv("CEED_DEBUG") || getenv("DEBUG") || getenv("DBG"); diff --git a/julia/LibCEED.jl/examples/common.jl b/julia/LibCEED.jl/examples/common.jl index d4f2eb7deb..0e3636398e 100644 --- a/julia/LibCEED.jl/examples/common.jl +++ b/julia/LibCEED.jl/examples/common.jl @@ -21,8 +21,8 @@ function get_cartesian_mesh_size(dim, order, prob_size) end struct FormRestrictionMode{T} end -const RestrictionOnly = FormRestrictionMode{:restr}() -const StridedOnly = FormRestrictionMode{:restr_i}() +const RestrictionOnly = FormRestrictionMode{:rstr}() +const StridedOnly = FormRestrictionMode{:rstr_i}() const RestrictionAndStrided = FormRestrictionMode{:both}() function build_cartesian_restriction( @@ -44,14 +44,14 @@ function build_cartesian_restriction( scalar_size::CeedInt = prod(nd) size::CeedInt = scalar_size*ncomp - form_restr = (Mode() != StridedOnly) + form_rstr = (Mode() != StridedOnly) form_strided = (Mode() != RestrictionOnly) # elem: 0 1 n-1 # |---*-...-*---|---*-...-*---|- ... -|--...--| # nnodes: 0 1 p-1 p p+1 2*p n*p - if form_restr + if form_rstr el_nodes = zeros(CeedInt, num_elem*nnodes) exyz = zeros(CeedInt, dim) @inbounds @simd for e = 0:(num_elem-1) @@ -76,8 +76,8 @@ function build_cartesian_restriction( end end - restr = - form_restr ? + rstr = + form_rstr ? create_elem_restriction( c, num_elem, @@ -87,7 +87,7 @@ function build_cartesian_restriction( ncomp*scalar_size, el_nodes, ) : nothing - restr_i = + rstr_i = form_strided ? create_elem_restriction_strided( c, @@ -98,7 +98,7 @@ function build_cartesian_restriction( STRIDES_BACKEND, ) : nothing - return size, restr, restr_i + return size, rstr, rstr_i end function set_cartesian_mesh_coords!(dim, nxyz, mesh_order, mesh_coords) diff --git a/julia/LibCEED.jl/examples/ex1-volume-c.jl b/julia/LibCEED.jl/examples/ex1-volume-c.jl index aa26689097..e9130f5287 100644 --- a/julia/LibCEED.jl/examples/ex1-volume-c.jl +++ b/julia/LibCEED.jl/examples/ex1-volume-c.jl @@ -127,7 +127,7 @@ function build_cartesian_restriction_c( end end - restr = Ref{C.CeedElemRestriction}() + rstr = Ref{C.CeedElemRestriction}() C.CeedElemRestrictionCreate( ceed[], num_elem, @@ -138,10 +138,10 @@ function build_cartesian_restriction_c( C.CEED_MEM_HOST, C.CEED_COPY_VALUES, el_nodes, - restr, + rstr, ) if form_strided - restr_i = Ref{C.CeedElemRestriction}() + rstr_i = Ref{C.CeedElemRestriction}() err = C.CeedElemRestrictionCreateStrided( ceed[], num_elem, @@ -149,11 +149,11 @@ function build_cartesian_restriction_c( ncomp, ncomp*elem_qpts*num_elem, C.CEED_STRIDES_BACKEND[], - restr_i, + rstr_i, ) - return size, restr, restr_i + return size, rstr, rstr_i else - return size, restr + return size, rstr end end @@ -248,9 +248,9 @@ function run_ex1_c(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) # Build CeedElemRestriction objects describing the mesh and solution discrete # representations. - mesh_size, mesh_restr = + mesh_size, mesh_rstr = build_cartesian_restriction_c(ceed, dim, nxyz, mesh_order, ncompx, num_qpts) - sol_size, sol_restr, sol_restr_i = build_cartesian_restriction_c( + sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction_c( ceed, dim, nxyz, @@ -314,7 +314,7 @@ function run_ex1_c(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) C.CeedOperatorSetField( build_oper[], "dx", - mesh_restr[], + mesh_rstr[], mesh_basis[], C.CEED_VECTOR_ACTIVE[], ) @@ -328,7 +328,7 @@ function run_ex1_c(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) C.CeedOperatorSetField( build_oper[], "qdata", - sol_restr_i[], + sol_rstr_i[], C.CEED_BASIS_NONE[], C.CEED_VECTOR_ACTIVE[], ) @@ -376,9 +376,9 @@ function run_ex1_c(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) C.CEED_QFUNCTION_NONE[], oper, ) - C.CeedOperatorSetField(oper[], "u", sol_restr[], sol_basis[], C.CEED_VECTOR_ACTIVE[]) - C.CeedOperatorSetField(oper[], "qdata", sol_restr_i[], C.CEED_BASIS_NONE[], qdata[]) - C.CeedOperatorSetField(oper[], "v", sol_restr[], sol_basis[], C.CEED_VECTOR_ACTIVE[]) + C.CeedOperatorSetField(oper[], "u", sol_rstr[], sol_basis[], C.CEED_VECTOR_ACTIVE[]) + C.CeedOperatorSetField(oper[], "qdata", sol_rstr_i[], C.CEED_BASIS_NONE[], qdata[]) + C.CeedOperatorSetField(oper[], "v", sol_rstr[], sol_basis[], C.CEED_VECTOR_ACTIVE[]) # Compute the mesh volume using the mass operator: vol = 1^T \cdot M \cdot 1 print("Computing the mesh volume using the formula: vol = 1^T.M.1 ...") @@ -416,9 +416,9 @@ function run_ex1_c(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size) C.CeedQFunctionDestroy(apply_qfunc) C.CeedOperatorDestroy(build_oper) C.CeedQFunctionDestroy(build_qfunc) - C.CeedElemRestrictionDestroy(sol_restr) - C.CeedElemRestrictionDestroy(mesh_restr) - C.CeedElemRestrictionDestroy(sol_restr_i) + C.CeedElemRestrictionDestroy(sol_rstr) + C.CeedElemRestrictionDestroy(mesh_rstr) + C.CeedElemRestrictionDestroy(sol_rstr_i) C.CeedBasisDestroy(sol_basis) C.CeedBasisDestroy(mesh_basis) C.CeedDestroy(ceed) diff --git a/julia/LibCEED.jl/examples/ex1-volume.jl b/julia/LibCEED.jl/examples/ex1-volume.jl index aebde477f7..ae1cca1434 100644 --- a/julia/LibCEED.jl/examples/ex1-volume.jl +++ b/julia/LibCEED.jl/examples/ex1-volume.jl @@ -44,9 +44,9 @@ function run_ex1(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g # Build CeedElemRestriction objects describing the mesh and solution discrete # representations. - mesh_size, mesh_restr, _ = + mesh_size, mesh_rstr, _ = build_cartesian_restriction(ceed, dim, nxyz, mesh_order, ncompx, num_qpts) - sol_size, sol_restr, sol_restr_i = build_cartesian_restriction( + sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction( ceed, dim, nxyz, @@ -86,9 +86,9 @@ function run_ex1(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g ceed, qf=build_qfunc, fields=[ - (gallery ? :dx : :J, mesh_restr, mesh_basis, CeedVectorActive()), + (gallery ? :dx : :J, mesh_rstr, mesh_basis, CeedVectorActive()), (gallery ? :weights : :w, ElemRestrictionNone(), mesh_basis, CeedVectorNone()), - (:qdata, sol_restr_i, BasisNone(), CeedVectorActive()), + (:qdata, sol_rstr_i, BasisNone(), CeedVectorActive()), ], ) @@ -122,9 +122,9 @@ function run_ex1(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g ceed, qf=apply_qfunc, fields=[ - (:u, sol_restr, sol_basis, CeedVectorActive()), - (:qdata, sol_restr_i, BasisNone(), qdata), - (:v, sol_restr, sol_basis, CeedVectorActive()), + (:u, sol_rstr, sol_basis, CeedVectorActive()), + (:qdata, sol_rstr_i, BasisNone(), qdata), + (:v, sol_rstr, sol_basis, CeedVectorActive()), ], ) diff --git a/julia/LibCEED.jl/examples/ex2-surface.jl b/julia/LibCEED.jl/examples/ex2-surface.jl index efa5cbe81e..fac9d090a9 100644 --- a/julia/LibCEED.jl/examples/ex2-surface.jl +++ b/julia/LibCEED.jl/examples/ex2-surface.jl @@ -30,7 +30,7 @@ function run_ex2(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g # Build CeedElemRestriction objects describing the mesh and solution discrete # representations. - mesh_size, mesh_restr, _ = build_cartesian_restriction( + mesh_size, mesh_rstr, _ = build_cartesian_restriction( ceed, dim, nxyz, @@ -39,7 +39,7 @@ function run_ex2(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g num_qpts, mode=RestrictionOnly, ) - sol_size, _, qdata_restr_i = build_cartesian_restriction( + sol_size, _, qdata_rstr_i = build_cartesian_restriction( ceed, dim, nxyz, @@ -48,7 +48,7 @@ function run_ex2(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g num_qpts, mode=StridedOnly, ) - sol_size, sol_restr, sol_restr_i = build_cartesian_restriction( + sol_size, sol_rstr, sol_rstr_i = build_cartesian_restriction( ceed, dim, nxyz, @@ -91,9 +91,9 @@ function run_ex2(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g ceed, qf=build_qfunc, fields=[ - (gallery ? :dx : :J, mesh_restr, mesh_basis, CeedVectorActive()), + (gallery ? :dx : :J, mesh_rstr, mesh_basis, CeedVectorActive()), (gallery ? :weights : :w, ElemRestrictionNone(), mesh_basis, CeedVectorNone()), - (:qdata, qdata_restr_i, BasisNone(), CeedVectorActive()), + (:qdata, qdata_rstr_i, BasisNone(), CeedVectorActive()), ], ) @@ -128,9 +128,9 @@ function run_ex2(; ceed_spec, dim, mesh_order, sol_order, num_qpts, prob_size, g ceed, qf=apply_qfunc, fields=[ - (:du, sol_restr, sol_basis, CeedVectorActive()), - (:qdata, qdata_restr_i, BasisNone(), qdata), - (:dv, sol_restr, sol_basis, CeedVectorActive()), + (:du, sol_rstr, sol_basis, CeedVectorActive()), + (:qdata, qdata_rstr_i, BasisNone(), qdata), + (:dv, sol_rstr, sol_basis, CeedVectorActive()), ], ) diff --git a/julia/LibCEED.jl/src/Operator.jl b/julia/LibCEED.jl/src/Operator.jl index bce246a569..d1de710c54 100644 --- a/julia/LibCEED.jl/src/Operator.jl +++ b/julia/LibCEED.jl/src/Operator.jl @@ -35,9 +35,9 @@ build_oper = Operator( ceed, qf=build_qfunc, fields=[ - (:J, mesh_restr, mesh_basis, CeedVectorActive()), + (:J, mesh_rstr, mesh_basis, CeedVectorActive()), (:w, ElemRestrictionNone(), mesh_basis, CeedVectorNone()), - (:qdata, sol_restr_i, BasisNone(), CeedVectorActive()) + (:qdata, sol_rstr_i, BasisNone(), CeedVectorActive()) ] ) ``` diff --git a/julia/LibCEED.jl/src/generated/libceed_bindings.jl b/julia/LibCEED.jl/src/generated/libceed_bindings.jl index 3f0ee45991..f2afdc6d82 100644 --- a/julia/LibCEED.jl/src/generated/libceed_bindings.jl +++ b/julia/LibCEED.jl/src/generated/libceed_bindings.jl @@ -933,8 +933,8 @@ function CeedSetObjectDelegate(ceed, delegate, obj_name) ccall((:CeedSetObjectDelegate, libceed), Cint, (Ceed, Ceed, Ptr{Cchar}), ceed, delegate, obj_name) end -function CeedGetOperatorFallbackResource(ceed, resource) - ccall((:CeedGetOperatorFallbackResource, libceed), Cint, (Ceed, Ptr{Ptr{Cchar}}), ceed, resource) +function CeedGetOperatorfallback_resource(ceed, resource) + ccall((:CeedGetOperatorfallback_resource, libceed), Cint, (Ceed, Ptr{Ptr{Cchar}}), ceed, resource) end function CeedGetOperatorFallbackCeed(ceed, fallback_ceed) diff --git a/rust/libceed/src/elem_restriction.rs b/rust/libceed/src/elem_restriction.rs index 6c45abe154..430d9b63af 100644 --- a/rust/libceed/src/elem_restriction.rs +++ b/rust/libceed/src/elem_restriction.rs @@ -20,9 +20,9 @@ pub enum ElemRestrictionOpt<'a> { } /// Construct a ElemRestrictionOpt reference from a ElemRestriction reference impl<'a> From<&'a ElemRestriction<'_>> for ElemRestrictionOpt<'a> { - fn from(restr: &'a ElemRestriction) -> Self { - debug_assert!(restr.ptr != unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE }); - Self::Some(restr) + fn from(rstr: &'a ElemRestriction) -> Self { + debug_assert!(rstr.ptr != unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE }); + Self::Some(rstr) } } impl<'a> ElemRestrictionOpt<'a> { @@ -30,7 +30,7 @@ impl<'a> ElemRestrictionOpt<'a> { /// CeedElemRestriction pub(crate) fn to_raw(self) -> bind_ceed::CeedElemRestriction { match self { - Self::Some(restr) => restr.ptr, + Self::Some(rstr) => rstr.ptr, Self::None => unsafe { bind_ceed::CEED_ELEMRESTRICTION_NONE }, } }