From 7251047c7cf32ba2ad478e34e97b2a7bc384014b Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Fri, 19 Jan 2024 15:33:13 -0800 Subject: [PATCH 1/4] Remove need for separate CUmodule/hipModule_t for weights in non-tensor MAGMA backend --- backends/magma/ceed-magma-basis.c | 251 ++++++++++++++++-------------- backends/magma/ceed-magma.h | 2 +- 2 files changed, 138 insertions(+), 115 deletions(-) diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c index 4442248bed..6db80b5dbf 100644 --- a/backends/magma/ceed-magma-basis.c +++ b/backends/magma/ceed-magma-basis.c @@ -255,8 +255,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedVector v) { Ceed ceed; Ceed_Magma *data; - CeedInt num_comp, q_comp, num_nodes, num_qpts, P, Q, N; - const CeedScalar *d_u, *d_b = NULL; + CeedInt num_comp, num_nodes, num_qpts, P, Q, N; + const CeedScalar *d_u; CeedScalar *d_v; CeedBasisNonTensor_Magma *impl; @@ -264,7 +264,6 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed CeedCallBackend(CeedGetData(ceed, &data)); CeedCallBackend(CeedBasisGetData(basis, &impl)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp)); CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); P = num_nodes; @@ -276,8 +275,81 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + // Compile kernels for N as needed + CeedInt iN = 0; + if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q && (e_mode != CEED_EVAL_WEIGHT || !impl->Weight)) { + CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; + CeedInt diff = abs(n_array[iN] - N), idiff; + + for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { + idiff = abs(n_array[in] - N); + if (idiff < diff) { + iN = in; + diff = idiff; + } + } + + if (!impl->NB_interp[iN]) { + CeedFESpace fe_space; + CeedInt q_comp_interp, q_comp_deriv; + Ceed ceed_delegate; + char *basis_kernel_path, *weight_kernel_path, *basis_kernel_source; + magma_int_t arch = magma_getdevice_arch(); + + // Tuning parameters for NB + CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); + switch (fe_space) { + case CEED_FE_SPACE_H1: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv)); + break; + case CEED_FE_SPACE_HDIV: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv)); + break; + case CEED_FE_SPACE_HCURL: + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv)); + break; + } + impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]); + impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]); + impl->NB_deriv[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]); + impl->NB_deriv_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]); + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile kernels + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); + if (!impl->Weight) { + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + } + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp, + "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], + "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T", + impl->NB_deriv_t[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); + if (!impl->Weight) { + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + } + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + } + } + // Apply basis operation if (e_mode != CEED_EVAL_WEIGHT) { + const CeedScalar *d_b = NULL; + CeedInt q_comp, NB, M, K; + CeedMagmaFunction Kernel; + switch (e_mode) { case CEED_EVAL_INTERP: d_b = impl->d_interp; @@ -298,69 +370,10 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); // LCOV_EXCL_STOP } + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp)); + M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; - // Apply basis operation if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { - CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES}; - CeedInt iN = 0, diff = abs(n_array[iN] - N), idiff; - CeedInt M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P; - CeedMagmaFunction Kernel; - CeedInt NB; - - for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - idiff = abs(n_array[in] - N); - if (idiff < diff) { - iN = in; - diff = idiff; - } - } - - // Compile kernels for N as needed - if (!impl->NB_interp[iN]) { - CeedFESpace fe_space; - CeedInt q_comp_interp, q_comp_deriv; - Ceed ceed_delegate; - char *basis_kernel_path, *basis_kernel_source; - magma_int_t arch = magma_getdevice_arch(); - - // Tuning parameters for NB - CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space)); - CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); - switch (fe_space) { - case CEED_FE_SPACE_H1: - CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv)); - break; - case CEED_FE_SPACE_HDIV: - CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv)); - break; - case CEED_FE_SPACE_HCURL: - CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv)); - break; - } - impl->NB_interp[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]); - impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]); - impl->NB_deriv[iN] = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]); - impl->NB_deriv_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]); - - // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - - // Compile kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp, - "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN], - "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T", - impl->NB_deriv_t[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN])); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN])); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); - } if (e_mode == CEED_EVAL_INTERP) { if (t_mode == CEED_TRANSPOSE) { Kernel = impl->InterpTranspose[iN]; @@ -447,17 +460,12 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) { CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedBasisGetData(basis, &impl)); -#ifdef CEED_MAGMA_USE_HIP - CeedCallHip(ceed, hipModuleUnload(impl->module_weight)); -#else - CeedCallCuda(ceed, cuModuleUnload(impl->module_weight)); -#endif for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) { - if (impl->module_interp[in]) { + if (impl->module[in]) { #ifdef CEED_MAGMA_USE_HIP - CeedCallHip(ceed, hipModuleUnload(impl->module_interp[in])); + CeedCallHip(ceed, hipModuleUnload(impl->module[in])); #else - CeedCallCuda(ceed, cuModuleUnload(impl->module_interp[in])); + CeedCallCuda(ceed, cuModuleUnload(impl->module[in])); #endif } } @@ -569,9 +577,8 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const //------------------------------------------------------------------------------ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed, ceed_delegate; + Ceed ceed; Ceed_Magma *data; - char *weight_kernel_path, *basis_kernel_source; CeedBasisNonTensor_Magma *impl; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); @@ -596,18 +603,24 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue); } - // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - - // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); - CeedCallBackend(CeedFree(&weight_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); + // Compile the weight kernel if it won't be compiled later on + if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { + Ceed ceed_delegate; + char *weight_kernel_path, *basis_kernel_source; + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + } CeedCallBackend(CeedBasisSetData(basis, impl)); @@ -622,9 +635,8 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node //------------------------------------------------------------------------------ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed, ceed_delegate; + Ceed ceed; Ceed_Magma *data; - char *weight_kernel_path, *basis_kernel_source; CeedBasisNonTensor_Magma *impl; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); @@ -649,18 +661,24 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue); } - // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - - // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); - CeedCallBackend(CeedFree(&weight_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); + // Compile the weight kernel if it won't be compiled later on + if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { + Ceed ceed_delegate; + char *weight_kernel_path, *basis_kernel_source; + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + } CeedCallBackend(CeedBasisSetData(basis, impl)); @@ -675,9 +693,8 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no //------------------------------------------------------------------------------ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed, ceed_delegate; + Ceed ceed; Ceed_Magma *data; - char *weight_kernel_path, *basis_kernel_source; CeedBasisNonTensor_Magma *impl; CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); @@ -702,18 +719,24 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue); } - // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data - CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); - - // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts)); - CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight)); - CeedCallBackend(CeedFree(&weight_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); + // Compile the weight kernel if it won't be compiled later on + if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) { + Ceed ceed_delegate; + char *weight_kernel_path, *basis_kernel_source; + + // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data + CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate)); + + // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply) + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts)); + CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight)); + CeedCallBackend(CeedFree(&weight_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + } CeedCallBackend(CeedBasisSetData(basis, impl)); diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h index 9350dc8452..1ec17d2de2 100644 --- a/backends/magma/ceed-magma.h +++ b/backends/magma/ceed-magma.h @@ -57,7 +57,7 @@ typedef struct { } CeedBasis_Magma; typedef struct { - CeedMagmaModule module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; + CeedMagmaModule module[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES]; CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES]; From cbfe683adbe50f8abe894256ac583c510bfbed11 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Fri, 19 Jan 2024 16:39:13 -0800 Subject: [PATCH 2/4] Lazily compile GPU diagonal/point-block diagonal assembly kernels as needed Also fixes CEED_SIZE -> USE_CEEDSIZE definition. --- backends/cuda-ref/ceed-cuda-ref-operator.c | 132 ++++++++++++++---- backends/cuda-ref/ceed-cuda-ref.h | 2 +- backends/hip-ref/ceed-hip-ref-operator.c | 130 +++++++++++++---- backends/hip-ref/ceed-hip-ref.h | 2 +- .../cuda-ref-operator-assemble-diagonal.h | 75 ++++------ .../hip/hip-ref-operator-assemble-diagonal.h | 75 ++++------ 6 files changed, 259 insertions(+), 157 deletions(-) diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c index 61a2ec4c9a..3b288f91b6 100644 --- a/backends/cuda-ref/ceed-cuda-ref-operator.c +++ b/backends/cuda-ref/ceed-cuda-ref-operator.c @@ -53,7 +53,12 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); + if (impl->diag->module) { + CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); + } + if (impl->diag->module_point_block) { + CeedCallCuda(ceed, cuModuleUnload(impl->diag->module_point_block)); + } CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in)); CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out)); CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); @@ -580,11 +585,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedV //------------------------------------------------------------------------------ // Assemble Diagonal Setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { +static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) { Ceed ceed; - char *diagonal_kernel_path, *diagonal_kernel_source; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; - CeedInt num_comp, q_comp, num_nodes, num_qpts; + CeedInt q_comp, num_nodes, num_qpts; CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; CeedBasis basis_in = NULL, basis_out = NULL; CeedQFunctionField *qf_fields; @@ -653,24 +657,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Cuda *diag = impl->diag; - // Assemble kernel + // Basis matrices CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); - CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); - CeedCallCuda( - ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", - num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); - CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); - CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); - - // Basis matrices const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); bool has_eval_none = false; @@ -774,13 +764,92 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Assemble Diagonal Setup (Compilation) +//------------------------------------------------------------------------------ +static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; + CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; + CeedInt num_comp, q_comp, num_nodes, num_qpts; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *op_fields; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); + + // Determine active input basis + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); + if (eval_mode != CEED_EVAL_WEIGHT) { + num_eval_modes_in += q_comp; + } + } + } + + // Determine active output basis + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); + if (eval_mode != CEED_EVAL_WEIGHT) { + num_eval_modes_out += q_comp; + } + } + } + + // Operator data struct + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedOperatorDiag_Cuda *diag = impl->diag; + + // Assemble kernel + CUmodule *module = is_point_block ? &diag->module_point_block : &diag->module; + CeedInt elems_per_block = 1; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; + else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); + CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", + num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE", + use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); + CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Assemble Diagonal Core //------------------------------------------------------------------------------ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { Ceed ceed; - CeedSize assembled_length, assembled_qf_length; - CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; + CeedInt num_elem, num_nodes; CeedScalar *elem_diag_array; const CeedScalar *assembled_qf_array; CeedVector assembled_qf = NULL, elem_diag; @@ -795,16 +864,23 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; - // Setup - if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, use_ceedsize_idx)); + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op)); CeedOperatorDiag_Cuda *diag = impl->diag; assert(diag != NULL); + // Assemble kernel if needed + if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) { + CeedSize assembled_length, assembled_qf_length; + CeedInt use_ceedsize_idx = 0; + CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; + + CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Cuda(op, use_ceedsize_idx, is_point_block)); + } + // Restriction and diagonal vector CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, @@ -981,8 +1057,8 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", - asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", - use_ceedsize_idx)); + asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, + "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); CeedCallBackend(CeedFree(&assembly_kernel_path)); CeedCallBackend(CeedFree(&assembly_kernel_source)); diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 8c43f23ab8..1e1affbc4e 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -100,7 +100,7 @@ typedef struct { } CeedQFunctionContext_Cuda; typedef struct { - CUmodule module; + CUmodule module, module_point_block; CUfunction LinearDiagonal; CUfunction LinearPointBlock; CeedElemRestriction diag_rstr, point_block_diag_rstr; diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index b1c4e11c8e..6eda8c485a 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -52,7 +52,12 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { Ceed ceed; CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); + if (impl->diag->module) { + CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); + } + if (impl->diag->module_point_block) { + CeedCallHip(ceed, hipModuleUnload(impl->diag->module_point_block)); + } CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in)); CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out)); CeedCallHip(ceed, hipFree(impl->diag->d_identity)); @@ -579,11 +584,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVe //------------------------------------------------------------------------------ // Assemble Diagonal Setup //------------------------------------------------------------------------------ -static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { +static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) { Ceed ceed; - char *diagonal_kernel_path, *diagonal_kernel_source; CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; - CeedInt num_comp, q_comp, num_nodes, num_qpts; + CeedInt q_comp, num_nodes, num_qpts; CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; CeedBasis basis_in = NULL, basis_out = NULL; CeedQFunctionField *qf_fields; @@ -652,24 +656,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt CeedCallBackend(CeedCalloc(1, &impl->diag)); CeedOperatorDiag_Hip *diag = impl->diag; - // Assemble kernel + // Basis matrices CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); - CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); - CeedCallHip(ceed, - CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", - num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); - CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); - CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); - - // Basis matrices const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); bool has_eval_none = false; @@ -773,13 +763,92 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt return CEED_ERROR_SUCCESS; } +//------------------------------------------------------------------------------ +// Assemble Diagonal Setup (Compilation) +//------------------------------------------------------------------------------ +static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; + CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; + CeedInt num_comp, q_comp, num_nodes, num_qpts; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; + CeedOperatorField *op_fields; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields)); + + // Determine active input basis + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); + if (eval_mode != CEED_EVAL_WEIGHT) { + num_eval_modes_in += q_comp; + } + } + } + + // Determine active output basis + CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields)); + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { + CeedEvalMode eval_mode; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); + CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); + if (eval_mode != CEED_EVAL_WEIGHT) { + num_eval_modes_out += q_comp; + } + } + } + + // Operator data struct + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedOperatorDiag_Hip *diag = impl->diag; + + // Assemble kernel + hipModule_t *module = is_point_block ? &diag->module_point_block : &diag->module; + CeedInt elems_per_block = 1; + CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); + CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); + if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; + else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); + CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", + num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE", + use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); + CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Assemble Diagonal Core //------------------------------------------------------------------------------ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { Ceed ceed; - CeedSize assembled_length, assembled_qf_length; - CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; + CeedInt num_elem, num_nodes; CeedScalar *elem_diag_array; const CeedScalar *assembled_qf_array; CeedVector assembled_qf = NULL, elem_diag; @@ -794,16 +863,23 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; - // Setup - if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx)); + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx, is_point_block)); CeedOperatorDiag_Hip *diag = impl->diag; assert(diag != NULL); + // Assemble kernel if needed + if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) { + CeedSize assembled_length, assembled_qf_length; + CeedInt use_ceedsize_idx = 0; + CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; + + CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Hip(op, use_ceedsize_idx, is_point_block)); + } + // Restriction and diagonal vector CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, @@ -978,7 +1054,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", - asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", + asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE", use_ceedsize_idx)); CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); CeedCallBackend(CeedFree(&assembly_kernel_path)); diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index 14737a2a53..a641351f6d 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -104,7 +104,7 @@ typedef struct { } CeedQFunctionContext_Hip; typedef struct { - hipModule_t module; + hipModule_t module, module_point_block; hipFunction_t LinearDiagonal; hipFunction_t LinearPointBlock; CeedElemRestriction diag_rstr, point_block_diag_rstr; diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h index ab366e7966..656fbbac03 100644 --- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h @@ -47,12 +47,11 @@ static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, //------------------------------------------------------------------------------ // Core code for diagonal assembly //------------------------------------------------------------------------------ -static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, - const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, - const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, - const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, - const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, - CeedScalar *__restrict__ elem_diag_array) { +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, + const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, + const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, + const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { const int tid = threadIdx.x; // Running with P threads if (tid >= NUM_NODES) return; @@ -84,65 +83,41 @@ static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const boo // Each component for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { - // Each qpoint/node pair - if (is_point_block) { - // Point block diagonal - for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { - CeedScalar e_value = 0.; - - for (IndexType q = 0; q < NUM_QPTS; q++) { - const CeedScalar qf_value = - assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * - NUM_QPTS + - q]; - - e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; - } - elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; - } - } else { - // Diagonal only +#if USE_POINT_BLOCK + // Point block diagonal + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { CeedScalar e_value = 0.; + // Each qpoint/node pair for (IndexType q = 0; q < NUM_QPTS; q++) { const CeedScalar qf_value = - assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + + assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } - elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; + elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; + } +#else + // Diagonal only + CeedScalar e_value = 0.; + + // Each qpoint/node pair + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + + q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } + elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; +#endif } } } } } -//------------------------------------------------------------------------------ -// Linear diagonal -//------------------------------------------------------------------------------ -extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, - const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, - const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, - const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, - const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { - DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, - assembled_qf_array, elem_diag_array); -} - -//------------------------------------------------------------------------------ -// Linear point block diagonal -//------------------------------------------------------------------------------ -extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, - const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, - const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, - const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, - const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { - DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, - assembled_qf_array, elem_diag_array); -} - //------------------------------------------------------------------------------ #endif // CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h index fcd8df2960..288e09fcad 100644 --- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h @@ -47,12 +47,11 @@ static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, //------------------------------------------------------------------------------ // Core code for diagonal assembly //------------------------------------------------------------------------------ -static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, - const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, - const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, - const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, - const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, - CeedScalar *__restrict__ elem_diag_array) { +extern "C" __launch_bounds__(BLOCK_SIZE) __global__ + void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, + const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, + const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, + const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { const int tid = threadIdx.x; // Running with P threads if (tid >= NUM_NODES) return; @@ -84,65 +83,41 @@ static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const boo // Each component for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { - // Each qpoint/node pair - if (is_point_block) { - // Point block diagonal - for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { - CeedScalar e_value = 0.; - - for (IndexType q = 0; q < NUM_QPTS; q++) { - const CeedScalar qf_value = - assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * - NUM_QPTS + - q]; - - e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; - } - elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; - } - } else { - // Diagonal only +#if USE_POINT_BLOCK + // Point block diagonal + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { CeedScalar e_value = 0.; + // Each qpoint/node pair for (IndexType q = 0; q < NUM_QPTS; q++) { const CeedScalar qf_value = - assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + + assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } - elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; + elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; + } +#else + // Diagonal only + CeedScalar e_value = 0.; + + // Each qpoint/node pair + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = + assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + + q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; } + elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; +#endif } } } } } -//------------------------------------------------------------------------------ -// Linear diagonal -//------------------------------------------------------------------------------ -extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, - const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, - const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, - const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, - const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { - DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, - assembled_qf_array, elem_diag_array); -} - -//------------------------------------------------------------------------------ -// Linear point block diagonal -//------------------------------------------------------------------------------ -extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, - const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, - const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, - const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, - const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { - DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, - assembled_qf_array, elem_diag_array); -} - //------------------------------------------------------------------------------ #endif // CEED_HIP_REF_OPERATOR_ASSEMBLE_DIAGONAL_H From cf8cbdd67b26059bcd4f184d0db9bf05297bc21d Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 22 Jan 2024 06:51:22 -0800 Subject: [PATCH 3/4] Refactor GPU-backend element restriction source to only compile the required kernels, and do so lazily at first apply --- backends/cuda-ref/ceed-cuda-ref-restriction.c | 276 +++++++++------ backends/cuda-ref/ceed-cuda-ref.h | 17 +- backends/hip-ref/ceed-hip-ref-operator.c | 2 +- backends/hip-ref/ceed-hip-ref-restriction.c | 276 +++++++++------ backends/hip-ref/ceed-hip-ref.h | 17 +- .../cuda/cuda-ref-restriction-curl-oriented.h | 183 ++++++++++ .../cuda/cuda-ref-restriction-offset.h | 74 ++++ .../cuda/cuda-ref-restriction-oriented.h | 81 +++++ .../cuda/cuda-ref-restriction-strided.h | 47 +++ .../jit-source/cuda/cuda-ref-restriction.h | 332 ------------------ .../hip/hip-ref-restriction-curl-oriented.h | 183 ++++++++++ .../hip/hip-ref-restriction-offset.h | 74 ++++ .../hip/hip-ref-restriction-oriented.h | 81 +++++ .../hip/hip-ref-restriction-strided.h | 47 +++ .../ceed/jit-source/hip/hip-ref-restriction.h | 332 ------------------ 15 files changed, 1097 insertions(+), 925 deletions(-) create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h delete mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction.h create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-offset.h create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-oriented.h create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-strided.h delete mode 100644 include/ceed/jit-source/hip/hip-ref-restriction.h diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c index f2b190e5f1..d873386575 100644 --- a/backends/cuda-ref/ceed-cuda-ref-restriction.c +++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c @@ -18,25 +18,126 @@ #include "../cuda/ceed-cuda-compile.h" #include "ceed-cuda-ref.h" +//------------------------------------------------------------------------------ +// Compile restriction kernels +//------------------------------------------------------------------------------ +static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) { + Ceed ceed; + bool is_deterministic; + CeedInt num_elem, num_comp, elem_size, comp_stride; + CeedRestrictionType rstr_type; + char *restriction_kernel_path, *restriction_kernel_source; + CeedElemRestriction_Cuda *impl; + + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + is_deterministic = impl->d_l_vec_indices != NULL; + + // Compile CUDA kernels + switch (rstr_type) { + case CEED_RESTRICTION_STRIDED: { + CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; + bool has_backend_strides; + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + } + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", + strides[2])); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); + } break; + case CEED_RESTRICTION_STANDARD: { + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); + } break; + case CEED_RESTRICTION_ORIENTED: { + char *offset_kernel_path; + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); + CeedCallBackend(CeedFree(&offset_kernel_path)); + } break; + case CEED_RESTRICTION_CURL_ORIENTED: { + char *offset_kernel_path; + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); + CeedCallBackend(CeedFree(&offset_kernel_path)); + } break; + case CEED_RESTRICTION_POINTS: { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); + // LCOV_EXCL_STOP + } break; + } + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Core apply restriction code //------------------------------------------------------------------------------ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { Ceed ceed; - CeedInt num_elem, elem_size; CeedRestrictionType rstr_type; const CeedScalar *d_u; CeedScalar *d_v; CeedElemRestriction_Cuda *impl; - CUfunction kernel; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - const CeedInt num_nodes = impl->num_nodes; + + // Assemble kernel if needed + if (!impl->module) { + CeedCallBackend(CeedElemRestrictionSetupCompile_Cuda(rstr)); + } // Get vectors CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); @@ -51,51 +152,47 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C // Restrict if (t_mode == CEED_NOTRANSPOSE) { // L-vector -> E-vector + CeedInt elem_size; + + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; - const CeedInt grid = CeedDivUpInt(num_nodes, block_size); + const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - kernel = impl->StridedNoTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; + void *args[] = {&d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_STANDARD: { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_ORIENTED: { if (use_signs) { - kernel = impl->OrientedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_CURL_ORIENTED: { if (use_signs && use_orients) { - kernel = impl->CurlOrientedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } else if (use_orients) { - kernel = impl->CurlOrientedUnsignedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_POINTS: { @@ -106,92 +203,80 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C } } else { // E-vector -> L-vector - const CeedInt block_size = 32; - const CeedInt grid = CeedDivUpInt(num_nodes, block_size); + const bool is_deterministic = impl->d_l_vec_indices != NULL; + const CeedInt block_size = 32; + const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - kernel = impl->StridedTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; + void *args[] = {&d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_STANDARD: { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_ORIENTED: { if (use_signs) { - if (impl->OrientedTranspose) { - kernel = impl->OrientedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->OrientedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } } else { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } } } break; case CEED_RESTRICTION_CURL_ORIENTED: { if (use_signs && use_orients) { - if (impl->CurlOrientedTranspose) { - kernel = impl->CurlOrientedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->CurlOrientedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args)); } } else if (use_orients) { - if (impl->CurlOrientedUnsignedTranspose) { - kernel = impl->CurlOrientedUnsignedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } else { - kernel = impl->CurlOrientedUnsignedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } } else { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); } } } break; @@ -297,7 +382,9 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallCuda(ceed, cuModuleUnload(impl->module)); + if (impl->module) { + CeedCallCuda(ceed, cuModuleUnload(impl->module)); + } CeedCallBackend(CeedFree(&impl->h_ind_allocated)); CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); @@ -398,33 +485,17 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt8 *curl_orients, CeedElemRestriction rstr) { Ceed ceed, ceed_parent; bool is_deterministic; - CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedInt num_elem, elem_size; CeedRestrictionType rstr_type; - char *restriction_kernel_path, *restriction_kernel_source; CeedElemRestriction_Cuda *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); - const CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - - // Stride data - CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - if (rstr_type == CEED_RESTRICTION_STRIDED) { - bool has_backend_strides; - - CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); - } - } else { - CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - } + const CeedInt size = num_elem * elem_size; + CeedInt layout[3] = {1, size, elem_size}; CeedCallBackend(CeedCalloc(1, &impl)); impl->num_nodes = size; @@ -446,6 +517,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); // Set up device offset/orientation arrays + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); if (rstr_type != CEED_RESTRICTION_STRIDED) { switch (mem_type) { case CEED_MEM_HOST: { @@ -576,34 +648,6 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, } } - // Compile CUDA kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, - "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", - strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); - if (!is_deterministic) { - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); - } else { - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); - } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); - // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h index 1e1affbc4e..652920280d 100644 --- a/backends/cuda-ref/ceed-cuda-ref.h +++ b/backends/cuda-ref/ceed-cuda-ref.h @@ -25,20 +25,9 @@ typedef struct { typedef struct { CUmodule module; - CUfunction StridedNoTranspose; - CUfunction StridedTranspose; - CUfunction OffsetNoTranspose; - CUfunction OffsetTranspose; - CUfunction OffsetTransposeDet; - CUfunction OrientedNoTranspose; - CUfunction OrientedTranspose; - CUfunction OrientedTransposeDet; - CUfunction CurlOrientedNoTranspose; - CUfunction CurlOrientedTranspose; - CUfunction CurlOrientedTransposeDet; - CUfunction CurlOrientedUnsignedNoTranspose; - CUfunction CurlOrientedUnsignedTranspose; - CUfunction CurlOrientedUnsignedTransposeDet; + CUfunction ApplyNoTranspose, ApplyTranspose; + CUfunction ApplyUnsignedNoTranspose, ApplyUnsignedTranspose; + CUfunction ApplyUnorientedNoTranspose, ApplyUnorientedTranspose; CeedInt num_nodes; CeedInt *h_ind; CeedInt *h_ind_allocated; diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c index 6eda8c485a..cc2538579a 100644 --- a/backends/hip-ref/ceed-hip-ref-operator.c +++ b/backends/hip-ref/ceed-hip-ref-operator.c @@ -864,7 +864,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); // Setup - if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx, is_point_block)); + if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op)); CeedOperatorDiag_Hip *diag = impl->diag; assert(diag != NULL); diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c index 5824fa4855..9364f425fb 100644 --- a/backends/hip-ref/ceed-hip-ref-restriction.c +++ b/backends/hip-ref/ceed-hip-ref-restriction.c @@ -17,25 +17,126 @@ #include "../hip/ceed-hip-compile.h" #include "ceed-hip-ref.h" +//------------------------------------------------------------------------------ +// Compile restriction kernels +//------------------------------------------------------------------------------ +static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) { + Ceed ceed; + bool is_deterministic; + CeedInt num_elem, num_comp, elem_size, comp_stride; + CeedRestrictionType rstr_type; + char *restriction_kernel_path, *restriction_kernel_source; + CeedElemRestriction_Hip *impl; + + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); + CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + is_deterministic = impl->d_l_vec_indices != NULL; + + // Compile HIP kernels + switch (rstr_type) { + case CEED_RESTRICTION_STRIDED: { + CeedInt strides[3] = {1, num_elem * elem_size, elem_size}; + bool has_backend_strides; + + CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); + if (!has_backend_strides) { + CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + } + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-strided.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", + strides[2])); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose)); + } break; + case CEED_RESTRICTION_STANDARD: { + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose)); + } break; + case CEED_RESTRICTION_ORIENTED: { + char *offset_kernel_path; + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-oriented.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose)); + CeedCallBackend(CeedFree(&offset_kernel_path)); + } break; + case CEED_RESTRICTION_CURL_ORIENTED: { + char *offset_kernel_path; + + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path)); + CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, + "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, + "USE_DETERMINISTIC", is_deterministic ? 1 : 0)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose)); + CeedCallBackend(CeedFree(&offset_kernel_path)); + } break; + case CEED_RESTRICTION_POINTS: { + // LCOV_EXCL_START + return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); + // LCOV_EXCL_STOP + } break; + } + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Core apply restriction code //------------------------------------------------------------------------------ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { Ceed ceed; - CeedInt num_elem, elem_size; CeedRestrictionType rstr_type; const CeedScalar *d_u; CeedScalar *d_v; CeedElemRestriction_Hip *impl; - hipFunction_t kernel; CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - const CeedInt num_nodes = impl->num_nodes; + + // Assemble kernel if needed + if (!impl->module) { + CeedCallBackend(CeedElemRestrictionSetupCompile_Hip(rstr)); + } // Get vectors CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); @@ -50,51 +151,47 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce // Restrict if (t_mode == CEED_NOTRANSPOSE) { // L-vector -> E-vector + CeedInt elem_size; + + CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); const CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; - const CeedInt grid = CeedDivUpInt(num_nodes, block_size); + const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - kernel = impl->StridedNoTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; + void *args[] = {&d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_STANDARD: { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_ORIENTED: { if (use_signs) { - kernel = impl->OrientedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_CURL_ORIENTED: { if (use_signs && use_orients) { - kernel = impl->CurlOrientedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args)); } else if (use_orients) { - kernel = impl->CurlOrientedUnsignedNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetNoTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_POINTS: { @@ -105,92 +202,80 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce } } else { // E-vector -> L-vector - const CeedInt block_size = 64; - const CeedInt grid = CeedDivUpInt(num_nodes, block_size); + const bool is_deterministic = impl->d_l_vec_indices != NULL; + const CeedInt block_size = 64; + const CeedInt grid = CeedDivUpInt(impl->num_nodes, block_size); switch (rstr_type) { case CEED_RESTRICTION_STRIDED: { - kernel = impl->StridedTranspose; - void *args[] = {&num_elem, &d_u, &d_v}; + void *args[] = {&d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } break; case CEED_RESTRICTION_STANDARD: { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } } break; case CEED_RESTRICTION_ORIENTED: { if (use_signs) { - if (impl->OrientedTranspose) { - kernel = impl->OrientedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->OrientedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } } else { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } } } break; case CEED_RESTRICTION_CURL_ORIENTED: { if (use_signs && use_orients) { - if (impl->CurlOrientedTranspose) { - kernel = impl->CurlOrientedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } else { - kernel = impl->CurlOrientedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args)); } } else if (use_orients) { - if (impl->CurlOrientedUnsignedTranspose) { - kernel = impl->CurlOrientedUnsignedTranspose; - void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } else { - kernel = impl->CurlOrientedUnsignedTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args)); } } else { - if (impl->OffsetTranspose) { - kernel = impl->OffsetTranspose; - void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + if (!is_deterministic) { + void *args[] = {&impl->d_ind, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); } else { - kernel = impl->OffsetTransposeDet; void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); + CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args)); } } } break; @@ -296,7 +381,9 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) { CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallHip(ceed, hipModuleUnload(impl->module)); + if (impl->module) { + CeedCallHip(ceed, hipModuleUnload(impl->module)); + } CeedCallBackend(CeedFree(&impl->h_ind_allocated)); CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); CeedCallHip(ceed, hipFree(impl->d_t_offsets)); @@ -397,33 +484,17 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt8 *curl_orients, CeedElemRestriction rstr) { Ceed ceed, ceed_parent; bool is_deterministic; - CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedInt num_elem, elem_size; CeedRestrictionType rstr_type; - char *restriction_kernel_path, *restriction_kernel_source; CeedElemRestriction_Hip *impl; CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); - CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); - const CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - - // Stride data - CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - if (rstr_type == CEED_RESTRICTION_STRIDED) { - bool has_backend_strides; - - CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); - if (!has_backend_strides) { - CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); - } - } else { - CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - } + const CeedInt size = num_elem * elem_size; + CeedInt layout[3] = {1, size, elem_size}; CeedCallBackend(CeedCalloc(1, &impl)); impl->num_nodes = size; @@ -445,6 +516,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); // Set up device offset/orientation arrays + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); if (rstr_type != CEED_RESTRICTION_STRIDED) { switch (mem_type) { case CEED_MEM_HOST: { @@ -575,34 +647,6 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, } } - // Compile HIP kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, - "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", - strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); - if (!is_deterministic) { - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); - } else { - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); - } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); - // Register backend functions CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Hip)); CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Hip)); diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h index a641351f6d..8784a6865a 100644 --- a/backends/hip-ref/ceed-hip-ref.h +++ b/backends/hip-ref/ceed-hip-ref.h @@ -29,20 +29,9 @@ typedef struct { typedef struct { hipModule_t module; - hipFunction_t StridedNoTranspose; - hipFunction_t StridedTranspose; - hipFunction_t OffsetNoTranspose; - hipFunction_t OffsetTranspose; - hipFunction_t OffsetTransposeDet; - hipFunction_t OrientedNoTranspose; - hipFunction_t OrientedTranspose; - hipFunction_t OrientedTransposeDet; - hipFunction_t CurlOrientedNoTranspose; - hipFunction_t CurlOrientedTranspose; - hipFunction_t CurlOrientedTransposeDet; - hipFunction_t CurlOrientedUnsignedNoTranspose; - hipFunction_t CurlOrientedUnsignedTranspose; - hipFunction_t CurlOrientedUnsignedTransposeDet; + hipFunction_t ApplyNoTranspose, ApplyTranspose; + hipFunction_t ApplyUnsignedNoTranspose, ApplyUnsignedTranspose; + hipFunction_t ApplyUnorientedNoTranspose, ApplyUnorientedTranspose; CeedInt num_nodes; CeedInt *h_ind; CeedInt *h_ind_allocated; diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h new file mode 100644 index 0000000000..8609acc9bb --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h @@ -0,0 +1,183 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA curl-oriented element restriction kernels +#ifndef CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H +#define CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, curl-oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; + const CeedInt ind_d = indices[node]; + const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; + const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; + const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; + const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; + value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; + value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; + } + } +} + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, unsigned curl-oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; + const CeedInt ind_d = indices[node]; + const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; + const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); + const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); + const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; + value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; + value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, curl-oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; + const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + } + } +} +#else +extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; + const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value[comp] += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, unsigned curl-oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind = indices[node]; + const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; + const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + } + } +} +#else +extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; + const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value[comp] += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h new file mode 100644 index 0000000000..f5e3f0cfb4 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h @@ -0,0 +1,74 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA offset element restriction kernels +#ifndef CEED_CUDA_REF_RESTRICTION_OFFSET_H +#define CEED_CUDA_REF_RESTRICTION_OFFSET_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, standard (with offsets) +//------------------------------------------------------------------------------ +extern "C" __global__ void OffsetNoTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, standard (with offsets) +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + } + } +} +#else +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_CUDA_REF_RESTRICTION_OFFSET_H diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h new file mode 100644 index 0000000000..85bb5a3e83 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h @@ -0,0 +1,81 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA oriented element restriction kernels +#ifndef CEED_CUDA_REF_RESTRICTION_ORIENTED_H +#define CEED_CUDA_REF_RESTRICTION_ORIENTED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void OrientedNoTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const bool orient = orients[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, const CeedScalar *__restrict__ u, + CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const bool orient = orients[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); + } + } +} +#else +extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const bool orient = orients[t_ind]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_CUDA_REF_RESTRICTION_ORIENTED_H diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h new file mode 100644 index 0000000000..7fe2d75cf5 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h @@ -0,0 +1,47 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA strided element restriction kernels +#ifndef CEED_CUDA_REF_RESTRICTION_STRIDED_H +#define CEED_CUDA_REF_RESTRICTION_STRIDED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, strided +//------------------------------------------------------------------------------ +extern "C" __global__ void StridedNoTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = + u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, strided +//------------------------------------------------------------------------------ +extern "C" __global__ void StridedTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += + u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } +} + +//------------------------------------------------------------------------------ + +#endif // CEED_CUDA_REF_RESTRICTION_STRIDED_H diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h deleted file mode 100644 index 80011148e6..0000000000 --- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Internal header for CUDA element restriction kernels -#ifndef CEED_CUDA_REF_RESTRICTION_H -#define CEED_CUDA_REF_RESTRICTION_H - -#include - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, strided -//------------------------------------------------------------------------------ -extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = - u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, standard (with offsets) -//------------------------------------------------------------------------------ -extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const bool orient = orients[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; - const CeedInt ind_d = indices[node]; - const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; - const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; - const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; - const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; - value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; - value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, unsigned curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; - const CeedInt ind_d = indices[node]; - const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; - const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); - const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); - const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; - value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; - value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; - } - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, strided -//------------------------------------------------------------------------------ -extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += - u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, standard (with offsets) -//------------------------------------------------------------------------------ -extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); - } - } -} - -extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const bool orient = orients[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, - u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); - } - } -} - -extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const bool orient = orients[t_ind]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; - const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); - } - } -} - -extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; - const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value[comp] += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, unsigned curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind = indices[node]; - const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; - const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); - } - } -} - -extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; - const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value[comp] += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ - -#endif // CEED_CUDA_REF_RESTRICTION_H diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h new file mode 100644 index 0000000000..c1131ae3d0 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h @@ -0,0 +1,183 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP curl-oriented element restriction kernels +#ifndef CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H +#define CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, curl-oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; + const CeedInt ind_d = indices[node]; + const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; + const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; + const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; + const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; + value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; + value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; + } + } +} + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, unsigned curl-oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; + const CeedInt ind_d = indices[node]; + const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; + const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); + const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); + const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; + value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; + value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, curl-oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; + const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + } + } +} +#else +extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; + const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value[comp] += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, unsigned curl-oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + const CeedInt ind = indices[node]; + const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; + const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + CeedScalar value = 0.0; + value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); + } + } +} +#else +extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; + const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); + const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; + value[comp] += + loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h new file mode 100644 index 0000000000..d31b8db893 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h @@ -0,0 +1,74 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP offset element restriction kernels +#ifndef CEED_HIP_REF_RESTRICTION_OFFSET_H +#define CEED_HIP_REF_RESTRICTION_OFFSET_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, standard (with offsets) +//------------------------------------------------------------------------------ +extern "C" __global__ void OffsetNoTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, standard (with offsets) +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + } + } +} +#else +extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_HIP_REF_RESTRICTION_OFFSET_H diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h new file mode 100644 index 0000000000..668169783a --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h @@ -0,0 +1,81 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP oriented element restriction kernels +#ifndef CEED_HIP_REF_RESTRICTION_ORIENTED_H +#define CEED_HIP_REF_RESTRICTION_ORIENTED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, oriented +//------------------------------------------------------------------------------ +extern "C" __global__ void OrientedNoTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const bool orient = orients[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, oriented +//------------------------------------------------------------------------------ +#if !USE_DETERMINISTIC +extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, const CeedScalar *__restrict__ u, + CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; + const bool orient = orients[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, + u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); + } + } +} +#else +extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, + const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + CeedScalar value[RSTR_NUM_COMP]; + + for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; + const bool orient = orients[t_ind]; + const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; + const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); + } + } + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } +} +#endif + +//------------------------------------------------------------------------------ + +#endif // CEED_HIP_REF_RESTRICTION_ORIENTED_H diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h new file mode 100644 index 0000000000..8f8ac4ee19 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h @@ -0,0 +1,47 @@ +// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP strided element restriction kernels +#ifndef CEED_HIP_REF_RESTRICTION_STRIDED_H +#define CEED_HIP_REF_RESTRICTION_STRIDED_H + +#include + +//------------------------------------------------------------------------------ +// L-vector -> E-vector, strided +//------------------------------------------------------------------------------ +extern "C" __global__ void StridedNoTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = + u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; + } + } +} + +//------------------------------------------------------------------------------ +// E-vector -> L-vector, strided +//------------------------------------------------------------------------------ +extern "C" __global__ void StridedTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += + u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } +} + +//------------------------------------------------------------------------------ + +#endif // CEED_HIP_REF_RESTRICTION_STRIDED_H diff --git a/include/ceed/jit-source/hip/hip-ref-restriction.h b/include/ceed/jit-source/hip/hip-ref-restriction.h deleted file mode 100644 index e1de6d87e9..0000000000 --- a/include/ceed/jit-source/hip/hip-ref-restriction.h +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -// -// SPDX-License-Identifier: BSD-2-Clause -// -// This file is part of CEED: http://github.com/ceed - -/// @file -/// Internal header for HIP element restriction kernels -#ifndef CEED_HIP_REF_RESTRICTION_H -#define CEED_HIP_REF_RESTRICTION_H - -#include - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, strided -//------------------------------------------------------------------------------ -extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = - u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, standard (with offsets) -//------------------------------------------------------------------------------ -extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const bool orient = orients[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; - const CeedInt ind_d = indices[node]; - const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; - const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; - const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; - const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; - value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; - value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; - } - } -} - -//------------------------------------------------------------------------------ -// L-vector -> E-vector, unsigned curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; - const CeedInt ind_d = indices[node]; - const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; - const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); - const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); - const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; - value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; - value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; - v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; - } - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, strided -//------------------------------------------------------------------------------ -extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += - u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, standard (with offsets) -//------------------------------------------------------------------------------ -extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); - } - } -} - -extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const bool orient = orients[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, - u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); - } - } -} - -extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const bool orient = orients[t_ind]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; - const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); - } - } -} - -extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; - const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value[comp] += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ -// E-vector -> L-vector, unsigned curl-oriented -//------------------------------------------------------------------------------ -extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, - const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - const CeedInt ind = indices[node]; - const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; - const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - CeedScalar value = 0.0; - value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); - } - } -} - -extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, - const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - CeedScalar value[RSTR_NUM_COMP]; - - for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; - const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; - const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; - const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); - const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; - value[comp] += - loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - } - } - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } -} - -//------------------------------------------------------------------------------ - -#endif // CEED_HIP_REF_RESTRICTION_H From 1b7492f867164853116dc978bf39f769f48c56d0 Mon Sep 17 00:00:00 2001 From: Sebastian Grimberg Date: Mon, 22 Jan 2024 09:02:30 -0800 Subject: [PATCH 4/4] Fix a few incorrect JIT header includes --- include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h | 2 +- include/ceed/jit-source/cuda/cuda-gen-templates.h | 2 +- include/ceed/jit-source/hip/hip-gen-templates.h | 2 +- include/ceed/jit-source/sycl/sycl-gen-templates.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h index b2ac2842ed..4530a633b0 100644 --- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h +++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h @@ -10,7 +10,7 @@ #ifndef CEED_CUDA_ATOMIC_ADD_FALLBACK_H #define CEED_CUDA_ATOMIC_ADD_FALLBACK_H -#include +#include //------------------------------------------------------------------------------ // Atomic add, for older CUDA diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index 265f7d2b4f..a3496e8411 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -10,7 +10,7 @@ #ifndef CEED_CUDA_GEN_TEMPLATES_H #define CEED_CUDA_GEN_TEMPLATES_H -#include +#include //------------------------------------------------------------------------------ // Load matrices for basis actions diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h index 884f346d76..27b894629e 100644 --- a/include/ceed/jit-source/hip/hip-gen-templates.h +++ b/include/ceed/jit-source/hip/hip-gen-templates.h @@ -10,7 +10,7 @@ #ifndef CEED_HIP_GEN_TEMPLATES_H #define CEED_HIP_GEN_TEMPLATES_H -#include +#include //------------------------------------------------------------------------------ // Load matrices for basis actions diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h index 88aa7b98cd..c13e12b690 100644 --- a/include/ceed/jit-source/sycl/sycl-gen-templates.h +++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h @@ -10,7 +10,7 @@ #ifndef CEED_SYCL_GEN_TEMPLATES_H #define CEED_SYCL_GEN_TEMPLATES_H -#include +#include #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable