From 7251047c7cf32ba2ad478e34e97b2a7bc384014b Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Fri, 19 Jan 2024 15:33:13 -0800
Subject: [PATCH 1/4] Remove need for separate CUmodule/hipModule_t for weights
 in non-tensor MAGMA backend

---
 backends/magma/ceed-magma-basis.c | 251 ++++++++++++++++--------------
 backends/magma/ceed-magma.h       |   2 +-
 2 files changed, 138 insertions(+), 115 deletions(-)

diff --git a/backends/magma/ceed-magma-basis.c b/backends/magma/ceed-magma-basis.c
index 4442248bed..6db80b5dbf 100644
--- a/backends/magma/ceed-magma-basis.c
+++ b/backends/magma/ceed-magma-basis.c
@@ -255,8 +255,8 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
                                          CeedVector v) {
   Ceed                      ceed;
   Ceed_Magma               *data;
-  CeedInt                   num_comp, q_comp, num_nodes, num_qpts, P, Q, N;
-  const CeedScalar         *d_u, *d_b = NULL;
+  CeedInt                   num_comp, num_nodes, num_qpts, P, Q, N;
+  const CeedScalar         *d_u;
   CeedScalar               *d_v;
   CeedBasisNonTensor_Magma *impl;
 
@@ -264,7 +264,6 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   CeedCallBackend(CeedGetData(ceed, &data));
   CeedCallBackend(CeedBasisGetData(basis, &impl));
   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-  CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp));
   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
   P = num_nodes;
@@ -276,8 +275,81 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
   else CeedCheck(e_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
 
+  // Compile kernels for N as needed
+  CeedInt iN = 0;
+  if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q && (e_mode != CEED_EVAL_WEIGHT || !impl->Weight)) {
+    CeedInt n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
+    CeedInt diff                                      = abs(n_array[iN] - N), idiff;
+
+    for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
+      idiff = abs(n_array[in] - N);
+      if (idiff < diff) {
+        iN   = in;
+        diff = idiff;
+      }
+    }
+
+    if (!impl->NB_interp[iN]) {
+      CeedFESpace fe_space;
+      CeedInt     q_comp_interp, q_comp_deriv;
+      Ceed        ceed_delegate;
+      char       *basis_kernel_path, *weight_kernel_path, *basis_kernel_source;
+      magma_int_t arch = magma_getdevice_arch();
+
+      // Tuning parameters for NB
+      CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space));
+      CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
+      switch (fe_space) {
+        case CEED_FE_SPACE_H1:
+          CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv));
+          break;
+        case CEED_FE_SPACE_HDIV:
+          CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv));
+          break;
+        case CEED_FE_SPACE_HCURL:
+          CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv));
+          break;
+      }
+      impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]);
+      impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]);
+      impl->NB_deriv[iN]    = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]);
+      impl->NB_deriv_t[iN]  = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]);
+
+      // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+      CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+      // Compile kernels
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
+      if (!impl->Weight) {
+        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+        CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+      }
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp,
+                                       "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN],
+                                       "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T",
+                                       impl->NB_deriv_t[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
+      CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
+      if (!impl->Weight) {
+        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[iN], "magma_weight_nontensor", &impl->Weight));
+        CeedCallBackend(CeedFree(&weight_kernel_path));
+      }
+      CeedCallBackend(CeedFree(&basis_kernel_path));
+      CeedCallBackend(CeedFree(&basis_kernel_source));
+    }
+  }
+
   // Apply basis operation
   if (e_mode != CEED_EVAL_WEIGHT) {
+    const CeedScalar *d_b = NULL;
+    CeedInt           q_comp, NB, M, K;
+    CeedMagmaFunction Kernel;
+
     switch (e_mode) {
       case CEED_EVAL_INTERP:
         d_b = impl->d_interp;
@@ -298,69 +370,10 @@ static int CeedBasisApplyNonTensor_Magma(CeedBasis basis, CeedInt num_elem, Ceed
         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
         // LCOV_EXCL_STOP
     }
+    CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, e_mode, &q_comp));
+    M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
 
-    // Apply basis operation
     if (P <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P && Q <= MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
-      CeedInt           n_array[MAGMA_NONTENSOR_KERNEL_INSTANCES] = {MAGMA_NONTENSOR_KERNEL_N_VALUES};
-      CeedInt           iN = 0, diff = abs(n_array[iN] - N), idiff;
-      CeedInt           M = (t_mode == CEED_TRANSPOSE) ? P : Q, K = (t_mode == CEED_TRANSPOSE) ? Q : P;
-      CeedMagmaFunction Kernel;
-      CeedInt           NB;
-
-      for (CeedInt in = iN + 1; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-        idiff = abs(n_array[in] - N);
-        if (idiff < diff) {
-          iN   = in;
-          diff = idiff;
-        }
-      }
-
-      // Compile kernels for N as needed
-      if (!impl->NB_interp[iN]) {
-        CeedFESpace fe_space;
-        CeedInt     q_comp_interp, q_comp_deriv;
-        Ceed        ceed_delegate;
-        char       *basis_kernel_path, *basis_kernel_source;
-        magma_int_t arch = magma_getdevice_arch();
-
-        // Tuning parameters for NB
-        CeedCallBackend(CeedBasisGetFESpace(basis, &fe_space));
-        CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
-        switch (fe_space) {
-          case CEED_FE_SPACE_H1:
-            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_deriv));
-            break;
-          case CEED_FE_SPACE_HDIV:
-            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_deriv));
-            break;
-          case CEED_FE_SPACE_HCURL:
-            CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_deriv));
-            break;
-        }
-        impl->NB_interp[iN]   = nontensor_rtc_get_nb(arch, 'n', q_comp_interp, P, Q, n_array[iN]);
-        impl->NB_interp_t[iN] = nontensor_rtc_get_nb(arch, 't', q_comp_interp, P, Q, n_array[iN]);
-        impl->NB_deriv[iN]    = nontensor_rtc_get_nb(arch, 'n', q_comp_deriv, P, Q, n_array[iN]);
-        impl->NB_deriv_t[iN]  = nontensor_rtc_get_nb(arch, 't', q_comp_deriv, P, Q, n_array[iN]);
-
-        // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
-        CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-
-        // Compile kernels
-        CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h", &basis_kernel_path));
-        CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-        CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
-        CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-        CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_interp[iN], 8, "BASIS_Q_COMP_INTERP", q_comp_interp,
-                                         "BASIS_Q_COMP_DERIV", q_comp_deriv, "BASIS_P", P, "BASIS_Q", Q, "BASIS_NB_INTERP_N", impl->NB_interp[iN],
-                                         "BASIS_NB_INTERP_T", impl->NB_interp_t[iN], "BASIS_NB_DERIV_N", impl->NB_deriv[iN], "BASIS_NB_DERIV_T",
-                                         impl->NB_deriv_t[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_n", &impl->Interp[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_interp_nontensor_t", &impl->InterpTranspose[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_n", &impl->Deriv[iN]));
-        CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_interp[iN], "magma_deriv_nontensor_t", &impl->DerivTranspose[iN]));
-        CeedCallBackend(CeedFree(&basis_kernel_path));
-        CeedCallBackend(CeedFree(&basis_kernel_source));
-      }
       if (e_mode == CEED_EVAL_INTERP) {
         if (t_mode == CEED_TRANSPOSE) {
           Kernel = impl->InterpTranspose[iN];
@@ -447,17 +460,12 @@ static int CeedBasisDestroyNonTensor_Magma(CeedBasis basis) {
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedBasisGetData(basis, &impl));
-#ifdef CEED_MAGMA_USE_HIP
-  CeedCallHip(ceed, hipModuleUnload(impl->module_weight));
-#else
-  CeedCallCuda(ceed, cuModuleUnload(impl->module_weight));
-#endif
   for (CeedInt in = 0; in < MAGMA_NONTENSOR_KERNEL_INSTANCES; in++) {
-    if (impl->module_interp[in]) {
+    if (impl->module[in]) {
 #ifdef CEED_MAGMA_USE_HIP
-      CeedCallHip(ceed, hipModuleUnload(impl->module_interp[in]));
+      CeedCallHip(ceed, hipModuleUnload(impl->module[in]));
 #else
-      CeedCallCuda(ceed, cuModuleUnload(impl->module_interp[in]));
+      CeedCallCuda(ceed, cuModuleUnload(impl->module[in]));
 #endif
     }
   }
@@ -569,9 +577,8 @@ int CeedBasisCreateTensorH1_Magma(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const
 //------------------------------------------------------------------------------
 int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
-  Ceed                      ceed, ceed_delegate;
+  Ceed                      ceed;
   Ceed_Magma               *data;
-  char                     *weight_kernel_path, *basis_kernel_source;
   CeedBasisNonTensor_Magma *impl;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
@@ -596,18 +603,24 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
     magma_setvector(num_qpts * num_nodes * q_comp_grad, sizeof(grad[0]), grad, 1, impl->d_grad, 1, data->queue);
   }
 
-  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
-  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-
-  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts));
-  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
-  CeedCallBackend(CeedFree(&weight_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
+  // Compile the weight kernel if it won't be compiled later on
+  if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
+    Ceed  ceed_delegate;
+    char *weight_kernel_path, *basis_kernel_source;
+
+    // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+    CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+    // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
+    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
+    CeedCallBackend(CeedFree(&weight_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
@@ -622,9 +635,8 @@ int CeedBasisCreateH1_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_node
 //------------------------------------------------------------------------------
 int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                               const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
-  Ceed                      ceed, ceed_delegate;
+  Ceed                      ceed;
   Ceed_Magma               *data;
-  char                     *weight_kernel_path, *basis_kernel_source;
   CeedBasisNonTensor_Magma *impl;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
@@ -649,18 +661,24 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
     magma_setvector(num_qpts * num_nodes * q_comp_div, sizeof(div[0]), div, 1, impl->d_div, 1, data->queue);
   }
 
-  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
-  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-
-  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts));
-  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
-  CeedCallBackend(CeedFree(&weight_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
+  // Compile the weight kernel if it won't be compiled later on
+  if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
+    Ceed  ceed_delegate;
+    char *weight_kernel_path, *basis_kernel_source;
+
+    // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+    CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+    // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
+    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
+    CeedCallBackend(CeedFree(&weight_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
@@ -675,9 +693,8 @@ int CeedBasisCreateHdiv_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_no
 //------------------------------------------------------------------------------
 int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
                                const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
-  Ceed                      ceed, ceed_delegate;
+  Ceed                      ceed;
   Ceed_Magma               *data;
-  char                     *weight_kernel_path, *basis_kernel_source;
   CeedBasisNonTensor_Magma *impl;
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
@@ -702,18 +719,24 @@ int CeedBasisCreateHcurl_Magma(CeedElemTopology topo, CeedInt dim, CeedInt num_n
     magma_setvector(num_qpts * num_nodes * q_comp_curl, sizeof(curl[0]), curl, 1, impl->d_curl, 1, data->queue);
   }
 
-  // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
-  CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
-
-  // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module_weight, 1, "BASIS_Q", num_qpts));
-  CeedCallBackend(CeedGetKernelMagma(ceed, impl->module_weight, "magma_weight_nontensor", &impl->Weight));
-  CeedCallBackend(CeedFree(&weight_kernel_path));
-  CeedCallBackend(CeedFree(&basis_kernel_source));
+  // Compile the weight kernel if it won't be compiled later on
+  if (num_nodes > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_P || num_qpts > MAGMA_NONTENSOR_CUSTOM_KERNEL_MAX_Q) {
+    Ceed  ceed_delegate;
+    char *weight_kernel_path, *basis_kernel_source;
+
+    // The RTC compilation code expects a Ceed with the common Ceed_Cuda or Ceed_Hip data
+    CeedCallBackend(CeedGetDelegate(ceed, &ceed_delegate));
+
+    // Compile weight kernel (the remainder of kernel compilation happens at first call to CeedBasisApply)
+    CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/magma/magma-basis-weight-nontensor.h", &weight_kernel_path));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
+    CeedCallBackend(CeedLoadSourceToBuffer(ceed, weight_kernel_path, &basis_kernel_source));
+    CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
+    CeedCallBackend(CeedCompileMagma(ceed_delegate, basis_kernel_source, &impl->module[0], 1, "BASIS_Q", num_qpts));
+    CeedCallBackend(CeedGetKernelMagma(ceed, impl->module[0], "magma_weight_nontensor", &impl->Weight));
+    CeedCallBackend(CeedFree(&weight_kernel_path));
+    CeedCallBackend(CeedFree(&basis_kernel_source));
+  }
 
   CeedCallBackend(CeedBasisSetData(basis, impl));
 
diff --git a/backends/magma/ceed-magma.h b/backends/magma/ceed-magma.h
index 9350dc8452..1ec17d2de2 100644
--- a/backends/magma/ceed-magma.h
+++ b/backends/magma/ceed-magma.h
@@ -57,7 +57,7 @@ typedef struct {
 } CeedBasis_Magma;
 
 typedef struct {
-  CeedMagmaModule   module_weight, module_interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
+  CeedMagmaModule   module[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Interp[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction InterpTranspose[MAGMA_NONTENSOR_KERNEL_INSTANCES];
   CeedMagmaFunction Deriv[MAGMA_NONTENSOR_KERNEL_INSTANCES];

From cbfe683adbe50f8abe894256ac583c510bfbed11 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Fri, 19 Jan 2024 16:39:13 -0800
Subject: [PATCH 2/4] Lazily compile GPU diagonal/point-block diagonal assembly
 kernels as needed

Also fixes CEED_SIZE -> USE_CEEDSIZE definition.
---
 backends/cuda-ref/ceed-cuda-ref-operator.c    | 132 ++++++++++++++----
 backends/cuda-ref/ceed-cuda-ref.h             |   2 +-
 backends/hip-ref/ceed-hip-ref-operator.c      | 130 +++++++++++++----
 backends/hip-ref/ceed-hip-ref.h               |   2 +-
 .../cuda-ref-operator-assemble-diagonal.h     |  75 ++++------
 .../hip/hip-ref-operator-assemble-diagonal.h  |  75 ++++------
 6 files changed, 259 insertions(+), 157 deletions(-)

diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c
index 61a2ec4c9a..3b288f91b6 100644
--- a/backends/cuda-ref/ceed-cuda-ref-operator.c
+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c
@@ -53,7 +53,12 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) {
     Ceed ceed;
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-    CeedCallCuda(ceed, cuModuleUnload(impl->diag->module));
+    if (impl->diag->module) {
+      CeedCallCuda(ceed, cuModuleUnload(impl->diag->module));
+    }
+    if (impl->diag->module_point_block) {
+      CeedCallCuda(ceed, cuModuleUnload(impl->diag->module_point_block));
+    }
     CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out));
     CeedCallCuda(ceed, cudaFree(impl->diag->d_identity));
@@ -580,11 +585,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedV
 //------------------------------------------------------------------------------
 // Assemble Diagonal Setup
 //------------------------------------------------------------------------------
-static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) {
+static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op) {
   Ceed                ceed;
-  char               *diagonal_kernel_path, *diagonal_kernel_source;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
-  CeedInt             num_comp, q_comp, num_nodes, num_qpts;
+  CeedInt             q_comp, num_nodes, num_qpts;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
   CeedBasis           basis_in = NULL, basis_out = NULL;
   CeedQFunctionField *qf_fields;
@@ -653,24 +657,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn
   CeedCallBackend(CeedCalloc(1, &impl->diag));
   CeedOperatorDiag_Cuda *diag = impl->diag;
 
-  // Assemble kernel
+  // Basis matrices
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
-  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
-  CeedCallCuda(
-      ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
-                             num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx));
-  CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal));
-  CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
-
-  // Basis matrices
   const CeedInt interp_bytes     = num_nodes * num_qpts * sizeof(CeedScalar);
   const CeedInt eval_modes_bytes = sizeof(CeedEvalMode);
   bool          has_eval_none    = false;
@@ -774,13 +764,92 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Assemble Diagonal Setup (Compilation)
+//------------------------------------------------------------------------------
+static inline int CeedOperatorAssembleDiagonalSetupCompile_Cuda(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
+  Ceed                ceed;
+  char               *diagonal_kernel_path, *diagonal_kernel_source;
+  CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
+  CeedInt             num_comp, q_comp, num_nodes, num_qpts;
+  CeedBasis           basis_in = NULL, basis_out = NULL;
+  CeedQFunctionField *qf_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_fields;
+  CeedOperator_Cuda  *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
+
+  // Determine active input basis
+  CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
+    if (vec == CEED_VECTOR_ACTIVE) {
+      CeedEvalMode eval_mode;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
+      if (eval_mode != CEED_EVAL_WEIGHT) {
+        num_eval_modes_in += q_comp;
+      }
+    }
+  }
+
+  // Determine active output basis
+  CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
+    if (vec == CEED_VECTOR_ACTIVE) {
+      CeedEvalMode eval_mode;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
+      if (eval_mode != CEED_EVAL_WEIGHT) {
+        num_eval_modes_out += q_comp;
+      }
+    }
+  }
+
+  // Operator data struct
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedOperatorDiag_Cuda *diag = impl->diag;
+
+  // Assemble kernel
+  CUmodule *module          = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt   elems_per_block = 1;
+  CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
+  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
+  if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
+  else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
+  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
+  CeedCallCuda(ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
+                                      num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
+                                      use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
+  CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
+  CeedCallBackend(CeedFree(&diagonal_kernel_path));
+  CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Assemble Diagonal Core
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
   Ceed                ceed;
-  CeedSize            assembled_length, assembled_qf_length;
-  CeedInt             use_ceedsize_idx = 0, num_elem, num_nodes;
+  CeedInt             num_elem, num_nodes;
   CeedScalar         *elem_diag_array;
   const CeedScalar   *assembled_qf_array;
   CeedVector          assembled_qf   = NULL, elem_diag;
@@ -795,16 +864,23 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec
   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
 
-  CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
-  CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
-  if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
-
   // Setup
-  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op, use_ceedsize_idx));
+  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Cuda(op));
   CeedOperatorDiag_Cuda *diag = impl->diag;
 
   assert(diag != NULL);
 
+  // Assemble kernel if needed
+  if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) {
+    CeedSize assembled_length, assembled_qf_length;
+    CeedInt  use_ceedsize_idx = 0;
+    CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
+    CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
+    if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
+
+    CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Cuda(op, use_ceedsize_idx, is_point_block));
+  }
+
   // Restriction and diagonal vector
   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
   CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND,
@@ -981,8 +1057,8 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee
   CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                    num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                    "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
-                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE",
-                                   use_ceedsize_idx));
+                                   asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y,
+                                   "USE_CEEDSIZE", use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
   CeedCallBackend(CeedFree(&assembly_kernel_path));
   CeedCallBackend(CeedFree(&assembly_kernel_source));
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 8c43f23ab8..1e1affbc4e 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -100,7 +100,7 @@ typedef struct {
 } CeedQFunctionContext_Cuda;
 
 typedef struct {
-  CUmodule            module;
+  CUmodule            module, module_point_block;
   CUfunction          LinearDiagonal;
   CUfunction          LinearPointBlock;
   CeedElemRestriction diag_rstr, point_block_diag_rstr;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index b1c4e11c8e..6eda8c485a 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -52,7 +52,12 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) {
     Ceed ceed;
 
     CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-    CeedCallHip(ceed, hipModuleUnload(impl->diag->module));
+    if (impl->diag->module) {
+      CeedCallHip(ceed, hipModuleUnload(impl->diag->module));
+    }
+    if (impl->diag->module_point_block) {
+      CeedCallHip(ceed, hipModuleUnload(impl->diag->module_point_block));
+    }
     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in));
     CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out));
     CeedCallHip(ceed, hipFree(impl->diag->d_identity));
@@ -579,11 +584,10 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVe
 //------------------------------------------------------------------------------
 // Assemble Diagonal Setup
 //------------------------------------------------------------------------------
-static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) {
+static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op) {
   Ceed                ceed;
-  char               *diagonal_kernel_path, *diagonal_kernel_source;
   CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
-  CeedInt             num_comp, q_comp, num_nodes, num_qpts;
+  CeedInt             q_comp, num_nodes, num_qpts;
   CeedEvalMode       *eval_modes_in = NULL, *eval_modes_out = NULL;
   CeedBasis           basis_in = NULL, basis_out = NULL;
   CeedQFunctionField *qf_fields;
@@ -652,24 +656,10 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt
   CeedCallBackend(CeedCalloc(1, &impl->diag));
   CeedOperatorDiag_Hip *diag = impl->diag;
 
-  // Assemble kernel
+  // Basis matrices
   CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
-  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
   if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
   else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
-  CeedCallHip(ceed,
-              CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
-                              num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx));
-  CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal));
-  CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock));
-  CeedCallBackend(CeedFree(&diagonal_kernel_path));
-  CeedCallBackend(CeedFree(&diagonal_kernel_source));
-
-  // Basis matrices
   const CeedInt interp_bytes     = num_nodes * num_qpts * sizeof(CeedScalar);
   const CeedInt eval_modes_bytes = sizeof(CeedEvalMode);
   bool          has_eval_none    = false;
@@ -773,13 +763,92 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt
   return CEED_ERROR_SUCCESS;
 }
 
+//------------------------------------------------------------------------------
+// Assemble Diagonal Setup (Compilation)
+//------------------------------------------------------------------------------
+static inline int CeedOperatorAssembleDiagonalSetupCompile_Hip(CeedOperator op, CeedInt use_ceedsize_idx, const bool is_point_block) {
+  Ceed                ceed;
+  char               *diagonal_kernel_path, *diagonal_kernel_source;
+  CeedInt             num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0;
+  CeedInt             num_comp, q_comp, num_nodes, num_qpts;
+  CeedBasis           basis_in = NULL, basis_out = NULL;
+  CeedQFunctionField *qf_fields;
+  CeedQFunction       qf;
+  CeedOperatorField  *op_fields;
+  CeedOperator_Hip   *impl;
+
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
+  CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
+  CeedCallBackend(CeedQFunctionGetNumArgs(qf, &num_input_fields, &num_output_fields));
+
+  // Determine active input basis
+  CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL));
+  for (CeedInt i = 0; i < num_input_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
+    if (vec == CEED_VECTOR_ACTIVE) {
+      CeedEvalMode eval_mode;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp));
+      if (eval_mode != CEED_EVAL_WEIGHT) {
+        num_eval_modes_in += q_comp;
+      }
+    }
+  }
+
+  // Determine active output basis
+  CeedCallBackend(CeedOperatorGetFields(op, NULL, NULL, NULL, &op_fields));
+  CeedCallBackend(CeedQFunctionGetFields(qf, NULL, NULL, NULL, &qf_fields));
+  for (CeedInt i = 0; i < num_output_fields; i++) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec));
+    if (vec == CEED_VECTOR_ACTIVE) {
+      CeedEvalMode eval_mode;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out));
+      CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode));
+      CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp));
+      if (eval_mode != CEED_EVAL_WEIGHT) {
+        num_eval_modes_out += q_comp;
+      }
+    }
+  }
+
+  // Operator data struct
+  CeedCallBackend(CeedOperatorGetData(op, &impl));
+  CeedOperatorDiag_Hip *diag = impl->diag;
+
+  // Assemble kernel
+  hipModule_t *module          = is_point_block ? &diag->module_point_block : &diag->module;
+  CeedInt      elems_per_block = 1;
+  CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes));
+  CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp));
+  if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes;
+  else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts));
+  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n");
+  CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source));
+  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n");
+  CeedCallHip(ceed, CeedCompile_Hip(ceed, diagonal_kernel_source, module, 8, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
+                                    num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "USE_CEEDSIZE",
+                                    use_ceedsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block));
+  CeedCallHip(ceed, CeedGetKernel_Hip(ceed, *module, "LinearDiagonal", is_point_block ? &diag->LinearPointBlock : &diag->LinearDiagonal));
+  CeedCallBackend(CeedFree(&diagonal_kernel_path));
+  CeedCallBackend(CeedFree(&diagonal_kernel_source));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Assemble Diagonal Core
 //------------------------------------------------------------------------------
 static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) {
   Ceed                ceed;
-  CeedSize            assembled_length, assembled_qf_length;
-  CeedInt             use_ceedsize_idx = 0, num_elem, num_nodes;
+  CeedInt             num_elem, num_nodes;
   CeedScalar         *elem_diag_array;
   const CeedScalar   *assembled_qf_array;
   CeedVector          assembled_qf   = NULL, elem_diag;
@@ -794,16 +863,23 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr));
   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
 
-  CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
-  CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
-  if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
-
   // Setup
-  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx));
+  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx, is_point_block));
   CeedOperatorDiag_Hip *diag = impl->diag;
 
   assert(diag != NULL);
 
+  // Assemble kernel if needed
+  if ((!is_point_block && !diag->LinearDiagonal) || (is_point_block && !diag->LinearPointBlock)) {
+    CeedSize assembled_length, assembled_qf_length;
+    CeedInt  use_ceedsize_idx = 0;
+    CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length));
+    CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length));
+    if ((assembled_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1;
+
+    CeedCallBackend(CeedOperatorAssembleDiagonalSetupCompile_Hip(op, use_ceedsize_idx, is_point_block));
+  }
+
   // Restriction and diagonal vector
   CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out));
   CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND,
@@ -978,7 +1054,7 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed
   CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT",
                                   num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in,
                                   "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE",
-                                  asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE",
+                                  asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "USE_CEEDSIZE",
                                   use_ceedsize_idx));
   CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble));
   CeedCallBackend(CeedFree(&assembly_kernel_path));
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index 14737a2a53..a641351f6d 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -104,7 +104,7 @@ typedef struct {
 } CeedQFunctionContext_Hip;
 
 typedef struct {
-  hipModule_t         module;
+  hipModule_t         module, module_point_block;
   hipFunction_t       LinearDiagonal;
   hipFunction_t       LinearPointBlock;
   CeedElemRestriction diag_rstr, point_block_diag_rstr;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
index ab366e7966..656fbbac03 100644
--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h
@@ -47,12 +47,11 @@ static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr,
 //------------------------------------------------------------------------------
 // Core code for diagonal assembly
 //------------------------------------------------------------------------------
-static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity,
-                                               const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in,
-                                               const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out,
-                                               const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in,
-                                               const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array,
-                                               CeedScalar *__restrict__ elem_diag_array) {
+extern "C" __launch_bounds__(BLOCK_SIZE) __global__
+    void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in,
+                        const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out,
+                        const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
+                        const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
   const int tid = threadIdx.x;  // Running with P threads
 
   if (tid >= NUM_NODES) return;
@@ -84,65 +83,41 @@ static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const boo
 
         // Each component
         for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) {
-          // Each qpoint/node pair
-          if (is_point_block) {
-            // Point block diagonal
-            for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
-              CeedScalar e_value = 0.;
-
-              for (IndexType q = 0; q < NUM_QPTS; q++) {
-                const CeedScalar qf_value =
-                    assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) *
-                                           NUM_QPTS +
-                                       q];
-
-                e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
-              }
-              elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value;
-            }
-          } else {
-            // Diagonal only
+#if USE_POINT_BLOCK
+          // Point block diagonal
+          for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
             CeedScalar e_value = 0.;
 
+            // Each qpoint/node pair
             for (IndexType q = 0; q < NUM_QPTS; q++) {
               const CeedScalar qf_value =
-                  assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
+                  assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
                                      q];
 
               e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
             }
-            elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value;
+            elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value;
+          }
+#else
+          // Diagonal only
+          CeedScalar e_value = 0.;
+
+          // Each qpoint/node pair
+          for (IndexType q = 0; q < NUM_QPTS; q++) {
+            const CeedScalar qf_value =
+                assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
+                                   q];
+
+            e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
           }
+          elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value;
+#endif
         }
       }
     }
   }
 }
 
-//------------------------------------------------------------------------------
-// Linear diagonal
-//------------------------------------------------------------------------------
-extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in,
-                                          const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out,
-                                          const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out,
-                                          const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
-                                          const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
-  DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out,
-               assembled_qf_array, elem_diag_array);
-}
-
-//------------------------------------------------------------------------------
-// Linear point block diagonal
-//------------------------------------------------------------------------------
-extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in,
-                                                    const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in,
-                                                    const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out,
-                                                    const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
-                                                    const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
-  DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out,
-               assembled_qf_array, elem_diag_array);
-}
-
 //------------------------------------------------------------------------------
 
 #endif  // CEED_CUDA_REF_OPERATOR_ASSEMBLE_DIAGONAL_H
diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
index fcd8df2960..288e09fcad 100644
--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h
@@ -47,12 +47,11 @@ static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr,
 //------------------------------------------------------------------------------
 // Core code for diagonal assembly
 //------------------------------------------------------------------------------
-static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity,
-                                               const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in,
-                                               const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out,
-                                               const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in,
-                                               const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array,
-                                               CeedScalar *__restrict__ elem_diag_array) {
+extern "C" __launch_bounds__(BLOCK_SIZE) __global__
+    void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in,
+                        const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out,
+                        const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
+                        const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
   const int tid = threadIdx.x;  // Running with P threads
 
   if (tid >= NUM_NODES) return;
@@ -84,65 +83,41 @@ static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const boo
 
         // Each component
         for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) {
-          // Each qpoint/node pair
-          if (is_point_block) {
-            // Point block diagonal
-            for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
-              CeedScalar e_value = 0.;
-
-              for (IndexType q = 0; q < NUM_QPTS; q++) {
-                const CeedScalar qf_value =
-                    assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) *
-                                           NUM_QPTS +
-                                       q];
-
-                e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
-              }
-              elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value;
-            }
-          } else {
-            // Diagonal only
+#if USE_POINT_BLOCK
+          // Point block diagonal
+          for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) {
             CeedScalar e_value = 0.;
 
+            // Each qpoint/node pair
             for (IndexType q = 0; q < NUM_QPTS; q++) {
               const CeedScalar qf_value =
-                  assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
+                  assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
                                      q];
 
               e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
             }
-            elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value;
+            elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value;
+          }
+#else
+          // Diagonal only
+          CeedScalar e_value = 0.;
+
+          // Each qpoint/node pair
+          for (IndexType q = 0; q < NUM_QPTS; q++) {
+            const CeedScalar qf_value =
+                assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS +
+                                   q];
+
+            e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid];
           }
+          elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value;
+#endif
         }
       }
     }
   }
 }
 
-//------------------------------------------------------------------------------
-// Linear diagonal
-//------------------------------------------------------------------------------
-extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in,
-                                          const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out,
-                                          const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out,
-                                          const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
-                                          const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
-  DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out,
-               assembled_qf_array, elem_diag_array);
-}
-
-//------------------------------------------------------------------------------
-// Linear point block diagonal
-//------------------------------------------------------------------------------
-extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in,
-                                                    const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in,
-                                                    const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out,
-                                                    const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out,
-                                                    const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) {
-  DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out,
-               assembled_qf_array, elem_diag_array);
-}
-
 //------------------------------------------------------------------------------
 
 #endif  // CEED_HIP_REF_OPERATOR_ASSEMBLE_DIAGONAL_H

From cf8cbdd67b26059bcd4f184d0db9bf05297bc21d Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 22 Jan 2024 06:51:22 -0800
Subject: [PATCH 3/4] Refactor GPU-backend element restriction source to only
 compile the required kernels, and do so lazily at first apply

---
 backends/cuda-ref/ceed-cuda-ref-restriction.c | 276 +++++++++------
 backends/cuda-ref/ceed-cuda-ref.h             |  17 +-
 backends/hip-ref/ceed-hip-ref-operator.c      |   2 +-
 backends/hip-ref/ceed-hip-ref-restriction.c   | 276 +++++++++------
 backends/hip-ref/ceed-hip-ref.h               |  17 +-
 .../cuda/cuda-ref-restriction-curl-oriented.h | 183 ++++++++++
 .../cuda/cuda-ref-restriction-offset.h        |  74 ++++
 .../cuda/cuda-ref-restriction-oriented.h      |  81 +++++
 .../cuda/cuda-ref-restriction-strided.h       |  47 +++
 .../jit-source/cuda/cuda-ref-restriction.h    | 332 ------------------
 .../hip/hip-ref-restriction-curl-oriented.h   | 183 ++++++++++
 .../hip/hip-ref-restriction-offset.h          |  74 ++++
 .../hip/hip-ref-restriction-oriented.h        |  81 +++++
 .../hip/hip-ref-restriction-strided.h         |  47 +++
 .../ceed/jit-source/hip/hip-ref-restriction.h | 332 ------------------
 15 files changed, 1097 insertions(+), 925 deletions(-)
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
 create mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
 delete mode 100644 include/ceed/jit-source/cuda/cuda-ref-restriction.h
 create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
 create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-offset.h
 create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
 create mode 100644 include/ceed/jit-source/hip/hip-ref-restriction-strided.h
 delete mode 100644 include/ceed/jit-source/hip/hip-ref-restriction.h

diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c
index f2b190e5f1..d873386575 100644
--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c
+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c
@@ -18,25 +18,126 @@
 #include "../cuda/ceed-cuda-compile.h"
 #include "ceed-cuda-ref.h"
 
+//------------------------------------------------------------------------------
+// Compile restriction kernels
+//------------------------------------------------------------------------------
+static inline int CeedElemRestrictionSetupCompile_Cuda(CeedElemRestriction rstr) {
+  Ceed                      ceed;
+  bool                      is_deterministic;
+  CeedInt                   num_elem, num_comp, elem_size, comp_stride;
+  CeedRestrictionType       rstr_type;
+  char                     *restriction_kernel_path, *restriction_kernel_source;
+  CeedElemRestriction_Cuda *impl;
+
+  CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
+  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  is_deterministic = impl->d_l_vec_indices != NULL;
+
+  // Compile CUDA kernels
+  switch (rstr_type) {
+    case CEED_RESTRICTION_STRIDED: {
+      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      bool    has_backend_strides;
+
+      CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
+      if (!has_backend_strides) {
+        CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides));
+      }
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-strided.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
+                                       strides[2]));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
+    } break;
+    case CEED_RESTRICTION_STANDARD: {
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
+    } break;
+    case CEED_RESTRICTION_ORIENTED: {
+      char *offset_kernel_path;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-oriented.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+    } break;
+    case CEED_RESTRICTION_CURL_ORIENTED: {
+      char *offset_kernel_path;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                       "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                       "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+    } break;
+    case CEED_RESTRICTION_POINTS: {
+      // LCOV_EXCL_START
+      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
+      // LCOV_EXCL_STOP
+    } break;
+  }
+  CeedCallBackend(CeedFree(&restriction_kernel_path));
+  CeedCallBackend(CeedFree(&restriction_kernel_source));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Core apply restriction code
 //------------------------------------------------------------------------------
 static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients,
                                                      CeedVector u, CeedVector v, CeedRequest *request) {
   Ceed                      ceed;
-  CeedInt                   num_elem, elem_size;
   CeedRestrictionType       rstr_type;
   const CeedScalar         *d_u;
   CeedScalar               *d_v;
   CeedElemRestriction_Cuda *impl;
-  CUfunction                kernel;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
-  const CeedInt num_nodes = impl->num_nodes;
+
+  // Assemble kernel if needed
+  if (!impl->module) {
+    CeedCallBackend(CeedElemRestrictionSetupCompile_Cuda(rstr));
+  }
 
   // Get vectors
   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
@@ -51,51 +152,47 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
   // Restrict
   if (t_mode == CEED_NOTRANSPOSE) {
     // L-vector -> E-vector
+    CeedInt elem_size;
+
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
     const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024;
-    const CeedInt grid       = CeedDivUpInt(num_nodes, block_size);
+    const CeedInt grid       = CeedDivUpInt(impl->num_nodes, block_size);
 
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED: {
-        kernel       = impl->StridedNoTranspose;
-        void *args[] = {&num_elem, &d_u, &d_v};
+        void *args[] = {&d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_STANDARD: {
-        kernel       = impl->OffsetNoTranspose;
-        void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+        void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_ORIENTED: {
         if (use_signs) {
-          kernel       = impl->OrientedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_CURL_ORIENTED: {
         if (use_signs && use_orients) {
-          kernel       = impl->CurlOrientedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyNoTranspose, grid, block_size, args));
         } else if (use_orients) {
-          kernel       = impl->CurlOrientedUnsignedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_POINTS: {
@@ -106,92 +203,80 @@ static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, C
     }
   } else {
     // E-vector -> L-vector
-    const CeedInt block_size = 32;
-    const CeedInt grid       = CeedDivUpInt(num_nodes, block_size);
+    const bool    is_deterministic = impl->d_l_vec_indices != NULL;
+    const CeedInt block_size       = 32;
+    const CeedInt grid             = CeedDivUpInt(impl->num_nodes, block_size);
 
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED: {
-        kernel       = impl->StridedTranspose;
-        void *args[] = {&num_elem, &d_u, &d_v};
+        void *args[] = {&d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_STANDARD: {
-        if (impl->OffsetTranspose) {
-          kernel       = impl->OffsetTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetTransposeDet;
           void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_ORIENTED: {
         if (use_signs) {
-          if (impl->OrientedTranspose) {
-            kernel       = impl->OrientedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OrientedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
           }
         } else {
-          if (impl->OffsetTranspose) {
-            kernel       = impl->OffsetTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OffsetTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           }
         }
       } break;
       case CEED_RESTRICTION_CURL_ORIENTED: {
         if (use_signs && use_orients) {
-          if (impl->CurlOrientedTranspose) {
-            kernel       = impl->CurlOrientedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->CurlOrientedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyTranspose, grid, block_size, args));
           }
         } else if (use_orients) {
-          if (impl->CurlOrientedUnsignedTranspose) {
-            kernel       = impl->CurlOrientedUnsignedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->CurlOrientedUnsignedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           }
         } else {
-          if (impl->OffsetTranspose) {
-            kernel       = impl->OffsetTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OffsetTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Cuda(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
           }
         }
       } break;
@@ -297,7 +382,9 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) {
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCallCuda(ceed, cuModuleUnload(impl->module));
+  if (impl->module) {
+    CeedCallCuda(ceed, cuModuleUnload(impl->module));
+  }
   CeedCallBackend(CeedFree(&impl->h_ind_allocated));
   CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated));
   CeedCallCuda(ceed, cudaFree(impl->d_t_offsets));
@@ -398,33 +485,17 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
                                    const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                      ceed, ceed_parent;
   bool                      is_deterministic;
-  CeedInt                   num_elem, num_comp, elem_size, comp_stride = 1;
+  CeedInt                   num_elem, elem_size;
   CeedRestrictionType       rstr_type;
-  char                     *restriction_kernel_path, *restriction_kernel_source;
   CeedElemRestriction_Cuda *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
-  const CeedInt size       = num_elem * elem_size;
-  CeedInt       strides[3] = {1, size, elem_size};
-  CeedInt       layout[3]  = {1, elem_size * num_elem, elem_size};
-
-  // Stride data
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
-  if (rstr_type == CEED_RESTRICTION_STRIDED) {
-    bool has_backend_strides;
-
-    CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
-    if (!has_backend_strides) {
-      CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides));
-    }
-  } else {
-    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  }
+  const CeedInt size      = num_elem * elem_size;
+  CeedInt       layout[3] = {1, size, elem_size};
 
   CeedCallBackend(CeedCalloc(1, &impl));
   impl->num_nodes                = size;
@@ -446,6 +517,7 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout));
 
   // Set up device offset/orientation arrays
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
       case CEED_MEM_HOST: {
@@ -576,34 +648,6 @@ int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
-  // Compile CUDA kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
-                                   "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES",
-                                   strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2]));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose));
-  if (!is_deterministic) {
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose));
-  } else {
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet));
-    CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet));
-  }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
-
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda));
diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h
index 1e1affbc4e..652920280d 100644
--- a/backends/cuda-ref/ceed-cuda-ref.h
+++ b/backends/cuda-ref/ceed-cuda-ref.h
@@ -25,20 +25,9 @@ typedef struct {
 
 typedef struct {
   CUmodule   module;
-  CUfunction StridedNoTranspose;
-  CUfunction StridedTranspose;
-  CUfunction OffsetNoTranspose;
-  CUfunction OffsetTranspose;
-  CUfunction OffsetTransposeDet;
-  CUfunction OrientedNoTranspose;
-  CUfunction OrientedTranspose;
-  CUfunction OrientedTransposeDet;
-  CUfunction CurlOrientedNoTranspose;
-  CUfunction CurlOrientedTranspose;
-  CUfunction CurlOrientedTransposeDet;
-  CUfunction CurlOrientedUnsignedNoTranspose;
-  CUfunction CurlOrientedUnsignedTranspose;
-  CUfunction CurlOrientedUnsignedTransposeDet;
+  CUfunction ApplyNoTranspose, ApplyTranspose;
+  CUfunction ApplyUnsignedNoTranspose, ApplyUnsignedTranspose;
+  CUfunction ApplyUnorientedNoTranspose, ApplyUnorientedTranspose;
   CeedInt    num_nodes;
   CeedInt   *h_ind;
   CeedInt   *h_ind_allocated;
diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c
index 6eda8c485a..cc2538579a 100644
--- a/backends/hip-ref/ceed-hip-ref-operator.c
+++ b/backends/hip-ref/ceed-hip-ref-operator.c
@@ -864,7 +864,7 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect
   CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array));
 
   // Setup
-  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op, use_ceedsize_idx, is_point_block));
+  if (!impl->diag) CeedCallBackend(CeedOperatorAssembleDiagonalSetup_Hip(op));
   CeedOperatorDiag_Hip *diag = impl->diag;
 
   assert(diag != NULL);
diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c
index 5824fa4855..9364f425fb 100644
--- a/backends/hip-ref/ceed-hip-ref-restriction.c
+++ b/backends/hip-ref/ceed-hip-ref-restriction.c
@@ -17,25 +17,126 @@
 #include "../hip/ceed-hip-compile.h"
 #include "ceed-hip-ref.h"
 
+//------------------------------------------------------------------------------
+// Compile restriction kernels
+//------------------------------------------------------------------------------
+static inline int CeedElemRestrictionSetupCompile_Hip(CeedElemRestriction rstr) {
+  Ceed                     ceed;
+  bool                     is_deterministic;
+  CeedInt                  num_elem, num_comp, elem_size, comp_stride;
+  CeedRestrictionType      rstr_type;
+  char                    *restriction_kernel_path, *restriction_kernel_source;
+  CeedElemRestriction_Hip *impl;
+
+  CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
+  CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
+  CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
+  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
+  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
+  CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
+  is_deterministic = impl->d_l_vec_indices != NULL;
+
+  // Compile HIP kernels
+  switch (rstr_type) {
+    case CEED_RESTRICTION_STRIDED: {
+      CeedInt strides[3] = {1, num_elem * elem_size, elem_size};
+      bool    has_backend_strides;
+
+      CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
+      if (!has_backend_strides) {
+        CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides));
+      }
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-strided.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_STRIDE_NODES", strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM",
+                                      strides[2]));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->ApplyTranspose));
+    } break;
+    case CEED_RESTRICTION_STANDARD: {
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                      "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyTranspose));
+    } break;
+    case CEED_RESTRICTION_ORIENTED: {
+      char *offset_kernel_path;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-oriented.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                      "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnsignedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnsignedTranspose));
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+    } break;
+    case CEED_RESTRICTION_CURL_ORIENTED: {
+      char *offset_kernel_path;
+
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h", &restriction_kernel_path));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
+      CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
+      CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction-offset.h", &offset_kernel_path));
+      CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, offset_kernel_path, &restriction_kernel_source));
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
+      CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 6, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
+                                      "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride,
+                                      "USE_DETERMINISTIC", is_deterministic ? 1 : 0));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->ApplyNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->ApplyUnsignedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->ApplyUnorientedNoTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->ApplyTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->ApplyUnsignedTranspose));
+      CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->ApplyUnorientedTranspose));
+      CeedCallBackend(CeedFree(&offset_kernel_path));
+    } break;
+    case CEED_RESTRICTION_POINTS: {
+      // LCOV_EXCL_START
+      return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints");
+      // LCOV_EXCL_STOP
+    } break;
+  }
+  CeedCallBackend(CeedFree(&restriction_kernel_path));
+  CeedCallBackend(CeedFree(&restriction_kernel_source));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Core apply restriction code
 //------------------------------------------------------------------------------
 static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients,
                                                     CeedVector u, CeedVector v, CeedRequest *request) {
   Ceed                     ceed;
-  CeedInt                  num_elem, elem_size;
   CeedRestrictionType      rstr_type;
   const CeedScalar        *d_u;
   CeedScalar              *d_v;
   CeedElemRestriction_Hip *impl;
-  hipFunction_t            kernel;
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
-  const CeedInt num_nodes = impl->num_nodes;
+
+  // Assemble kernel if needed
+  if (!impl->module) {
+    CeedCallBackend(CeedElemRestrictionSetupCompile_Hip(rstr));
+  }
 
   // Get vectors
   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
@@ -50,51 +151,47 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
   // Restrict
   if (t_mode == CEED_NOTRANSPOSE) {
     // L-vector -> E-vector
+    CeedInt elem_size;
+
+    CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
     const CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256;
-    const CeedInt grid       = CeedDivUpInt(num_nodes, block_size);
+    const CeedInt grid       = CeedDivUpInt(impl->num_nodes, block_size);
 
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED: {
-        kernel       = impl->StridedNoTranspose;
-        void *args[] = {&num_elem, &d_u, &d_v};
+        void *args[] = {&d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_STANDARD: {
-        kernel       = impl->OffsetNoTranspose;
-        void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+        void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_ORIENTED: {
         if (use_signs) {
-          kernel       = impl->OrientedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_CURL_ORIENTED: {
         if (use_signs && use_orients) {
-          kernel       = impl->CurlOrientedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyNoTranspose, grid, block_size, args));
         } else if (use_orients) {
-          kernel       = impl->CurlOrientedUnsignedNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedNoTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetNoTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedNoTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_POINTS: {
@@ -105,92 +202,80 @@ static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, Ce
     }
   } else {
     // E-vector -> L-vector
-    const CeedInt block_size = 64;
-    const CeedInt grid       = CeedDivUpInt(num_nodes, block_size);
+    const bool    is_deterministic = impl->d_l_vec_indices != NULL;
+    const CeedInt block_size       = 64;
+    const CeedInt grid             = CeedDivUpInt(impl->num_nodes, block_size);
 
     switch (rstr_type) {
       case CEED_RESTRICTION_STRIDED: {
-        kernel       = impl->StridedTranspose;
-        void *args[] = {&num_elem, &d_u, &d_v};
+        void *args[] = {&d_u, &d_v};
 
-        CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+        CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
       } break;
       case CEED_RESTRICTION_STANDARD: {
-        if (impl->OffsetTranspose) {
-          kernel       = impl->OffsetTranspose;
-          void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+        if (!is_deterministic) {
+          void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
         } else {
-          kernel       = impl->OffsetTransposeDet;
           void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-          CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+          CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
         }
       } break;
       case CEED_RESTRICTION_ORIENTED: {
         if (use_signs) {
-          if (impl->OrientedTranspose) {
-            kernel       = impl->OrientedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OrientedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
           }
         } else {
-          if (impl->OffsetTranspose) {
-            kernel       = impl->OffsetTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OffsetTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           }
         }
       } break;
       case CEED_RESTRICTION_CURL_ORIENTED: {
         if (use_signs && use_orients) {
-          if (impl->CurlOrientedTranspose) {
-            kernel       = impl->CurlOrientedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->CurlOrientedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyTranspose, grid, block_size, args));
           }
         } else if (use_orients) {
-          if (impl->CurlOrientedUnsignedTranspose) {
-            kernel       = impl->CurlOrientedUnsignedTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->CurlOrientedUnsignedTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnsignedTranspose, grid, block_size, args));
           }
         } else {
-          if (impl->OffsetTranspose) {
-            kernel       = impl->OffsetTranspose;
-            void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v};
+          if (!is_deterministic) {
+            void *args[] = {&impl->d_ind, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
           } else {
-            kernel       = impl->OffsetTransposeDet;
             void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v};
 
-            CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args));
+            CeedCallBackend(CeedRunKernel_Hip(ceed, impl->ApplyUnorientedTranspose, grid, block_size, args));
           }
         }
       } break;
@@ -296,7 +381,9 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) {
 
   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
-  CeedCallHip(ceed, hipModuleUnload(impl->module));
+  if (impl->module) {
+    CeedCallHip(ceed, hipModuleUnload(impl->module));
+  }
   CeedCallBackend(CeedFree(&impl->h_ind_allocated));
   CeedCallHip(ceed, hipFree(impl->d_ind_allocated));
   CeedCallHip(ceed, hipFree(impl->d_t_offsets));
@@ -397,33 +484,17 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
                                   const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
   Ceed                     ceed, ceed_parent;
   bool                     is_deterministic;
-  CeedInt                  num_elem, num_comp, elem_size, comp_stride = 1;
+  CeedInt                  num_elem, elem_size;
   CeedRestrictionType      rstr_type;
-  char                    *restriction_kernel_path, *restriction_kernel_source;
   CeedElemRestriction_Hip *impl;
 
   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
   CeedCallBackend(CeedGetParent(ceed, &ceed_parent));
   CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic));
   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
-  CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
-  const CeedInt size       = num_elem * elem_size;
-  CeedInt       strides[3] = {1, size, elem_size};
-  CeedInt       layout[3]  = {1, elem_size * num_elem, elem_size};
-
-  // Stride data
-  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
-  if (rstr_type == CEED_RESTRICTION_STRIDED) {
-    bool has_backend_strides;
-
-    CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
-    if (!has_backend_strides) {
-      CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides));
-    }
-  } else {
-    CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
-  }
+  const CeedInt size      = num_elem * elem_size;
+  CeedInt       layout[3] = {1, size, elem_size};
 
   CeedCallBackend(CeedCalloc(1, &impl));
   impl->num_nodes                = size;
@@ -445,6 +516,7 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
   CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout));
 
   // Set up device offset/orientation arrays
+  CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
   if (rstr_type != CEED_RESTRICTION_STRIDED) {
     switch (mem_type) {
       case CEED_MEM_HOST: {
@@ -575,34 +647,6 @@ int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode,
     }
   }
 
-  // Compile HIP kernels
-  CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n");
-  CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source));
-  CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n");
-  CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem,
-                                  "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES",
-                                  strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2]));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose));
-  if (!is_deterministic) {
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose));
-  } else {
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet));
-    CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet));
-  }
-  CeedCallBackend(CeedFree(&restriction_kernel_path));
-  CeedCallBackend(CeedFree(&restriction_kernel_source));
-
   // Register backend functions
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Hip));
   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Hip));
diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h
index a641351f6d..8784a6865a 100644
--- a/backends/hip-ref/ceed-hip-ref.h
+++ b/backends/hip-ref/ceed-hip-ref.h
@@ -29,20 +29,9 @@ typedef struct {
 
 typedef struct {
   hipModule_t   module;
-  hipFunction_t StridedNoTranspose;
-  hipFunction_t StridedTranspose;
-  hipFunction_t OffsetNoTranspose;
-  hipFunction_t OffsetTranspose;
-  hipFunction_t OffsetTransposeDet;
-  hipFunction_t OrientedNoTranspose;
-  hipFunction_t OrientedTranspose;
-  hipFunction_t OrientedTransposeDet;
-  hipFunction_t CurlOrientedNoTranspose;
-  hipFunction_t CurlOrientedTranspose;
-  hipFunction_t CurlOrientedTransposeDet;
-  hipFunction_t CurlOrientedUnsignedNoTranspose;
-  hipFunction_t CurlOrientedUnsignedTranspose;
-  hipFunction_t CurlOrientedUnsignedTransposeDet;
+  hipFunction_t ApplyNoTranspose, ApplyTranspose;
+  hipFunction_t ApplyUnsignedNoTranspose, ApplyUnsignedTranspose;
+  hipFunction_t ApplyUnorientedNoTranspose, ApplyUnorientedTranspose;
   CeedInt       num_nodes;
   CeedInt      *h_ind;
   CeedInt      *h_ind_allocated;
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
new file mode 100644
index 0000000000..8609acc9bb
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-curl-oriented.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA curl-oriented element restriction kernels
+#ifndef CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H
+#define CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, curl-oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
+    const CeedInt  ind_d          = indices[node];
+    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
+    const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0];
+    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
+    const CeedInt8 curl_orient_du = curl_orients[3 * node + 2];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
+      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
+      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, unsigned curl-oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
+    const CeedInt  ind_d          = indices[node];
+    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
+    const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]);
+    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
+    const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]);
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
+      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
+      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, curl-oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  ind            = indices[node];
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0;
+    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
+    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+      value +=
+          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+    }
+  }
+}
+#else
+extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                                 const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt  t_ind          = t_indices[j];
+      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
+      const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0;
+      const CeedInt8 curl_orient_d  = curl_orients[3 * t_ind + 1];
+      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+        value[comp] +=
+            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, unsigned curl-oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind            = indices[node];
+    const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0;
+    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
+    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+      value +=
+          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+    }
+  }
+}
+#else
+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                                         const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
+                                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt  t_ind          = t_indices[j];
+      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
+      const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0;
+      const CeedInt8 curl_orient_d  = abs(curl_orients[3 * t_ind + 1]);
+      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+        value[comp] +=
+            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_CUDA_REF_RESTRICTION_CURL_ORIENTED_H
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
new file mode 100644
index 0000000000..f5e3f0cfb4
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-offset.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA offset element restriction kernels
+#ifndef CEED_CUDA_REF_RESTRICTION_OFFSET_H
+#define CEED_CUDA_REF_RESTRICTION_OFFSET_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+extern "C" __global__ void OffsetNoTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                           const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_CUDA_REF_RESTRICTION_OFFSET_H
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
new file mode 100644
index 0000000000..85bb5a3e83
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-oriented.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA oriented element restriction kernels
+#ifndef CEED_CUDA_REF_RESTRICTION_ORIENTED_H
+#define CEED_CUDA_REF_RESTRICTION_ORIENTED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void OrientedNoTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
+                                               const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const bool    orient   = orients[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, const CeedScalar *__restrict__ u,
+                                             CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const bool    orient   = orients[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+                u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
+    }
+  }
+}
+#else
+extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const bool    orient   = orients[t_ind];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0);
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_CUDA_REF_RESTRICTION_ORIENTED_H
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
new file mode 100644
index 0000000000..7fe2d75cf5
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction-strided.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA strided element restriction kernels
+#ifndef CEED_CUDA_REF_RESTRICTION_STRIDED_H
+#define CEED_CUDA_REF_RESTRICTION_STRIDED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, strided
+//------------------------------------------------------------------------------
+extern "C" __global__ void StridedNoTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] =
+          u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, strided
+//------------------------------------------------------------------------------
+extern "C" __global__ void StridedTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] +=
+          u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_CUDA_REF_RESTRICTION_STRIDED_H
diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h
deleted file mode 100644
index 80011148e6..0000000000
--- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Internal header for CUDA element restriction kernels
-#ifndef CEED_CUDA_REF_RESTRICTION_H
-#define CEED_CUDA_REF_RESTRICTION_H
-
-#include <ceed.h>
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, strided
-//------------------------------------------------------------------------------
-extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] =
-          u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, standard (with offsets)
-//------------------------------------------------------------------------------
-extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u,
-                                             CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
-                                               const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const bool    orient   = orients[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0);
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                   const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                   CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
-    const CeedInt  ind_d          = indices[node];
-    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
-    const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0];
-    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
-    const CeedInt8 curl_orient_du = curl_orients[3 * node + 2];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
-      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
-      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, unsigned curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                           const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                           CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
-    const CeedInt  ind_d          = indices[node];
-    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
-    const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]);
-    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
-    const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]);
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
-      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
-      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, strided
-//------------------------------------------------------------------------------
-extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] +=
-          u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, standard (with offsets)
-//------------------------------------------------------------------------------
-extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u,
-                                           CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
-    }
-  }
-}
-
-extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                              const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt t_ind    = t_indices[j];
-      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
-                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const bool    orient   = orients[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
-                u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
-    }
-  }
-}
-
-extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients,
-                                                const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt t_ind    = t_indices[j];
-      const bool    orient   = orients[t_ind];
-      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0);
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                 const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                 CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  ind            = indices[node];
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0;
-    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
-    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-      value +=
-          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
-    }
-  }
-}
-
-extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                    const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
-                                                    const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt  t_ind          = t_indices[j];
-      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
-      const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0;
-      const CeedInt8 curl_orient_d  = curl_orients[3 * t_ind + 1];
-      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-        value[comp] +=
-            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, unsigned curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                         const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                         CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind            = indices[node];
-    const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0;
-    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
-    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-      value +=
-          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
-    }
-  }
-}
-
-extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                            const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
-                                                            const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt  t_ind          = t_indices[j];
-      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
-      const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0;
-      const CeedInt8 curl_orient_d  = abs(curl_orients[3 * t_ind + 1]);
-      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-        value[comp] +=
-            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-
-#endif  // CEED_CUDA_REF_RESTRICTION_H
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
new file mode 100644
index 0000000000..c1131ae3d0
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-curl-oriented.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP curl-oriented element restriction kernels
+#ifndef CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H
+#define CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, curl-oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                   const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
+    const CeedInt  ind_d          = indices[node];
+    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
+    const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0];
+    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
+    const CeedInt8 curl_orient_du = curl_orients[3 * node + 2];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
+      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
+      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, unsigned curl-oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                           const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
+    const CeedInt  ind_d          = indices[node];
+    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
+    const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]);
+    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
+    const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]);
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
+      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
+      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, curl-oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  ind            = indices[node];
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0;
+    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
+    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+      value +=
+          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+    }
+  }
+}
+#else
+extern "C" __global__ void CurlOrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                                 const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
+                                                 const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt  t_ind          = t_indices[j];
+      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
+      const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0;
+      const CeedInt8 curl_orient_d  = curl_orients[3 * t_ind + 1];
+      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+        value[comp] +=
+            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, unsigned curl-oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ indices, const CeedInt8 *__restrict__ curl_orients,
+                                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
+    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
+    const CeedInt  ind            = indices[node];
+    const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0;
+    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
+    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      CeedScalar value = 0.0;
+      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+      value +=
+          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
+    }
+  }
+}
+#else
+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                                         const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
+                                                         const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt  t_ind          = t_indices[j];
+      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
+      const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0;
+      const CeedInt8 curl_orient_d  = abs(curl_orients[3 * t_ind + 1]);
+      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
+        value[comp] +=
+            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_HIP_REF_RESTRICTION_CURL_ORIENTED_H
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-offset.h b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
new file mode 100644
index 0000000000..d31b8db893
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-offset.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP offset element restriction kernels
+#ifndef CEED_HIP_REF_RESTRICTION_OFFSET_H
+#define CEED_HIP_REF_RESTRICTION_OFFSET_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+extern "C" __global__ void OffsetNoTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, standard (with offsets)
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
+    }
+  }
+}
+#else
+extern "C" __global__ void OffsetTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                           const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_HIP_REF_RESTRICTION_OFFSET_H
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
new file mode 100644
index 0000000000..668169783a
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-oriented.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP oriented element restriction kernels
+#ifndef CEED_HIP_REF_RESTRICTION_ORIENTED_H
+#define CEED_HIP_REF_RESTRICTION_ORIENTED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, oriented
+//------------------------------------------------------------------------------
+extern "C" __global__ void OrientedNoTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
+                                               const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const bool    orient   = orients[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, oriented
+//------------------------------------------------------------------------------
+#if !USE_DETERMINISTIC
+extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ indices, const bool *__restrict__ orients, const CeedScalar *__restrict__ u,
+                                             CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt ind      = indices[node];
+    const bool    orient   = orients[node];
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
+                u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
+    }
+  }
+}
+#else
+extern "C" __global__ void OrientedTranspose(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
+                                             const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients,
+                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  CeedScalar value[RSTR_NUM_COMP];
+
+  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
+    const CeedInt ind     = l_vec_indices[i];
+    const CeedInt range_1 = t_offsets[i];
+    const CeedInt range_N = t_offsets[i + 1];
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
+
+    for (CeedInt j = range_1; j < range_N; j++) {
+      const CeedInt t_ind    = t_indices[j];
+      const bool    orient   = orients[t_ind];
+      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
+      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
+
+      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0);
+      }
+    }
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
+  }
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_HIP_REF_RESTRICTION_ORIENTED_H
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction-strided.h b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
new file mode 100644
index 0000000000..8f8ac4ee19
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-ref-restriction-strided.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP strided element restriction kernels
+#ifndef CEED_HIP_REF_RESTRICTION_STRIDED_H
+#define CEED_HIP_REF_RESTRICTION_STRIDED_H
+
+#include <ceed.h>
+
+//------------------------------------------------------------------------------
+// L-vector -> E-vector, strided
+//------------------------------------------------------------------------------
+extern "C" __global__ void StridedNoTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] =
+          u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// E-vector -> L-vector, strided
+//------------------------------------------------------------------------------
+extern "C" __global__ void StridedTranspose(const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
+  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < RSTR_NUM_ELEM * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
+    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
+    const CeedInt elem     = node / RSTR_ELEM_SIZE;
+
+    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
+      v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] +=
+          u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+#endif  // CEED_HIP_REF_RESTRICTION_STRIDED_H
diff --git a/include/ceed/jit-source/hip/hip-ref-restriction.h b/include/ceed/jit-source/hip/hip-ref-restriction.h
deleted file mode 100644
index e1de6d87e9..0000000000
--- a/include/ceed/jit-source/hip/hip-ref-restriction.h
+++ /dev/null
@@ -1,332 +0,0 @@
-// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
-// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
-//
-// SPDX-License-Identifier: BSD-2-Clause
-//
-// This file is part of CEED:  http://github.com/ceed
-
-/// @file
-/// Internal header for HIP element restriction kernels
-#ifndef CEED_HIP_REF_RESTRICTION_H
-#define CEED_HIP_REF_RESTRICTION_H
-
-#include <ceed.h>
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, strided
-//------------------------------------------------------------------------------
-extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] =
-          u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, standard (with offsets)
-//------------------------------------------------------------------------------
-extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u,
-                                             CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
-                                               const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const bool    orient   = orients[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0);
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                   const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                   CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
-    const CeedInt  ind_d          = indices[node];
-    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
-    const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0];
-    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
-    const CeedInt8 curl_orient_du = curl_orients[3 * node + 2];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
-      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
-      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// L-vector -> E-vector, unsigned curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                           const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                           CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind_dl         = loc_node > 0 ? indices[node - 1] : 0;
-    const CeedInt  ind_d          = indices[node];
-    const CeedInt  ind_du         = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0;
-    const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]);
-    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
-    const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]);
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0;
-      value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d;
-      value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0;
-      v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, strided
-//------------------------------------------------------------------------------
-extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] +=
-          u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, standard (with offsets)
-//------------------------------------------------------------------------------
-extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u,
-                                           CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]);
-    }
-  }
-}
-
-extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                              const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt t_ind    = t_indices[j];
-      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE];
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients,
-                                             const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt ind      = indices[node];
-    const bool    orient   = orients[node];
-    const CeedInt loc_node = node % RSTR_ELEM_SIZE;
-    const CeedInt elem     = node / RSTR_ELEM_SIZE;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE,
-                u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0));
-    }
-  }
-}
-
-extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients,
-                                                const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt t_ind    = t_indices[j];
-      const bool    orient   = orients[t_ind];
-      const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt elem     = t_ind / RSTR_ELEM_SIZE;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0);
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                 const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                 CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  ind            = indices[node];
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0;
-    const CeedInt8 curl_orient_d  = curl_orients[3 * node + 1];
-    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-      value +=
-          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
-    }
-  }
-}
-
-extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                    const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
-                                                    const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt  t_ind          = t_indices[j];
-      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
-      const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0;
-      const CeedInt8 curl_orient_d  = curl_orients[3 * t_ind + 1];
-      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-        value[comp] +=
-            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-// E-vector -> L-vector, unsigned curl-oriented
-//------------------------------------------------------------------------------
-extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices,
-                                                         const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u,
-                                                         CeedScalar *__restrict__ v) {
-  for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) {
-    const CeedInt  loc_node       = node % RSTR_ELEM_SIZE;
-    const CeedInt  elem           = node / RSTR_ELEM_SIZE;
-    const CeedInt  ind            = indices[node];
-    const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0;
-    const CeedInt8 curl_orient_d  = abs(curl_orients[3 * node + 1]);
-    const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0;
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-      CeedScalar value = 0.0;
-      value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-      value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-      value +=
-          loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value);
-    }
-  }
-}
-
-extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices,
-                                                            const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients,
-                                                            const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) {
-  CeedScalar value[RSTR_NUM_COMP];
-
-  for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) {
-    const CeedInt ind     = l_vec_indices[i];
-    const CeedInt range_1 = t_offsets[i];
-    const CeedInt range_N = t_offsets[i + 1];
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0;
-
-    for (CeedInt j = range_1; j < range_N; j++) {
-      const CeedInt  t_ind          = t_indices[j];
-      const CeedInt  loc_node       = t_ind % RSTR_ELEM_SIZE;
-      const CeedInt  elem           = t_ind / RSTR_ELEM_SIZE;
-      const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0;
-      const CeedInt8 curl_orient_d  = abs(curl_orients[3 * t_ind + 1]);
-      const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0;
-
-      for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) {
-        value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0;
-        value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d;
-        value[comp] +=
-            loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0;
-      }
-    }
-
-    for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp];
-  }
-}
-
-//------------------------------------------------------------------------------
-
-#endif  // CEED_HIP_REF_RESTRICTION_H

From 1b7492f867164853116dc978bf39f769f48c56d0 Mon Sep 17 00:00:00 2001
From: Sebastian Grimberg <sjg@amazon.com>
Date: Mon, 22 Jan 2024 09:02:30 -0800
Subject: [PATCH 4/4] Fix a few incorrect JIT header includes

---
 include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h | 2 +-
 include/ceed/jit-source/cuda/cuda-gen-templates.h       | 2 +-
 include/ceed/jit-source/hip/hip-gen-templates.h         | 2 +-
 include/ceed/jit-source/sycl/sycl-gen-templates.h       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
index b2ac2842ed..4530a633b0 100644
--- a/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
+++ b/include/ceed/jit-source/cuda/cuda-atomic-add-fallback.h
@@ -10,7 +10,7 @@
 #ifndef CEED_CUDA_ATOMIC_ADD_FALLBACK_H
 #define CEED_CUDA_ATOMIC_ADD_FALLBACK_H
 
-#include <ceed/types.h>
+#include <ceed.h>
 
 //------------------------------------------------------------------------------
 // Atomic add, for older CUDA
diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h
index 265f7d2b4f..a3496e8411 100644
--- a/include/ceed/jit-source/cuda/cuda-gen-templates.h
+++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h
@@ -10,7 +10,7 @@
 #ifndef CEED_CUDA_GEN_TEMPLATES_H
 #define CEED_CUDA_GEN_TEMPLATES_H
 
-#include <ceed/types.h>
+#include <ceed.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
diff --git a/include/ceed/jit-source/hip/hip-gen-templates.h b/include/ceed/jit-source/hip/hip-gen-templates.h
index 884f346d76..27b894629e 100644
--- a/include/ceed/jit-source/hip/hip-gen-templates.h
+++ b/include/ceed/jit-source/hip/hip-gen-templates.h
@@ -10,7 +10,7 @@
 #ifndef CEED_HIP_GEN_TEMPLATES_H
 #define CEED_HIP_GEN_TEMPLATES_H
 
-#include <ceed/types.h>
+#include <ceed.h>
 
 //------------------------------------------------------------------------------
 // Load matrices for basis actions
diff --git a/include/ceed/jit-source/sycl/sycl-gen-templates.h b/include/ceed/jit-source/sycl/sycl-gen-templates.h
index 88aa7b98cd..c13e12b690 100644
--- a/include/ceed/jit-source/sycl/sycl-gen-templates.h
+++ b/include/ceed/jit-source/sycl/sycl-gen-templates.h
@@ -10,7 +10,7 @@
 #ifndef CEED_SYCL_GEN_TEMPLATES_H
 #define CEED_SYCL_GEN_TEMPLATES_H
 
-#include <ceed/types.h>
+#include <ceed.h>
 
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable