diff --git a/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu b/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu index e7c1701f..a2a87a4b 100644 --- a/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu +++ b/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu @@ -728,7 +728,7 @@ __global__ void Marlin( // latency hiding. At the same time, we want relatively few warps to have many registers per warp and small tiles. const int THREADS = 256; const int STAGES = 4; // 4 pipeline stages fit into shared memory -const int SHARED_MEM = 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0) +const int SHARED_MEM = 164 * 1024; // max shared memory on compute capability 8.0 // ADDED: add scaled zero pointer #define CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, GROUP_BLOCKS) \