diff --git a/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu b/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu
index e7c1701f..a2a87a4b 100644
--- a/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu
+++ b/optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu
@@ -728,7 +728,7 @@ __global__ void Marlin(
 // latency hiding. At the same time, we want relatively few warps to have many registers per warp and small tiles.
 const int THREADS = 256;
 const int STAGES = 4; // 4 pipeline stages fit into shared memory
-const int SHARED_MEM = 96 * 1024; // max shared memory on compute capability 8.6 (< 8.0)
+const int SHARED_MEM = 164 * 1024; // max shared memory on compute capability 8.0
 
 // ADDED: add scaled zero pointer
 #define CALL_IF(THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, GROUP_BLOCKS) \