Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NANOVDB_USE_SYNC_CUDA_MALLOC define to force sync CUDA malloc #1799

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaAddBlindData.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
// extract byte sizes of the grid, blind meta data and blind data
enum {GRID=0, META=1, DATA=2, CHECKSUM=3};
uint64_t tmp[4], *d_tmp;
cudaCheck(cudaMallocAsync((void**)&d_tmp, 4*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_tmp, 4*sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
if (auto count = d_grid->blindDataCount()) {
d_tmp[GRID] = PtrDiff(&d_grid->blindMetaData(0), d_grid);
Expand Down Expand Up @@ -114,7 +114,7 @@ cudaAddBlindData(const NanoGrid<BuildT> *d_grid,
for (uint32_t i=0, n=grid.mBlindMetadataCount-1; i<n; ++i, ++meta) meta->mDataOffset += sizeof(GridBlindMetaData);
grid.mGridSize += sizeof(GridBlindMetaData) + meta->blindDataSize();// expansion with 32 byte alignment
}); cudaCheckError();
cudaCheck(cudaFreeAsync(d_tmp, stream));
cudaCheck(CUDA_FREE(d_tmp, stream));

GridChecksum cs(tmp[CHECKSUM]);
cudaGridChecksum(reinterpret_cast<GridData*>(d_data), cs.mode());
Expand Down
6 changes: 3 additions & 3 deletions nanovdb/nanovdb/util/cuda/CudaDeviceBuffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ inline void CudaDeviceBuffer::init(uint64_t size, bool host, void* stream)
cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
checkPtr(mCpuData, "CudaDeviceBuffer::init: failed to allocate host buffer");
} else {
cudaCheck(cudaMallocAsync((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
cudaCheck(CUDA_MALLOC((void**)&mGpuData, size, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
checkPtr(mGpuData, "CudaDeviceBuffer::init: failed to allocate device buffer");
}
mSize = size;
Expand All @@ -163,7 +163,7 @@ inline void CudaDeviceBuffer::deviceUpload(void* stream, bool sync) const
{
checkPtr(mCpuData, "uninitialized cpu data");
if (mGpuData == nullptr) {
cudaCheck(cudaMallocAsync((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
cudaCheck(CUDA_MALLOC((void**)&mGpuData, mSize, reinterpret_cast<cudaStream_t>(stream))); // un-managed memory on the device, always 32B aligned!
}
checkPtr(mGpuData, "uninitialized gpu data");
cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream)));
Expand All @@ -183,7 +183,7 @@ inline void CudaDeviceBuffer::deviceDownload(void* stream, bool sync) const

inline void CudaDeviceBuffer::clear(void *stream)
{
if (mGpuData) cudaCheck(cudaFreeAsync(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
if (mGpuData) cudaCheck(CUDA_FREE(mGpuData, reinterpret_cast<cudaStream_t>(stream)));
if (mCpuData) cudaCheck(cudaFreeHost(mCpuData));
mCpuData = mGpuData = nullptr;
mSize = 0;
Expand Down
32 changes: 16 additions & 16 deletions nanovdb/nanovdb/util/cuda/CudaGridChecksum.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ __global__ void checksumKernel(const T *d_data, uint32_t* d_blockCRC, uint32_t b
inline uint32_t* cudaCreateLut(cudaStream_t stream = 0)
{
uint32_t *d_lut;
cudaCheck(cudaMallocAsync((void**)&d_lut, 256*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_lut, 256*sizeof(uint32_t), stream));
initLutKernel<<<1, 256, 0, stream>>>(d_lut);
cudaCheckError();
return d_lut;
Expand All @@ -93,10 +93,10 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
uint8_t *d_begin = reinterpret_cast<uint8_t*>(d_gridData);
uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
uint64_t size[2], *d_size;// {total size of grid, partial size for first checksum}
cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream));

// Compute CRC32 checksum of GridData, TreeData, RootData (+tiles), but exclude GridData::mMagic and GridData::mChecksum
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__ (size_t) {
d_size[0] = d_gridData->mGridSize;
uint8_t *d_mid = d_gridData->template nodePtr<2>();
if (d_mid == nullptr) {// no upper nodes
Expand All @@ -112,15 +112,15 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(size, d_size, 2*sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(CUDA_FREE(d_size, stream));

if (mode != ChecksumMode::Full || size[0] == size[1]) return ChecksumMode::Partial;

// Compute CRC32 checksum of 4K block of everything remaining in the buffer, i.e. nodes and blind data
const uint8_t *d_mid = d_begin + size[1], *d_end = d_begin + size[0];
uint32_t *d_checksums;// 4096 byte chunks
const uint64_t checksumCount = (d_end - d_mid) >> NANOVDB_CRC32_LOG2_BLOCK_SIZE;// 4 KB (4096 byte)
cudaCheck(cudaMallocAsync((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksums, checksumCount*sizeof(uint32_t), stream));
cudaLambdaKernel<<<numBlocks(checksumCount), mNumThreads, 0, stream>>>(checksumCount, [=] __device__(size_t tid) {
uint32_t size = 1<<NANOVDB_CRC32_LOG2_BLOCK_SIZE;
if (tid+1 == checksumCount) size += d_end - d_mid - (checksumCount<<NANOVDB_CRC32_LOG2_BLOCK_SIZE);
Expand All @@ -132,8 +132,8 @@ inline ChecksumMode cudaGridChecksum(GridData *d_gridData, ChecksumMode mode = C
p[1] = crc32::checksum((const uint8_t*)d_checksums, checksumCount*sizeof(uint32_t), d_lut);
});
cudaCheckError();
cudaCheck(cudaFreeAsync(d_checksums, stream));
cudaCheck(cudaFreeAsync(d_lut, stream));
cudaCheck(CUDA_FREE(d_checksums, stream));
cudaCheck(CUDA_FREE(d_lut, stream));

return ChecksumMode::Full;
}// cudaGridChecksum
Expand All @@ -147,11 +147,11 @@ inline ChecksumMode cudaGridChecksum(NanoGrid<BuildT> *d_grid, ChecksumMode mode
inline GridChecksum cudaGetGridChecksum(GridData *d_gridData, cudaStream_t stream = 0)
{
uint64_t checksum, *d_checksum;
cudaCheck(cudaMallocAsync((void**)&d_checksum, sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksum, sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {*d_checksum = d_gridData->mChecksum;});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&checksum, d_checksum, sizeof(uint64_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_checksum, stream));
cudaCheck(CUDA_FREE(d_checksum, stream));
return GridChecksum(checksum);;
}

Expand All @@ -172,7 +172,7 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode

uint32_t *d_lut = crc32::cudaCreateLut(stream);// allocate and generate device LUT for CRC32
uint64_t size[2], *d_size;
cudaCheck(cudaMallocAsync((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, 2*sizeof(uint64_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
d_size[0] = d_grid->gridSize();
d_size[1] = d_grid->memUsage() + d_grid->tree().memUsage() + d_grid->tree().root().memUsage();
Expand All @@ -188,15 +188,15 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode

// Get node counts
uint32_t nodeCount[3], *d_nodeCount, *d_checksums, *d_ptr;
cudaCheck(cudaMallocAsync((void**)&d_nodeCount, 3*sizeof(uint32_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_nodeCount, 3*sizeof(uint32_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
auto &tree = d_grid->tree();
for (int i = 0; i < 3; ++i) d_nodeCount[i] = tree.nodeCount(i);
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(nodeCount, d_nodeCount, 3*sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_nodeCount, stream));
cudaCheck(cudaMallocAsync((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream));
cudaCheck(CUDA_FREE(d_nodeCount, stream));
cudaCheck(CUDA_MALLOC((void**)&d_checksums, (nodeCount[0]+nodeCount[1]+nodeCount[2])*sizeof(uint32_t), stream));

auto nodeMgrHandle = cudaCreateNodeManager<ValueT, CudaDeviceBuffer>(d_grid, CudaDeviceBuffer(), stream);
auto *d_nodeMgr = nodeMgrHandle.template deviceMgr<ValueT>();
Expand Down Expand Up @@ -232,9 +232,9 @@ void cudaGridChecksum(NanoGrid<ValueT> *d_grid, ChecksumMode mode = ChecksumMode
});
cudaCheckError();

cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(cudaFreeAsync(d_checksums, stream));
cudaCheck(cudaFreeAsync(d_lut, stream));
cudaCheck(CUDA_FREE(d_size, stream));
cudaCheck(CUDA_FREE(d_checksums, stream));
cudaCheck(CUDA_FREE(d_lut, stream));
}// cudaGridChecksum

#endif
Expand Down
8 changes: 4 additions & 4 deletions nanovdb/nanovdb/util/cuda/CudaGridHandle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ cudaSplitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = n
if (ptr == nullptr) return VectorT<GridHandle<BufferT>>();
VectorT<GridHandle<BufferT>> handles(handle.gridCount());
bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream));
for (uint32_t n=0; n<handle.gridCount(); ++n) {
auto buffer = BufferT::create(handle.gridSize(n), other, false, stream);
GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
Expand All @@ -84,7 +84,7 @@ cudaSplitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = n
handles[n] = GridHandle<BufferT>(std::move(buffer));
ptr += handle.gridSize(n);
}
cudaCheck(cudaFreeAsync(d_dirty, stream));
cudaCheck(CUDA_FREE(d_dirty, stream));
return std::move(handles);
}// cudaSplitGridHandles

Expand All @@ -106,7 +106,7 @@ cudaMergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT*
auto buffer = BufferT::create(size, other, false, stream);
uint8_t *dst = buffer.deviceData();
bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
cudaCheck(cudaMallocAsync((void**)&d_dirty, sizeof(bool), stream));
cudaCheck(CUDA_MALLOC((void**)&d_dirty, sizeof(bool), stream));
for (auto &h : handles) {
const uint8_t *src = h.deviceData();
for (uint32_t n=0; n<h.gridCount(); ++n) {
Expand All @@ -120,7 +120,7 @@ cudaMergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT*
src += h.gridSize(n);
}
}
cudaCheck(cudaFreeAsync(d_dirty, stream));
cudaCheck(CUDA_FREE(d_dirty, stream));
return GridHandle<BufferT>(std::move(buffer));
}// cudaMergeGridHandles

Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaGridStats.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ void CudaGridStats<BuildT, StatsT>::operator()(NanoGrid<BuildT> *d_grid, cudaStr

StatsT *d_stats = nullptr;

if constexpr(StatsT::hasAverage()) cudaCheck(cudaMallocAsync((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));
if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_MALLOC((void**)&d_stats, nodeCount[0]*sizeof(StatsT), stream));

processLeaf<BuildT><<<blocksPerGrid(nodeCount[0]), threadsPerBlock, 0, stream>>>(d_nodeMgr, d_stats);

Expand All @@ -220,7 +220,7 @@ void CudaGridStats<BuildT, StatsT>::operator()(NanoGrid<BuildT> *d_grid, cudaStr

processRootAndGrid<BuildT><<<1, 1, 0, stream>>>(d_nodeMgr, d_stats);

if constexpr(StatsT::hasAverage()) cudaCheck(cudaFreeAsync(d_stats, stream));
if constexpr(StatsT::hasAverage()) cudaCheck(CUDA_FREE(d_stats, stream));

} // CudaGridStats::operator()( Grid )

Expand Down
8 changes: 4 additions & 4 deletions nanovdb/nanovdb/util/cuda/CudaIndexToGrid.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public:
/// @param srcGrid Device pointer to IndexGrid used as the source
CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStream_t stream = 0);

~CudaIndexToGrid() {cudaCheck(cudaFreeAsync(mDevNodeAcc, mStream));}
~CudaIndexToGrid() {cudaCheck(CUDA_FREE(mDevNodeAcc, mStream));}

/// @brief Toggle on and off verbose mode
/// @param on if true verbose is turned on
Expand Down Expand Up @@ -295,7 +295,7 @@ CudaIndexToGrid<SrcBuildT>::CudaIndexToGrid(const SrcGridT *d_srcGrid, cudaStrea
: mStream(stream), mTimer(stream)
{
NANOVDB_ASSERT(d_srcGrid);
cudaCheck(cudaMallocAsync((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
cudaCheck(CUDA_MALLOC((void**)&mDevNodeAcc, sizeof(NodeAccessor), mStream));
cudaCpyNodeCount<SrcBuildT><<<1, 1, 0, mStream>>>(d_srcGrid, mDevNodeAcc);
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&mNodeAcc, mDevNodeAcc, sizeof(NodeAccessor), cudaMemcpyDeviceToHost, mStream));// mNodeAcc = *mDevNodeAcc
Expand All @@ -319,7 +319,7 @@ GridHandle<BufferT> CudaIndexToGrid<SrcBuildT>::getHandle(const typename BuildTo
cudaProcessRootTiles<SrcBuildT,DstBuildT><<<mNodeAcc.nodeCount[3], 1, 0, mStream>>>(mDevNodeAcc, srcValues);
cudaCheckError();

cudaCheck(cudaFreeAsync(mNodeAcc.d_gridName, mStream));
cudaCheck(CUDA_FREE(mNodeAcc.d_gridName, mStream));

if (mVerbose) mTimer.restart("Process upper internal nodes");
cudaProcessInternalNodes<SrcBuildT,DstBuildT,2><<<mNodeAcc.nodeCount[2], dim3(32,32), 0, mStream>>>(mDevNodeAcc, srcValues);
Expand Down Expand Up @@ -362,7 +362,7 @@ inline BufferT CudaIndexToGrid<SrcBuildT>::getBuffer(const BufferT &pool)
if (mNodeAcc.d_dstPtr == nullptr) throw std::runtime_error("Failed memory allocation on the device");

if (size_t size = mGridName.size()) {
cudaCheck(cudaMallocAsync((void**)&mNodeAcc.d_gridName, size, mStream));
cudaCheck(CUDA_MALLOC((void**)&mNodeAcc.d_gridName, size, mStream));
cudaCheck(cudaMemcpyAsync(mNodeAcc.d_gridName, mGridName.data(), size, cudaMemcpyHostToDevice, mStream));
} else {
mNodeAcc.d_gridName = nullptr;
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaNodeManager.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
auto buffer = BufferT::create(sizeof(NodeManagerData), &pool, false, stream);
auto *d_data = (NodeManagerData*)buffer.deviceData();
size_t size = 0u, *d_size;
cudaCheck(cudaMallocAsync((void**)&d_size, sizeof(size_t), stream));
cudaCheck(CUDA_MALLOC((void**)&d_size, sizeof(size_t), stream));
cudaLambdaKernel<<<1, 1, 0, stream>>>(1, [=] __device__(size_t) {
#ifdef NANOVDB_USE_NEW_MAGIC_NUMBERS
*d_data = NodeManagerData{NANOVDB_MAGIC_NODE, 0u, (void*)d_grid, {0u,0u,0u}};
Expand All @@ -58,7 +58,7 @@ cudaCreateNodeManager(const NanoGrid<BuildT> *d_grid,
});
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&size, d_size, sizeof(size_t), cudaMemcpyDeviceToHost, stream));
cudaCheck(cudaFreeAsync(d_size, stream));
cudaCheck(CUDA_FREE(d_size, stream));
if (size > sizeof(NodeManagerData)) {
auto tmp = BufferT::create(size, &pool, false, stream);// only allocate buffer on the device
cudaCheck(cudaMemcpyAsync(tmp.deviceData(), buffer.deviceData(), sizeof(NodeManagerData), cudaMemcpyDeviceToDevice, stream));
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaPointsToGrid.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ public:
{
mData.map = map;
mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
cudaCheck(cudaMallocAsync((void**)&mDeviceData, sizeof(Data), mStream));
cudaCheck(CUDA_MALLOC((void**)&mDeviceData, sizeof(Data), mStream));
}

/// @brief Default constructor
Expand All @@ -276,7 +276,7 @@ public:
: CudaPointsToGrid(Map(scale, trans), stream) {}

/// @brief Destructor
~CudaPointsToGrid() {cudaCheck(cudaFreeAsync(mDeviceData, mStream));}
~CudaPointsToGrid() {cudaCheck(CUDA_FREE(mDeviceData, mStream));}

/// @brief Toggle on and off verbose mode
/// @param level Verbose level: 0=quiet, 1=timing, 2=benchmarking
Expand Down
4 changes: 2 additions & 2 deletions nanovdb/nanovdb/util/cuda/CudaSignedFloodFill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,11 @@ void CudaSignedFloodFill<BuildT>::operator()(NanoGrid<BuildT> *d_grid)
static_assert(BuildTraits<BuildT>::is_float, "CudaSignedFloodFill only works on float grids");
NANOVDB_ASSERT(d_grid);
uint64_t count[4], *d_count = nullptr;
cudaCheck(cudaMallocAsync((void**)&d_count, 4*sizeof(uint64_t), mStream));
cudaCheck(CUDA_MALLOC((void**)&d_count, 4*sizeof(uint64_t), mStream));
cudaCpyNodeCount<BuildT><<<1, 1, 0, mStream>>>(d_grid, d_count);
cudaCheckError();
cudaCheck(cudaMemcpyAsync(&count, d_count, 4*sizeof(uint64_t), cudaMemcpyDeviceToHost, mStream));
cudaCheck(cudaFreeAsync(d_count, mStream));
cudaCheck(CUDA_FREE(d_count, mStream));

static const int threadsPerBlock = 128;
auto blocksPerGrid = [&](size_t count)->uint32_t{return (count + (threadsPerBlock - 1)) / threadsPerBlock;};
Expand Down
45 changes: 30 additions & 15 deletions nanovdb/nanovdb/util/cuda/CudaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,35 @@
cudaCheck(cudaGetLastError()); \
}

#if CUDART_VERSION < 11020 // 11.2 introduced cudaMallocAsync and cudaFreeAsync

/// @brief Dummy implementation of cudaMallocAsync that calls cudaMalloc
/// @param d_ptr Device pointer to allocated device memory
/// @param size Number of bytes to allocate
/// @param dummy The stream establishing the stream ordering contract and the memory pool to allocate from (ignored)
/// @return Cuda error code
inline cudaError_t cudaMallocAsync(void** d_ptr, size_t size, cudaStream_t){return cudaMalloc(d_ptr, size);}

/// @brief Dummy implementation of cudaFreeAsync that calls cudaFree
/// @param d_ptr Device pointer that will be freed
/// @param dummy The stream establishing the stream ordering promise (ignored)
/// @return Cuda error code
inline cudaError_t cudaFreeAsync(void* d_ptr, cudaStream_t){return cudaFree(d_ptr);}
// cudaMallocAsync and cudaFreeAsync were introduced in CUDA 11.2, for older CUDA
// versions fall back to cudaMalloc and cudaFree. The fallback can also be forced
// using the NANOVDB_USE_SYNC_CUDA_MALLOC flag. This may be useful when deploying nanoVDB
// code in virtualized environments that share the GPU between instances by slicing
// it up into vGPU's. In such environments GPU unified memory is usually disabled
// out of security considerations, which means cudaMallocAsync can not be used.
#if (CUDART_VERSION < 11020) || defined(NANOVDB_USE_SYNC_CUDA_MALLOC)

#define CUDA_MALLOC(d_ptr, size, stream) \
{ \
cudaMalloc((d_ptr), (size)); \
}

#define CUDA_FREE(d_ptr, stream) \
{ \
cudaFree((d_ptr)); \
}

#else

#define CUDA_MALLOC(d_ptr, size, stream) \
{ \
cudaMallocAsync((d_ptr), (size), (stream)); \
}

#define CUDA_FREE(d_ptr, stream) \
{ \
cudaFreeAsync((d_ptr), (stream)); \
}

#endif

Expand Down Expand Up @@ -133,4 +148,4 @@ __device__ inline bool cudaStrEq(const char *lhs, const char *rhs)

#endif// __CUDACC__

#endif// NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED
#endif// NANOVDB_CUDA_UTILS_H_HAS_BEEN_INCLUDED