-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheigen_GPU_batch.hpp
119 lines (98 loc) · 3.27 KB
/
eigen_GPU_batch.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#ifndef __HEADER_EIGEN_GPU_BATCH_HPP__
#define __HEADER_EIGEN_GPU_BATCH_HPP__
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <type_traits>
#if defined(__NVCC__)
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#if defined(__HIPACC__)
#include <hip/hip_runtime.h>
#endif
#include "gpu_arch.h"
#include "eigen_GPU_batch.h"
template <class T>
__host__ gpuError_t
eigen_GPU_batch_get_Launch_params(const int L, const int n, int &grpSZ, int &numTB, int &numGR, int &numTH, int &sizeSH, int &sizeL2)
{
grpSZ=numTB=numGR=numTH=sizeSH=0;
if (L<1||n<1) return gpuErrorInvalidValue;
gpuError_t err_code;
int dev;
err_code = gpuGetDevice( &dev );
if ( err_code != gpuSuccess ) return err_code;
gpuDeviceProp deviceProp;
err_code = gpuGetDeviceProperties (&deviceProp, dev);
if ( err_code != gpuSuccess ) return err_code;
// groupSize = tile_size
grpSZ = (n<=4) ?
4 : ((n<=8) ?
8 : ((n<=16) ?
16 : ((n<=32) ?
32 : WARP_GPU_SIZE )));
// The number of ThreadBlocks to be invoked per SM
// numTB = deviceProp.multiProcessorCount*12;
numTB = deviceProp.multiProcessorCount*4;
// The number of Thread per ThreadBlock
// numTH = WARP_GPU_SIZE*8;
numTH = WARP_GPU_SIZE*4;
// The number of groups per ThreadBlock
numGR = numTH/grpSZ;
// The required shared memory size per ThreadBlock
sizeSH = sizeof(T)*numTH*2;
const int numWP = (L-1)/(WARP_GPU_SIZE/grpSZ)+1;
const int minTB = (numWP-1)/(numTH/WARP_GPU_SIZE)+1;
numTB = min(numTB, minTB);
// These parameter controlls assignment of threads and batches, statically.
// Therefore, if numGR*numTB >> L, some CUDA cores are idling, and
// dynamic schedulling will be effective in such a case.
sizeL2 = deviceProp.l2CacheSize;
return gpuSuccess;
}
template <class T>
__host__ gpuError_t
eigen_GPU_batch_BufferSize(const int L, const int nm, const int n, const int m, T * a, T * w, size_t *lwork)
{
*lwork = 0;
if (L<1||nm<n||n<1||m<1||m>n) return gpuErrorInvalidValue;
int grpSZ=0, numTB=0, numGR=0, numTH=0, sizeSH=0, sizeL2=0;
gpuError_t err_code = eigen_GPU_batch_get_Launch_params<T>(L, n, grpSZ, numTB, numGR, numTH, sizeSH, sizeL2);
if ( err_code != gpuSuccess ) return err_code;
size_t elem = sizeof(T)*(n+(n*nm));
size_t len = elem*min(numTB*numGR,L);
*lwork = len;
return gpuSuccess;
}
template <class T>
__host__ void
eigen_GPU_batch(const int L, const int nm, const int n, const int m, T * a, T * w, T * wk, const gpuStream_t stream=NULL)
{
int current_device;
gpuGetDevice(¤t_device);
{
gpuPointerAttributes attr_a, attr_w, attr_wk;
gpuPointerGetAttributes(&attr_a, a);
gpuPointerGetAttributes(&attr_w, w);
gpuPointerGetAttributes(&attr_wk, wk);
if ( attr_a.device != attr_w.device ||
attr_w.device != attr_wk.device ||
attr_wk.device != attr_a.device ) {
fprintf(stderr,"device number confliction\n"); fflush(stderr);
exit(1);
}
gpuSetDevice( attr_a.device );
}
// if (std::is_same<T,half>::value) {
// ;
// }
if (std::is_same<T,float>::value) {
eigen_GPU_batch_FP(L, nm, n, m, (float*)a, (float*)w, (float*)wk, stream);
}
if (std::is_same<T,double>::value) {
eigen_GPU_batch_DP(L, nm, n, m, (double*)a, (double*)w, (double*)wk, stream);
}
gpuSetDevice( current_device );
}
#endif