Example: tiled nbody using bulk copies

Vectorization library from: https://github.com/aff3ct/MIPP

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <iostream>
#include "omp.h"

// a simd tool from github
#define MIPP_ALIGNED_LOADS
#include "mipp/mipp.h"

int main()
{
	const int simd = mipp::N<float>();
	std::cout<<"simd width: "<<simd<<std::endl;

	const int n = simd * 40000;
	std::cout<<"particles: "<< n <<std::endl;

	GraphicsCardSupplyDepot depot;
	// last parameter=true to enable cache hit ratio query
	VirtualMultiArray<float> xVir(n, depot.requestGpus(), 4000, 1,{5,15,10},VirtualMultiArray<float>::MemMult::UseDefault,true,true);
	VirtualMultiArray<float> yVir(n, depot.requestGpus(), 4000, 1,{5,15,10});
	VirtualMultiArray<float> zVir(n, depot.requestGpus(), 4000, 1,{5,15,10});
	VirtualMultiArray<float> vxVir(n, depot.requestGpus(), 4000, 1,{5,15,10});
	VirtualMultiArray<float> vyVir(n, depot.requestGpus(), 4000, 1,{5,15,10});
	VirtualMultiArray<float> vzVir(n, depot.requestGpus(), 4000, 1,{5,15,10});

	// if you dont initialize data, floating point NaN values slow down the speed too much
	{
		CpuBenchmarker init(0,"init");
		#pragma omp parallel for
		for(int i=0;i<n;i+=1000)
		{
			xVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
			yVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
			zVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
			vxVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
			vyVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
			vzVir.mappedReadWriteAccess(i,1000,[i](float * ptr){ for(int j=i;j<i+1000;j++) ptr[j]=j; },false,false,true);
		}
	}


	mipp::Reg<float> smoothing = 0.0001f;
	// mapped array access
	{
		CpuBenchmarker bench(((size_t)n) * n * sizeof(float) * 3, "mapped access for force-calc",((size_t)n)*n);
		const int tileSize = 4000;
		const int regTile = 500;
		#pragma omp parallel for num_threads(32) schedule(dynamic)
		for (int i = 0; i < n; i += regTile)
		{
			std::vector<float> x0 = xVir.readOnlyGetN(i,regTile);
			std::vector<float> y0 = yVir.readOnlyGetN(i,regTile);
			std::vector<float> z0 = zVir.readOnlyGetN(i,regTile);

			mipp::Reg<float> fma1[regTile];
			mipp::Reg<float> fma2[regTile];
			mipp::Reg<float> fma3[regTile];
			for(int j=0;j<regTile;j++)
			{
				fma1[j]=0.0f;
				fma2[j]=0.0f;
				fma3[j]=0.0f;
			}

			for (int ii = 0; ii < n; ii += tileSize)
			{

				xVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrX1) {
					const float* __restrict__ const ptrX = ptrX1 + ii;
					yVir.mappedReadWriteAccess(ii, tileSize, [&,ii](float* ptrY1) {
						const float* __restrict__ const ptrY = ptrY1 + ii;
						zVir.mappedReadWriteAccess(ii, tileSize, [&, ii](float* ptrZ1) {
							const float* __restrict__ const ptrZ = ptrZ1 + ii;


							for (int ld = 0; ld < tileSize; ld += simd)
							{
								mipp::Reg<float> x = mipp::load(ptrX + ld);
								mipp::Reg<float> y = mipp::load(ptrY + ld);
								mipp::Reg<float> z = mipp::load(ptrZ + ld);


								for(int reg0 = 0; reg0 < regTile; reg0++)
								{
									const int reg = reg0 ;
									const mipp::Reg<float> x0r = x0[reg];
									const mipp::Reg<float> y0r = y0[reg];
									const mipp::Reg<float> z0r = z0[reg];

									const mipp::Reg<float> dx = mipp::sub(x,x0r);
									const mipp::Reg<float> dy = mipp::sub(y,y0r);
									const mipp::Reg<float> dz = mipp::sub(z,z0r);
									const mipp::Reg<float> dx2 = mipp::mul(dx,dx);
									const mipp::Reg<float> dy2 = mipp::mul(dy,dy);
									const mipp::Reg<float> dz2 = mipp::mul(dz,dz);
									const mipp::Reg<float> dxy2 = mipp::add(dx2,dy2);
									const mipp::Reg<float> dz2s = mipp::add(dz2,smoothing);
									const mipp::Reg<float> smoothed = mipp::add(dxy2,dz2s);
									const mipp::Reg<float> r = mipp::rsqrt(smoothed);
									const mipp::Reg<float> r3 = mipp::mul(mipp::mul(r, r), r);



									fma1[reg] = mipp::fmadd(dx, r3, fma1[reg]);
									fma2[reg] = mipp::fmadd(dy, r3, fma2[reg]);
									fma3[reg] = mipp::fmadd(dz, r3, fma3[reg]);
								}


							}
						}, false, true, false);
					}, false, true, false);
				}, false, true, false);

			}

			for(int j=0;j<regTile;j++)
			{
				vxVir.set(i+j, vxVir.get(i+j) + mipp::hadd(fma1[j]));
				vyVir.set(i+j, vyVir.get(i+j) + mipp::hadd(fma2[j]));
				vzVir.set(i+j, vzVir.get(i+j) + mipp::hadd(fma3[j]));
			}

		}
	}

	std::cout<<"cache hit ratio = "<<xVir.getTotalCacheHitRatio()<<std::endl;
	return 0;
}

Output with Ubuntu 18.04 LTS, g++-10, fx8150 3.6GHz (turbo off):

simd width: 8
particles: 320000
init: 27506342 nanoseconds    
mapped access for force-calc: 20963330247 nanoseconds     
(bandwidth = 58616.64 MB/s)      (throughput = 0.20 nanoseconds per iteration) 
cache hit ratio = 0.48

Tiling sizes (500 registers tile + 4000 virtual array tile) are too much for just 16 registers of Bulldozer core. So it runs on L1 cache instead. Also 0.20 nanoseconds per force compute means 90 GFLOPS performance. This is a reasonable nbody compute-performance for a non-turbo fx8150 without fitting a whole tile into 16 registers. Bandwidth value shows input x,y,z values of inner particle loop. For every particle, 12bytes (3x floats) are read from memory. Rest(such as fused-multiply-add registers) are not counted in this. When they are added, real bandwidth is 3x of the value, ~180GB/s for this sample and looks like a combination of L1 bandwidth + register bandwidth (some iterations might have been cached by compiler's optimizations) + 48% cache hit ratio related pcie latency.

For OpenMP parallelization, num_threads(32) was used instead of default 8 threads, because each thread does i/o between RAM and VRAM and waiting for i/o completion takes more time than a context-switch when tile size 4000 elements or greater.

With 2 pages per LRU (depot.requestGpus(), 4000, 2 /* 2 pages per LRU */,{5 /* 5-way LRU for gpu1*/,15 /* 15-way LRU for gpu 2*/,10} ):

simd width: 8
particles: 320000
init: 12266012 nanoseconds    
mapped access for force-calc: 20549907052 nanoseconds     (bandwidth = 59795.89 MB/s)      (throughput = 0.20 nanoseconds per iteration) 
cache hit ratio = 0.98

(hidden i/o latency behind computations = tolerated low cache hit ratio)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Example: tiled nbody using bulk copies

Clone this wiki locally