Skip to content

Example: Gaussian Blur

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 18, 2021 · 3 revisions

Mandelbrot set for easy verification: https://medium.com/farouk-ounanes-home-on-the-internet/mandelbrot-set-in-c-from-scratch-c7ad6a1bf2d9

Blur algorithm for benchmarking: https://www.geeksforgeeks.org/gaussian-filter-generation-c/

Scalar:

Recursive Z-order pattern (by scalar get/set methods of virtual array) to use LRU-cache of the virtual array: https://stackoverflow.com/questions/19214368/given-a-2d-array-convert-to-z-order

Resolution: 1024x1024

total virtual gpus:278
RAM cache / Virtual Array ratio:0.509033
Starting cpu-compute on gpu-array
mandelbrot compute finished. starting gaussian blur.
pixel stream for Gaussian Blur: 1057027897 nanoseconds     (bandwidth = 214.27 MB/s)      (throughput = 18.67 nanoseconds per iteration) 
blur complete. writing to file.
File saved
cache hit ratio=1.00

Tiled:

Non-z-order tiled image processing where each thread works on a tile of 128x128 pixels to minimize average page-lock latency by bulk-read operations and still benefit from least-recently-used caching.

Resolution: 2048x2048

total virtual gpus:59
RAM cache / Virtual Array ratio:0.20166
Starting cpu-compute on gpu-array
mandelbrot compute finished. starting gaussian blur.
pixel stream for Gaussian Blur: 794146138 nanoseconds     (bandwidth = 1140.81 MB/s)      (throughput = 3.51 nanoseconds per iteration) 
blur complete. writing to file.
File saved
cache hit ratio=1.00

Scalar source code:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "CpuBenchmarker.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include <fstream>
#include <complex>
#include <omp.h>
#include <algorithm>

#ifndef M_PI
#define M_PI (3.14159265359)
#endif

class Color
{
public:
	Color(){val=0; }
	Color(int v){val=v;}
	int val;
};

const int width  = 1024;
const int height = 1024;

int value ( int x, int y)  {std::complex<float> point((float)x/width-1.5, (float)y/height-0.5);std::complex<float> z(0, 0);
    unsigned int nb_iter = 0;
    while (abs (z) < 2 && nb_iter <= 34) {
           z = z * z + point;
           nb_iter++;
    }
    if (nb_iter < 34) return (255*nb_iter)/33;
    else return 0;
}

void filterCreation(double GKernel[][5])
{
    // intialising standard deviation to 1.0
    double sigma = 1.0;
    double r, s = 2.0 * sigma * sigma;

    // sum is for normalization
    double sum = 0.0;

    // generating 5x5 kernel
    for (int x = -2; x <= 2; x++) {
        for (int y = -2; y <= 2; y++) {
            r = sqrt(x * x + y * y);
            GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
            sum += GKernel[x + 2][y + 2];
        }
    }

    // normalising the Kernel
    for (int i = 0; i < 5; ++i)
        for (int j = 0; j < 5; ++j)
            GKernel[i][j] /= sum;
}

void zorder(const int * source, int y0, int x0, int size,
            int * target, const int N, int * counter)
{

    if (size == 1) {

        target[(*counter)++]=source[y0*N+x0];
    } else {

        int h = size/2;
        zorder(source, y0,   x0,   h, target,N,counter);
        zorder(source, y0,   x0+h, h, target,N,counter);
        zorder(source, y0+h, x0,   h, target,N,counter);
        zorder(source, y0+h, x0+h, h, target,N,counter);
    }
}

int main(int argC, char ** argV)
{
	GraphicsCardSupplyDepot depot;

	// n needs to be integer multiple of pageSize !!!!
	const size_t n = width*height;
	const size_t pageSize=128;
	const int maxActivePagesPerGpu = 15;
	PcieBandwidthBenchmarker benchmarker;
	std::vector<int> bw = benchmarker.bestBandwidth(60);
	int tot = 0;
	for(auto & e: bw)
		tot+=e;
	std::cout<<"total virtual gpus:"<<tot<<std::endl;
	std::cout<<"RAM cache / Virtual Array ratio:"<<tot*pageSize*maxActivePagesPerGpu / ((double)n)<<std::endl;

	bool cacheHitRatioQueryEnable = true;
	VirtualMultiArray<Color> img(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,cacheHitRatioQueryEnable);
	VirtualMultiArray<Color> img2(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,cacheHitRatioQueryEnable);
	std::cout<<"Starting cpu-compute on gpu-array"<<std::endl;

    #pragma omp parallel for num_threads(8) shared(img)
	for (int i = 0; i < width; i++) {
	    for (int j = 0; j < height; j++)  {
		 img.set(j+i*width,Color(value(i, j)));
	    }
        }

	std::cout<<"mandelbrot compute finished. starting gaussian blur."<<std::endl;

    double gKernel[5][5];
    filterCreation(gKernel);
    int counter = 0;
	    std::vector<int> order,zOrder;
	    int ct=0;
	    for(int i=0;i<width;i++)
	    	for(int ii=0;ii<width;ii++)
	    	{
	    		order.push_back(ct);
	    		ct++;
	    		zOrder.push_back(0);
	    	}

	zorder(order.data(), 0, 0, width, zOrder.data(),width,&counter);


    // 2x blur
	{
		CpuBenchmarker bench(n*54*sizeof(Color),"pixel stream for Gaussian Blur",n*54);

		#pragma omp parallel for num_threads(64)
    	for(int i=0;i<n;i++)
		{
			auto center = img.get(zOrder[i]);
			float v = 0;
			for(int iy=-2;iy<=2;iy++)
				for(int ix=-2;ix<=2;ix++)
				{
					if(zOrder[i]+ix+iy*width>=0 && zOrder[i]+ix+iy*width<n)
					{
						auto neighbor = img.get(zOrder[i]+ix+iy*width);
						v+=neighbor.val * gKernel[ix+2][iy+2];
					}
				}
			center.val=v;
			img2.set(zOrder[i],center);
		}

		#pragma omp parallel for num_threads(64)
		for(int i=0;i<n;i++)
		{
			auto center = img2.get(zOrder[i]);
			float v = 0;
			for(int iy=-2;iy<=2;iy++)
				for(int ix=-2;ix<=2;ix++)
				{
					if(zOrder[i]+ix+iy*width>=0 && zOrder[i]+ix+iy*width<n)
					{
						auto neighbor = img2.get(zOrder[i]+ix+iy*width);
						v+=neighbor.val * gKernel[ix+2][iy+2];
					}
				}
			center.val=v;
			img.set(zOrder[i],center);
		}
	}
    std::cout<<"blur complete. writing to file."<<std::endl;

	std::ofstream my_Image ("mandelbrot.ppm");
	if (my_Image.is_open ()) {
	     my_Image << "P3\n" << width << " " << height << " 255\n";
	     for (int i = 0; i < width; i++) {
	          for (int j = 0; j < height; j++)  {
	               int val = img.get(j+i*width).val;
	               my_Image << val<< ' ' << 0 << ' ' << 0 << "\n";
	          }
	     }
	     my_Image.close();
	}
	else std::cout << "Could not open the file";
	std::cout<<"File saved"<<std::endl;
	std::cout<<"cache hit ratio="<<img.getTotalCacheHitRatio()<<std::endl;
	return 0;
}

Tiled version source code:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "CpuBenchmarker.h"
#include "PcieBandwidthBenchmarker.h"

// testing
#include <iostream>
#include <fstream>
#include <complex>
#include <omp.h>
#include <algorithm>

#ifndef M_PI
#define M_PI (3.14159265359)
#endif

class Color
{
public:
	Color(){val=0; }
	Color(int v){val=v;}
	int val;
};

const int width  = 1024*2;
const int height = 1024*2;

int value ( int x, int y)  {std::complex<float> point((float)x/width-1.5, (float)y/height-0.5);std::complex<float> z(0, 0);
    unsigned int nb_iter = 0;
    while (abs (z) < 2 && nb_iter <= 34) {
           z = z * z + point;
           nb_iter++;
    }
    if (nb_iter < 34) return (255*nb_iter)/33;
    else return 0;
}

void filterCreation(double GKernel[][5])
{
    // intialising standard deviation to 1.0
    double sigma = 1.0;
    double r, s = 2.0 * sigma * sigma;

    // sum is for normalization
    double sum = 0.0;

    // generating 5x5 kernel
    for (int x = -2; x <= 2; x++) {
        for (int y = -2; y <= 2; y++) {
            r = sqrt(x * x + y * y);
            GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
            sum += GKernel[x + 2][y + 2];
        }
    }

    // normalising the Kernel
    for (int i = 0; i < 5; ++i)
        for (int j = 0; j < 5; ++j)
            GKernel[i][j] /= sum;
}

int main(int argC, char ** argV)
{
	GraphicsCardSupplyDepot depot;

	// n needs to be integer multiple of pageSize !!!!
	const size_t n = width*height;
	const size_t pageSize=width;
	const int maxActivePagesPerGpu = 7;

	PcieBandwidthBenchmarker benchPci;
	auto bw = benchPci.bestBandwidth(13);
	int tot = 0;
	for(auto & e: bw)
		tot+=e;
	std::cout<<"total virtual gpus:"<<tot<<std::endl;
	std::cout<<"RAM cache / Virtual Array ratio:"<<tot*pageSize*maxActivePagesPerGpu / ((double)n)<<std::endl;
	VirtualMultiArray<Color> img(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,true);
	VirtualMultiArray<Color> img2(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,true);
	std::cout<<"Starting cpu-compute on gpu-array"<<std::endl;

    #pragma omp parallel for num_threads(8) shared(img)
	for (int i = 0; i < width; i++) {
	    for (int j = 0; j < height; j++)  {
		 img.set(j+i*width,Color(value(i, j)));
	    }
        }

	std::cout<<"mandelbrot compute finished. starting gaussian blur."<<std::endl;

    double gKernel[5][5];
    filterCreation(gKernel);

    // 2x blur
    try
	{
		const int tileX = 128;
		const int tileY = 128;


		CpuBenchmarker bench(n*54*sizeof(Color),"pixel stream for Gaussian Blur",n*54);


    	for(int i0y=0;i0y<height/tileY;i0y++)
			#pragma omp parallel for num_threads(16)
    		for(int i0x=0;i0x<width/tileX;i0x++)
		{
    		const int i=i0y*tileY*width + i0x*tileX;
    		Color tile[(tileX+5)*(tileY+5)];
    		for(int scan=0;scan<tileY;scan++)
    		{
    			auto line=img.readOnlyGetN(i+scan*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(scan+2)*(tileX+5) + (copy+2)] = line[copy];
    			}
    		}

    		if(i0y>=1)
    		{
    			auto line1=img.readOnlyGetN(i-width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(-1+2)*(tileX+5) + (copy+2)] = line1[copy];
    			}
    			auto line2=img.readOnlyGetN(i-2*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(-2+2)*(tileX+5) + (copy+2)] = line2[copy];
    			}
    		}

    		if(i0x>=1)
    		{
    			for(int scan=0;scan<tileY;scan++)
    			{
					auto line1=img.readOnlyGetN(i-2+scan*width,2);
					tile[(scan+2)*(tileX+5) + (-2+2)] = line1[0];
					tile[(scan+2)*(tileX+5) + (-1+2)] = line1[1];
    			}
    		}



    		if(i0y<(height/tileY - 1))
    		{
    			auto lineLast1=img.readOnlyGetN(i+tileY*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(tileY+2)*(tileX+5) + (copy+2)] = lineLast1[copy];
    			}
    			auto lineLast2=img.readOnlyGetN(i+(tileY+1)*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(tileY+1+2)*(tileX+5) + (copy+2)] = lineLast2[copy];
    			}
    		}

    		if(i0x<(width/tileX-1))
    		{
    			for(int scan=0;scan<tileY;scan++)
    			{
					auto line1=img.readOnlyGetN(i+scan*width + tileX,2);
					tile[(scan+2)*(tileX+5) + (tileX+2)] = line1[0];
					tile[(scan+2)*(tileX+5) + (tileX+2+1)] = line1[1];
    			}
    		}

    		for(int ty=0;ty<tileY;ty++)
    		for(int tx=0;tx<tileX;tx++)
    		{
				auto center = tile[tx+2+(ty+2)*(tileX+5)];
				float v = 0;
				for(int iy=-2;iy<=2;iy++)
					for(int ix=-2;ix<=2;ix++)
					{
						auto neighbor = tile[tx+ix+2+(ty+iy+2)*(tileX+5)];
						v+=neighbor.val * gKernel[ix+2][iy+2];
					}
				center.val=v;
				img2.set(i+ty*width+tx,center);
    		}
		}



		for(int i0y=0;i0y<height/tileY;i0y++)
			#pragma omp parallel for num_threads(16)
			for(int i0x=0;i0x<width/tileX;i0x++)
		{
			const int i=i0y*tileY*width + i0x*tileX;
			Color tile[(tileX+5)*(tileY+5)];
			for(int scan=0;scan<tileY;scan++)
			{
				auto line=img2.readOnlyGetN(i+scan*width,tileX);
				for(int copy=0;copy<tileX;copy++)
				{
					tile[(scan+2)*(tileX+5) + (copy+2)] = line[copy];
				}
			}

    		if(i0y>=1)
    		{
    			auto line1=img2.readOnlyGetN(i-width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(-1+2)*(tileX+5) + (copy+2)] = line1[copy];
    			}
    			auto line2=img2.readOnlyGetN(i-2*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(-2+2)*(tileX+5) + (copy+2)] = line2[copy];
    			}
    		}

    		if(i0x>=1)
    		{
    			for(int scan=0;scan<tileY;scan++)
    			{
					auto line1=img2.readOnlyGetN(i-2+scan*width,2);
					tile[(scan+2)*(tileX+5) + (-2+2)] = line1[0];
					tile[(scan+2)*(tileX+5) + (-1+2)] = line1[1];
    			}
    		}

    		if(i0y<height/tileY - 1)
    		{
    			auto lineLast1=img2.readOnlyGetN(i+tileY*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(tileY+2)*(tileX+5) + (copy+2)] = lineLast1[copy];
    			}
    			auto lineLast2=img2.readOnlyGetN(i+(tileY+1)*width,tileX);
    			for(int copy=0;copy<tileX;copy++)
    			{
    				tile[(tileY+1+2)*(tileX+5) + (copy+2)] = lineLast2[copy];
    			}
    		}


    		if(i0x<(width/tileX-1))
    		{
    			for(int scan=0;scan<tileY;scan++)
    			{
					auto line1=img2.readOnlyGetN(i+scan*width + tileX,2);
					tile[(scan+2)*(tileX+5) + (tileX+2)] = line1[0];
					tile[(scan+2)*(tileX+5) + (tileX+2+1)] = line1[1];
    			}
    		}
			for(int ty=0;ty<tileY;ty++)
			for(int tx=0;tx<tileX;tx++)
			{
				auto center = tile[tx+2+(ty+2)*(tileX+5)];
				float v = 0;
				for(int iy=-2;iy<=2;iy++)
					for(int ix=-2;ix<=2;ix++)
					{
						auto neighbor = tile[tx+ix+2+(ty+iy+2)*(tileX+5)];
						v+=neighbor.val * gKernel[ix+2][iy+2];
					}
				center.val=v;
				img.set(i+ty*width+tx,center);
			}
		}

	}
    catch(std::exception & e)
    {
    	std::cout<<e.what()<<std::endl;
    }
    std::cout<<"blur complete. writing to file."<<std::endl;

	std::ofstream my_Image ("mandelbrot.ppm");
	if (my_Image.is_open ()) {
	     my_Image << "P3\n" << width << " " << height << " 255\n";
	     for (int i = 0; i < width; i++) {
	          for (int j = 0; j < height; j++)  {
	               int val = img.get(j+i*width).val;
	               my_Image << val<< ' ' << 0 << ' ' << 0 << "\n";
	          }
	     }
	     my_Image.close();
	}
	else std::cout << "Could not open the file";
	std::cout<<"File saved"<<std::endl;
	std::cout<<"cache hit ratio="<<img.getTotalCacheHitRatio()<<std::endl;
	return 0;
}