-
Notifications
You must be signed in to change notification settings - Fork 3
Example: Gaussian Blur
Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 18, 2021
·
3 revisions
Mandelbrot set for easy verification: https://medium.com/farouk-ounanes-home-on-the-internet/mandelbrot-set-in-c-from-scratch-c7ad6a1bf2d9
Blur algorithm for benchmarking: https://www.geeksforgeeks.org/gaussian-filter-generation-c/
Recursive Z-order pattern (by scalar get/set methods of virtual array) to use LRU-cache of the virtual array: https://stackoverflow.com/questions/19214368/given-a-2d-array-convert-to-z-order
Resolution: 1024x1024
total virtual gpus:278
RAM cache / Virtual Array ratio:0.509033
Starting cpu-compute on gpu-array
mandelbrot compute finished. starting gaussian blur.
pixel stream for Gaussian Blur: 1057027897 nanoseconds (bandwidth = 214.27 MB/s) (throughput = 18.67 nanoseconds per iteration)
blur complete. writing to file.
File saved
cache hit ratio=1.00
Non-z-order tiled image processing where each thread works on a tile of 128x128 pixels to minimize average page-lock latency by bulk-read operations and still benefit from least-recently-used caching.
Resolution: 2048x2048
total virtual gpus:59
RAM cache / Virtual Array ratio:0.20166
Starting cpu-compute on gpu-array
mandelbrot compute finished. starting gaussian blur.
pixel stream for Gaussian Blur: 794146138 nanoseconds (bandwidth = 1140.81 MB/s) (throughput = 3.51 nanoseconds per iteration)
blur complete. writing to file.
File saved
cache hit ratio=1.00
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "CpuBenchmarker.h"
#include "PcieBandwidthBenchmarker.h"
// testing
#include <iostream>
#include <fstream>
#include <complex>
#include <omp.h>
#include <algorithm>
#ifndef M_PI
#define M_PI (3.14159265359)
#endif
class Color
{
public:
Color(){val=0; }
Color(int v){val=v;}
int val;
};
const int width = 1024;
const int height = 1024;
int value ( int x, int y) {std::complex<float> point((float)x/width-1.5, (float)y/height-0.5);std::complex<float> z(0, 0);
unsigned int nb_iter = 0;
while (abs (z) < 2 && nb_iter <= 34) {
z = z * z + point;
nb_iter++;
}
if (nb_iter < 34) return (255*nb_iter)/33;
else return 0;
}
void filterCreation(double GKernel[][5])
{
// intialising standard deviation to 1.0
double sigma = 1.0;
double r, s = 2.0 * sigma * sigma;
// sum is for normalization
double sum = 0.0;
// generating 5x5 kernel
for (int x = -2; x <= 2; x++) {
for (int y = -2; y <= 2; y++) {
r = sqrt(x * x + y * y);
GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
sum += GKernel[x + 2][y + 2];
}
}
// normalising the Kernel
for (int i = 0; i < 5; ++i)
for (int j = 0; j < 5; ++j)
GKernel[i][j] /= sum;
}
void zorder(const int * source, int y0, int x0, int size,
int * target, const int N, int * counter)
{
if (size == 1) {
target[(*counter)++]=source[y0*N+x0];
} else {
int h = size/2;
zorder(source, y0, x0, h, target,N,counter);
zorder(source, y0, x0+h, h, target,N,counter);
zorder(source, y0+h, x0, h, target,N,counter);
zorder(source, y0+h, x0+h, h, target,N,counter);
}
}
int main(int argC, char ** argV)
{
GraphicsCardSupplyDepot depot;
// n needs to be integer multiple of pageSize !!!!
const size_t n = width*height;
const size_t pageSize=128;
const int maxActivePagesPerGpu = 15;
PcieBandwidthBenchmarker benchmarker;
std::vector<int> bw = benchmarker.bestBandwidth(60);
int tot = 0;
for(auto & e: bw)
tot+=e;
std::cout<<"total virtual gpus:"<<tot<<std::endl;
std::cout<<"RAM cache / Virtual Array ratio:"<<tot*pageSize*maxActivePagesPerGpu / ((double)n)<<std::endl;
bool cacheHitRatioQueryEnable = true;
VirtualMultiArray<Color> img(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,cacheHitRatioQueryEnable);
VirtualMultiArray<Color> img2(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,cacheHitRatioQueryEnable);
std::cout<<"Starting cpu-compute on gpu-array"<<std::endl;
#pragma omp parallel for num_threads(8) shared(img)
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
img.set(j+i*width,Color(value(i, j)));
}
}
std::cout<<"mandelbrot compute finished. starting gaussian blur."<<std::endl;
double gKernel[5][5];
filterCreation(gKernel);
int counter = 0;
std::vector<int> order,zOrder;
int ct=0;
for(int i=0;i<width;i++)
for(int ii=0;ii<width;ii++)
{
order.push_back(ct);
ct++;
zOrder.push_back(0);
}
zorder(order.data(), 0, 0, width, zOrder.data(),width,&counter);
// 2x blur
{
CpuBenchmarker bench(n*54*sizeof(Color),"pixel stream for Gaussian Blur",n*54);
#pragma omp parallel for num_threads(64)
for(int i=0;i<n;i++)
{
auto center = img.get(zOrder[i]);
float v = 0;
for(int iy=-2;iy<=2;iy++)
for(int ix=-2;ix<=2;ix++)
{
if(zOrder[i]+ix+iy*width>=0 && zOrder[i]+ix+iy*width<n)
{
auto neighbor = img.get(zOrder[i]+ix+iy*width);
v+=neighbor.val * gKernel[ix+2][iy+2];
}
}
center.val=v;
img2.set(zOrder[i],center);
}
#pragma omp parallel for num_threads(64)
for(int i=0;i<n;i++)
{
auto center = img2.get(zOrder[i]);
float v = 0;
for(int iy=-2;iy<=2;iy++)
for(int ix=-2;ix<=2;ix++)
{
if(zOrder[i]+ix+iy*width>=0 && zOrder[i]+ix+iy*width<n)
{
auto neighbor = img2.get(zOrder[i]+ix+iy*width);
v+=neighbor.val * gKernel[ix+2][iy+2];
}
}
center.val=v;
img.set(zOrder[i],center);
}
}
std::cout<<"blur complete. writing to file."<<std::endl;
std::ofstream my_Image ("mandelbrot.ppm");
if (my_Image.is_open ()) {
my_Image << "P3\n" << width << " " << height << " 255\n";
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
int val = img.get(j+i*width).val;
my_Image << val<< ' ' << 0 << ' ' << 0 << "\n";
}
}
my_Image.close();
}
else std::cout << "Could not open the file";
std::cout<<"File saved"<<std::endl;
std::cout<<"cache hit ratio="<<img.getTotalCacheHitRatio()<<std::endl;
return 0;
}
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "CpuBenchmarker.h"
#include "PcieBandwidthBenchmarker.h"
// testing
#include <iostream>
#include <fstream>
#include <complex>
#include <omp.h>
#include <algorithm>
#ifndef M_PI
#define M_PI (3.14159265359)
#endif
class Color
{
public:
Color(){val=0; }
Color(int v){val=v;}
int val;
};
const int width = 1024*2;
const int height = 1024*2;
int value ( int x, int y) {std::complex<float> point((float)x/width-1.5, (float)y/height-0.5);std::complex<float> z(0, 0);
unsigned int nb_iter = 0;
while (abs (z) < 2 && nb_iter <= 34) {
z = z * z + point;
nb_iter++;
}
if (nb_iter < 34) return (255*nb_iter)/33;
else return 0;
}
void filterCreation(double GKernel[][5])
{
// intialising standard deviation to 1.0
double sigma = 1.0;
double r, s = 2.0 * sigma * sigma;
// sum is for normalization
double sum = 0.0;
// generating 5x5 kernel
for (int x = -2; x <= 2; x++) {
for (int y = -2; y <= 2; y++) {
r = sqrt(x * x + y * y);
GKernel[x + 2][y + 2] = (exp(-(r * r) / s)) / (M_PI * s);
sum += GKernel[x + 2][y + 2];
}
}
// normalising the Kernel
for (int i = 0; i < 5; ++i)
for (int j = 0; j < 5; ++j)
GKernel[i][j] /= sum;
}
int main(int argC, char ** argV)
{
GraphicsCardSupplyDepot depot;
// n needs to be integer multiple of pageSize !!!!
const size_t n = width*height;
const size_t pageSize=width;
const int maxActivePagesPerGpu = 7;
PcieBandwidthBenchmarker benchPci;
auto bw = benchPci.bestBandwidth(13);
int tot = 0;
for(auto & e: bw)
tot+=e;
std::cout<<"total virtual gpus:"<<tot<<std::endl;
std::cout<<"RAM cache / Virtual Array ratio:"<<tot*pageSize*maxActivePagesPerGpu / ((double)n)<<std::endl;
VirtualMultiArray<Color> img(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,true);
VirtualMultiArray<Color> img2(n,depot.requestGpus(),pageSize,maxActivePagesPerGpu,bw,VirtualMultiArray<Color>::MemMult::UseDefault,true,true);
std::cout<<"Starting cpu-compute on gpu-array"<<std::endl;
#pragma omp parallel for num_threads(8) shared(img)
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
img.set(j+i*width,Color(value(i, j)));
}
}
std::cout<<"mandelbrot compute finished. starting gaussian blur."<<std::endl;
double gKernel[5][5];
filterCreation(gKernel);
// 2x blur
try
{
const int tileX = 128;
const int tileY = 128;
CpuBenchmarker bench(n*54*sizeof(Color),"pixel stream for Gaussian Blur",n*54);
for(int i0y=0;i0y<height/tileY;i0y++)
#pragma omp parallel for num_threads(16)
for(int i0x=0;i0x<width/tileX;i0x++)
{
const int i=i0y*tileY*width + i0x*tileX;
Color tile[(tileX+5)*(tileY+5)];
for(int scan=0;scan<tileY;scan++)
{
auto line=img.readOnlyGetN(i+scan*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(scan+2)*(tileX+5) + (copy+2)] = line[copy];
}
}
if(i0y>=1)
{
auto line1=img.readOnlyGetN(i-width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(-1+2)*(tileX+5) + (copy+2)] = line1[copy];
}
auto line2=img.readOnlyGetN(i-2*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(-2+2)*(tileX+5) + (copy+2)] = line2[copy];
}
}
if(i0x>=1)
{
for(int scan=0;scan<tileY;scan++)
{
auto line1=img.readOnlyGetN(i-2+scan*width,2);
tile[(scan+2)*(tileX+5) + (-2+2)] = line1[0];
tile[(scan+2)*(tileX+5) + (-1+2)] = line1[1];
}
}
if(i0y<(height/tileY - 1))
{
auto lineLast1=img.readOnlyGetN(i+tileY*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(tileY+2)*(tileX+5) + (copy+2)] = lineLast1[copy];
}
auto lineLast2=img.readOnlyGetN(i+(tileY+1)*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(tileY+1+2)*(tileX+5) + (copy+2)] = lineLast2[copy];
}
}
if(i0x<(width/tileX-1))
{
for(int scan=0;scan<tileY;scan++)
{
auto line1=img.readOnlyGetN(i+scan*width + tileX,2);
tile[(scan+2)*(tileX+5) + (tileX+2)] = line1[0];
tile[(scan+2)*(tileX+5) + (tileX+2+1)] = line1[1];
}
}
for(int ty=0;ty<tileY;ty++)
for(int tx=0;tx<tileX;tx++)
{
auto center = tile[tx+2+(ty+2)*(tileX+5)];
float v = 0;
for(int iy=-2;iy<=2;iy++)
for(int ix=-2;ix<=2;ix++)
{
auto neighbor = tile[tx+ix+2+(ty+iy+2)*(tileX+5)];
v+=neighbor.val * gKernel[ix+2][iy+2];
}
center.val=v;
img2.set(i+ty*width+tx,center);
}
}
for(int i0y=0;i0y<height/tileY;i0y++)
#pragma omp parallel for num_threads(16)
for(int i0x=0;i0x<width/tileX;i0x++)
{
const int i=i0y*tileY*width + i0x*tileX;
Color tile[(tileX+5)*(tileY+5)];
for(int scan=0;scan<tileY;scan++)
{
auto line=img2.readOnlyGetN(i+scan*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(scan+2)*(tileX+5) + (copy+2)] = line[copy];
}
}
if(i0y>=1)
{
auto line1=img2.readOnlyGetN(i-width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(-1+2)*(tileX+5) + (copy+2)] = line1[copy];
}
auto line2=img2.readOnlyGetN(i-2*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(-2+2)*(tileX+5) + (copy+2)] = line2[copy];
}
}
if(i0x>=1)
{
for(int scan=0;scan<tileY;scan++)
{
auto line1=img2.readOnlyGetN(i-2+scan*width,2);
tile[(scan+2)*(tileX+5) + (-2+2)] = line1[0];
tile[(scan+2)*(tileX+5) + (-1+2)] = line1[1];
}
}
if(i0y<height/tileY - 1)
{
auto lineLast1=img2.readOnlyGetN(i+tileY*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(tileY+2)*(tileX+5) + (copy+2)] = lineLast1[copy];
}
auto lineLast2=img2.readOnlyGetN(i+(tileY+1)*width,tileX);
for(int copy=0;copy<tileX;copy++)
{
tile[(tileY+1+2)*(tileX+5) + (copy+2)] = lineLast2[copy];
}
}
if(i0x<(width/tileX-1))
{
for(int scan=0;scan<tileY;scan++)
{
auto line1=img2.readOnlyGetN(i+scan*width + tileX,2);
tile[(scan+2)*(tileX+5) + (tileX+2)] = line1[0];
tile[(scan+2)*(tileX+5) + (tileX+2+1)] = line1[1];
}
}
for(int ty=0;ty<tileY;ty++)
for(int tx=0;tx<tileX;tx++)
{
auto center = tile[tx+2+(ty+2)*(tileX+5)];
float v = 0;
for(int iy=-2;iy<=2;iy++)
for(int ix=-2;ix<=2;ix++)
{
auto neighbor = tile[tx+ix+2+(ty+iy+2)*(tileX+5)];
v+=neighbor.val * gKernel[ix+2][iy+2];
}
center.val=v;
img.set(i+ty*width+tx,center);
}
}
}
catch(std::exception & e)
{
std::cout<<e.what()<<std::endl;
}
std::cout<<"blur complete. writing to file."<<std::endl;
std::ofstream my_Image ("mandelbrot.ppm");
if (my_Image.is_open ()) {
my_Image << "P3\n" << width << " " << height << " 255\n";
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
int val = img.get(j+i*width).val;
my_Image << val<< ' ' << 0 << ' ' << 0 << "\n";
}
}
my_Image.close();
}
else std::cout << "Could not open the file";
std::cout<<"File saved"<<std::endl;
std::cout<<"cache hit ratio="<<img.getTotalCacheHitRatio()<<std::endl;
return 0;
}