Commit 9313ebfa authored by Gaurav Kukreja's avatar Gaurav Kukreja

Working Convolution

Signed-off-by: Gaurav Kukreja's avatarGaurav Kukreja <gaurav@gauravk.in>
parent 5cf429a7
...@@ -33,8 +33,27 @@ using namespace std; ...@@ -33,8 +33,27 @@ using namespace std;
#define USING_GPU #define USING_GPU
template<typename T>
__device__ T gpu_min(T a, T b)
{
if (a < b)
return a;
else
return b;
}
template<typename T>
__device__ T gpu_max(T a, T b)
{
if (a < b)
return b;
else
return a;
}
// Image Gradient // Image Gradient
__device__ void convolveImage(float* imgIn, float* kernel, float* imgOut, int rad, int w, size_t h, size_t nc) __device__ void convolveImage(float* imgIn, float* kernel, float* imgOut, int rad, int w, int h, int nc)
{ {
int ix = threadIdx.x + blockDim.x * blockIdx.x; int ix = threadIdx.x + blockDim.x * blockIdx.x;
int iy = threadIdx.y + blockDim.y * blockIdx.y; int iy = threadIdx.y + blockDim.y * blockIdx.y;
...@@ -42,24 +61,27 @@ __device__ void convolveImage(float* imgIn, float* kernel, float* imgOut, int ra ...@@ -42,24 +61,27 @@ __device__ void convolveImage(float* imgIn, float* kernel, float* imgOut, int ra
int idx = ix + (iy * w) + (iz * w * h); int idx = ix + (iy * w) + (iz * w * h);
if (idx < count) if (ix < w && iy < h && iz < nc)
{ {
imgOut[idx] = 0; imgOut[idx] = 0;
float value = 0;
for(int j = -rad; j < rad; j++) for(int j = -rad; j < rad; j++)
{ {
int iny = gpu_max(0, gpu_min(iy+j, h-1));
for(int i = -rad; i < rad; i++) for(int i = -rad; i < rad; i++)
{ {
size_t inIdx = idx + i + (w * j); int inx = gpu_max(0, gpu_min(ix+i, w-1));
if ((ix + i + (w * j)) > 0 && (ix + i + (w * j)) < n_pixels) int inIdx = inx + iny * w + iz * w * h;
imgOut[idx] += imgIn[inIdx] * kernel[i+rad + ((j+rad) * rad)]; value += imgIn[inIdx] * kernel[i+rad + ((j+rad) * rad)];
} }
} }
imgOut[idx] = value;
} }
} }
__global__ void callKernel(float* imgIn, float* kernel, float* imgOut, int rad, int w, size_t n_pixels, size_t count) __global__ void callKernel(float* imgIn, float* kernel, float* imgOut, int rad, int w, int h, int nc)
{ {
convolveImage(imgIn, kernel, imgOut, rad, w, n_pixels, count); convolveImage(imgIn, kernel, imgOut, rad, w, h, nc);
} }
int main(int argc, char **argv) int main(int argc, char **argv)
...@@ -250,8 +272,8 @@ int main(int argc, char **argv) ...@@ -250,8 +272,8 @@ int main(int argc, char **argv)
size_t count = w * h * nc; size_t count = w * h * nc;
// Thread Dimensions // Thread Dimensions
dim3 block = dim3(128, nc, 1); dim3 block = dim3(16, 4, nc);
dim3 grid = dim3((n_pixels + block.x - 1) / block.x, 1, 1); dim3 grid = dim3((w + block.x - 1) / block.x, (h + block.y - 1) / block.y, 1);
// Allocating memory on the device // Allocating memory on the device
float *d_imgIn = NULL; float *d_imgIn = NULL;
...@@ -266,7 +288,7 @@ int main(int argc, char **argv) ...@@ -266,7 +288,7 @@ int main(int argc, char **argv)
cudaMemcpy(d_kernel, kernel, 2 * rad * 2 * rad * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, kernel, 2 * rad * 2 * rad * sizeof(float), cudaMemcpyHostToDevice);
// Calling Kernel // Calling Kernel
callKernel <<< grid, block >>> (d_imgIn, d_kernel, d_imgOut, rad, w, n_pixels, count); callKernel <<< grid, block >>> (d_imgIn, d_kernel, d_imgOut, rad, w, h, nc);
// Copying result back // Copying result back
cudaMemcpy(imgOut, d_imgOut, count * sizeof(float), cudaMemcpyDeviceToHost); cudaMemcpy(imgOut, d_imgOut, count * sizeof(float), cudaMemcpyDeviceToHost);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment