Added CPU version for convolution

Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>

Added CPU version for convolution
Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>
bf02d566 · Gaurav Kukreja · 9313ebfa · bf02d566
Commit bf02d566 authored Mar 05, 2014 by Gaurav Kukreja
Hide whitespace changes
Inline Side-by-side

Showing with 53 additions and 47 deletions

main.cu gaurav/1_Assign/ex6/main.cu +53 -47

No files found.
--- a/gaurav/1_Assign/ex6/main.cu
+++ b/gaurav/1_Assign/ex6/main.cu
@@ -18,7 +18,7 @@
 // ###
 // ### TODO: For every student of your group, please provide here:
 // ###
-// ### name, email, login username (for example p123)
+// ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058 
 // ###
 // ###
@@ -31,7 +31,7 @@ using namespace std;
 // uncomment to use the camera
 //#define CAMERA
-#define USING_GPU
+//#define USING_GPU
 template<typename T>
 __device__ T gpu_min(T a, T b)
@@ -59,19 +59,21 @@ __device__ void convolveImage(float* imgIn, float* kernel, float* imgOut, int ra
    int iy = threadIdx.y + blockDim.y * blockIdx.y;
    int iz = threadIdx.z + blockDim.z * blockIdx.z;
+    // Index of the output image, this kernel works on
    int idx = ix + (iy * w) + (iz * w * h);  
-    if (ix < w && iy < h && iz < nc)
+    // check limits
+    if (idx < w * h * nc)
    { 
-        imgOut[idx] = 0;
+        imgOut[idx] = 0;						    // initialize
 	float value = 0;
-        for(int j = -rad; j < rad; j++)
+        for(int j = -rad; j < rad; j++)					    // for each row in kernel
-        {
+        {   
-	    int iny = gpu_max(0, gpu_min(iy+j, h-1));
+	    int iny = gpu_max(0, gpu_min(iy+j, h-1));			    
-	    for(int i = -rad; i < rad; i++)
+	    for(int i = -rad; i < rad; i++)				    // for each element in the kernel row
            {
 		int inx = gpu_max(0, gpu_min(ix+i, w-1));
-	        int inIdx = inx + iny * w + iz * w * h; 
+	        int inIdx = inx + (iny * w) + (iz * w * h);		    // Index of Input Image to be multiplied by corresponding element in kernel
 		value += imgIn[inIdx] * kernel[i+rad + ((j+rad) * rad)];
 	    }
 	}
@@ -86,11 +88,12 @@ __global__ void callKernel(float* imgIn, float* kernel, float* imgOut, int rad,
 int main(int argc, char **argv)
 {
+#ifdef USING_GPU
    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
    // We will do it right here, so that the run time measurements are accurate
    cudaDeviceSynchronize();  CUDA_CHECK;
+#endif // USING_GPU
@@ -211,46 +214,48 @@ int main(int argc, char **argv)
    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
    convert_mat_to_layered (imgIn, mIn);
-    int rad = ceil(3 * sigma);
+    int rad = ceil(3 * sigma); // kernel radius
+    int kw = 2 * rad; // kernel width
-    float c = 1. / (2. * 3.142857 * sigma * sigma);
+    float c = 1. / (2. * 3.142857 * sigma * sigma); // constant
-    float *kernel =  new float[(size_t) (2 * rad * 2 * rad)];
+    float *kernel =  new float[(size_t) (kw * kw)]; // kernel
-    float *kernelOut = new float[(size_t) (2 * rad * 2 * rad)];
+    float *kernelOut = new float[(size_t) (kw * kw)]; // kernel to be displayed
+    // Computation of Kernel
    float a;
    float b;
-    for (int iy = 0; iy < 2 * rad; iy++)
+    for (int iy = 0; iy < kw; iy++)
    {
 	a = iy - rad;
-	for (int ix = 0; ix < 2 * rad; ix++)
+	for (int ix = 0; ix < kw; ix++)
 	{
 	    b = ix - rad;
-	    kernel[ix + (iy * 2 * rad)] = c * exp(-(a*a + b*b) / (2 * sigma*sigma));
+	    kernel[ix + (iy * kw)] = c * exp(-(a*a + b*b) / (2 * sigma*sigma));
 	}
    }	
+    // Normalization of Kernel
    float sum = 0.;
    float kmax = 0.;
+    for (int iy = 0; iy < kw; iy++)
-    for (int iy = 0; iy < 2 * rad; iy++)
    {
-        for (int ix = 0; ix < 2 * rad; ix++)
+        for (int ix = 0; ix < kw; ix++)
 	{
-	    kmax = max(kmax, kernel[ix + (iy * 2 * rad)]); 
+	    kmax = max(kmax, kernel[ix + (iy * kw)]); 
-	    sum += kernel[ix + (iy * 2 * rad)];	    
+	    sum += kernel[ix + (iy * kw)];	    
 	}
    }
-    for (int iy = 0; iy < 2 * rad; iy++)
+    for (int iy = 0; iy < kw; iy++)
    {   
-	for (int ix = 0; ix < 2 * rad; ix++)
+	for (int ix = 0; ix < kw; ix++)
 	{
-	    kernelOut[ix + (iy * 2 * rad)] = kernel[ix + (iy * 2 * rad)] / kmax;
+	    kernelOut[ix + (iy * kw)] = kernel[ix + (iy * kw)] / kmax;
-	    kernel[ix + (iy * 2 * rad)] = kernel[ix + (iy * 2 * rad)] / sum;
+	    kernel[ix + (iy * kw)] = kernel[ix + (iy * kw)] / sum;
 	}
    }
+    // Display Kernel
    cv::Mat cvKernelOut(2*rad, 2*rad, CV_32F);
    convert_layered_to_mat(cvKernelOut, kernelOut);
    showImage("Kernel", cvKernelOut, 100, 10);
@@ -268,7 +273,6 @@ int main(int argc, char **argv)
    // Repetitions Loop
    for(int rep = 0; rep < repeats; rep++)
    {
-        size_t n_pixels = w * h;
 	size_t count = w * h * nc;        
        // Thread Dimensions
@@ -281,11 +285,11 @@ int main(int argc, char **argv)
 	float *d_kernel = NULL;
        cudaMalloc(&d_imgIn, count * sizeof(float));
        cudaMalloc(&d_imgOut, count * sizeof(float));
-	cudaMalloc(&d_kernel, 2 * rad * 2 * rad * sizeof(float));
+	cudaMalloc(&d_kernel, kw * kw * sizeof(float));
        // Copying Input image to device, and initializing result to 0
        cudaMemcpy(d_imgIn, imgIn, count * sizeof(float), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_kernel, kernel, 2 * rad * 2 * rad * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_kernel, kernel, kw * kw * sizeof(float), cudaMemcpyHostToDevice);
        // Calling Kernel
        callKernel <<< grid, block >>> (d_imgIn, d_kernel, d_imgOut, rad, w, h, nc);        
@@ -312,27 +316,29 @@ int main(int argc, char **argv)
    // Repetitions Loop
    for(int rep = 0; rep < repeats; rep++)    
    {
-        size_t n_pixels = w * h;
+        for(int ix = 0; ix < w; ix++)
-	float *forwardDiffX = NULL;
+	{
-	float *forwardDiffY = NULL;
-	forwardDiffX = (float*) malloc(sizeof(float) * w * h * nc);
-	forwardDiffY = (float*) malloc(sizeof(float) * w * h * nc);
-        memset(imgOut, 0, n_pixels * sizeof(float));
-	for(int ix = 0; ix < w; ix ++)
 	    for(int iy = 0; iy < h; iy++)
 	    {
-		imgOut[ix + (iy * w)] = 0;
+		for(int iz = 0; iz < h; iz++)
-	        for(int i = 0; i < nc; i++)
+	    	{
-	        {
+		    int idx = ix + (iy * w) + (iz * w * h);
-	            forwardDiffX[ix + (iy * w) + (i * w * h)] = (ix < (w-2)) ? (imgIn[ix + (iy * w) + (i * w * h) + 1] - imgIn[ix + (iy * w) + (i * w * h)]) : 0;
+	            imgOut[idx] = 0;                                                    // initialize
-	            forwardDiffY[ix + (iy * w) + (i * w * h)] = (iy < (h-2)) ? (imgIn[ix + ((iy + 1) * w) + (i * w * h)] - imgIn[ix + (iy * w) + (i * w * h)]) : 0; 
+	            float value = 0;
-	            imgOut[ix + (iy * w)] += pow(forwardDiffX[ix + (iy * w) + (i * w * h)] , 2) +  pow(forwardDiffY[ix + (iy * w) + (i * w * h)] , 2);
+	            for(int j = -rad; j < rad; j++)                                     // for each row in kernel
+	            {
+	                int iny = max(0, min(iy+j, h-1));
+	                for(int i = -rad; i < rad; i++)                                 // for each element in the kernel row
+	                {
+	                    int inx = max(0, min(ix+i, w-1));
+	                    int inIdx = inx + (iny * w) + (iz * w * h);                 // Index of Input Image to be multiplied by corresponding element in kernel
+	                    value += imgIn[inIdx] * kernel[i+rad + ((j+rad) * rad)];
+	                }
+	            }
+	            imgOut[idx] = value;
 	        }
-	        imgOut[ix + (iy * w)] = sqrt(imgOut[ix + (iy * w)]);
 	    }
+	}
    }