Assignment2, ex4

Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>

Assignment2, ex4
Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>
57882761 · Gaurav Kukreja · 9eb6b7a5 · 57882761 · 57882761 · 57882761
Commit 57882761 authored Mar 05, 2014 by Gaurav Kukreja
Showing with 659 additions and 0 deletions

.gitignore .gitignore +3 -0

Makefile gaurav/2_Assign/ex4/Makefile +3 -0

aux.cu gaurav/2_Assign/ex4/aux.cu +146 -0

aux.h gaurav/2_Assign/ex4/aux.h +109 -0

main.cu gaurav/2_Assign/ex4/main.cu +398 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
 main
+image_input.png
+image_result.png
+*.swp
--- a/gaurav/2_Assign/ex4/Makefile
+++ b/gaurav/2_Assign/ex4/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+
--- a/gaurav/2_Assign/ex4/aux.cu
+++ b/gaurav/2_Assign/ex4/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+
+
+
+
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+
+
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+
+
+
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+
+
+
+
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+
+
+
+
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/gaurav/2_Assign/ex4/aux.h
+++ b/gaurav/2_Assign/ex4/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#ifndef AUX_H
+#define AUX_H
+
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+
+
+
+
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+
+
+
+
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+
+
+
+
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+
+
+
+#endif  // AUX_H
--- a/gaurav/2_Assign/ex4/main.cu
+++ b/gaurav/2_Assign/ex4/main.cu
+    // ###
+    // ###
+    // ### Practical Course: GPU Programming in Computer Vision
+    // ###
+    // ###
+    // ### Technical University Munich, Computer Vision Group
+    // ### Winter Semester 2013/2014, March 3 - April 4
+    // ###
+    // ###
+    // ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+    // ###
+    // ###
+    // ###
+
+
+
+    // ###
+    // ###
+    // ### TODO: For every student of your group, please provide here:
+    // ###
+    // ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058 
+    // ###
+    // ###
+
+
+    #include "aux.h"
+    #include <iostream>
+    #include <math.h>
+    using namespace std;
+
+    #define MAX_KERNEL_WIDTH 20
+
+    __constant__ float constKernel[MAX_KERNEL_WIDTH * MAX_KERNEL_WIDTH];
+
+    // uncomment to use the camera
+    //#define CAMERA
+
+    #define USING_GPU
+
+    template<typename T>
+    __device__ T gpu_min(T a, T b)
+    {
+        if (a < b)
+            return a;
+        else
+            return b;
+    }
+
+    template<typename T>
+    __device__ T gpu_max(T a, T b)
+    {
+        if (a < b)
+            return b;
+        else
+            return a;
+    }
+
+
+    // Image Gradient 
+    __device__ void convolveImage(float* imgIn, float* imgOut, int rad, int w, int h, int nc) 
+    {
+        int ix = threadIdx.x + blockDim.x * blockIdx.x;
+        int iy = threadIdx.y + blockDim.y * blockIdx.y;
+        int iz = threadIdx.z + blockDim.z * blockIdx.z;
+
+        // Index of the output image, this kernel works on
+        int idx = ix + (iy * w) + (iz * w * h);  
+     
+        // check limits
+        if (idx < w * h * nc)
+        { 
+            imgOut[idx] = 0;						    // initialize
+    	float value = 0;
+            for(int j = -rad; j < rad; j++)					    // for each row in kernel
+            {   
+    	    int iny = gpu_max(0, gpu_min(iy+j, h-1));			    
+    	    for(int i = -rad; i < rad; i++)				    // for each element in the kernel row
+                {
+    		int inx = gpu_max(0, gpu_min(ix+i, w-1));
+    	        int inIdx = inx + (iny * w) + (iz * w * h);		    // Index of Input Image to be multiplied by corresponding element in kernel
+    		value += imgIn[inIdx] * constKernel[i+rad + ((j+rad) * rad)];
+    	    }
+    	}
+    	imgOut[idx] = value;
+        }
+    }
+
+    __global__ void callKernel(float* imgIn, float* imgOut, int rad, int w, int h, int nc)
+    {
+        convolveImage(imgIn, imgOut, rad, w, h, nc);    
+    }
+
+    int main(int argc, char **argv)
+    {
+    #ifdef USING_GPU
+        // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+        // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+        // We will do it right here, so that the run time measurements are accurate
+        cudaDeviceSynchronize();  CUDA_CHECK;
+    #endif // USING_GPU
+
+
+
+        // Reading command line parameters:
+        // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+        // If "-param" is not specified, the value of "var" remains unchanged
+        //
+        // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+
+    #ifdef CAMERA
+    #else
+        // input image
+        string image = "";
+        bool ret = getParam("i", image, argc, argv);
+        if (!ret) cerr << "ERROR: no image specified" << endl;
+        if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray] [-sigma <sigma>]" << endl << "\t Default Value of sigma = 0.5" << endl; return 1; }
+    #endif
+        
+        // number of computation repetitions to get a better run time measurement
+        int repeats = 1;
+        getParam("repeats", repeats, argc, argv);
+        cout << "repeats: " << repeats << endl;
+        
+        // load the input image as grayscale if "-gray" is specifed
+        bool gray = false;
+        getParam("gray", gray, argc, argv);
+        cout << "gray: " << gray << endl;
+
+        float sigma = 2.0;
+        getParam("sigma", sigma, argc, argv);
+        cout << "sigma = " << sigma << endl;
+
+        // ### Define your own parameters here as needed    
+
+        // Init camera / Load input image
+    #ifdef CAMERA
+
+        // Init camera
+      	cv::VideoCapture camera(0);
+      	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+        int camW = 640;
+        int camH = 480;
+      	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+      	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+        // read in first frame to get the dimensions
+        cv::Mat mIn;
+        camera >> mIn;
+        
+    #else
+
+        // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+        cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+        // check
+        if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+        
+    #endif
+
+        // convert to float representation (opencv loads image values as single bytes by default)
+        mIn.convertTo(mIn,CV_32F);
+        // convert range of each channel to [0,1] (opencv default is [0,255])
+        mIn /= 255.f;
+        // get image dimensions
+        int w = mIn.cols;         // width
+        int h = mIn.rows;         // height
+        int nc = mIn.channels();  // number of channels
+        cout << "image: " << w << " x " << h << endl;
+
+
+
+
+        // Set the output image format
+        // ###
+        // ###
+        // ### TODO: Change the output image format as needed
+        // ###
+        // ###
+        cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+        //cv::Mat mOut(h,w,CV_32FC3);    // mOut will be a color image, 3 layers
+        //cv::Mat mOut(h,w,CV_32FC1);    // mOut will be a grayscale image, 1 layer
+        // ### Define your own output images here as needed
+
+
+
+
+        // Allocate arrays
+        // input/output image width: w
+        // input/output image height: h
+        // input image number of channels: nc
+        // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+
+        // allocate raw input image array
+        float *imgIn  = new float[(size_t)w*h*nc];
+
+        // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+        float *imgOut = new float[(size_t)w*h*mOut.channels()];
+
+        int rad = ceil(3 * sigma); // kernel radius
+        int kw = 2 * rad; // kernel width
+        
+        if (kw > MAX_KERNEL_WIDTH)
+        {
+            cout << "Kernel Size larger than allocated space in the GPU constant memory!!" << endl;
+            return -1;
+        }
+
+        float c = 1. / (2. * 3.142857 * sigma * sigma); // constant
+
+        float *kernel =  new float[(size_t) (kw * kw)]; // kernel
+        float *kernelOut = new float[(size_t) (kw * kw)]; // kernel to be displayed
+
+        // Computation of Kernel
+        float a;
+        float b;
+        for (int iy = 0; iy < kw; iy++)
+        {
+            a = iy - rad;
+            for (int ix = 0; ix < kw; ix++)
+            {
+                b = ix - rad;
+                kernel[ix + (iy * kw)] = c * exp(-(a*a + b*b) / (2 * sigma*sigma));
+            }
+        }
+
+        // Normalization of Kernel
+        float sum = 0.;
+        float kmax = 0.;
+        for (int iy = 0; iy < kw; iy++)
+        {
+            for (int ix = 0; ix < kw; ix++)
+            {
+                kmax = max(kmax, kernel[ix + (iy * kw)]);
+                sum += kernel[ix + (iy * kw)];
+            }
+        }
+
+        for (int iy = 0; iy < kw; iy++)
+        {
+            for (int ix = 0; ix < kw; ix++)
+            {
+                kernelOut[ix + (iy * kw)] = kernel[ix + (iy * kw)] / kmax;
+                kernel[ix + (iy * kw)] = kernel[ix + (iy * kw)] / sum;
+            }
+        }
+
+        // Display Kernel
+        cv::Mat cvKernelOut(2*rad, 2*rad, CV_32F);
+        convert_layered_to_mat(cvKernelOut, kernelOut);
+        showImage("Kernel", cvKernelOut, 100, 10);
+
+
+        // For camera mode: Make a loop to read in camera frames
+    #ifdef CAMERA
+        // Read a camera image frame every 30 milliseconds:
+        // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+        // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+        while (cv::waitKey(30) < 0)
+        {
+        // Get camera image
+        camera >> mIn;
+        // convert to float representation (opencv loads image values as single bytes by default)
+        mIn.convertTo(mIn,CV_32F);
+        // convert range of each channel to [0,1] (opencv default is [0,255])
+        mIn /= 255.f;
+    #endif
+
+        // Init raw input image array
+        // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+        // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+        // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+        convert_mat_to_layered (imgIn, mIn);
+
+        Timer timer;
+        float t;
+        // ###
+        // ###
+        // ### TODO: Main computation
+        // ###
+        // ###
+        #ifdef USING_GPU
+        timer.start();
+        
+        // Repetitions Loop
+        for(int rep = 0; rep < repeats; rep++)
+        {
+    	size_t count = w * h * nc;        
+
+            // Thread Dimensions
+            dim3 block = dim3(16, 8, nc);
+            dim3 grid = dim3((w + block.x - 1) / block.x, (h + block.y - 1) / block.y, 1);
+
+            // Allocating memory on the device
+            float *d_imgIn = NULL;
+            float *d_imgOut = NULL;
+            //float *d_kernel = NULL;
+            cudaMalloc(&d_imgIn, count * sizeof(float));
+            cudaMalloc(&d_imgOut, count * sizeof(float));
+            //cudaMalloc(&d_kernel, kw * kw * sizeof(float));
+            
+            // Copying Input image to device, and initializing result to 0
+            cudaMemcpy(d_imgIn, imgIn, count * sizeof(float), cudaMemcpyHostToDevice);
+            //cudaMemcpy(d_kernel, kernel, kw * kw * sizeof(float), cudaMemcpyHostToDevice);
+            cudaMemcpyToSymbol(constKernel, kernel, kw * kw * sizeof(float));
+
+            // Calling Kernel
+            callKernel <<< grid, block >>> (d_imgIn, d_imgOut, rad, w, h, nc);        
+            
+            // Copying result back
+            cudaMemcpy(imgOut, d_imgOut, count * sizeof(float), cudaMemcpyDeviceToHost);
+     
+    	CUDA_CHECK;
+     
+            // Freeing Memory
+            cudaFree(d_imgIn);
+            //cudaFree(d_kernel);
+            cudaFree(d_imgOut);
+        }
+        
+        timer.end();
+        t = timer.get();
+        
+        #else // USING_GPU
+        // CPU Implementation
+
+        timer.start();
+        
+        // Repetitions Loop
+        for(int rep = 0; rep < repeats; rep++)    
+        {
+            for(int ix = 0; ix < w; ix++)
+    	{
+    	    for(int iy = 0; iy < h; iy++)
+    	    {
+    		for(int iz = 0; iz < nc; iz++)
+    	    	{
+    		    int idx = ix + (iy * w) + (iz * w * h);
+    	            imgOut[idx] = 0;                                                    // initialize
+    	            float value = 0;
+    	            for(int j = -rad; j < rad; j++)                                     // for each row in kernel
+    	            {
+    	                int iny = max(0, min(iy+j, h-1));
+    	                for(int i = -rad; i < rad; i++)                                 // for each element in the kernel row
+    	                {
+    	                    int inx = max(0, min(ix+i, w-1));
+    	                    int inIdx = inx + (iny * w) + (iz * w * h);                 // Index of Input Image to be multiplied by corresponding element in kernel
+    	                    value += imgIn[inIdx] * kernel[i+rad + ((j+rad) * rad)];
+    	                }
+    	            }
+    	            imgOut[idx] = value;
+    	        }
+    	    }
+    	}
+    	
+        }
+        
+        timer.end();  
+        t = timer.get();  // elapsed time in seconds
+            
+        #endif
+        
+        cout << "time: " << t*1000 << " ms" << endl;
+
+        // show input image
+        showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+
+        // show output image: first convert to interleaved opencv format from the layered raw array
+        convert_layered_to_mat(mOut, imgOut);
+        showImage("Output", mOut, 100+w+40, 100);
+
+        // ### Display your own output images here as needed
+
+    #ifdef CAMERA
+        // end of camera loop
+        }
+    #else
+        // wait for key inputs
+        cv::waitKey(0);
+    #endif
+
+
+
+
+        // save input and result
+        cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+        cv::imwrite("image_result.png",mOut*255.f);
+
+        // free allocated arrays
+        delete[] imgIn;
+        delete[] imgOut;
+        delete[] kernel;
+        delete[] kernelOut;
+
+        // close all opencv windows
+        cvDestroyAllWindows();
+        return 0;
+    }
+
+
+