adding submission for exercise 7

fb855284 · Ravikishore · f7960fc2 · fb855284 · fb855284 · fb855284
Commit fb855284 authored Mar 05, 2014 by Ravikishore
16 changed files
--- a/submission/ex7/shared_memory+constant_kernel/Makefile
+++ b/submission/ex7/shared_memory+constant_kernel/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
--- a/submission/ex7/shared_memory+constant_kernel/aux.cu
+++ b/submission/ex7/shared_memory+constant_kernel/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex7/shared_memory+constant_kernel/aux.h
+++ b/submission/ex7/shared_memory+constant_kernel/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#ifndef AUX_H
+#define AUX_H
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+#endif  // AUX_H
--- a/submission/ex7/shared_memory+constant_kernel/main.cu
+++ b/submission/ex7/shared_memory+constant_kernel/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+#include "aux.h"
+#include <iostream>
+#include <opencv2/imgproc/imgproc.hpp>
+using namespace std;
+#define MAX_KERNEL_WIDTH 64
+__constant__ float constKernel[MAX_KERNEL_WIDTH * MAX_KERNEL_WIDTH];
+// uncomment to use the camera
+//#define CAMERA
+template<typename T>
+__device__ __host__ T min(T a, T b)
+{
+    return (a < b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T max(T a, T b)
+{
+    return (a > b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T clamp(T m, T x, T M)
+{
+    return max(m, min(x, M));
+}
+__global__ void convolution(float *in, float *out, int w, int h, int r)
+{
+    int ksize = 2*r + 1;
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    // Load 'in' to shared memory
+    extern __shared__ float s_in[];
+    int nThreads = blockDim.x * blockDim.y;
+    int threadId = threadIdx.x + blockDim.x * threadIdx.y;
+    int V = blockDim.x + 2*r;
+    int G = blockDim.y + 2*r;
+    int smLength = V * G;
+    for (int i = threadId; i < smLength; i += nThreads) {
+        int rx = i % V;
+        int ry = i / V;
+        int cx = clamp<int>(0, blockDim.x*blockIdx.x + rx - r, w-1);
+        int cy = clamp<int>(0, blockDim.y*blockIdx.y + ry - r, h-1);
+        s_in[i] = in[cx + w*cy];
+    }
+    __syncthreads();
+    // Do the job!
+    if (x < w && y < h) {
+        float value = 0;
+        for (int ky = 0; ky < ksize; ky++) {
+            int ry = threadIdx.y + ky;
+            for (int kx = 0; kx < ksize; kx++) {
+                int rx = threadIdx.x + kx;
+                value += constKernel[kx + ksize*ky] * s_in[rx + V*ry];
+            }
+        }
+        out[x + w*y] = value;
+    }
+}
+inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
+inline dim3 make_grid(dim3 whole, dim3 block)
+{
+    return dim3(div_ceil(whole.x, block.x),
+                div_ceil(whole.y, block.y),
+                div_ceil(whole.z, block.z));
+}
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+    // ### Define your own parameters here as needed    
+    float sigma = 2.0;
+    getParam("sigma", sigma, argc, argv);
+    cout << "sigma: " << sigma << endl;
+    // Init camera / Load input image
+#ifdef CAMERA
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+#else
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+#endif
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+    // Set the output image format
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    // ### Define your own output images here as needed
+    // Size of the kernel
+    int r = ceil(3 * sigma);
+    int ksize = 2*r + 1;
+    float *kern = new float[ksize * ksize];
+    for (int i = 0; i < 2*r+1; i++) {
+        double a = i - r;
+        for (int j = 0; j < 2*r+1; j++) {
+            double b = j - r;
+            kern[i*ksize + j] = exp(-(a*a + b*b) / (2 * sigma*sigma))
+                                / (2 * M_PI * sigma*sigma);
+        }
+    }
+    float kernMax = 0;
+    float kernSum = 0;
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernSum += kern[i*ksize + j];
+            kernMax = std::max(kernMax, kern[i*ksize + j]);
+        }
+    }
+    float *kernOut = new float[(2*r + 1) * (2*r + 1)];
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernOut[i*ksize + j] = kern[i*ksize + j] / kernMax;
+            kern[i*ksize + j] /= kernSum;
+        }
+    }
+    cv::Mat mKernOut(ksize, ksize, CV_32F);
+    convert_layered_to_mat(mKernOut, kernOut);
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+    dim3 block(16, 16, 1);
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    size_t smBytes = (block.x + 2*r) * (block.y + 2*r) * sizeof(float);
+    cout << "shared memory: " << smBytes << " bytes" << endl;
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    Timer timer; timer.start();
+    for (int measurement = 0; measurement < repeats; measurement++) {
+//#define CPU
+#ifdef CPU
+    for (int c = 0; c < nc; c++) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                float value = 0;
+                for (int ky = 0; ky < ksize; ky++) {
+                    int cy = clamp(0, y + ky - r, h-1);
+                    for (int kx = 0; kx < ksize; kx++) {
+                        int cx = clamp(0, x + kx - r, w-1);
+                        value += kern[kx + ksize*ky] * imgIn[cx + w*cy + w*h*c];
+                    }
+                }
+                imgOut[x + w*y + w*h*c] = value;
+            }
+        }
+    }
+#else
+    float *d_in, *d_out;
+    size_t pixels = (size_t)w*h;
+    size_t nbytes = (size_t)w*h*nc*sizeof(float);
+    cudaMalloc(&d_in, nbytes);
+    cudaMalloc(&d_out, nbytes);
+    cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(constKernel, kern, (size_t)ksize*ksize*sizeof(float));
+    for (int c = 0; c < nc; c++) {
+        convolution<<<grid, block, smBytes>>>(d_in + c*pixels, d_out + c*pixels, w, h, r);
+    }
+    cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_in);
+    cudaFree(d_out);
+#endif
+    }
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << (t / repeats)*1000 << " ms" << endl;
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+    // show kernel
+    showImage("Kernel", mKernOut, 100+2*(w+40), 100);
+    // ### Display your own output images here as needed
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+    delete[] kern;
+    delete[] kernOut;
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
--- a/submission/ex7/shared_memory/Makefile
+++ b/submission/ex7/shared_memory/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
--- a/submission/ex7/shared_memory/aux.cu
+++ b/submission/ex7/shared_memory/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex7/shared_memory/aux.h
+++ b/submission/ex7/shared_memory/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#ifndef AUX_H
+#define AUX_H
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+#endif  // AUX_H
--- a/submission/ex7/shared_memory/main.cu
+++ b/submission/ex7/shared_memory/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+#include "aux.h"
+#include <iostream>
+#include <opencv2/imgproc/imgproc.hpp>
+using namespace std;
+// uncomment to use the camera
+//#define CAMERA
+template<typename T>
+__device__ __host__ T min(T a, T b)
+{
+    return (a < b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T max(T a, T b)
+{
+    return (a > b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T clamp(T m, T x, T M)
+{
+    return max(m, min(x, M));
+}
+__global__ void convolution(float *in, float *out, float *kern, int w, int h, int r)
+{
+    int ksize = 2*r + 1;
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    // Load 'in' to shared memory
+    extern __shared__ float s_in[];
+    int nThreads = blockDim.x * blockDim.y;
+    int threadId = threadIdx.x + blockDim.x * threadIdx.y;
+    int V = blockDim.x + 2*r;
+    int G = blockDim.y + 2*r;
+    int smLength = V * G;
+    for (int i = threadId; i < smLength; i += nThreads) {
+        int rx = i % V;
+        int ry = i / V;
+        int cx = clamp<int>(0, blockDim.x*blockIdx.x + rx - r, w-1);
+        int cy = clamp<int>(0, blockDim.y*blockIdx.y + ry - r, h-1);
+        s_in[i] = in[cx + w*cy];
+    }
+    __syncthreads();
+    // Do the job!
+    if (x < w && y < h) {
+        float value = 0;
+        for (int ky = 0; ky < ksize; ky++) {
+            int ry = threadIdx.y + ky;
+            for (int kx = 0; kx < ksize; kx++) {
+                int rx = threadIdx.x + kx;
+                value += kern[kx + ksize*ky] * s_in[rx + V*ry];
+            }
+        }
+        out[x + w*y] = value;
+    }
+}
+inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
+inline dim3 make_grid(dim3 whole, dim3 block)
+{
+    return dim3(div_ceil(whole.x, block.x),
+                div_ceil(whole.y, block.y),
+                div_ceil(whole.z, block.z));
+}
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+    // ### Define your own parameters here as needed    
+    float sigma = 2.0;
+    getParam("sigma", sigma, argc, argv);
+    cout << "sigma: " << sigma << endl;
+    // Init camera / Load input image
+#ifdef CAMERA
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+#else
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+#endif
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+    // Set the output image format
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    // ### Define your own output images here as needed
+    // Size of the kernel
+    int r = ceil(3 * sigma);
+    int ksize = 2*r + 1;
+    float *kern = new float[ksize * ksize];
+    for (int i = 0; i < 2*r+1; i++) {
+        double a = i - r;
+        for (int j = 0; j < 2*r+1; j++) {
+            double b = j - r;
+            kern[i*ksize + j] = exp(-(a*a + b*b) / (2 * sigma*sigma))
+                                / (2 * M_PI * sigma*sigma);
+        }
+    }
+    float kernMax = 0;
+    float kernSum = 0;
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernSum += kern[i*ksize + j];
+            kernMax = std::max(kernMax, kern[i*ksize + j]);
+        }
+    }
+    float *kernOut = new float[(2*r + 1) * (2*r + 1)];
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernOut[i*ksize + j] = kern[i*ksize + j] / kernMax;
+            kern[i*ksize + j] /= kernSum;
+        }
+    }
+    cv::Mat mKernOut(ksize, ksize, CV_32F);
+    convert_layered_to_mat(mKernOut, kernOut);
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    dim3 block(16, 16, 1);
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    size_t smBytes = (block.x + 2*r) * (block.y + 2*r) * sizeof(float);
+    cout << "shared memory: " << smBytes << " bytes" << endl;
+    Timer timer; timer.start();
+    for (int measurement = 0; measurement < repeats; measurement++) {
+//#define CPU
+#ifdef CPU
+    for (int c = 0; c < nc; c++) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                float value = 0;
+                for (int ky = 0; ky < ksize; ky++) {
+                    int cy = clamp(0, y + ky - r, h-1);
+                    for (int kx = 0; kx < ksize; kx++) {
+                        int cx = clamp(0, x + kx - r, w-1);
+                        value += kern[kx + ksize*ky] * imgIn[cx + w*cy + w*h*c];
+                    }
+                }
+                imgOut[x + w*y + w*h*c] = value;
+            }
+        }
+    }
+#else
+    float *d_in, *d_out, *d_kern;
+    size_t pixels = (size_t)w*h;
+    size_t nbytes = (size_t)w*h*nc*sizeof(float);
+    cudaMalloc(&d_in, nbytes);
+    cudaMalloc(&d_out, nbytes);
+    cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float));
+    cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice);
+    for (int c = 0; c < nc; c++) {
+        convolution<<<grid, block, smBytes>>>(d_in + c*pixels, d_out + c*pixels,
+                                              d_kern, w, h, r);
+    }
+    cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_in);
+    cudaFree(d_out);
+    cudaFree(d_kern);
+#endif
+    }
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << (t / repeats)*1000 << " ms" << endl;
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+    // show kernel
+    showImage("Kernel", mKernOut, 100+2*(w+40), 100);
+    // ### Display your own output images here as needed
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+    delete[] kern;
+    delete[] kernOut;
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
--- a/submission/ex7/texture/.cproject
+++ b/submission/ex7/texture/.cproject
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="com.nvidia.cuda.ide.toolchain.base.16721254">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.toolchain.base.16721254" moduleId="org.eclipse.cdt.core.settings" name="Default">
+				<externalSettings/>
+				<extensions>
+					<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration buildProperties="" id="com.nvidia.cuda.ide.toolchain.base.16721254" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
+					<folderInfo id="com.nvidia.cuda.ide.toolchain.base.16721254.2042018922" name="/" resourcePath="">
+						<toolChain id="com.nvidia.cuda.ide.toolchain.base.109808492" name="com.nvidia.cuda.ide.toolchain.base" superClass="com.nvidia.cuda.ide.toolchain.base">
+							<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1849903712" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
+							<builder id="com.nvidia.cuda.ide.builder.1340177224" managedBuildOn="false" name="CUDA Toolkit 5.5 Builder.Default" superClass="com.nvidia.cuda.ide.builder"/>
+							<tool id="nvcc.compiler.base.2059283331" name="NVCC Compiler" superClass="nvcc.compiler.base">
+								<option id="nvcc.compiler.pic.844715740" superClass="nvcc.compiler.pic"/>
+							</tool>
+							<tool id="nvcc.linker.base.1594175951" name="NVCC Linker" superClass="nvcc.linker.base"/>
+							<tool id="nvcc.archiver.base.460939642" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
+							<tool id="com.nvidia.host.assembler.1472631319" name="Host Assembler" superClass="com.nvidia.host.assembler"/>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration"/>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="ex7_texture.null.1156624956" name="ex7_texture"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+</cproject>
--- a/submission/ex7/texture/.project
+++ b/submission/ex7/texture/.project
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ex7_texture</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+</projectDescription>
--- a/submission/ex7/texture/Makefile
+++ b/submission/ex7/texture/Makefile
+all: main
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
--- a/submission/ex7/texture/aux.cu
+++ b/submission/ex7/texture/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex7/texture/aux.h
+++ b/submission/ex7/texture/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#ifndef AUX_H
+#define AUX_H
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+#endif  // AUX_H
--- a/submission/ex7/texture/main.cu
+++ b/submission/ex7/texture/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// Declare the texture at file scope
+texture<float,2,cudaReadModeElementType> texRef;
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+#include "aux.h"
+#include <iostream>
+#include <opencv2/imgproc/imgproc.hpp>
+using namespace std;
+// uncomment to use the camera
+// #define CAMERA
+template<typename T>
+__device__ T gpu_min(T a, T b)
+{
+    if (a < b)
+        return a;
+    else
+        return b;
+}
+template<typename T>
+__device__ T gpu_max(T a, T b)
+{
+    if (a < b)
+        return b;
+    else
+        return a;
+}
+__global__ void convolution(float *in, float *out, float *kern,
+                            int w, int h, int nc, int r)
+{
+    int ksize = 2*r + 1;
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+///    int c = threadIdx.z + blockDim.z * blockIdx.z;
+    int idx = x + w*y;
+    if (idx < w*h*nc ) {
+        float value = 0;
+        for (int ky = 0; ky < ksize; ky++) {
+            // int cy = gpu_max(0, gpu_min(y + ky - r, h-1));
+            for (int kx = 0; kx < ksize; kx++) {
+                // int cx = gpu_max(0, gpu_min(x + kx - r, w-1));
+                value += kern[kx + ksize*ky] * tex2D(texRef, x+kx+0.5f-r, y+ky+0.5f-r);
+                // value += kern[kx + ksize*ky] * in[cx + w*cy + w*h*c];
+            }
+        }
+        out[idx] = value;
+    }
+}
+inline int divc(int n, int b) { return (n + b - 1) / b; }
+inline dim3 make_grid(dim3 whole, dim3 block)
+{
+    return dim3(divc(whole.x, block.x),
+                divc(whole.y, block.y),
+                divc(whole.z, block.z));
+}
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+    // ### Define your own parameters here as needed    
+    float sigma = 2.0;
+    getParam("sigma", sigma, argc, argv);
+    cout << "sigma: " << sigma << endl;
+    // Init camera / Load input image
+#ifdef CAMERA
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+#else
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+#endif
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+    // Set the output image format
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    // ### Define your own output images here as needed
+    // Size of the kernel
+    int r = ceil(3 * sigma);
+    int ksize = 2*r + 1;
+    float *kern = new float[ksize * ksize];
+    for (int i = 0; i < 2*r+1; i++) {
+        double a = i - r;
+        for (int j = 0; j < 2*r+1; j++) {
+            double b = j - r;
+            kern[i*ksize + j] = exp(-(a*a + b*b) / (2 * sigma*sigma))
+                                / (2 * M_PI * sigma*sigma);
+        }
+    }
+    float kernMax = 0;
+    float kernSum = 0;
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernSum += kern[i*ksize + j];
+            kernMax = std::max(kernMax, kern[i*ksize + j]);
+        }
+    }
+    float *kernOut = new float[(2*r + 1) * (2*r + 1)];
+    for (int i = 0; i < 2*r+1; i++) {
+        for (int j = 0; j < 2*r+1; j++) {
+            kernOut[i*ksize + j] = kern[i*ksize + j] / kernMax;
+            kern[i*ksize + j] /= kernSum;
+        }
+    }
+    cv::Mat mKernOut(ksize, ksize, CV_32F);
+    convert_layered_to_mat(mKernOut, kernOut);
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    Timer timer; timer.start();
+// #define CPU
+#ifdef CPU
+    for (int c = 0; c < nc; c++) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                float value = 0;
+                for (int ky = 0; ky < ksize; ky++) {
+                    int cy = std::max(0, std::min(y + ky - r, h-1));
+                    for (int kx = 0; kx < ksize; kx++) {
+                        int cx = std::max(0, std::min(x + kx - r, w-1));
+                        value += kern[kx + ksize*ky] * imgIn[cx + w*cy + w*h*c];
+                    }
+                }
+                imgOut[x + w*y + w*h*c] = value;
+            }
+        }
+    }
+#else
+    float *d_in, *d_out, *d_kern;
+    size_t nbytes = (size_t)w*h*nc*sizeof(float);
+    cudaMalloc(&d_in, nbytes);
+    CUDA_CHECK;
+    cudaMalloc(&d_out, nbytes);
+    CUDA_CHECK;
+    cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float));
+    CUDA_CHECK;
+    cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice);
+    CUDA_CHECK;
+    dim3 block(16, 8, 1);
+    dim3 grid = make_grid(dim3(w, h*nc, 1), block);
+    // Define texture attributes
+    texRef.addressMode[0] = cudaAddressModeClamp;             // clamp x to border
+    texRef.addressMode[1] = cudaAddressModeClamp;            // clamp y to border
+    texRef.filterMode = cudaFilterModeLinear;            // linear interpolation
+    texRef.normalized = false;
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
+    cudaBindTexture2D(NULL, &texRef, d_in, &desc, w, h*nc, w*sizeof(d_in[0]));
+    convolution<<<grid, block>>>(d_in, d_out, d_kern, w, h, nc, r);
+    CUDA_CHECK;
+    cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost);
+    CUDA_CHECK;
+    // unbind texture
+    cudaUnbindTexture(texRef);
+    CUDA_CHECK;
+    cudaFree(d_in);
+    CUDA_CHECK;
+    cudaFree(d_out);
+    CUDA_CHECK;
+    cudaFree(d_kern);
+    CUDA_CHECK;
+#endif
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << t*1000 << " ms" << endl;
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+    // show kernel
+    showImage("Kernel", mKernOut, 100+2*(w+40), 100);
+    // ### Display your own output images here as needed
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+    delete[] kern;
+    delete[] kernOut;
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
--- a/submission/ex7/todo
+++ b/submission/ex7/todo
+Gaurav - 
+global memory + constant kernel (with miklos's base code)
+texture + constant kernel (with existing texture code)
--- a/submission/ex7/todo~
+++ b/submission/ex7/todo~