first assignment submission

5dd8c027 · Ravikishore · 5aeb8c49 · 5dd8c027 · 5dd8c027 · 5dd8c027
Commit 5dd8c027 authored Mar 05, 2014 by Ravikishore
32 changed files
--- a/.gitignore
+++ b/.gitignore
+main
--- a/submission/basic/addArrays
+++ b/submission/basic/addArrays
--- a/submission/basic/addArrays.cu
+++ b/submission/basic/addArrays.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+
+
+#include <cuda_runtime.h>
+#include <iostream>
+using namespace std;
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        exit(1);
+    }
+}
+
+__device__ float add(float a, float b)
+{
+    return a + b;
+}
+
+__global__ void add_arrays(float *a, float *b, float *c, int n)
+{
+    int idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (idx < n)
+        c[idx] = add(a[idx], b[idx]);
+}
+
+int main(int argc, char **argv)
+{
+    // alloc and init input arrays on host (CPU)
+    int n = 20;
+    float *a = new float[n];
+    float *b = new float[n];
+    float *c = new float[n];
+    for(int i=0; i<n; i++)
+    {
+        a[i] = i;
+        b[i] = (i%5)+1;
+        c[i] = 0;
+    }
+
+    // CPU computation
+    for(int i=0; i<n; i++) c[i] = a[i] + b[i];
+
+    // print result
+    cout << "CPU:"<<endl;
+    for(int i=0; i<n; i++) cout << i << ": " << a[i] << " + " << b[i] << " = " << c[i] << endl;
+    cout << endl;
+    // init c
+    for(int i=0; i<n; i++) c[i] = 0;
+    
+    float *d_a, *d_b, *d_c;
+    cudaMalloc(&d_a, n*sizeof(float));
+    cudaMalloc(&d_b, n*sizeof(float));
+    cudaMalloc(&d_c, n*sizeof(float));
+    cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, b, n*sizeof(float), cudaMemcpyHostToDevice);
+
+    dim3 block(32);
+    dim3 grid((n + block.x - 1) / block.x);
+    add_arrays<<<grid, block>>>(d_a, d_b, d_c, n);
+
+    cudaMemcpy(c, d_c, n*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    // ### Notes:
+    // ### 2. Always use the macro CUDA_CHECK after each CUDA call, e.g. "cudaMalloc(...); CUDA_CHECK;"
+
+    // print result
+    cout << "GPU:"<<endl;
+    for(int i=0; i<n; i++) cout << i << ": " << a[i] << " + " << b[i] << " = " << c[i] << endl;
+    cout << endl;
+
+    // free CPU arrays
+    delete[] a;
+    delete[] b;
+    delete[] c;
+}
+
+
+
--- a/submission/basic/squareArray.cu
+++ b/submission/basic/squareArray.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+
+
+#include <cuda_runtime.h>
+#include <iostream>
+using namespace std;
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        exit(1);
+    }
+}
+
+
+__device__ float square(float x)
+{
+    return x * x;
+}
+
+__global__ void square_array(float *arr, int n)
+{
+    int idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (idx < n)
+        arr[idx] = square(arr[idx]);
+}
+
+int main(int argc,char **argv)
+{
+    // alloc and init input arrays on host (CPU)
+    int n = 10;
+    float *a = new float[n];
+    for(int i=0; i<n; i++) a[i] = i;
+
+    // CPU computation
+    for(int i=0; i<n; i++)
+    {
+        float val = a[i];
+        val = val*val;
+        a[i] = val;
+    }
+
+    // print result
+    cout << "CPU:"<<endl;
+    for(int i=0; i<n; i++) cout << i << ": " << a[i] << endl;
+    cout << endl;
+    
+
+
+    // GPU computation
+    // reinit data
+    for(int i=0; i<n; i++) a[i] = i;
+
+    float *d_a;
+    cudaMalloc(&d_a, n*sizeof(float));
+    cudaMemcpy(d_a, a, n*sizeof(float), cudaMemcpyHostToDevice);
+
+    dim3 block(32);
+    dim3 grid((n + block.x - 1) / block.x);
+    square_array<<<grid, block>>>(d_a, n);
+
+    cudaMemcpy(a, d_a, n*sizeof(float), cudaMemcpyDeviceToHost);
+    cudaFree(d_a);
+
+    // print result
+    cout << "GPU:" << endl;
+    for(int i=0; i<n; i++) cout << i << ": " << a[i] << endl;
+    cout << endl;
+
+    // free CPU arrays
+    delete[] a;
+}
+
+
+
--- a/submission/ex3/Makefile
+++ b/submission/ex3/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+
--- a/submission/ex3/aux.cu
+++ b/submission/ex3/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+
+
+
+
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+
+
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+
+
+
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+
+
+
+
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+
+
+
+
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex3/aux.h
+++ b/submission/ex3/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#ifndef AUX_H
+#define AUX_H
+
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+
+
+
+
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+
+
+
+
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+
+
+
+
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+
+
+
+#endif  // AUX_H
--- a/submission/ex3/image_input.png
+++ b/submission/ex3/image_input.png
--- a/submission/ex3/image_result.png
+++ b/submission/ex3/image_result.png
--- a/submission/ex3/main
+++ b/submission/ex3/main
--- a/submission/ex3/main.cu
+++ b/submission/ex3/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+
+
+
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+
+
+#include "aux.h"
+#include <iostream>
+using namespace std;
+
+// uncomment to use the camera
+#define CAMERA
+
+#define USING_GPU
+
+
+__global__ void invertImage(float *imgIn, float *imgOut, unsigned long n_pixels)
+{
+    int ix = threadIdx.x + blockDim.x * blockIdx.x;
+        
+    if(ix < n_pixels)
+    {      
+        imgOut[ix] = 1 - imgIn[ix];
+    }
+}
+
+
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+
+
+
+
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+
+    // ### Define your own parameters here as needed    
+
+
+
+
+    // Init camera / Load input image
+#ifdef CAMERA
+
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+    
+#else
+    time: 10 ms
+
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+    
+#endif
+
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+
+
+
+
+    // Set the output image format
+    // ###
+    // ###
+    // ### TODO: Change the output image format as needed
+    // ###
+    // ###
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    //cv::Mat mOut(h,w,CV_32FC3);    // mOut will be a color image, 3 layers
+    //cv::Mat mOut(h,w,CV_32FC1);    // mOut will be a grayscale image, 1 layer
+    // ### Define your own output images here as needed
+
+
+
+
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+
+
+
+
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+
+
+
+
+
+
+    
+    Timer timer;
+    float t;
+    // ###
+    // ###
+    // ### TODO: Main computation
+    // ###
+    // ###
+    #ifdef USING_GPU
+    timer.start();
+    
+    for(int rep = 0; rep < repeats; rep++)
+    {
+        size_t n_pixels = w * h * nc;
+        
+        dim3 block = dim3(64, 1, 1);
+        dim3 grid = dim3((n_pixels + block.x - 1) / block.x, 1, 1);
+
+        float *d_imgIn = NULL;
+        float *d_imgOut = NULL;
+        
+        cudaMalloc(&d_imgIn, n_pixels * sizeof(float));
+        cudaMalloc(&d_imgOut, n_pixels * sizeof(float));
+        
+        cudaMemcpy(d_imgIn, imgIn, n_pixels * sizeof(float), cudaMemcpyHostToDevice);
+        
+        // timer.start();
+                
+        invertImage <<< grid, block >>> (d_imgIn, d_imgOut, n_pixels);        
+        cudaDeviceSynchronize();
+        
+        // timer.end();  
+        // t = timer.get();  // elapsed time in seconds
+                
+        cudaMemcpy(imgOut, d_imgOut, n_pixels * sizeof(float), cudaMemcpyDeviceToHost);
+    
+        cudaFree(d_imgIn);
+        cudaFree(d_imgOut);
+    }
+    
+    timer.end();
+    t = timer.get();
+    #else //USING_GPU
+    
+    timer.start();
+    
+    for(int rep = 0; rep < repeats; rep++)    
+    {
+        unsigned long n_pixels = w * h * nc;
+        for(unsigned long idx = 0; idx < n_pixels; idx++)
+        {                
+            imgOut[idx] = 1 - imgIn[idx];
+        }
+    }
+    
+    timer.end();  
+    t = timer.get();  // elapsed time in seconds
+        
+    #endif
+    
+
+
+    cout << "time: " << t*1000 << " ms" << endl;
+
+
+
+
+
+
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+
+    // ### Display your own output images here as needed
+
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+
+
+
+
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
+
+
+
--- a/submission/ex4/Makefile
+++ b/submission/ex4/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+
--- a/submission/ex4/aux.cu
+++ b/submission/ex4/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+
+
+
+
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+
+
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+
+
+
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+
+
+
+
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+
+
+
+
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex4/aux.h
+++ b/submission/ex4/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#ifndef AUX_H
+#define AUX_H
+
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+
+
+
+
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+
+
+
+
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+
+
+
+
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+
+
+
+#endif  // AUX_H
--- a/submission/ex4/image_input.png
+++ b/submission/ex4/image_input.png
--- a/submission/ex4/image_result.png
+++ b/submission/ex4/image_result.png
--- a/submission/ex4/main
+++ b/submission/ex4/main
--- a/submission/ex4/main.cu
+++ b/submission/ex4/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+
+
+
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+
+
+#include "aux.h"
+#include <iostream>
+using namespace std;
+
+// uncomment to use the camera
+//#define CAMERA
+
+#define USING_GPU
+
+// Image Threshold
+__device__ void threshImage(float *in, float *out, float thresh, size_t count)
+{
+    int ix = threadIdx.x + blockDim.x * blockIdx.x;
+        
+    if(ix < count)
+    {      
+        out[ix] = ( in[ix] >= thresh );
+    }
+}
+
+// Calculate the Intermediate Result required for Threshold
+__device__ void intResult(float *in, float *out, int nc, size_t count)
+{
+    int ix = threadIdx.x + blockDim.x * blockIdx.x;
+        
+    if(ix < count)
+    {
+        for(int i = 0; i < nc; i++)
+            out[ix] += in[ix + i * count];
+        out[ix] = out[ix] / nc;
+    }
+}
+
+
+__global__ void callKernel(float *in, float *out, float thresh, int nc, size_t count)
+{
+    intResult(in, out, nc, count);
+    threshImage(out, out, thresh, count);
+}
+
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+
+
+
+
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+
+    // ### Define your own parameters here as needed    
+
+    float thresh = 0.5;
+
+
+    // Init camera / Load input image
+#ifdef CAMERA
+
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+    
+#else
+
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+    
+#endif
+
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+
+
+
+
+    // Set the output image format
+    // ###
+    // ###
+    // ### TODO: Change the output image format as needed
+    // ###
+    // ###
+    //cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    //cv::Mat mOut(h,w,CV_32FC3);    // mOut will be a color image, 3 layers
+    cv::Mat mOut(h,w,CV_32FC1);    // mOut will be a grayscale image, 1 layer
+    // ### Define your own output images here as needed
+
+
+
+
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+
+
+
+
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+
+
+
+
+
+
+    
+    Timer timer;
+    float t;
+    // ###
+    // ###
+    // ### TODO: Main computation
+    // ###
+    // ###
+    #ifdef USING_GPU
+    timer.start();
+    
+    // Repetitions Loop
+    for(int rep = 0; rep < repeats; rep++)
+    {
+        size_t n_pixels = w * h;
+        
+        // Thread Dimensions
+        dim3 block = dim3(256, 1, 1);
+        dim3 grid = dim3((n_pixels + block.x - 1) / block.x, 1, 1);
+
+        // Allocating memory on the device
+        float *d_imgIn = NULL;
+        float *d_imgOut = NULL;
+        cudaMalloc(&d_imgIn, n_pixels * nc * sizeof(float));
+        cudaMalloc(&d_imgOut, n_pixels * sizeof(float));
+        
+        // Copying Input image to device, and initializing result to 0
+        cudaMemcpy(d_imgIn, imgIn, n_pixels * nc * sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemset(d_imgOut, 0, n_pixels * sizeof(float));
+        
+        // Calling Kernel
+        callKernel <<< grid, block >>> (d_imgIn, d_imgOut, thresh, nc, n_pixels);        
+        
+        // Copying result back
+        cudaMemcpy(imgOut, d_imgOut, n_pixels * sizeof(float), cudaMemcpyDeviceToHost);
+    
+        // Freeing Memory
+        cudaFree(d_imgIn);
+        cudaFree(d_imgOut);
+    }
+    
+    timer.end();
+    t = timer.get();
+    
+    #else // USING_GPU
+    // CPU Implementation
+
+    timer.start();
+    
+    // Repetitions Loop
+    for(int rep = 0; rep < repeats; rep++)    
+    {
+        size_t n_pixels = w * h;
+        memset(imgOut, 0, n_pixels * sizeof(float));
+        for(size_t idx = 0; idx < n_pixels; idx++)
+        {         
+            for(int i = 0; i < nc; i++)
+                imgOut[idx] += imgIn[idx + i * n_pixels];
+            imgOut[idx] = imgOut[idx] / nc;    
+            imgOut[idx] = (imgOut[idx] >= thresh);
+        }
+    }
+    
+    timer.end();  
+    t = timer.get();  // elapsed time in seconds
+        
+    #endif
+    
+    cout << "time: " << t*1000 << " ms" << endl;
+
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+
+    // ### Display your own output images here as needed
+
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+
+
+
+
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
+
+
+
--- a/submission/ex5/Makefile
+++ b/submission/ex5/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+
--- a/submission/ex5/aux.cu
+++ b/submission/ex5/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+
+
+
+
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+
+
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+
+
+
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+
+
+
+
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+
+
+
+
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex5/aux.h
+++ b/submission/ex5/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#ifndef AUX_H
+#define AUX_H
+
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+
+
+
+
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+
+
+
+
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+
+
+
+
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+
+
+
+#endif  // AUX_H
--- a/submission/ex5/image_input.png
+++ b/submission/ex5/image_input.png
--- a/submission/ex5/image_result.png
+++ b/submission/ex5/image_result.png
--- a/submission/ex5/main
+++ b/submission/ex5/main
--- a/submission/ex5/main.cu
+++ b/submission/ex5/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+
+
+
+// ###
+// ###
+// ### TODO: For every student of your group, please provide here:
+// ###
+// ### name, email, login username (for example p123)
+// ###
+// ###
+
+
+#include "aux.h"
+#include <iostream>
+using namespace std;
+
+// uncomment to use the camera
+//#define CAMERA
+
+__global__ void gradient_components(float *image, float *v1, float *v2,
+                                    int w, int h, int nc)
+{
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    int c = threadIdx.z + blockDim.z * blockIdx.z;
+    int idx = x + w*y + w*h*c;
+    if (x < w && y < h) {
+        if (x == w-1)
+            v1[idx] = 0;
+        else
+            v1[idx] = image[idx + 1] - image[idx];
+        if (y == h-1)
+            v2[idx] = 0;
+        else
+            v2[idx] = image[idx + w] - image[idx];
+    }
+}
+
+__global__ void gradient_norm(float *v1, float *v2, float *grad,
+                              int w, int h, int nc)
+{
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (x < w && y < h) {
+        float g = 0;
+        for (int c = 0; c < nc; c++) {
+            float u1 = v1[x + y*w + w*h*c];
+            float u2 = v2[x + y*w + w*h*c];
+            g += u1*u1 + u2*u2;
+        }
+        grad[x + w*y] = g;
+    }
+}
+
+inline int divc(int n, int b) { return (n + b - 1) / b; }
+
+int main(int argc, char **argv)
+{
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+
+
+
+
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    
+    // number of computation repetitions to get a better run time measurement
+    int repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    
+    // load the input image as grayscale if "-gray" is specifed
+    bool gray = false;
+    getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+
+    // ### Define your own parameters here as needed    
+
+
+
+
+    // Init camera / Load input image
+#ifdef CAMERA
+
+    // Init camera
+  	cv::VideoCapture camera(0);
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    cv::Mat mIn;
+    camera >> mIn;
+    
+#else
+    
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+    
+#endif
+
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    int w = mIn.cols;         // width
+    int h = mIn.rows;         // height
+    int nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+
+
+
+
+    // Set the output image format
+    cv::Mat mOut(h,w,CV_32FC1);    // mOut will be a grayscale image, 1 layer
+    // ### Define your own output images here as needed
+
+
+
+
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+
+
+
+
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    while (cv::waitKey(30) < 0)
+    {
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+#endif
+
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+
+    float t = 0;
+    for (int measurement = 0; measurement < repeats; measurement++) {
+        Timer timer; timer.start();
+        float *d_image, *d_v1, *d_v2, *d_grad;
+        size_t nbytes_in  = (size_t)w*h*nc*sizeof(float);
+        size_t nbytes_out = (size_t)w*h*sizeof(float);
+        cudaMalloc(&d_image, nbytes_in);
+        cudaMalloc(&d_v1, nbytes_in);
+        cudaMalloc(&d_v2, nbytes_in);
+        cudaMalloc(&d_grad, nbytes_out);
+        cudaMemcpy(d_image, imgIn, nbytes_in, cudaMemcpyHostToDevice);
+        CUDA_CHECK;
+        dim3 block_comp(16, 8, 3);
+        dim3 block_norm(32, 8, 1);
+        dim3 grid_comp(divc(w, block_comp.x), divc(h, block_comp.y), divc(nc, block_comp.z));
+        dim3 grid_norm(divc(w, block_norm.x), divc(h, block_norm.y));
+        gradient_components<<<grid_comp, block_comp>>>(d_image, d_v1, d_v2, w, h, nc);
+        gradient_norm<<<grid_norm, block_norm>>>(d_v1, d_v2, d_grad, w, h, nc);
+        cudaMemcpy(imgOut, d_grad, nbytes_out, cudaMemcpyDeviceToHost);
+        cudaFree(d_image);
+        cudaFree(d_v1);
+        cudaFree(d_v2);
+        cudaFree(d_grad);
+        timer.end();  t += timer.get();  // elapsed time in seconds
+    }
+    cout << "time: " << (t / repeats)*1000 << " ms" << endl;
+
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+
+    // ### Display your own output images here as needed
+
+#ifdef CAMERA
+    // end of camera loop
+    }
+#else
+    // wait for key inputs
+    cv::waitKey(0);
+#endif
+
+
+
+
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}
+
+
+
--- a/submission/ex6/Makefile
+++ b/submission/ex6/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+
--- a/submission/ex6/aux.cu
+++ b/submission/ex6/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+
+
+
+
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+
+
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+
+
+
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+
+
+
+
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+
+
+
+
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/submission/ex6/aux.h
+++ b/submission/ex6/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+
+
+#ifndef AUX_H
+#define AUX_H
+
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+
+
+
+
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+
+
+
+
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+
+
+
+
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+
+
+
+
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+
+
+
+
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+
+
+
+#endif  // AUX_H
--- a/submission/ex6/image_input.png
+++ b/submission/ex6/image_input.png
--- a/submission/ex6/image_result.png
+++ b/submission/ex6/image_result.png
--- a/submission/ex6/main
+++ b/submission/ex6/main
--- a/submission/ex6/main.cu
+++ b/submission/ex6/main.cu