Ugly GLUT Working Code

Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>

Ugly GLUT Working Code
Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>
55399ce1 · Gaurav Kukreja · 132598d0 · 55399ce1 · 55399ce1 · 55399ce1
Commit 55399ce1 authored Mar 25, 2014 by Gaurav Kukreja
Showing with 719 additions and 0 deletions

Makefile miklos/project_interop/Makefile +3 -0

aux.cu miklos/project_interop/aux.cu +146 -0

aux.h miklos/project_interop/aux.h +109 -0

main.cu miklos/project_interop/main.cu +461 -0

No files found.
--- a/miklos/project_interop/Makefile
+++ b/miklos/project_interop/Makefile
+main: main.cu aux.cu aux.h Makefile
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core -lopencv_imgproc -lGL -lglut
--- a/miklos/project_interop/aux.cu
+++ b/miklos/project_interop/aux.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#include "aux.h"
+#include <cstdlib>
+#include <iostream>
+using std::stringstream;
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::string;
+// parameter processing: template specialization for T=bool
+template<>
+bool getParam<bool>(std::string param, bool &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc) || argv[i+1][0]=='-') { var = true; return true; }
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_layered_to_interleaved(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[(nc-1-c) + nc*(x + (size_t)w*y)] = aIn[x + (size_t)w*y + nOmega*c];
+            }
+        }
+    }
+}
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn)
+{
+    convert_layered_to_interleaved((float*)mOut.data, aIn, mOut.cols, mOut.rows, mOut.channels());
+}
+void convert_interleaved_to_layered(float *aOut, const float *aIn, int w, int h, int nc)
+{
+    if (nc==1) { memcpy(aOut, aIn, w*h*sizeof(float)); return; }
+    size_t nOmega = (size_t)w*h;
+    for (int y=0; y<h; y++)
+    {
+        for (int x=0; x<w; x++)
+        {
+            for (int c=0; c<nc; c++)
+            {
+                aOut[x + (size_t)w*y + nOmega*c] = aIn[(nc-1-c) + nc*(x + (size_t)w*y)];
+            }
+        }
+    }
+}
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn)
+{
+    convert_interleaved_to_layered(aOut, (float*)mIn.data, mIn.cols, mIn.rows, mIn.channels());
+}
+void showImage(string title, const cv::Mat &mat, int x, int y)
+{
+    const char *wTitle = title.c_str();
+    cv::namedWindow(wTitle, CV_WINDOW_AUTOSIZE);
+    cvMoveWindow(wTitle, x, y);
+    cv::imshow(wTitle, mat);
+}
+// adding Gaussian noise
+float noise(float sigma)
+{
+    float x1 = (float)rand()/RAND_MAX;
+    float x2 = (float)rand()/RAND_MAX;
+    return sigma * sqrtf(-2*log(std::max(x1,0.000001f)))*cosf(2*M_PI*x2);
+}
+void addNoise(cv::Mat &m, float sigma)
+{
+    float *data = (float*)m.data;
+    int w = m.cols;
+    int h = m.rows;
+    int nc = m.channels();
+    size_t n = (size_t)w*h*nc;
+    for(size_t i=0; i<n; i++)
+    {
+        data[i] += noise(sigma);
+    }
+}
+// cuda error checking
+string prev_file = "";
+int prev_line = 0;
+void cuda_check(string file, int line)
+{
+    cudaError_t e = cudaGetLastError();
+    if (e != cudaSuccess)
+    {
+        cout << endl << file << ", line " << line << ": " << cudaGetErrorString(e) << " (" << e << ")" << endl;
+        if (prev_line>0) cout << "Previous CUDA call:" << endl << prev_file << ", line " << prev_line << endl;
+        exit(1);
+    }
+    prev_file = file;
+    prev_line = line;
+}
--- a/miklos/project_interop/aux.h
+++ b/miklos/project_interop/aux.h
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ### THIS FILE IS SUPPOSED TO REMAIN UNCHANGED
+// ###
+// ###
+#ifndef AUX_H
+#define AUX_H
+#include <cuda_runtime.h>
+#include <ctime>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <sstream>
+// parameter processing
+template<typename T>
+bool getParam(std::string param, T &var, int argc, char **argv)
+{
+    const char *c_param = param.c_str();
+    for(int i=argc-1; i>=1; i--)
+    {
+        if (argv[i][0]!='-') continue;
+        if (strcmp(argv[i]+1, c_param)==0)
+        {
+            if (!(i+1<argc)) continue;
+            std::stringstream ss;
+            ss << argv[i+1];
+            ss >> var;
+            return (bool)ss;
+        }
+    }
+    return false;
+}
+// opencv helpers
+void convert_mat_to_layered(float *aOut, const cv::Mat &mIn);
+void convert_layered_to_mat(cv::Mat &mOut, const float *aIn);
+void showImage(std::string title, const cv::Mat &mat, int x, int y);
+// adding Gaussian noise
+void addNoise(cv::Mat &m, float sigma);
+// measuring time
+class Timer
+{
+    public:
+	Timer() : tStart(0), running(false), sec(0.f)
+	{
+	}
+	void start()
+	{
+		tStart = clock();
+		running = true;
+	}
+	void end()
+	{
+		if (!running) { sec = 0; return; }
+        cudaDeviceSynchronize();
+		clock_t tEnd = clock();
+		sec = (float)(tEnd - tStart) / CLOCKS_PER_SEC;
+		running = false;
+	}
+	float get()
+	{
+		if (running) end();
+		return sec;
+	}
+    private:
+	clock_t tStart;
+	bool running;
+	float sec;
+};
+// cuda error checking
+#define CUDA_CHECK cuda_check(__FILE__,__LINE__)
+void cuda_check(std::string file, int line);
+#endif  // AUX_H
--- a/miklos/project_interop/main.cu
+++ b/miklos/project_interop/main.cu
+// ###
+// ###
+// ### Practical Course: GPU Programming in Computer Vision
+// ###
+// ###
+// ### Technical University Munich, Computer Vision Group
+// ### Winter Semester 2013/2014, March 3 - April 4
+// ###
+// ###
+// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
+// ###
+// ###
+// ###
+// ###
+// ###
+// ### Miklos Homolya, miklos.homolya@tum.de, p056 
+// ### Ravikishore Kommajosyula, r.kommajosyula@tum.de, p057
+// ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058
+// ###
+// ###
+#define GL_GLEXT_PROTOTYPES
+#include <GL/glut.h>
+#include "cuda_gl_interop.h"
+#include "aux.h"
+#include <iostream>
+using namespace std;
+/************************************************************************
+ ***        GLOBAL VARIABLES                                        *****
+ ************************************************************************/
+int repeats;
+bool gray;
+float lambda;
+float tau;
+int N;
+float c1;
+float c2;
+cv::VideoCapture camera(0);
+cv::Mat mIn;
+int w;
+int h;
+int nc;
+// uncomment to use the camera
+#define CAMERA
+template<typename T>
+__device__ __host__ T min(T a, T b)
+{
+    return (a < b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T max(T a, T b)
+{
+    return (a > b) ? a : b;
+}
+template<typename T>
+__device__ __host__ T clamp(T m, T x, T M)
+{
+    return max(m, min(x, M));
+}
+/**
+ * Computes the normalized gradient.
+ *
+ * @param U input image (single-channel)
+ * @param vx x-coordinate of result
+ * @param vy y-coordinate of result
+ * @param w width of image (pixels)
+ * @param h height of image (pixels)
+ */
+__global__ void norm_grad(float *U, float *vx, float *vy, int w, int h)
+{
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (x < w && y < h) {
+        size_t i = x + (size_t)w*y;
+        float ux = ((x+1 < w) ? (U[i + 1] - U[i]) : 0);
+        float uy = ((y+1 < h) ? (U[i + w] - U[i]) : 0);
+        float gn = sqrtf(ux*ux + uy*uy + FLT_EPSILON);
+        vx[i] = ux / gn;
+        vy[i] = uy / gn;
+    }
+}
+/**
+ * nu (Greek letter) function penalizes being outside the interval [0; 1].
+ */
+__device__ float nu(float u)
+{
+    if (u < 0.f)
+        return -2.f;
+    if (u > 1.f)
+        return +2.f;
+    return 0.f;
+}
+/**
+ * Calculate s(x) = (c1 - f(x))^2 - (c2 - f(x))^2.
+ *
+ * @param F original input image (single-channel)
+ * @param S result (single-channel)
+ * @param w width of image (pixels)
+ * @param h height of image (pixels)
+ */
+__global__ void calculate_S(float *F, float *S, int w, int h, float c1, float c2)
+{
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (x < w && y < h) {
+        size_t i = x + (size_t)w*y;
+        S[i] = (c1 - F[i])*(c1 - F[i]) - (c2 - F[i])*(c2 - F[i]);
+    }
+}
+/**
+ * Update approximation.
+ *
+ * @param U approximation of solution (single-channel)
+ * @param S update component from input image (single-channel)
+ * @param vx normalized gradient of U (x-coordinate)
+ * @param vy normalized gradient of U (y-coordinate)
+ * @param w width of image (pixels)
+ * @param h height of image (pixels)
+ * @param lambda weight of S
+ * @param alpha weight of nu
+ * @param tau update coefficient
+ */
+#ifdef CAMERA
+__global__ void update(uchar4* output, float *U, float *S, float *vx, float *vy,
+                       int w, int h, float lambda, float alpha, float tau)
+#else
+__global__ void update(float *U, float *S, float *vx, float *vy,
+                       int w, int h, float lambda, float alpha, float tau)
+#endif
+{
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    if (x < w && y < h) {
+        size_t i = x + (size_t)w*y;
+        // smoothness (functional derivative of energy)
+        float dx_vx = ((x+1 < w) ? vx[i] : 0) - ((x > 0) ? vx[i - 1] : 0);
+        float dy_vy = ((y+1 < h) ? vy[i] : 0) - ((y > 0) ? vy[i - w] : 0);
+        float div_v = dx_vx + dy_vy;
+        // explicit Euler update rule
+        U[i] += tau * (div_v - lambda * S[i] - alpha * nu(U[i]));
+#ifdef CAMERA
+        output[w*h-i-1].x = (uchar)(U[i] * 255.f);
+        output[w*h-i-1].y = output[w*h-i-1].x;
+        output[w*h-i-1].z = output[w*h-i-1].x;
+        output[w*h-i-1].w = 255;
+#endif
+    }
+}
+inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
+inline dim3 make_grid(dim3 whole, dim3 block)
+{
+    return dim3(div_ceil(whole.x, block.x),
+                div_ceil(whole.y, block.y),
+                div_ceil(whole.z, block.z));
+}
+GLuint bufferObj;
+cudaGraphicsResource * resource;
+#define HEIGHT 480
+#define WIDTH 640
+static void key_func( unsigned char key, int x, int y ) {
+    switch (key) {
+        case 27:
+        // clean up OpenGL and CUDA
+        cudaGraphicsUnregisterResource( resource );
+        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
+        glDeleteBuffers( 1, &bufferObj );
+        exit(0);
+    } 
+}
+static void draw_func( void ) {
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    // Get camera image
+    camera >> mIn;
+    if(gray)
+        cvtColor(mIn, mIn, CV_BGR2GRAY);
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    uchar4* d_output;
+    size_t size;
+    // allocate raw input image array   
+    float *imgIn  = new float[(size_t)w*h*nc];
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
+    cudaGraphicsMapResources (1, &resource, NULL);
+    cudaGraphicsResourceGetMappedPointer( (void**) &d_output, &size, resource);
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    dim3 block(32, 16);
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    Timer timer; timer.start();
+    float *d_U, *d_S, *d_vx, *d_vy;
+    cudaMalloc(&d_U, imageBytes);
+    cudaMalloc(&d_S, imageBytes);
+    cudaMalloc(&d_vx, imageBytes);
+    cudaMalloc(&d_vy, imageBytes);
+    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
+    float *S = new float[(size_t)w*h];
+    cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
+    float S_max = 0.0;
+    for (size_t i = 0; i < (size_t)w*h; i++)
+        S_max = max(S_max, fabs(S[i]));  // TODO: CPU thing
+    delete[] S;
+    float alpha = 0.5 * lambda * S_max;
+    for (int n = 0; n < N; n++) {
+        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
+        update<<< grid, block >>>(d_output, d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
+    }
+    cudaGraphicsUnmapResources(1, &resource, NULL);
+    cudaFree(d_U);
+    cudaFree(d_S);
+    cudaFree(d_vx);
+    cudaFree(d_vy);
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << t*1000 << " ms" << endl;
+    // show input image
+    // showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+    glDrawPixels( WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
+    glutSwapBuffers();
+    glutPostRedisplay();
+}
+int main(int argc, char **argv)
+{
+#ifdef CAMERA
+    cudaGLSetGLDevice(0);   CUDA_CHECK;
+    // these GLUT calls need to be made before the other GL calls
+     glutInit( &argc, argv );
+     glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
+     glutInitWindowSize( WIDTH, HEIGHT );
+     glutCreateWindow( "bitmap" );
+     glGenBuffers(1, &bufferObj);
+     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
+     glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_DYNAMIC_DRAW_ARB);
+     cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone);
+#endif    
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    // We will do it right here, so that the run time measurements are accurate
+    cudaDeviceSynchronize();  CUDA_CHECK;
+    // Reading command line parameters:
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    // If "-param" is not specified, the value of "var" remains unchanged
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+#ifdef CAMERA
+#else
+    // input image
+    string image = "";
+    bool ret = getParam("i", image, argc, argv);
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+#endif
+    // number of computation repetitions to get a better run time measurement
+    repeats = 1;
+    getParam("repeats", repeats, argc, argv);
+    cout << "repeats: " << repeats << endl;
+    // load the input image as grayscale if "-gray" is specifed
+    gray = true;
+    // always true: getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
+    // ### Define your own parameters here as needed    
+    lambda = 0.8;
+    getParam("lambda", lambda, argc, argv);
+    cout << "λ: " << lambda << endl;
+    tau = 0.01;
+    getParam("tau", tau, argc, argv);
+    cout << "τ: " << tau << endl;
+    N = 2000;
+    getParam("N", N, argc, argv);
+    cout << "N: " << N << endl;
+    c1 = 0.65;
+    getParam("c1", c1, argc, argv);
+    cout << "c1: " << c1 << endl;
+    c2 = 0.00;
+    getParam("c2", c2, argc, argv);
+    cout << "c2: " << c2 << endl;
+    // Init camera / Load input image
+#ifdef CAMERA
+    // Init camera
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    int camW = 640;
+    int camH = 480;
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
+    camera >> mIn;
+    if(gray)
+        cvtColor(mIn, mIn, CV_BGR2GRAY);
+#else
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // check
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+#endif
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // get image dimensions
+    w = mIn.cols;         // width
+    h = mIn.rows;         // height
+    nc = mIn.channels();  // number of channels
+    cout << "image: " << w << " x " << h << endl;
+    // Set the output image format
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    // ### Define your own output images here as needed
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
+    glutKeyboardFunc (key_func);
+    glutDisplayFunc (draw_func);
+    glutMainLoop();
+#else
+    // Allocate arrays
+    // input/output image width: w
+    // input/output image height: h
+    // input image number of channels: nc
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    dim3 block(32, 16);
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    Timer timer; timer.start();
+    float *d_U, *d_S, *d_vx, *d_vy;
+    cudaMalloc(&d_U, imageBytes);
+    cudaMalloc(&d_S, imageBytes);
+    cudaMalloc(&d_vx, imageBytes);
+    cudaMalloc(&d_vy, imageBytes);
+    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
+    float *S = new float[(size_t)w*h];
+    cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
+    float S_max = 0.0;
+    for (size_t i = 0; i < (size_t)w*h; i++)
+        S_max = max(S_max, fabs(S[i]));  // TODO: CPU thing
+    delete[] S;
+    float alpha = 0.5 * lambda * S_max;
+    for (int n = 0; n < N; n++) {
+        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
+        update<<< grid, block >>>(d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
+    }
+    cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_U);
+    cudaFree(d_S);
+    cudaFree(d_vx);
+    cudaFree(d_vy);
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << t*1000 << " ms" << endl;
+    // show input image
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+    // show output image: first convert to interleaved opencv format from the layered raw array
+    convert_layered_to_mat(mOut, imgOut);
+    showImage("Output", mOut, 100+w+40, 100);
+    // ### Display your own output images here as needed
+    // wait for key inputs
+    cv::waitKey(0);
+    // save input and result
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
+    cv::imwrite("image_result.png",mOut*255.f);
+    // free allocated arrays
+    delete[] imgIn;
+    delete[] imgOut;
+#endif
+    // close all opencv windows
+    cvDestroyAllWindows();
+    return 0;
+}