interoparability with primal dual - fast execution

6cf20bce · Ravi · 20663a73 · 6cf20bce
Commit 6cf20bce authored Mar 27, 2014 by Ravi
Hide whitespace changes
Inline Side-by-side

Showing with 430 additions and 461 deletions

main.cu miklos/project_interop/main.cu +430 -461

No files found.
--- a/miklos/project_interop/main.cu
+++ b/miklos/project_interop/main.cu
 // ###
 // ###
 // ### Practical Course: GPU Programming in Computer Vision
 // ###
 // ###
 // ### Technical University Munich, Computer Vision Group
 // ### Winter Semester 2013/2014, March 3 - April 4
 // ###
 // ###
 // ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
 // ###
 // ###
 // ###
 // ###
 // ###
-// ### Miklos Homolya, miklos.homolya@tum.de, p056 
+// ### Miklos Homolya, miklos.homolya@tum.de, p056
 // ### Ravikishore Kommajosyula, r.kommajosyula@tum.de, p057
 // ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058
 // ###
 // ###
 #define GL_GLEXT_PROTOTYPES
 #include <GL/glut.h>
 #include "cuda_gl_interop.h"
 #include "aux.h"
 #include <iostream>
 using namespace std;
 /************************************************************************
 ***        GLOBAL VARIABLES                                        *****
 ************************************************************************/
 int repeats;
 bool gray;
 float lambda;
 float tau;
 int N;
 float c1;
 float c2;
+float sigma;
-cv::VideoCapture camera(0);
-cv::Mat mIn;
+cv::VideoCapture camera(0);
+cv::Mat mIn;
-int w;
-int h;
+int w;
-int nc;
+int h;
+int nc;
-// uncomment to use the camera
-#define CAMERA
+// uncomment to use the camera
+#define CAMERA
-template<typename T>
-__device__ __host__ T min(T a, T b)
+template<typename T>
-{
+__device__ __host__ T min(T a, T b)
-    return (a < b) ? a : b;
+{
-}
+    return (a < b) ? a : b;
+}
-template<typename T>
-__device__ __host__ T max(T a, T b)
+template<typename T>
-{
+__device__ __host__ T max(T a, T b)
-    return (a > b) ? a : b;
+{
-}
+    return (a > b) ? a : b;
+}
-template<typename T>
-__device__ __host__ T clamp(T m, T x, T M)
+template<typename T>
-{
+__device__ __host__ T clamp(T m, T x, T M)
-    return max(m, min(x, M));
+{
-}
+    return max(m, min(x, M));
+}
-/**
- * Computes the normalized gradient.
+__global__ void calculate_F(float *U, float *F, int w, int h, float c1, float c2, float lambda)
- *
+{
- * @param U input image (single-channel)
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
- * @param vx x-coordinate of result
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
- * @param vy y-coordinate of result
+    if (x < w && y < h) {
- * @param w width of image (pixels)
+        size_t i = x + (size_t)w*y;
- * @param h height of image (pixels)
+        F[i] = lambda * ((c1 - U[i])*(c1 - U[i]) - (c2 - U[i])*(c2 - U[i]));
- */
+    }
-__global__ void norm_grad(float *U, float *vx, float *vy, int w, int h)
+}
-{
-    int x = threadIdx.x + blockDim.x * blockIdx.x;
+__device__ float diff_i(float *M, int w, int h, int x, int y)
-    int y = threadIdx.y + blockDim.y * blockIdx.y;
+{
-    if (x < w && y < h) {
+    size_t i = x + (size_t)w*y;
-        size_t i = x + (size_t)w*y;
+    return (x+1 < w) ? (M[i + 1] - M[i]) : 0.f;
-        float ux = ((x+1 < w) ? (U[i + 1] - U[i]) : 0);
+}
-        float uy = ((y+1 < h) ? (U[i + w] - U[i]) : 0);
-        float gn = sqrtf(ux*ux + uy*uy + FLT_EPSILON);
+__device__ float diff_j(float *M, int w, int h, int x, int y)
-        vx[i] = ux / gn;
+{
-        vy[i] = uy / gn;
+    size_t i = x + (size_t)w*y;
-    }
+    return (y+1 < h) ? (M[i + w] - M[i]) : 0.f;
 }
-/**
+__global__ void update_Xij(float *Xi, float *Xj, float *T, float *U, int w, int h, float sigma)
- * nu (Greek letter) function penalizes being outside the interval [0; 1].
+{
- */
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
-__device__ float nu(float u)
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
-{
+    if (x < w && y < h) {
-    if (u < 0.f)
+        size_t i = x + (size_t)w*y;
-        return -2.f;
+        float xi = Xi[i] - sigma * (2 * diff_i(U, w, h, x, y) - diff_i(T, w, h, x, y));
-    if (u > 1.f)
+        float xj = Xj[i] - sigma * (2 * diff_j(U, w, h, x, y) - diff_j(T, w, h, x, y));
-        return +2.f;
+        float dn = max(1.f, sqrtf(xi*xi + xj*xj));
-    return 0.f;
+        Xi[i] = xi / dn;
-}
+        Xj[i] = xj / dn;
+    }
-/**
+}
- * Calculate s(x) = (c1 - f(x))^2 - (c2 - f(x))^2.
- *
+__device__ float divergence(float *X, float *Y, int w, int h, int x, int y)
- * @param F original input image (single-channel)
+{
- * @param S result (single-channel)
+    size_t i = x + (size_t)w*y;
- * @param w width of image (pixels)
+    float dx_x = ((x+1 < w) ? X[i] : 0.f) - ((x > 0) ? X[i - 1] : 0.f);
- * @param h height of image (pixels)
+    float dy_y = ((y+1 < h) ? Y[i] : 0.f) - ((y > 0) ? Y[i - w] : 0.f);
- */
+    return dx_x + dy_y;
-__global__ void calculate_S(float *F, float *S, int w, int h, float c1, float c2)
+}
-{
-    int x = threadIdx.x + blockDim.x * blockIdx.x;
+__global__ void update_U(uchar4* output, float *T, float *Xi, float *Xj, float *F, float *U, int w, int h, float tau)
-    int y = threadIdx.y + blockDim.y * blockIdx.y;
+{
-    if (x < w && y < h) {
+    int x = threadIdx.x + blockDim.x * blockIdx.x;
-        size_t i = x + (size_t)w*y;
+    int y = threadIdx.y + blockDim.y * blockIdx.y;
-        S[i] = (c1 - F[i])*(c1 - F[i]) - (c2 - F[i])*(c2 - F[i]);
+    if (x < w && y < h) {
-    }
+        size_t i = x + (size_t)w*y;
-}
+        U[i] = clamp(0.f, T[i] - tau * (divergence(Xi, Xj, w, h, x, y) + F[i]), 1.f);
+        uchar temp_res = (uchar)(U[i] * 255.f);
-/**
+        output[w*h-i-1].x = temp_res;
- * Update approximation.
+        output[w*h-i-1].y = temp_res;
- *
+        output[w*h-i-1].z = temp_res;
- * @param U approximation of solution (single-channel)
+        output[w*h-i-1].w = 255;
- * @param S update component from input image (single-channel)
- * @param vx normalized gradient of U (x-coordinate)
+    }
- * @param vy normalized gradient of U (y-coordinate)
+}
- * @param w width of image (pixels)
- * @param h height of image (pixels)
+inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
- * @param lambda weight of S
- * @param alpha weight of nu
+inline dim3 make_grid(dim3 whole, dim3 block)
- * @param tau update coefficient
+{
- */
+    return dim3(div_ceil(whole.x, block.x),
-#ifdef CAMERA
+                div_ceil(whole.y, block.y),
-__global__ void update(uchar4* output, float *U, float *S, float *vx, float *vy,
+                div_ceil(whole.z, block.z));
-                       int w, int h, float lambda, float alpha, float tau)
+}
-#else
+GLuint bufferObj;
-__global__ void update(float *U, float *S, float *vx, float *vy,
+cudaGraphicsResource * resource;
-                       int w, int h, float lambda, float alpha, float tau)
-#endif
+#define HEIGHT 480
-{
+#define WIDTH 640
-    int x = threadIdx.x + blockDim.x * blockIdx.x;
-    int y = threadIdx.y + blockDim.y * blockIdx.y;
+static void key_func( unsigned char key, int x, int y ) {
-    if (x < w && y < h) {
+    switch (key) {
-        size_t i = x + (size_t)w*y;
+        case 27:
+        // clean up OpenGL and CUDA
-        // smoothness (functional derivative of energy)
-        float dx_vx = ((x+1 < w) ? vx[i] : 0) - ((x > 0) ? vx[i - 1] : 0);
+        cudaGraphicsUnregisterResource( resource );
-        float dy_vy = ((y+1 < h) ? vy[i] : 0) - ((y > 0) ? vy[i - w] : 0);
+        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
-        float div_v = dx_vx + dy_vy;
+        glDeleteBuffers( 1, &bufferObj );
+        exit(0);
-        // explicit Euler update rule
+    }
-        U[i] += tau * (div_v - lambda * S[i] - alpha * nu(U[i]));
+}
-#ifdef CAMERA
-        output[w*h-i-1].x = (uchar)(U[i] * 255.f);
+static void draw_func( void ) {
-        output[w*h-i-1].y = output[w*h-i-1].x;
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
-        output[w*h-i-1].z = output[w*h-i-1].x;
-        output[w*h-i-1].w = 255;
+    // Get camera image
-#endif
+    camera >> mIn;
-    }
+    if(gray)
-}
+        cvtColor(mIn, mIn, CV_BGR2GRAY);
+    // convert to float representation (opencv loads image values as single bytes by default)
-inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
-inline dim3 make_grid(dim3 whole, dim3 block)
+    mIn /= 255.f;
-{
-    return dim3(div_ceil(whole.x, block.x),
+    uchar4* d_output;
-                div_ceil(whole.y, block.y),
+    size_t size;
-                div_ceil(whole.z, block.z));
-}
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
-GLuint bufferObj;
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
-cudaGraphicsResource * resource;
+    cudaGraphicsMapResources (1, &resource, NULL);
-#define HEIGHT 480
+    cudaGraphicsResourceGetMappedPointer( (void**) &d_output, &size, resource);
-#define WIDTH 640
+    // Init raw input image array
-static void key_func( unsigned char key, int x, int y ) {
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
-    switch (key) {
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
-        case 27:
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
-        // clean up OpenGL and CUDA
+    convert_mat_to_layered (imgIn, mIn);
-        cudaGraphicsUnregisterResource( resource );
+    dim3 block(32, 16);
-        glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
+    dim3 grid = make_grid(dim3(w, h, 1), block);
-        glDeleteBuffers( 1, &bufferObj );
-        exit(0);
+    Timer timer; timer.start();
-    } 
+    float *d_T, *d_U, *d_F, *d_Xi, *d_Xj;
-}
+    cudaMalloc(&d_T, imageBytes);
+    cudaMalloc(&d_U, imageBytes);
-static void draw_func( void ) {
+    cudaMalloc(&d_F, imageBytes);
-    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    cudaMalloc(&d_Xi, imageBytes);
+    cudaMalloc(&d_Xj, imageBytes);
-    // Get camera image
+    cudaMemcpy(d_T, imgIn, imageBytes, cudaMemcpyHostToDevice);
-    camera >> mIn;
+    cudaMemcpy(d_U, d_T, imageBytes, cudaMemcpyDeviceToDevice);
-    if(gray)
+    cudaMemset(d_Xi, 0, imageBytes);
-        cvtColor(mIn, mIn, CV_BGR2GRAY);
+    cudaMemset(d_Xj, 0, imageBytes);
-    // convert to float representation (opencv loads image values as single bytes by default)
-    mIn.convertTo(mIn,CV_32F);
+    calculate_F<<< grid, block >>>(d_U, d_F, w, h, c1, c2, lambda);
-    // convert range of each channel to [0,1] (opencv default is [0,255])
-    mIn /= 255.f;
+    for (int n = 0; n < N; n++) {
+        update_Xij<<< grid, block >>>(d_Xi, d_Xj, d_T, d_U, w, h, sigma);
-    uchar4* d_output;
+        std::swap(d_U, d_T);
-    size_t size;
+        update_U<<< grid, block >>>(d_output, d_T, d_Xi, d_Xj, d_F, d_U, w, h, tau);
+    }
-    // allocate raw input image array   
-    float *imgIn  = new float[(size_t)w*h*nc];
+   // cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
-    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
+    cudaGraphicsUnmapResources(1, &resource, NULL);
+    cudaFree(d_T);
-    cudaGraphicsMapResources (1, &resource, NULL);
+    cudaFree(d_U);
-    cudaGraphicsResourceGetMappedPointer( (void**) &d_output, &size, resource);
+    cudaFree(d_F);
+    cudaFree(d_Xi);
-    // Init raw input image array
+    cudaFree(d_Xj);
-    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
-    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    timer.end();  float t = timer.get();  // elapsed time in seconds
-    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    cout << "time: " << t*1000 << " ms" << endl;
-    convert_mat_to_layered (imgIn, mIn);
+    // show input image
-    dim3 block(32, 16);
+    // showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
-    dim3 grid = make_grid(dim3(w, h, 1), block);
+    glDrawPixels( WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
-    Timer timer; timer.start();
+    glutSwapBuffers();
-    float *d_U, *d_S, *d_vx, *d_vy;
+    glutPostRedisplay();
-    cudaMalloc(&d_U, imageBytes);
+}
-    cudaMalloc(&d_S, imageBytes);
-    cudaMalloc(&d_vx, imageBytes);
+int main(int argc, char **argv)
-    cudaMalloc(&d_vy, imageBytes);
+{
-    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
+#ifdef CAMERA
-    cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    cudaGLSetGLDevice(0);   CUDA_CHECK;
-    calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
+    // these GLUT calls need to be made before the other GL calls
-    float *S = new float[(size_t)w*h];
+     glutInit( &argc, argv );
-    cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
+     glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
-    float S_max = 0.0;
+     glutInitWindowSize( WIDTH, HEIGHT );
-    for (size_t i = 0; i < (size_t)w*h; i++)
+     glutCreateWindow( "bitmap" );
-        S_max = max(S_max, fabs(S[i]));  // TODO: CPU thing
-    delete[] S;
+     glGenBuffers(1, &bufferObj);
-    float alpha = 0.5 * lambda * S_max;
+     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
+     glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_DYNAMIC_DRAW_ARB);
-    for (int n = 0; n < N; n++) {
+     cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone);
-        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
+#endif
-        update<<< grid, block >>>(d_output, d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
-    }
+    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
-    cudaGraphicsUnmapResources(1, &resource, NULL);
+    // We will do it right here, so that the run time measurements are accurate
-    cudaFree(d_U);
+    cudaDeviceSynchronize();  CUDA_CHECK;
-    cudaFree(d_S);
-    cudaFree(d_vx);
+    // Reading command line parameters:
-    cudaFree(d_vy);
+    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
-    timer.end();  float t = timer.get();  // elapsed time in seconds
+    // If "-param" is not specified, the value of "var" remains unchanged
-    cout << "time: " << t*1000 << " ms" << endl;
+    //
+    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
-    // show input image
-    // showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
+#ifdef CAMERA
+#else
-    glDrawPixels( WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
+    // input image
-    glutSwapBuffers();
+    string image = "";
-    glutPostRedisplay();
+    bool ret = getParam("i", image, argc, argv);
-}
+    if (!ret) cerr << "ERROR: no image specified" << endl;
+    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
-int main(int argc, char **argv)
+#endif
-{
-#ifdef CAMERA
+    // number of computation repetitions to get a better run time measurement
-    cudaGLSetGLDevice(0);   CUDA_CHECK;
+    repeats = 1;
+    getParam("repeats", repeats, argc, argv);
-    // these GLUT calls need to be made before the other GL calls
+    cout << "repeats: " << repeats << endl;
-     glutInit( &argc, argv );
-     glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
+    // load the input image as grayscale if "-gray" is specifed
-     glutInitWindowSize( WIDTH, HEIGHT );
+    gray = true;
-     glutCreateWindow( "bitmap" );
+    // always true: getParam("gray", gray, argc, argv);
+    cout << "gray: " << gray << endl;
-     glGenBuffers(1, &bufferObj);
-     glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
+    // ### Define your own parameters here as needed
-     glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_DYNAMIC_DRAW_ARB);
+    lambda = 1.0;
-     cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone);
+    getParam("lambda", lambda, argc, argv);
-#endif    
+    cout << "λ: " << lambda << endl;
-    // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
+    sigma = 0.4;
-    // This happens on the very first call to a CUDA function, and takes some time (around half a second)
+    getParam("sigma", sigma, argc, argv);
-    // We will do it right here, so that the run time measurements are accurate
+    cout << "σ: " << sigma << endl;
-    cudaDeviceSynchronize();  CUDA_CHECK;
+    tau = 0.4;
-    // Reading command line parameters:
+    getParam("tau", tau, argc, argv);
-    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
+    cout << "τ: " << tau << endl;
-    // If "-param" is not specified, the value of "var" remains unchanged
-    //
+    N = 160;
-    // return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
+    getParam("N", N, argc, argv);
+    cout << "N: " << N << endl;
-#ifdef CAMERA
-#else
+    c1 = 1.0;
-    // input image
+    getParam("c1", c1, argc, argv);
-    string image = "";
+    cout << "c1: " << c1 << endl;
-    bool ret = getParam("i", image, argc, argv);
-    if (!ret) cerr << "ERROR: no image specified" << endl;
+    c2 = 0.00;
-    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
+    getParam("c2", c2, argc, argv);
-#endif
+    cout << "c2: " << c2 << endl;
-    // number of computation repetitions to get a better run time measurement
+    // Init camera / Load input image
-    repeats = 1;
+#ifdef CAMERA
-    getParam("repeats", repeats, argc, argv);
-    cout << "repeats: " << repeats << endl;
+    // Init camera
+  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
-    // load the input image as grayscale if "-gray" is specifed
+    int camW = 640;
-    gray = true;
+    int camH = 480;
-    // always true: getParam("gray", gray, argc, argv);
+  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
-    cout << "gray: " << gray << endl;
+  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
+    // read in first frame to get the dimensions
-    // ### Define your own parameters here as needed    
-    lambda = 0.8;
+    camera >> mIn;
-    getParam("lambda", lambda, argc, argv);
+    if(gray)
-    cout << "λ: " << lambda << endl;
+        cvtColor(mIn, mIn, CV_BGR2GRAY);
-    tau = 0.01;
+#else
-    getParam("tau", tau, argc, argv);
-    cout << "τ: " << tau << endl;
+    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
-    N = 2000;
+    // check
-    getParam("N", N, argc, argv);
+    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
-    cout << "N: " << N << endl;
+#endif
-    c1 = 0.65;
-    getParam("c1", c1, argc, argv);
+    // convert to float representation (opencv loads image values as single bytes by default)
-    cout << "c1: " << c1 << endl;
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
-    c2 = 0.00;
+    mIn /= 255.f;
-    getParam("c2", c2, argc, argv);
+    // get image dimensions
-    cout << "c2: " << c2 << endl;
+    w = mIn.cols;         // width
+    h = mIn.rows;         // height
-    // Init camera / Load input image
+    nc = mIn.channels();  // number of channels
-#ifdef CAMERA
+    cout << "image: " << w << " x " << h << endl;
-    // Init camera
+    // Set the output image format
-  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
+    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
-    int camW = 640;
+    // ### Define your own output images here as needed
-    int camH = 480;
-  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
-  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
-    // read in first frame to get the dimensions
+    // For camera mode: Make a loop to read in camera frames
+#ifdef CAMERA
-    camera >> mIn;
+    glutKeyboardFunc (key_func);
-    if(gray)
+    glutDisplayFunc (draw_func);
-        cvtColor(mIn, mIn, CV_BGR2GRAY);
+    glutMainLoop();
+#else
-#else
+    // Allocate arrays
-    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
+    // input/output image width: w
-    mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // input/output image height: h
-    // check
+    // input image number of channels: nc
-    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
+    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
-#endif
+    // allocate raw input image array
+    float *imgIn  = new float[(size_t)w*h*nc];
-    // convert to float representation (opencv loads image values as single bytes by default)
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
-    mIn.convertTo(mIn,CV_32F);
-    // convert range of each channel to [0,1] (opencv default is [0,255])
+    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
-    mIn /= 255.f;
+    float *imgOut = new float[(size_t)w*h*mOut.channels()];
-    // get image dimensions
-    w = mIn.cols;         // width
+    // Init raw input image array
-    h = mIn.rows;         // height
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
-    nc = mIn.channels();  // number of channels
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
-    cout << "image: " << w << " x " << h << endl;
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
-    // Set the output image format
-    cv::Mat mOut(h,w,mIn.type());  // mOut will have the same number of channels as the input image, nc layers
+    dim3 block(32, 16);
-    // ### Define your own output images here as needed
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    Timer timer; timer.start();
+    float *d_U, *d_S, *d_vx, *d_vy;
-    // For camera mode: Make a loop to read in camera frames
+    cudaMalloc(&d_U, imageBytes);
-#ifdef CAMERA
+    cudaMalloc(&d_S, imageBytes);
-    glutKeyboardFunc (key_func);
+    cudaMalloc(&d_vx, imageBytes);
-    glutDisplayFunc (draw_func);
+    cudaMalloc(&d_vy, imageBytes);
-    glutMainLoop();
+    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
-#else
+    cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
-    // Allocate arrays
+    calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
-    // input/output image width: w
+    float *S = new float[(size_t)w*h];
-    // input/output image height: h
+    cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
-    // input image number of channels: nc
+    float S_max = 0.0;
-    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
+    for (size_t i = 0; i < (size_t)w*h; i++)
+        S_max = max(S_max, fabs(S[i]));  // TODO: CPU thing
-    // allocate raw input image array
+    delete[] S;
-    float *imgIn  = new float[(size_t)w*h*nc];
+    float alpha = 0.5 * lambda * S_max;
-    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
+    for (int n = 0; n < N; n++) {
-    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
+        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
-    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+        update<<< grid, block >>>(d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
+    }
-    // Init raw input image array
-    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
-    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    cudaFree(d_U);
-    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    cudaFree(d_S);
-    convert_mat_to_layered (imgIn, mIn);
+    cudaFree(d_vx);
+    cudaFree(d_vy);
-    dim3 block(32, 16);
+    timer.end();  float t = timer.get();  // elapsed time in seconds
-    dim3 grid = make_grid(dim3(w, h, 1), block);
+    cout << "time: " << t*1000 << " ms" << endl;
-    Timer timer; timer.start();
+    // show input image
-    float *d_U, *d_S, *d_vx, *d_vy;
+    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
-    cudaMalloc(&d_U, imageBytes);
-    cudaMalloc(&d_S, imageBytes);
+    // show output image: first convert to interleaved opencv format from the layered raw array
-    cudaMalloc(&d_vx, imageBytes);
+    convert_layered_to_mat(mOut, imgOut);
-    cudaMalloc(&d_vy, imageBytes);
+    showImage("Output", mOut, 100+w+40, 100);
-    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    // ### Display your own output images here as needed
-    cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    // wait for key inputs
-    calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
+    cv::waitKey(0);
-    float *S = new float[(size_t)w*h];
-    cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
+    // save input and result
-    float S_max = 0.0;
+    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
-    for (size_t i = 0; i < (size_t)w*h; i++)
+    cv::imwrite("image_result.png",mOut*255.f);
-        S_max = max(S_max, fabs(S[i]));  // TODO: CPU thing
-    delete[] S;
+    // free allocated arrays
-    float alpha = 0.5 * lambda * S_max;
+    delete[] imgIn;
+    delete[] imgOut;
-    for (int n = 0; n < N; n++) {
-        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
+#endif
-        update<<< grid, block >>>(d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
-    }
+    // close all opencv windows
+    cvDestroyAllWindows();
-    cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
+    return 0;
-    cudaFree(d_U);
+}
-    cudaFree(d_S);
-    cudaFree(d_vx);
-    cudaFree(d_vy);
-    timer.end();  float t = timer.get();  // elapsed time in seconds
-    cout << "time: " << t*1000 << " ms" << endl;
-    // show input image
-    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
-    // show output image: first convert to interleaved opencv format from the layered raw array
-    convert_layered_to_mat(mOut, imgOut);
-    showImage("Output", mOut, 100+w+40, 100);
-    // ### Display your own output images here as needed
-    // wait for key inputs
-    cv::waitKey(0);
-    // save input and result
-    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
-    cv::imwrite("image_result.png",mOut*255.f);
-    // free allocated arrays
-    delete[] imgIn;
-    delete[] imgOut;
-#endif
-    // close all opencv windows
-    cvDestroyAllWindows();
-    return 0;
-}