first (dirty) version of opengl interoperability

Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>

first (dirty) version of opengl interoperability
Signed-off-by: Gaurav Kukreja <gmkukreja@gmail.com>
7e94e981 · Gaurav Kukreja · 1db2e79a · 7e94e981 · 7e94e981
Commit 7e94e981 authored Mar 20, 2014 by Gaurav Kukreja
Hide whitespace changes
Inline Side-by-side

Showing with 177 additions and 51 deletions

Makefile miklos/project/Makefile +1 -1

main.cu miklos/project/main.cu +176 -50

No files found.
--- a/miklos/project/Makefile
+++ b/miklos/project/Makefile
 main: main.cu aux.cu aux.h Makefile
-	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core
+	nvcc -o main main.cu aux.cu --ptxas-options=-v --use_fast_math --compiler-options -Wall -lopencv_highgui -lopencv_core -lGL -lglut
--- a/miklos/project/main.cu
+++ b/miklos/project/main.cu
@@ -25,10 +25,46 @@
 #include "aux.h"
 #include <iostream>
+// #include <GLTools.h>            // OpenGL toolkit
+// #include <GLShaderManager.h>    // Shader Manager Class
+#define GL_GLEXT_PROTOTYPES
+#include <GL/gl.h>
+#include <GL/glut.h>            // Windows FreeGlut equivalent
+#include <GL/glext.h>
+#include "cuda_gl_interop.h"
 using namespace std;
 // uncomment to use the camera
-//#define CAMERA
+#define CAMERA
+/*********************************************************************
+ *** Global Variables                                           ******
+ *********************************************************************/
+// Pointers to Device Memory for Input, Output and Intermediate images
+float *d_F, *d_U, *d_vx, *d_vy;
+// Variables for OpenGL Interoperability
+GLuint outputVBO;
+struct cudaGraphicsResource* outputVBO_CUDA;
+int repeats;
+bool gray;
+float lambda;
+float tau;
+int N;
+cv::VideoCapture camera(0);
+cv::Mat mIn;
+float *imgIn;
+int w, h, nc;
 template<typename T>
 __device__ __host__ T min(T a, T b)
@@ -49,6 +85,8 @@ __device__ __host__ T clamp(T m, T x, T M)
 }
 /**
 * Computes the normalized gradient.
 *
@@ -81,6 +119,7 @@ __global__ void norm_grad(float *U, float *vx, float *vy, int w, int h)
 /**
 * Update approximation.
 *
+ * @param output
 * @param U approximation of solution (single-channel)
 * @param F original input image (single-channel)
 * @param vx normalized gradient of U (x-coordinate)
@@ -90,7 +129,7 @@ __global__ void norm_grad(float *U, float *vx, float *vy, int w, int h)
 * @param lambda weight of 'similarity' energy component
 * @param tau update coefficient
 */
-__global__ void update(float *U, float *F, float *vx, float *vy,
+__global__ void update(float* output, float *U, float *F, float *vx, float *vy,
                       int w, int h, float lambda, float tau)
 {
    int x = threadIdx.x + blockDim.x * blockIdx.x;
@@ -111,7 +150,7 @@ __global__ void update(float *U, float *F, float *vx, float *vy,
        float div_v = dx_vx + dy_vy;
        // explicit Euler update rule
-        U[i] += tau * (lambda * d + div_v);
+        output[i] = U[i] + tau * (lambda * d + div_v);
    }
 }
@@ -124,6 +163,11 @@ inline dim3 make_grid(dim3 whole, dim3 block)
                div_ceil(whole.z, block.z));
 }
+void display();
+void bye() {
+    cout << "bye!\n";
+}
 int main(int argc, char **argv)
 {
@@ -132,9 +176,6 @@ int main(int argc, char **argv)
    // We will do it right here, so that the run time measurements are accurate
    cudaDeviceSynchronize();  CUDA_CHECK;
    // Reading command line parameters:
    // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
    // If "-param" is not specified, the value of "var" remains unchanged
@@ -149,48 +190,47 @@ int main(int argc, char **argv)
    if (!ret) cerr << "ERROR: no image specified" << endl;
    if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
 #endif
    // number of computation repetitions to get a better run time measurement
-    int repeats = 1;
+    repeats = 1;
    getParam("repeats", repeats, argc, argv);
    cout << "repeats: " << repeats << endl;
    // load the input image as grayscale if "-gray" is specifed
-    bool gray = true;
+    gray = true;
    // always true: getParam("gray", gray, argc, argv);
    cout << "gray: " << gray << endl;
    // ### Define your own parameters here as needed    
-    float lambda = 0.8;
+    lambda = 0.8;
    getParam("lambda", lambda, argc, argv);
    cout << "λ: " << lambda << endl;
-    float tau = 0.01;
+    tau = 0.01;
    getParam("tau", tau, argc, argv);
    cout << "τ: " << tau << endl;
-    int N = 2000;
+    N = 2000;
    getParam("N", N, argc, argv);
    cout << "N: " << N << endl;
    // Init camera / Load input image
 #ifdef CAMERA
    // Init camera
-  	cv::VideoCapture camera(0);
+  	// cv::VideoCapture camera(0);
  	if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
    int camW = 640;
    int camH = 480;
  	camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
  	camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
    // read in first frame to get the dimensions
-    cv::Mat mIn;
+    // cv::Mat mIn;
    camera >> mIn;
 #else
    // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
-    cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    // cv::Mat mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
+    mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
    // check
    if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
@@ -201,9 +241,9 @@ int main(int argc, char **argv)
    // convert range of each channel to [0,1] (opencv default is [0,255])
    mIn /= 255.f;
    // get image dimensions
-    int w = mIn.cols;         // width
+    w = mIn.cols;         // width
-    int h = mIn.rows;         // height
+    h = mIn.rows;         // height
-    int nc = mIn.channels();  // number of channels
+    nc = mIn.channels();  // number of channels
    cout << "image: " << w << " x " << h << endl;
@@ -223,29 +263,51 @@ int main(int argc, char **argv)
    // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
    // allocate raw input image array
-    float *imgIn  = new float[(size_t)w*h*nc];
+    imgIn  = new float[(size_t)w*h*nc];
-    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
    // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
    float *imgOut = new float[(size_t)w*h*mOut.channels()];
+#ifdef CAMERA
+    // Initialize OpenGL and GLUT for device 0
+    // and make the OpenGL context current
+    glutInit(&argc, argv);
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
-    // For camera mode: Make a loop to read in camera frames
+    glutInitDisplayMode(GLUT_DEPTH | GLUT_DOUBLE | GLUT_RGBA);
-#ifdef CAMERA
+    glutInitWindowPosition(100,100);
-    // Read a camera image frame every 30 milliseconds:
+    glutInitWindowSize(640,480);
-    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    glutCreateWindow("Project");
-    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    glutDisplayFunc(&display);
-    while (cv::waitKey(30) < 0)
-    {
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
-    // Get camera image
-    camera >> mIn;
+    // Explicitly set device 0
-    // convert to float representation (opencv loads image values as single bytes by default)
+    cudaGLSetGLDevice(0);
-    mIn.convertTo(mIn,CV_32F);
-    // convert range of each channel to [0,1] (opencv default is [0,255])
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
-    mIn /= 255.f;
-#endif
+    // Create buffer object and register it with CUDA
+    glGenBuffers(1, &outputVBO);
+    glBindBuffer(GL_ARRAY_BUFFER, outputVBO);
+    unsigned int size = w * h * nc * sizeof(float);
+    glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+    cudaGraphicsGLRegisterBuffer(&outputVBO_CUDA,
+                                 outputVBO,
+                                 cudaGraphicsMapFlagsWriteDiscard);
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
+    // Launch rendering loop
+    glutMainLoop();
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
+    atexit(bye);
+#else // CAMERA
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
    // Init raw input image array
    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
@@ -257,7 +319,7 @@ int main(int argc, char **argv)
    dim3 grid = make_grid(dim3(w, h, 1), block);
    Timer timer; timer.start();
-    float *d_F, *d_U, *d_vx, *d_vy;
+    // float *d_F, *d_U, *d_vx, *d_vy;
    cudaMalloc(&d_F, imageBytes);
    cudaMalloc(&d_U, imageBytes);
    cudaMalloc(&d_vx, imageBytes);
@@ -267,7 +329,7 @@ int main(int argc, char **argv)
    for (int n = 0; n < N; n++) {
        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
-        update<<< grid, block >>>(d_U, d_F, d_vx, d_vy, w, h, lambda, tau);
+        update<<< grid, block >>>(d_U, d_U, d_F, d_vx, d_vy, w, h, lambda, tau);
    }
    cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
@@ -279,10 +341,6 @@ int main(int argc, char **argv)
    cout << "time: " << t*1000 << " ms" << endl;
    // show input image
    showImage("Input", mIn, 100, 100);  // show at position (x_from_left=100,y_from_above=100)
@@ -292,16 +350,8 @@ int main(int argc, char **argv)
    // ### Display your own output images here as needed
-#ifdef CAMERA
-    // end of camera loop
-    }
-#else
    // wait for key inputs
    cv::waitKey(0);
-#endif
    // save input and result
    cv::imwrite("image_input.png",mIn*255.f);  // "imwrite" assumes channel range [0,255]
@@ -313,8 +363,84 @@ int main(int argc, char **argv)
    // close all opencv windows
    cvDestroyAllWindows();
    return 0;
+#endif
 }
+void display()
+{
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
+    float *d_output;
+    cudaGraphicsMapResources(1, &outputVBO_CUDA, 0);
+    size_t num_bytes; 
+    cudaGraphicsResourceGetMappedPointer((void**)&d_output, &num_bytes, outputVBO_CUDA);
+    size_t imageBytes = (size_t)w*h*nc*sizeof(float);
+    cout << __func__ << ": "<< __LINE__ << ": " << endl;
+    // Read a camera image frame every 30 milliseconds:
+    // cv::waitKey(30) waits 30 milliseconds for a keyboard input,
+    // returns a value <0 if no key is pressed during this time, returns immediately with a value >=0 if a key is pressed
+    cv::waitKey(30);
+    // Get camera image
+    camera >> mIn;
+    // convert to float representation (opencv loads image values as single bytes by default)
+    mIn.convertTo(mIn,CV_32F);
+    // convert range of each channel to [0,1] (opencv default is [0,255])
+    mIn /= 255.f;
+    // Init raw input image array
+    // opencv images are interleaved: rgb rgb rgb...  (actually bgr bgr bgr...)
+    // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
+    // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
+    convert_mat_to_layered (imgIn, mIn);
+    dim3 block(32, 16);
+    dim3 grid = make_grid(dim3(w, h, 1), block);
+    Timer timer; timer.start();
+    // float *d_F, *d_U, *d_vx, *d_vy;
+    cudaMalloc(&d_F, imageBytes);
+    cudaMalloc(&d_U, imageBytes);
+    cudaMalloc(&d_vx, imageBytes);
+    cudaMalloc(&d_vy, imageBytes);
+    cudaMemcpy(d_F, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
+    for (int n = 0; n < N; n++) {
+        norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
+        update<<< grid, block >>>(d_output, d_U, d_F, d_vx, d_vy, w, h, lambda, tau);
+    }
+    // cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
+    cudaFree(d_F);
+    cudaFree(d_U);
+    cudaFree(d_vx);
+    cudaFree(d_vy);
+    timer.end();  float t = timer.get();  // elapsed time in seconds
+    cout << "time: " << t*1000 << " ms" << endl;
+    // Unmap buffer object
+    cudaGraphicsUnmapResources(1, &outputVBO_CUDA, 0);
+    // Render from buffer object
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    glBindBuffer(GL_ARRAY_BUFFER, outputVBO);
+    glVertexPointer(4, GL_FLOAT, 0, 0);
+    glEnableClientState(GL_VERTEX_ARRAY);
+    glDrawArrays(GL_POINTS, 0, w * h);
+    glDisableClientState(GL_VERTEX_ARRAY);
+    // Swap buffers
+    glutSwapBuffers();
+    glutPostRedisplay();
+}