Commit 6cf20bce authored by Ravi's avatar Ravi

interoparability with primal dual - fast execution

parent 20663a73
// ### // ###
// ### // ###
// ### Practical Course: GPU Programming in Computer Vision // ### Practical Course: GPU Programming in Computer Vision
// ### // ###
// ### // ###
// ### Technical University Munich, Computer Vision Group // ### Technical University Munich, Computer Vision Group
// ### Winter Semester 2013/2014, March 3 - April 4 // ### Winter Semester 2013/2014, March 3 - April 4
// ### // ###
// ### // ###
// ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai // ### Evgeny Strekalovskiy, Maria Klodt, Jan Stuehmer, Mohamed Souiai
// ### // ###
// ### // ###
// ### // ###
// ### // ###
// ### // ###
// ### Miklos Homolya, miklos.homolya@tum.de, p056 // ### Miklos Homolya, miklos.homolya@tum.de, p056
// ### Ravikishore Kommajosyula, r.kommajosyula@tum.de, p057 // ### Ravikishore Kommajosyula, r.kommajosyula@tum.de, p057
// ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058 // ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058
// ### // ###
// ### // ###
#define GL_GLEXT_PROTOTYPES #define GL_GLEXT_PROTOTYPES
#include <GL/glut.h> #include <GL/glut.h>
#include "cuda_gl_interop.h" #include "cuda_gl_interop.h"
#include "aux.h" #include "aux.h"
#include <iostream> #include <iostream>
using namespace std; using namespace std;
/************************************************************************ /************************************************************************
*** GLOBAL VARIABLES ***** *** GLOBAL VARIABLES *****
************************************************************************/ ************************************************************************/
int repeats; int repeats;
bool gray; bool gray;
float lambda; float lambda;
float tau; float tau;
int N; int N;
float c1; float c1;
float c2; float c2;
float sigma;
cv::VideoCapture camera(0);
cv::Mat mIn; cv::VideoCapture camera(0);
cv::Mat mIn;
int w;
int h; int w;
int nc; int h;
int nc;
// uncomment to use the camera
#define CAMERA // uncomment to use the camera
#define CAMERA
template<typename T>
__device__ __host__ T min(T a, T b) template<typename T>
{ __device__ __host__ T min(T a, T b)
return (a < b) ? a : b; {
} return (a < b) ? a : b;
}
template<typename T>
__device__ __host__ T max(T a, T b) template<typename T>
{ __device__ __host__ T max(T a, T b)
return (a > b) ? a : b; {
} return (a > b) ? a : b;
}
template<typename T>
__device__ __host__ T clamp(T m, T x, T M) template<typename T>
{ __device__ __host__ T clamp(T m, T x, T M)
return max(m, min(x, M)); {
} return max(m, min(x, M));
}
/**
* Computes the normalized gradient. __global__ void calculate_F(float *U, float *F, int w, int h, float c1, float c2, float lambda)
* {
* @param U input image (single-channel) int x = threadIdx.x + blockDim.x * blockIdx.x;
* @param vx x-coordinate of result int y = threadIdx.y + blockDim.y * blockIdx.y;
* @param vy y-coordinate of result if (x < w && y < h) {
* @param w width of image (pixels) size_t i = x + (size_t)w*y;
* @param h height of image (pixels) F[i] = lambda * ((c1 - U[i])*(c1 - U[i]) - (c2 - U[i])*(c2 - U[i]));
*/ }
__global__ void norm_grad(float *U, float *vx, float *vy, int w, int h) }
{
int x = threadIdx.x + blockDim.x * blockIdx.x; __device__ float diff_i(float *M, int w, int h, int x, int y)
int y = threadIdx.y + blockDim.y * blockIdx.y; {
if (x < w && y < h) { size_t i = x + (size_t)w*y;
size_t i = x + (size_t)w*y; return (x+1 < w) ? (M[i + 1] - M[i]) : 0.f;
float ux = ((x+1 < w) ? (U[i + 1] - U[i]) : 0); }
float uy = ((y+1 < h) ? (U[i + w] - U[i]) : 0);
float gn = sqrtf(ux*ux + uy*uy + FLT_EPSILON); __device__ float diff_j(float *M, int w, int h, int x, int y)
vx[i] = ux / gn; {
vy[i] = uy / gn; size_t i = x + (size_t)w*y;
} return (y+1 < h) ? (M[i + w] - M[i]) : 0.f;
} }
/** __global__ void update_Xij(float *Xi, float *Xj, float *T, float *U, int w, int h, float sigma)
* nu (Greek letter) function penalizes being outside the interval [0; 1]. {
*/ int x = threadIdx.x + blockDim.x * blockIdx.x;
__device__ float nu(float u) int y = threadIdx.y + blockDim.y * blockIdx.y;
{ if (x < w && y < h) {
if (u < 0.f) size_t i = x + (size_t)w*y;
return -2.f; float xi = Xi[i] - sigma * (2 * diff_i(U, w, h, x, y) - diff_i(T, w, h, x, y));
if (u > 1.f) float xj = Xj[i] - sigma * (2 * diff_j(U, w, h, x, y) - diff_j(T, w, h, x, y));
return +2.f; float dn = max(1.f, sqrtf(xi*xi + xj*xj));
return 0.f; Xi[i] = xi / dn;
} Xj[i] = xj / dn;
}
/** }
* Calculate s(x) = (c1 - f(x))^2 - (c2 - f(x))^2.
* __device__ float divergence(float *X, float *Y, int w, int h, int x, int y)
* @param F original input image (single-channel) {
* @param S result (single-channel) size_t i = x + (size_t)w*y;
* @param w width of image (pixels) float dx_x = ((x+1 < w) ? X[i] : 0.f) - ((x > 0) ? X[i - 1] : 0.f);
* @param h height of image (pixels) float dy_y = ((y+1 < h) ? Y[i] : 0.f) - ((y > 0) ? Y[i - w] : 0.f);
*/ return dx_x + dy_y;
__global__ void calculate_S(float *F, float *S, int w, int h, float c1, float c2) }
{
int x = threadIdx.x + blockDim.x * blockIdx.x; __global__ void update_U(uchar4* output, float *T, float *Xi, float *Xj, float *F, float *U, int w, int h, float tau)
int y = threadIdx.y + blockDim.y * blockIdx.y; {
if (x < w && y < h) { int x = threadIdx.x + blockDim.x * blockIdx.x;
size_t i = x + (size_t)w*y; int y = threadIdx.y + blockDim.y * blockIdx.y;
S[i] = (c1 - F[i])*(c1 - F[i]) - (c2 - F[i])*(c2 - F[i]); if (x < w && y < h) {
} size_t i = x + (size_t)w*y;
} U[i] = clamp(0.f, T[i] - tau * (divergence(Xi, Xj, w, h, x, y) + F[i]), 1.f);
uchar temp_res = (uchar)(U[i] * 255.f);
/** output[w*h-i-1].x = temp_res;
* Update approximation. output[w*h-i-1].y = temp_res;
* output[w*h-i-1].z = temp_res;
* @param U approximation of solution (single-channel) output[w*h-i-1].w = 255;
* @param S update component from input image (single-channel)
* @param vx normalized gradient of U (x-coordinate) }
* @param vy normalized gradient of U (y-coordinate) }
* @param w width of image (pixels)
* @param h height of image (pixels) inline int div_ceil(int n, int b) { return (n + b - 1) / b; }
* @param lambda weight of S
* @param alpha weight of nu inline dim3 make_grid(dim3 whole, dim3 block)
* @param tau update coefficient {
*/ return dim3(div_ceil(whole.x, block.x),
#ifdef CAMERA div_ceil(whole.y, block.y),
__global__ void update(uchar4* output, float *U, float *S, float *vx, float *vy, div_ceil(whole.z, block.z));
int w, int h, float lambda, float alpha, float tau) }
#else GLuint bufferObj;
__global__ void update(float *U, float *S, float *vx, float *vy, cudaGraphicsResource * resource;
int w, int h, float lambda, float alpha, float tau)
#endif #define HEIGHT 480
{ #define WIDTH 640
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y; static void key_func( unsigned char key, int x, int y ) {
if (x < w && y < h) { switch (key) {
size_t i = x + (size_t)w*y; case 27:
// clean up OpenGL and CUDA
// smoothness (functional derivative of energy)
float dx_vx = ((x+1 < w) ? vx[i] : 0) - ((x > 0) ? vx[i - 1] : 0); cudaGraphicsUnregisterResource( resource );
float dy_vy = ((y+1 < h) ? vy[i] : 0) - ((y > 0) ? vy[i - w] : 0); glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
float div_v = dx_vx + dy_vy; glDeleteBuffers( 1, &bufferObj );
exit(0);
// explicit Euler update rule }
U[i] += tau * (div_v - lambda * S[i] - alpha * nu(U[i])); }
#ifdef CAMERA
output[w*h-i-1].x = (uchar)(U[i] * 255.f); static void draw_func( void ) {
output[w*h-i-1].y = output[w*h-i-1].x; glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
output[w*h-i-1].z = output[w*h-i-1].x;
output[w*h-i-1].w = 255; // Get camera image
#endif camera >> mIn;
} if(gray)
} cvtColor(mIn, mIn, CV_BGR2GRAY);
// convert to float representation (opencv loads image values as single bytes by default)
inline int div_ceil(int n, int b) { return (n + b - 1) / b; } mIn.convertTo(mIn,CV_32F);
// convert range of each channel to [0,1] (opencv default is [0,255])
inline dim3 make_grid(dim3 whole, dim3 block) mIn /= 255.f;
{
return dim3(div_ceil(whole.x, block.x), uchar4* d_output;
div_ceil(whole.y, block.y), size_t size;
div_ceil(whole.z, block.z));
} // allocate raw input image array
float *imgIn = new float[(size_t)w*h*nc];
GLuint bufferObj; size_t imageBytes = (size_t)w*h*nc*sizeof(float);
cudaGraphicsResource * resource;
cudaGraphicsMapResources (1, &resource, NULL);
#define HEIGHT 480 cudaGraphicsResourceGetMappedPointer( (void**) &d_output, &size, resource);
#define WIDTH 640
// Init raw input image array
static void key_func( unsigned char key, int x, int y ) { // opencv images are interleaved: rgb rgb rgb... (actually bgr bgr bgr...)
switch (key) { // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
case 27: // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
// clean up OpenGL and CUDA convert_mat_to_layered (imgIn, mIn);
cudaGraphicsUnregisterResource( resource ); dim3 block(32, 16);
glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 ); dim3 grid = make_grid(dim3(w, h, 1), block);
glDeleteBuffers( 1, &bufferObj );
exit(0); Timer timer; timer.start();
} float *d_T, *d_U, *d_F, *d_Xi, *d_Xj;
} cudaMalloc(&d_T, imageBytes);
cudaMalloc(&d_U, imageBytes);
static void draw_func( void ) { cudaMalloc(&d_F, imageBytes);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); cudaMalloc(&d_Xi, imageBytes);
cudaMalloc(&d_Xj, imageBytes);
// Get camera image cudaMemcpy(d_T, imgIn, imageBytes, cudaMemcpyHostToDevice);
camera >> mIn; cudaMemcpy(d_U, d_T, imageBytes, cudaMemcpyDeviceToDevice);
if(gray) cudaMemset(d_Xi, 0, imageBytes);
cvtColor(mIn, mIn, CV_BGR2GRAY); cudaMemset(d_Xj, 0, imageBytes);
// convert to float representation (opencv loads image values as single bytes by default)
mIn.convertTo(mIn,CV_32F); calculate_F<<< grid, block >>>(d_U, d_F, w, h, c1, c2, lambda);
// convert range of each channel to [0,1] (opencv default is [0,255])
mIn /= 255.f; for (int n = 0; n < N; n++) {
update_Xij<<< grid, block >>>(d_Xi, d_Xj, d_T, d_U, w, h, sigma);
uchar4* d_output; std::swap(d_U, d_T);
size_t size; update_U<<< grid, block >>>(d_output, d_T, d_Xi, d_Xj, d_F, d_U, w, h, tau);
}
// allocate raw input image array
float *imgIn = new float[(size_t)w*h*nc]; // cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
size_t imageBytes = (size_t)w*h*nc*sizeof(float); cudaGraphicsUnmapResources(1, &resource, NULL);
cudaFree(d_T);
cudaGraphicsMapResources (1, &resource, NULL); cudaFree(d_U);
cudaGraphicsResourceGetMappedPointer( (void**) &d_output, &size, resource); cudaFree(d_F);
cudaFree(d_Xi);
// Init raw input image array cudaFree(d_Xj);
// opencv images are interleaved: rgb rgb rgb... (actually bgr bgr bgr...)
// But for CUDA it's better to work with layered images: rrr... ggg... bbb... timer.end(); float t = timer.get(); // elapsed time in seconds
// So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations cout << "time: " << t*1000 << " ms" << endl;
convert_mat_to_layered (imgIn, mIn);
// show input image
dim3 block(32, 16); // showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100)
dim3 grid = make_grid(dim3(w, h, 1), block);
glDrawPixels( WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, 0 );
Timer timer; timer.start(); glutSwapBuffers();
float *d_U, *d_S, *d_vx, *d_vy; glutPostRedisplay();
cudaMalloc(&d_U, imageBytes); }
cudaMalloc(&d_S, imageBytes);
cudaMalloc(&d_vx, imageBytes); int main(int argc, char **argv)
cudaMalloc(&d_vy, imageBytes); {
cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice); #ifdef CAMERA
cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice); cudaGLSetGLDevice(0); CUDA_CHECK;
calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2); // these GLUT calls need to be made before the other GL calls
float *S = new float[(size_t)w*h]; glutInit( &argc, argv );
cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost); glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
float S_max = 0.0; glutInitWindowSize( WIDTH, HEIGHT );
for (size_t i = 0; i < (size_t)w*h; i++) glutCreateWindow( "bitmap" );
S_max = max(S_max, fabs(S[i])); // TODO: CPU thing
delete[] S; glGenBuffers(1, &bufferObj);
float alpha = 0.5 * lambda * S_max; glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_DYNAMIC_DRAW_ARB);
for (int n = 0; n < N; n++) { cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone);
norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h); #endif
update<<< grid, block >>>(d_output, d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
} // Before the GPU can process your kernels, a so called "CUDA context" must be initialized
// This happens on the very first call to a CUDA function, and takes some time (around half a second)
cudaGraphicsUnmapResources(1, &resource, NULL); // We will do it right here, so that the run time measurements are accurate
cudaFree(d_U); cudaDeviceSynchronize(); CUDA_CHECK;
cudaFree(d_S);
cudaFree(d_vx); // Reading command line parameters:
cudaFree(d_vy); // getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var"
timer.end(); float t = timer.get(); // elapsed time in seconds // If "-param" is not specified, the value of "var" remains unchanged
cout << "time: " << t*1000 << " ms" << endl; //
// return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise
// show input image
// showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100) #ifdef CAMERA
#else
glDrawPixels( WIDTH, HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, 0 ); // input image
glutSwapBuffers(); string image = "";
glutPostRedisplay(); bool ret = getParam("i", image, argc, argv);
} if (!ret) cerr << "ERROR: no image specified" << endl;
if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; }
int main(int argc, char **argv) #endif
{
#ifdef CAMERA // number of computation repetitions to get a better run time measurement
cudaGLSetGLDevice(0); CUDA_CHECK; repeats = 1;
getParam("repeats", repeats, argc, argv);
// these GLUT calls need to be made before the other GL calls cout << "repeats: " << repeats << endl;
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA ); // load the input image as grayscale if "-gray" is specifed
glutInitWindowSize( WIDTH, HEIGHT ); gray = true;
glutCreateWindow( "bitmap" ); // always true: getParam("gray", gray, argc, argv);
cout << "gray: " << gray << endl;
glGenBuffers(1, &bufferObj);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj); // ### Define your own parameters here as needed
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, WIDTH * HEIGHT * 4, NULL, GL_DYNAMIC_DRAW_ARB); lambda = 1.0;
cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone); getParam("lambda", lambda, argc, argv);
#endif cout << "λ: " << lambda << endl;
// Before the GPU can process your kernels, a so called "CUDA context" must be initialized sigma = 0.4;
// This happens on the very first call to a CUDA function, and takes some time (around half a second) getParam("sigma", sigma, argc, argv);
// We will do it right here, so that the run time measurements are accurate cout << "σ: " << sigma << endl;
cudaDeviceSynchronize(); CUDA_CHECK;
tau = 0.4;
// Reading command line parameters: getParam("tau", tau, argc, argv);
// getParam("param", var, argc, argv) looks whether "-param xyz" is specified, and if so stores the value "xyz" in "var" cout << "τ: " << tau << endl;
// If "-param" is not specified, the value of "var" remains unchanged
// N = 160;
// return value: getParam("param", ...) returns true if "-param" is specified, and false otherwise getParam("N", N, argc, argv);
cout << "N: " << N << endl;
#ifdef CAMERA
#else c1 = 1.0;
// input image getParam("c1", c1, argc, argv);
string image = ""; cout << "c1: " << c1 << endl;
bool ret = getParam("i", image, argc, argv);
if (!ret) cerr << "ERROR: no image specified" << endl; c2 = 0.00;
if (argc <= 1) { cout << "Usage: " << argv[0] << " -i <image> [-repeats <repeats>] [-gray]" << endl; return 1; } getParam("c2", c2, argc, argv);
#endif cout << "c2: " << c2 << endl;
// number of computation repetitions to get a better run time measurement // Init camera / Load input image
repeats = 1; #ifdef CAMERA
getParam("repeats", repeats, argc, argv);
cout << "repeats: " << repeats << endl; // Init camera
if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; }
// load the input image as grayscale if "-gray" is specifed int camW = 640;
gray = true; int camH = 480;
// always true: getParam("gray", gray, argc, argv); camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
cout << "gray: " << gray << endl; camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
// read in first frame to get the dimensions
// ### Define your own parameters here as needed
lambda = 0.8; camera >> mIn;
getParam("lambda", lambda, argc, argv); if(gray)
cout << "λ: " << lambda << endl; cvtColor(mIn, mIn, CV_BGR2GRAY);
tau = 0.01; #else
getParam("tau", tau, argc, argv);
cout << "τ: " << tau << endl; // Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale))
mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1));
N = 2000; // check
getParam("N", N, argc, argv); if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; }
cout << "N: " << N << endl;
#endif
c1 = 0.65;
getParam("c1", c1, argc, argv); // convert to float representation (opencv loads image values as single bytes by default)
cout << "c1: " << c1 << endl; mIn.convertTo(mIn,CV_32F);
// convert range of each channel to [0,1] (opencv default is [0,255])
c2 = 0.00; mIn /= 255.f;
getParam("c2", c2, argc, argv); // get image dimensions
cout << "c2: " << c2 << endl; w = mIn.cols; // width
h = mIn.rows; // height
// Init camera / Load input image nc = mIn.channels(); // number of channels
#ifdef CAMERA cout << "image: " << w << " x " << h << endl;
// Init camera // Set the output image format
if(!camera.isOpened()) { cerr << "ERROR: Could not open camera" << endl; return 1; } cv::Mat mOut(h,w,mIn.type()); // mOut will have the same number of channels as the input image, nc layers
int camW = 640; // ### Define your own output images here as needed
int camH = 480;
camera.set(CV_CAP_PROP_FRAME_WIDTH,camW);
camera.set(CV_CAP_PROP_FRAME_HEIGHT,camH);
// read in first frame to get the dimensions // For camera mode: Make a loop to read in camera frames
#ifdef CAMERA
camera >> mIn; glutKeyboardFunc (key_func);
if(gray) glutDisplayFunc (draw_func);
cvtColor(mIn, mIn, CV_BGR2GRAY); glutMainLoop();
#else
#else
// Allocate arrays
// Load the input image using opencv (load as grayscale if "gray==true", otherwise as is (may be color or grayscale)) // input/output image width: w
mIn = cv::imread(image.c_str(), (gray? CV_LOAD_IMAGE_GRAYSCALE : -1)); // input/output image height: h
// check // input image number of channels: nc
if (mIn.data == NULL) { cerr << "ERROR: Could not load image " << image << endl; return 1; } // output image number of channels: mOut.channels(), as defined above (nc, 3, or 1)
#endif // allocate raw input image array
float *imgIn = new float[(size_t)w*h*nc];
// convert to float representation (opencv loads image values as single bytes by default) size_t imageBytes = (size_t)w*h*nc*sizeof(float);
mIn.convertTo(mIn,CV_32F);
// convert range of each channel to [0,1] (opencv default is [0,255]) // allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying)
mIn /= 255.f; float *imgOut = new float[(size_t)w*h*mOut.channels()];
// get image dimensions
w = mIn.cols; // width // Init raw input image array
h = mIn.rows; // height // opencv images are interleaved: rgb rgb rgb... (actually bgr bgr bgr...)
nc = mIn.channels(); // number of channels // But for CUDA it's better to work with layered images: rrr... ggg... bbb...
cout << "image: " << w << " x " << h << endl; // So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations
convert_mat_to_layered (imgIn, mIn);
// Set the output image format
cv::Mat mOut(h,w,mIn.type()); // mOut will have the same number of channels as the input image, nc layers dim3 block(32, 16);
// ### Define your own output images here as needed dim3 grid = make_grid(dim3(w, h, 1), block);
Timer timer; timer.start();
float *d_U, *d_S, *d_vx, *d_vy;
// For camera mode: Make a loop to read in camera frames cudaMalloc(&d_U, imageBytes);
#ifdef CAMERA cudaMalloc(&d_S, imageBytes);
glutKeyboardFunc (key_func); cudaMalloc(&d_vx, imageBytes);
glutDisplayFunc (draw_func); cudaMalloc(&d_vy, imageBytes);
glutMainLoop(); cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice);
#else cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
// Allocate arrays calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2);
// input/output image width: w float *S = new float[(size_t)w*h];
// input/output image height: h cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost);
// input image number of channels: nc float S_max = 0.0;
// output image number of channels: mOut.channels(), as defined above (nc, 3, or 1) for (size_t i = 0; i < (size_t)w*h; i++)
S_max = max(S_max, fabs(S[i])); // TODO: CPU thing
// allocate raw input image array delete[] S;
float *imgIn = new float[(size_t)w*h*nc]; float alpha = 0.5 * lambda * S_max;
size_t imageBytes = (size_t)w*h*nc*sizeof(float);
for (int n = 0; n < N; n++) {
// allocate raw output array (the computation result will be stored in this array, then later converted to mOut for displaying) norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h);
float *imgOut = new float[(size_t)w*h*mOut.channels()]; update<<< grid, block >>>(d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
}
// Init raw input image array
// opencv images are interleaved: rgb rgb rgb... (actually bgr bgr bgr...) cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost);
// But for CUDA it's better to work with layered images: rrr... ggg... bbb... cudaFree(d_U);
// So we will convert as necessary, using interleaved "cv::Mat" for loading/saving/displaying, and layered "float*" for CUDA computations cudaFree(d_S);
convert_mat_to_layered (imgIn, mIn); cudaFree(d_vx);
cudaFree(d_vy);
dim3 block(32, 16); timer.end(); float t = timer.get(); // elapsed time in seconds
dim3 grid = make_grid(dim3(w, h, 1), block); cout << "time: " << t*1000 << " ms" << endl;
Timer timer; timer.start(); // show input image
float *d_U, *d_S, *d_vx, *d_vy; showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100)
cudaMalloc(&d_U, imageBytes);
cudaMalloc(&d_S, imageBytes); // show output image: first convert to interleaved opencv format from the layered raw array
cudaMalloc(&d_vx, imageBytes); convert_layered_to_mat(mOut, imgOut);
cudaMalloc(&d_vy, imageBytes); showImage("Output", mOut, 100+w+40, 100);
cudaMemcpy(d_U, imgIn, imageBytes, cudaMemcpyHostToDevice); // ### Display your own output images here as needed
cudaMemcpy(d_S, imgIn, imageBytes, cudaMemcpyHostToDevice);
// wait for key inputs
calculate_S<<< grid, block >>>(d_U, d_S, w, h, c1, c2); cv::waitKey(0);
float *S = new float[(size_t)w*h];
cudaMemcpy(S, d_S, imageBytes, cudaMemcpyDeviceToHost); // save input and result
float S_max = 0.0; cv::imwrite("image_input.png",mIn*255.f); // "imwrite" assumes channel range [0,255]
for (size_t i = 0; i < (size_t)w*h; i++) cv::imwrite("image_result.png",mOut*255.f);
S_max = max(S_max, fabs(S[i])); // TODO: CPU thing
delete[] S; // free allocated arrays
float alpha = 0.5 * lambda * S_max; delete[] imgIn;
delete[] imgOut;
for (int n = 0; n < N; n++) {
norm_grad<<< grid, block >>>(d_U, d_vx, d_vy, w, h); #endif
update<<< grid, block >>>(d_U, d_S, d_vx, d_vy, w, h, lambda, alpha, tau);
} // close all opencv windows
cvDestroyAllWindows();
cudaMemcpy(imgOut, d_U, imageBytes, cudaMemcpyDeviceToHost); return 0;
cudaFree(d_U); }
cudaFree(d_S);
cudaFree(d_vx);
cudaFree(d_vy);
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "time: " << t*1000 << " ms" << endl;
// show input image
showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100)
// show output image: first convert to interleaved opencv format from the layered raw array
convert_layered_to_mat(mOut, imgOut);
showImage("Output", mOut, 100+w+40, 100);
// ### Display your own output images here as needed
// wait for key inputs
cv::waitKey(0);
// save input and result
cv::imwrite("image_input.png",mIn*255.f); // "imwrite" assumes channel range [0,255]
cv::imwrite("image_result.png",mOut*255.f);
// free allocated arrays
delete[] imgIn;
delete[] imgOut;
#endif
// close all opencv windows
cvDestroyAllWindows();
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment