Commit 0d8fa5a9 authored by Gaurav Kukreja's avatar Gaurav Kukreja

Constant Kernel Memory on Ex6

Signed-off-by: Gaurav Kukreja's avatarGaurav Kukreja <gaurav@gauravk.in>
parent b0bc636c
...@@ -65,6 +65,7 @@ __device__ void convolveImage(float* imgIn, float* imgOut, int rad, int w, int h ...@@ -65,6 +65,7 @@ __device__ void convolveImage(float* imgIn, float* imgOut, int rad, int w, int h
// Index of the output image, this kernel works on // Index of the output image, this kernel works on
int idx = ix + (iy * w) + (iz * w * h); int idx = ix + (iy * w) + (iz * w * h);
int kw = 2 * rad + 1;
// check limits // check limits
if (idx < w * h * nc) if (idx < w * h * nc)
...@@ -78,7 +79,7 @@ __device__ void convolveImage(float* imgIn, float* imgOut, int rad, int w, int h ...@@ -78,7 +79,7 @@ __device__ void convolveImage(float* imgIn, float* imgOut, int rad, int w, int h
{ {
int inx = gpu_max(0, gpu_min(ix+i, w-1)); int inx = gpu_max(0, gpu_min(ix+i, w-1));
int inIdx = inx + (iny * w) + (iz * w * h); // Index of Input Image to be multiplied by corresponding element in kernel int inIdx = inx + (iny * w) + (iz * w * h); // Index of Input Image to be multiplied by corresponding element in kernel
value += imgIn[inIdx] * constKernel[i+rad + ((j+rad) * (2 * rad + 1))]; value += imgIn[inIdx] * constKernel[i+rad + ((j+rad) * (kw))];
} }
} }
imgOut[idx] = value; imgOut[idx] = value;
......
...@@ -18,8 +18,9 @@ ...@@ -18,8 +18,9 @@
// ### // ###
// ### TODO: For every student of your group, please provide here: // ### TODO: For every student of your group, please provide here:
// ### // ###
// ### name, email, login username (for example p123) // ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058
// ### // ### Miklos Homolya, miklos.homolya@tum.de, p056
// ### Ravikishore Kommajosyula, r.kommajosyula, p057
// ### // ###
...@@ -29,6 +30,7 @@ ...@@ -29,6 +30,7 @@
using namespace std; using namespace std;
#define MAX_KERNEL_WIDTH 20 #define MAX_KERNEL_WIDTH 20
__constant__ float constKernel[MAX_KERNEL_WIDTH * MAX_KERNEL_WIDTH]; __constant__ float constKernel[MAX_KERNEL_WIDTH * MAX_KERNEL_WIDTH];
// uncomment to use the camera // uncomment to use the camera
...@@ -165,12 +167,6 @@ int main(int argc, char **argv) ...@@ -165,12 +167,6 @@ int main(int argc, char **argv)
int r = ceil(3 * sigma); int r = ceil(3 * sigma);
int ksize = 2*r + 1; int ksize = 2*r + 1;
if(ksize > MAX_KERNEL_WIDTH)
{
cout << "Kernel width more than Max Kernel width viz. 20" << endl;
return -1;
}
float *kern = new float[ksize * ksize]; float *kern = new float[ksize * ksize];
for (int i = 0; i < 2*r+1; i++) { for (int i = 0; i < 2*r+1; i++) {
double a = i - r; double a = i - r;
...@@ -243,7 +239,8 @@ int main(int argc, char **argv) ...@@ -243,7 +239,8 @@ int main(int argc, char **argv)
Timer timer; timer.start(); Timer timer; timer.start();
#define CPU for (int measurement = 0; measurement < repeats; measurement++) {
//#define CPU
#ifdef CPU #ifdef CPU
for (int c = 0; c < nc; c++) { for (int c = 0; c < nc; c++) {
for (int y = 0; y < h; y++) { for (int y = 0; y < h; y++) {
...@@ -261,13 +258,13 @@ int main(int argc, char **argv) ...@@ -261,13 +258,13 @@ int main(int argc, char **argv)
} }
} }
#else #else
float *d_in, *d_out, *d_kern; float *d_in, *d_out;
size_t nbytes = (size_t)w*h*nc*sizeof(float); size_t nbytes = (size_t)w*h*nc*sizeof(float);
cudaMalloc(&d_in, nbytes); cudaMalloc(&d_in, nbytes);
cudaMalloc(&d_out, nbytes); cudaMalloc(&d_out, nbytes);
//cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float)); // cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float));
cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice);
// cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice); // cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(constKernel, kern, (size_t)ksize*ksize*sizeof(float)); cudaMemcpyToSymbol(constKernel, kern, (size_t)ksize*ksize*sizeof(float));
dim3 block(16, 8, 3); dim3 block(16, 8, 3);
dim3 grid = make_grid(dim3(w, h, nc), block); dim3 grid = make_grid(dim3(w, h, nc), block);
...@@ -275,10 +272,16 @@ int main(int argc, char **argv) ...@@ -275,10 +272,16 @@ int main(int argc, char **argv)
cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost); cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost);
cudaFree(d_in); cudaFree(d_in);
cudaFree(d_out); cudaFree(d_out);
// cudaFree(d_kern); // cudaFree(d_kern);
#endif #endif
}
timer.end(); float t = timer.get(); // elapsed time in seconds timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "time: " << t*1000 << " ms" << endl; cout << "time: " << (t / repeats)*1000 << " ms" << endl;
// show input image // show input image
showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100) showImage("Input", mIn, 100, 100); // show at position (x_from_left=100,y_from_above=100)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment