Commit b0bc636c authored by Gaurav Kukreja's avatar Gaurav Kukreja

Miklos's working code for convolution

Signed-off-by: Gaurav Kukreja's avatarGaurav Kukreja <gaurav@gauravk.in>
parent 636c1600
...@@ -18,8 +18,9 @@ ...@@ -18,8 +18,9 @@
// ### // ###
// ### TODO: For every student of your group, please provide here: // ### TODO: For every student of your group, please provide here:
// ### // ###
// ### name, email, login username (for example p123) // ### Gaurav Kukreja, gaurav.kukreja@tum.de, p058
// ### // ### Miklos Homolya, miklos.homolya@tum.de, p056
// ### Ravikishore Kommajosyula, r.kommajosyula, p057
// ### // ###
...@@ -234,7 +235,8 @@ int main(int argc, char **argv) ...@@ -234,7 +235,8 @@ int main(int argc, char **argv)
Timer timer; timer.start(); Timer timer; timer.start();
#define CPU for (int measurement = 0; measurement < repeats; measurement++) {
//#define CPU
#ifdef CPU #ifdef CPU
for (int c = 0; c < nc; c++) { for (int c = 0; c < nc; c++) {
for (int y = 0; y < h; y++) { for (int y = 0; y < h; y++) {
...@@ -255,29 +257,21 @@ int main(int argc, char **argv) ...@@ -255,29 +257,21 @@ int main(int argc, char **argv)
float *d_in, *d_out, *d_kern; float *d_in, *d_out, *d_kern;
size_t nbytes = (size_t)w*h*nc*sizeof(float); size_t nbytes = (size_t)w*h*nc*sizeof(float);
cudaMalloc(&d_in, nbytes); cudaMalloc(&d_in, nbytes);
CUDA_CHECK;
cudaMalloc(&d_out, nbytes); cudaMalloc(&d_out, nbytes);
CUDA_CHECK;
cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float)); cudaMalloc(&d_kern, (size_t)ksize*ksize*sizeof(float));
CUDA_CHECK;
cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in, imgIn, nbytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_kern, kern, (size_t)ksize*ksize*sizeof(float), cudaMemcpyHostToDevice);
CUDA_CHECK;
dim3 block(16, 8, 3); dim3 block(16, 8, 3);
dim3 grid = make_grid(dim3(w, h, nc), block); dim3 grid = make_grid(dim3(w, h, nc), block);
convolution<<<grid, block>>>(d_in, d_out, d_kern, w, h, nc, r); convolution<<<grid, block>>>(d_in, d_out, d_kern, w, h, nc, r);
CUDA_CHECK;
cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost); cudaMemcpy(imgOut, d_out, nbytes, cudaMemcpyDeviceToHost);
CUDA_CHECK;
cudaFree(d_in); cudaFree(d_in);
CUDA_CHECK;
cudaFree(d_out); cudaFree(d_out);
CUDA_CHECK;
cudaFree(d_kern); cudaFree(d_kern);
CUDA_CHECK;
#endif #endif
}
timer.end(); float t = timer.get(); // elapsed time in seconds timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "time: " << t*1000 << " ms" << endl; cout << "time: " << (t / repeats)*1000 << " ms" << endl;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment