eliminate local memory in compute_P

ef6380e1 · Miklós Homolya · a3bd58c1 · ef6380e1
Commit ef6380e1 authored Mar 06, 2014 by Miklós Homolya
Hide whitespace changes
Inline Side-by-side

Showing with 24 additions and 17 deletions

main.cu miklos/ex11/main.cu +24 -17

No files found.
--- a/miklos/ex11/main.cu
+++ b/miklos/ex11/main.cu
@@ -54,38 +54,44 @@ __device__ __host__ float huber(float s, float epsilon)
    return 1.0F / max(epsilon, s);
 }
-__global__ void compute_P(float *image, float *vx, float *vy, int w, int h, int nc, float epsilon)
+__global__ void compute_P(float *image, float *Px, float *Py, int w, int h, int nc, float epsilon)
 {
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
+    extern __shared__ float sh_u[];
    if (x < w && y < h) {
-        float ux[3], uy[3];
+        int b = threadIdx.x + blockDim.x * threadIdx.y;
+        int B = blockDim.x * blockDim.y;
        float G2 = 0;
        for (int c = 0; c < nc; c++) {
            int i = x + w*y + w*h*c;
-            ux[c] = ((x < w-1) ? (image[i + 1] - image[i]) : 0);
+            float ux = ((x < w-1) ? (image[i + 1] - image[i]) : 0);
-            uy[c] = ((y < h-1) ? (image[i + w] - image[i]) : 0);
+            float uy = ((y < h-1) ? (image[i + w] - image[i]) : 0);
-            G2 += ux[c]*ux[c] + uy[c]*uy[c];
+            sh_u[b + B*c + B*nc*0] = ux;
+            sh_u[b + B*c + B*nc*1] = uy;
+            G2 += ux*ux + uy*uy;
        }
        float g = huber(sqrtf(G2), epsilon);
        for (int c = 0; c < nc; c++) {
            int i = x + w*y + w*h*c;
-            vx[i] = g * ux[c];
+            Px[i] = g * sh_u[b + B*c + B*nc*0];
-            vy[i] = g * uy[c];
+            Py[i] = g * sh_u[b + B*c + B*nc*1];
        }
    }
 }
-__global__ void divergence_and_update(float *image, float *u1, float *u2, int w, int h, int nc, float tau)
+__global__ void divergence_and_update(float *image, float *Px, float *Py, int w, int h, int nc, float tau)
 {
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    int y = threadIdx.y + blockDim.y * blockIdx.y;
    if (x < w && y < h) {
        for (int c = 0; c < nc; c++) {
            int i = x + w*y + w*h*c;
-            float dx_u1 = u1[i] - ((x > 0) ? u1[i - 1] : 0);
+            float dx_u1 = Px[i] - ((x > 0) ? Px[i - 1] : 0);
-            float dy_u2 = u2[i] - ((y > 0) ? u2[i - w] : 0);
+            float dy_u2 = Py[i] - ((y > 0) ? Py[i - w] : 0);
            image[i] += tau * (dx_u1 + dy_u2);
        }
    }
@@ -231,23 +237,24 @@ int main(int argc, char **argv)
    dim3 block(32, 16);
    dim3 grid = make_grid(dim3(w, h, 1), block);
+    size_t smBytes = (size_t)block.x*block.y*nc*2*sizeof(float);
    Timer timer; timer.start();
-    float *d_image, *d_vx, *d_vy;
+    float *d_image, *d_Px, *d_Py;
    cudaMalloc(&d_image, imageBytes);
-    cudaMalloc(&d_vx, imageBytes);
+    cudaMalloc(&d_Px, imageBytes);
-    cudaMalloc(&d_vy, imageBytes);
+    cudaMalloc(&d_Py, imageBytes);
    cudaMemcpy(d_image, imgIn, imageBytes, cudaMemcpyHostToDevice);
    for (int n = 0; n < N; n++) {
-        compute_P<<< grid, block >>>(d_image, d_vx, d_vy, w, h, nc, epsilon);
+        compute_P<<< grid, block, smBytes >>>(d_image, d_Px, d_Py, w, h, nc, epsilon);
-        divergence_and_update<<< grid, block >>>(d_image, d_vx, d_vy, w, h, nc, tau);
+        divergence_and_update<<< grid, block >>>(d_image, d_Px, d_Py, w, h, nc, tau);
    }
    cudaMemcpy(imgOut, d_image, imageBytes, cudaMemcpyDeviceToHost);
    cudaFree(d_image);
-    cudaFree(d_vx);
+    cudaFree(d_Px);
-    cudaFree(d_vy);
+    cudaFree(d_Py);
    timer.end();  float t = timer.get();  // elapsed time in seconds
    cout << "time: " << t*1000 << " ms" << endl;