Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
C
cuda_lab
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Gaurav Kukreja
cuda_lab
Commits
ef6380e1
Commit
ef6380e1
authored
Mar 06, 2014
by
Miklós Homolya
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
eliminate local memory in compute_P
parent
a3bd58c1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
24 additions
and
17 deletions
+24
-17
main.cu
miklos/ex11/main.cu
+24
-17
No files found.
miklos/ex11/main.cu
View file @
ef6380e1
...
@@ -54,38 +54,44 @@ __device__ __host__ float huber(float s, float epsilon)
...
@@ -54,38 +54,44 @@ __device__ __host__ float huber(float s, float epsilon)
return 1.0F / max(epsilon, s);
return 1.0F / max(epsilon, s);
}
}
__global__ void compute_P(float *image, float *
vx, float *v
y, int w, int h, int nc, float epsilon)
__global__ void compute_P(float *image, float *
Px, float *P
y, int w, int h, int nc, float epsilon)
{
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int y = threadIdx.y + blockDim.y * blockIdx.y;
extern __shared__ float sh_u[];
if (x < w && y < h) {
if (x < w && y < h) {
float ux[3], uy[3];
int b = threadIdx.x + blockDim.x * threadIdx.y;
int B = blockDim.x * blockDim.y;
float G2 = 0;
float G2 = 0;
for (int c = 0; c < nc; c++) {
for (int c = 0; c < nc; c++) {
int i = x + w*y + w*h*c;
int i = x + w*y + w*h*c;
ux[c] = ((x < w-1) ? (image[i + 1] - image[i]) : 0);
float ux = ((x < w-1) ? (image[i + 1] - image[i]) : 0);
uy[c] = ((y < h-1) ? (image[i + w] - image[i]) : 0);
float uy = ((y < h-1) ? (image[i + w] - image[i]) : 0);
G2 += ux[c]*ux[c] + uy[c]*uy[c];
sh_u[b + B*c + B*nc*0] = ux;
sh_u[b + B*c + B*nc*1] = uy;
G2 += ux*ux + uy*uy;
}
}
float g = huber(sqrtf(G2), epsilon);
float g = huber(sqrtf(G2), epsilon);
for (int c = 0; c < nc; c++) {
for (int c = 0; c < nc; c++) {
int i = x + w*y + w*h*c;
int i = x + w*y + w*h*c;
vx[i] = g * ux[c
];
Px[i] = g * sh_u[b + B*c + B*nc*0
];
vy[i] = g * uy[c
];
Py[i] = g * sh_u[b + B*c + B*nc*1
];
}
}
}
}
}
}
__global__ void divergence_and_update(float *image, float *
u1, float *u2
, int w, int h, int nc, float tau)
__global__ void divergence_and_update(float *image, float *
Px, float *Py
, int w, int h, int nc, float tau)
{
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < w && y < h) {
if (x < w && y < h) {
for (int c = 0; c < nc; c++) {
for (int c = 0; c < nc; c++) {
int i = x + w*y + w*h*c;
int i = x + w*y + w*h*c;
float dx_u1 =
u1[i] - ((x > 0) ? u1
[i - 1] : 0);
float dx_u1 =
Px[i] - ((x > 0) ? Px
[i - 1] : 0);
float dy_u2 =
u2[i] - ((y > 0) ? u2
[i - w] : 0);
float dy_u2 =
Py[i] - ((y > 0) ? Py
[i - w] : 0);
image[i] += tau * (dx_u1 + dy_u2);
image[i] += tau * (dx_u1 + dy_u2);
}
}
}
}
...
@@ -231,23 +237,24 @@ int main(int argc, char **argv)
...
@@ -231,23 +237,24 @@ int main(int argc, char **argv)
dim3 block(32, 16);
dim3 block(32, 16);
dim3 grid = make_grid(dim3(w, h, 1), block);
dim3 grid = make_grid(dim3(w, h, 1), block);
size_t smBytes = (size_t)block.x*block.y*nc*2*sizeof(float);
Timer timer; timer.start();
Timer timer; timer.start();
float *d_image, *d_
vx, *d_v
y;
float *d_image, *d_
Px, *d_P
y;
cudaMalloc(&d_image, imageBytes);
cudaMalloc(&d_image, imageBytes);
cudaMalloc(&d_
v
x, imageBytes);
cudaMalloc(&d_
P
x, imageBytes);
cudaMalloc(&d_
v
y, imageBytes);
cudaMalloc(&d_
P
y, imageBytes);
cudaMemcpy(d_image, imgIn, imageBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_image, imgIn, imageBytes, cudaMemcpyHostToDevice);
for (int n = 0; n < N; n++) {
for (int n = 0; n < N; n++) {
compute_P<<< grid, block
>>>(d_image, d_vx, d_v
y, w, h, nc, epsilon);
compute_P<<< grid, block
, smBytes >>>(d_image, d_Px, d_P
y, w, h, nc, epsilon);
divergence_and_update<<< grid, block >>>(d_image, d_
vx, d_v
y, w, h, nc, tau);
divergence_and_update<<< grid, block >>>(d_image, d_
Px, d_P
y, w, h, nc, tau);
}
}
cudaMemcpy(imgOut, d_image, imageBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(imgOut, d_image, imageBytes, cudaMemcpyDeviceToHost);
cudaFree(d_image);
cudaFree(d_image);
cudaFree(d_
v
x);
cudaFree(d_
P
x);
cudaFree(d_
v
y);
cudaFree(d_
P
y);
timer.end(); float t = timer.get(); // elapsed time in seconds
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "time: " << t*1000 << " ms" << endl;
cout << "time: " << t*1000 << " ms" << endl;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment