Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
C
cuda_lab
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Gaurav Kukreja
cuda_lab
Commits
9e1f16ff
Commit
9e1f16ff
authored
Mar 27, 2014
by
Ravi
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
texture memory optimization - 6.98 to 7.78 FPS improvement
parent
563c4e97
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
40 additions
and
34 deletions
+40
-34
glwidget.cpp
miklos/project_integration/glwidget.cpp
+1
-1
glwidget.h
miklos/project_integration/glwidget.h
+1
-1
kernel.cu
miklos/project_integration/kernel.cu
+35
-29
kernel.h
miklos/project_integration/kernel.h
+2
-2
main.cpp
miklos/project_integration/main.cpp
+1
-1
No files found.
miklos/project_integration/glwidget.cpp
View file @
9e1f16ff
...
...
@@ -38,7 +38,7 @@ void GlWidget::initializeGL()
cudaGraphicsGLRegisterBuffer
(
&
pixelsVBO_CUDA
,
pixelsVBO
,
cudaGraphicsMapFlagsWriteDiscard
);
size_t
inBytes
=
camera
.
width
()
*
camera
.
height
()
*
sizeof
(
float
);
cudaMalloc
(
&
d_in
,
inBytes
);
cudaMalloc
(
(
void
**
)
&
d_in
,
inBytes
);
}
void
GlWidget
::
paintGL
()
...
...
miklos/project_integration/glwidget.h
View file @
9e1f16ff
...
...
@@ -12,6 +12,7 @@ public:
explicit
GlWidget
(
QWidget
*
parent
=
0
);
~
GlWidget
();
QSize
sizeHint
()
const
;
float
*
d_in
;
protected
:
void
initializeGL
();
...
...
@@ -19,7 +20,6 @@ protected:
private
:
QGLFunctions
gl
;
void
*
d_in
;
};
#endif // GLWIDGET_H
miklos/project_integration/kernel.cu
View file @
9e1f16ff
...
...
@@ -4,6 +4,9 @@
#include <algorithm>
#include <stdio.h>
texture<float,2,cudaReadModeElementType> texRef_Xi;
texture<float,2,cudaReadModeElementType> texRef_Xj;
template<typename T>
__device__ __host__ T min(T a, T b)
{
...
...
@@ -29,7 +32,8 @@ __global__ void calculate_F(float *U, float *F, int w, int h, float c1, float c2
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < w && y < h) {
size_t i = x + (size_t)w*y;
F[i] = lambda * ((c1 - U[i])*(c1 - U[i]) - (c2 - U[i])*(c2 - U[i]));
float temp_ui = U[i];
F[i] = lambda * ((c1 - temp_ui)*(c1 - temp_ui) - (c2 - temp_ui)*(c2 - temp_ui));
}
}
...
...
@@ -50,10 +54,10 @@ __global__ void update_Xij(float *Xi, float *Xj, float *T, float *U, int w, int
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < w && y < h) {
size_t i = x + (size_t)w*
y;
size_t i = x + (size_t) w *
y;
float xi = Xi[i] - sigma * (2 * diff_i(U, w, h, x, y) - diff_i(T, w, h, x, y));
float xj = Xj[i] - sigma * (2 * diff_j(U, w, h, x, y) - diff_j(T, w, h, x, y));
float dn = max(1.f, sqrtf(xi*xi + xj*
xj));
float dn = max(1.f, sqrtf(xi * xi + xj *
xj));
Xi[i] = xi / dn;
Xj[i] = xj / dn;
}
...
...
@@ -61,9 +65,9 @@ __global__ void update_Xij(float *Xi, float *Xj, float *T, float *U, int w, int
__device__ float divergence(float *X, float *Y, int w, int h, int x, int y)
{
size_t i = x + (size_t)w*y
;
float d
x_x = ((x+1 < w) ? X[i] : 0.f) - ((x > 0) ? X[i - 1] : 0.
f);
float dy_y = ((y+1 < h) ? Y[i] : 0.f) - ((y > 0) ? Y[i - w] : 0.f);
float dx_x = tex2D(texRef_Xi, x + 0.5f , y + 0.5f) - tex2D(texRef_Xi, x - 0.5f , y + 0.5f)
;
float d
y_y = tex2D(texRef_Xj, x + 0.5f , y + 0.5f) - tex2D(texRef_Xj, x + 0.5f , y - 0.5
f);
return dx_x + dy_y;
}
...
...
@@ -100,36 +104,38 @@ inline dim3 make_grid(dim3 whole, dim3 block)
div_ceil(whole.z, block.z));
}
__global__ void createVertices(float *in, uchar4* pixel, int w, int h)
{
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned char intensity = roundf(255 * in[y * w + x]);
// Write positions
size_t i = x + w*(h-1 - y);
pixel[i].x = intensity;
pixel[i].y = intensity;
pixel[i].z = intensity;
pixel[i].w = 255;
}
static float *d_T, *d_F, *d_Xi, *d_Xj;
void allocate_device_memory(size_t w, size_t h)
void allocate_device_memory(float *d_in, size_t w, size_t h)
{
size_t imageBytes = w*h*sizeof(float);
cudaMalloc(&d_T, imageBytes);
cudaMalloc(&d_F, imageBytes);
cudaMalloc(&d_Xi, imageBytes);
cudaMalloc(&d_Xj, imageBytes);
// Define texture attributes
texRef_Xi.addressMode[0] = cudaAddressModeClamp; // clamp x to border
texRef_Xi.addressMode[1] = cudaAddressModeClamp; // clamp y to border
texRef_Xi.filterMode = cudaFilterModeLinear; // linear interpolation
texRef_Xi.normalized = false;
cudaChannelFormatDesc desc_Xi = cudaCreateChannelDesc<float>();
cudaBindTexture2D(NULL, &texRef_Xi, d_Xi, &desc_Xi, w, h, w*sizeof(d_Xi[0]));
// Define texture attributes
texRef_Xj.addressMode[0] = cudaAddressModeClamp; // clamp x to border
texRef_Xj.addressMode[1] = cudaAddressModeClamp; // clamp y to border
texRef_Xj.filterMode = cudaFilterModeLinear; // linear interpolation
texRef_Xj.normalized = false;
cudaChannelFormatDesc desc_Xj = cudaCreateChannelDesc<float>();
cudaBindTexture2D(NULL, &texRef_Xj, d_Xj, &desc_Xj, w, h, w*sizeof(d_Xj[0]));
}
void executeKernel(
void *d_in
, void *d_out, size_t w, size_t h)
void executeKernel(
float *d_U
, void *d_out, size_t w, size_t h)
{
float *d_U = reinterpret_cast<float *>(d_in);
//
float *d_U = reinterpret_cast<float *>(d_in);
uchar4 *pixel = reinterpret_cast<uchar4 *>(d_out);
static Timer timer;
...
...
miklos/project_integration/kernel.h
View file @
9e1f16ff
#ifndef KERNEL_H
#define KERNEL_H
extern
"C"
void
allocate_device_memory
(
size_t
width
,
size_t
height
);
extern
"C"
void
executeKernel
(
void
*
d_in
,
void
*
d_out
,
size_t
width
,
size_t
height
);
extern
"C"
void
allocate_device_memory
(
float
*
d_in
,
size_t
width
,
size_t
height
);
extern
"C"
void
executeKernel
(
float
*
d_in
,
void
*
d_out
,
size_t
width
,
size_t
height
);
#endif // KERNEL_H
miklos/project_integration/main.cpp
View file @
9e1f16ff
...
...
@@ -17,7 +17,7 @@ int main(int argc, char *argv[])
}
QObject
::
connect
(
&
camera
,
SIGNAL
(
newFrame
()),
&
w
,
SLOT
(
updateGL
()));
allocate_device_memory
(
camera
.
width
(),
camera
.
height
());
allocate_device_memory
(
w
.
d_in
,
camera
.
width
(),
camera
.
height
());
camera
.
start
();
w
.
show
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment