HologramDepthmap Library
HologramKernel.cu
Go to the documentation of this file.
1 #include <cuda_runtime.h>
2 #include <cuComplex.h>
3 #include <cuda.h>
4 #include <vector>
5 #include <device_launch_parameters.h>
6 #include <cufft.h>
7 
8 static const int kBlockThreads = 512;
9 
10 __global__ void fftShift(int N, int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
11 {
12  int tid = threadIdx.x + blockIdx.x*blockDim.x;
13 
14  double normalF = 1.0;
15  if (bNormailzed == true)
16  normalF = nx * ny;
17 
18  while (tid < N)
19  {
20  int i = tid % nx;
21  int j = tid / nx;
22 
23  int ti = i - nx / 2; if (ti<0) ti += nx;
24  int tj = j - ny / 2; if (tj<0) tj += ny;
25 
26  int oindex = tj*nx + ti;
27 
28 
29  output[tid].x = input[oindex].x / normalF;
30  output[tid].y = input[oindex].y / normalF;
31 
32  tid += blockDim.x * gridDim.x;
33  }
34 }
35 
36 __device__ void exponent_complex(cuDoubleComplex* val)
37 {
38  double exp_val = exp(val->x);
39  double cos_v;
40  double sin_v;
41  sincos(val->y, &sin_v, &cos_v);
42 
43  val->x = exp_val * cos_v;
44  val->y = exp_val * sin_v;
45 
46 }
47 
48 
49 __global__ void depth_sources_kernel(cufftDoubleComplex* u_o_gpu, unsigned char* img_src_gpu, unsigned char* dimg_src_gpu, double* depth_index_gpu,
50  int dtr, double rand_phase_val_a, double rand_phase_val_b, double carrier_phase_delay_a, double carrier_phase_delay_b, int pnx, int pny,
51  int FLAG_CHANGE_DEPTH_QUANTIZATION, unsigned int DEFAULT_DEPTH_QUANTIZATION)
52 {
53  int tid = threadIdx.x + blockIdx.x*blockDim.x;
54 
55  if (tid < pnx*pny) {
56 
57  double img = ((double)img_src_gpu[tid]) / 255.0;
58  double depth_idx;
59  if (FLAG_CHANGE_DEPTH_QUANTIZATION == 1)
60  depth_idx = depth_index_gpu[tid];
61  else
62  depth_idx = (double)DEFAULT_DEPTH_QUANTIZATION - (double)dimg_src_gpu[tid];
63 
64  double alpha_map = ((double)img_src_gpu[tid] > 0.0 ? 1.0 : 0.0);
65 
66  u_o_gpu[tid].x = img * alpha_map * (depth_idx == (double)dtr ? 1.0 : 0.0);
67 
68  cuDoubleComplex tmp1 = cuCmul(make_cuDoubleComplex(rand_phase_val_a, rand_phase_val_b), make_cuDoubleComplex(carrier_phase_delay_a, carrier_phase_delay_b));
69  u_o_gpu[tid] = cuCmul(u_o_gpu[tid], tmp1);
70 
71  }
72 }
73 
74 
75 __global__ void propagation_angularsp_kernel(cufftDoubleComplex* input_d, cufftDoubleComplex* u_complex, int pnx, int pny,
76  double ppx, double ppy, double ssx, double ssy, double lambda, double params_k, double propagation_dist)
77 {
78  int tid = threadIdx.x + blockIdx.x*blockDim.x;
79 
80  if (tid < pnx*pny) {
81 
82  int x = tid % pnx;
83  int y = tid / pnx;
84 
85  double fxx = (-1.0 / (2.0*ppx)) + (1.0 / ssx) * (double)x;
86  double fyy = (1.0 / (2.0*ppy)) - (1.0 / ssy) - (1.0 / ssy) * (double)y;
87 
88  double sval = sqrt(1 - (lambda*fxx)*(lambda*fxx) - (lambda*fyy)*(lambda*fyy));
89  sval *= params_k * propagation_dist;
90 
91  cuDoubleComplex kernel = make_cuDoubleComplex(0, sval);
92  exponent_complex(&kernel);
93 
94  int prop_mask = ((fxx * fxx + fyy * fyy) < (params_k *params_k)) ? 1 : 0;
95 
96  cuDoubleComplex u_frequency = make_cuDoubleComplex(0, 0);
97  if (prop_mask == 1)
98  u_frequency = cuCmul(kernel,input_d[tid]);
99 
100  u_complex[tid] = cuCadd(u_complex[tid], u_frequency);
101 
102  }
103 }
104 
105 
106 __global__ void cropFringe(int nx, int ny, cufftDoubleComplex* in_filed, cufftDoubleComplex* out_filed, int cropx1, int cropx2, int cropy1, int cropy2)
107 {
108  int tid = threadIdx.x + blockIdx.x*blockDim.x;
109 
110  if (tid < nx*ny)
111  {
112  int x = tid % nx;
113  int y = tid / nx;
114 
115  if (x >= cropx1 && x <= cropx2 && y >= cropy1 && y <= cropy2)
116  out_filed[tid] = in_filed[tid];
117  }
118 }
119 
120 
121 __global__ void getFringe(int nx, int ny, cufftDoubleComplex* in_filed, cufftDoubleComplex* out_filed, int sig_locationx, int sig_locationy,
122  double ssx, double ssy, double ppx, double ppy, double pi)
123 {
124  int tid = threadIdx.x + blockIdx.x*blockDim.x;
125 
126  if (tid < nx*ny)
127  {
128  cuDoubleComplex shift_phase = make_cuDoubleComplex(1, 0);
129 
130  if (sig_locationy != 0)
131  {
132  int r = tid / nx;
133  int c = tid % nx;
134  double yy = (ssy / 2.0) - (ppy)*(double)r - ppy;
135 
136  cuDoubleComplex val = make_cuDoubleComplex(0,0);
137  if (sig_locationy == 1)
138  val.y = 2.0 * pi * (yy / (4.0 * ppy));
139  else
140  val.y = 2.0 * pi * (-yy / (4.0 * ppy));
141 
142  exponent_complex(&val);
143 
144  shift_phase = cuCmul(shift_phase,val);
145  }
146 
147  if (sig_locationx != 0)
148  {
149  int r = tid / nx;
150  int c = tid % nx;
151  double xx = (-ssx / 2.0) - (ppx)*(double)c - ppx;
152 
153  cuDoubleComplex val = make_cuDoubleComplex(0, 0);
154  if (sig_locationx == -1)
155  val.y = 2.0 * pi * (-xx / (4.0 * ppx));
156  else
157  val.y = 2.0 * pi * (xx / (4.0 * ppx));
158 
159  exponent_complex(&val);
160  shift_phase = cuCmul(shift_phase, val);
161  }
162 
163  out_filed[tid] = cuCmul(in_filed[tid], shift_phase);
164  }
165 
166 }
167 
168 __global__ void change_depth_quan_kernel(double* depth_index_gpu, unsigned char* dimg_src_gpu, int pnx, int pny,
169  int dtr, double d1, double d2, double num_depth, double far_depth, double near_depth)
170 {
171  int tid = threadIdx.x + blockIdx.x*blockDim.x;
172 
173  if (tid < pnx*pny) {
174 
175  int tdepth;
176  double dmap_src = double(dimg_src_gpu[tid]) / 255.0;
177  double dmap = (1.0 - dmap_src)*(far_depth - near_depth) + near_depth;
178 
179  if (dtr < num_depth - 1)
180  tdepth = (dmap >= d1 ? 1 : 0) * (dmap < d2 ? 1 : 0);
181  else
182  tdepth = (dmap >= d1 ? 1 : 0) * (dmap <= d2 ? 1 : 0);
183 
184  depth_index_gpu[tid] = depth_index_gpu[tid] + (double)(tdepth * (dtr + 1));
185 
186  }
187 }
188 
189 extern "C"
190 void cudaFFT(CUstream_st* stream, int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* output_field, int direction, bool bNormalized)
191 {
192  unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
193  int N = nx* ny;
194  fftShift << <nblocks, kBlockThreads, 0, stream >> >(N, nx, ny, in_field, output_field, false);
195 
196  cufftHandle plan;
197 
198  // fft
199  if (cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z) != CUFFT_SUCCESS)
200  {
201  //LOG("FAIL in creating cufft plan");
202  return;
203  };
204 
205  cufftResult result;
206 
207  if (direction == -1)
208  result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_FORWARD);
209  else
210  result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_INVERSE);
211 
212  if (result != CUFFT_SUCCESS)
213  {
214  //LOG("------------------FAIL: execute cufft, code=%s", result);
215  return;
216  }
217 
218  if (cudaDeviceSynchronize() != cudaSuccess) {
219  //LOG("Cuda error: Failed to synchronize\n");
220  return;
221  }
222 
223  fftShift << < nblocks, kBlockThreads, 0, stream >> >(N, nx, ny, in_field, output_field, bNormalized);
224 
225  cufftDestroy(plan);
226 
227 }
228 
229 extern "C"
230 void cudaDepthHoloKernel(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* u_o_gpu, unsigned char* img_src_gpu, unsigned char* dimg_src_gpu, double* depth_index_gpu,
231  int dtr, double rand_phase_val_a, double rand_phase_val_b, double carrier_phase_delay_a, double carrier_phase_delay_b, int flag_change_depth_quan, unsigned int default_depth_quan)
232 {
233  dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
234  depth_sources_kernel << <grid, kBlockThreads, 0, stream >> >(u_o_gpu, img_src_gpu, dimg_src_gpu, depth_index_gpu,
235  dtr, rand_phase_val_a, rand_phase_val_b, carrier_phase_delay_a, carrier_phase_delay_b, pnx, pny, flag_change_depth_quan, default_depth_quan);
236 }
237 
238 extern "C"
239 void cudaPropagation_AngularSpKernel(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* input_d, cufftDoubleComplex* u_complex,
240  double ppx, double ppy, double ssx, double ssy, double lambda, double params_k, double propagation_dist)
241 {
242  dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
243  propagation_angularsp_kernel << <grid, kBlockThreads, 0, stream >> >(input_d, u_complex, pnx, pny, ppx, ppy, ssx, ssy, lambda, params_k, propagation_dist);
244 
245 }
246 
247 extern "C"
248 void cudaCropFringe(CUstream_st* stream, int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* out_field, int cropx1,int cropx2, int cropy1, int cropy2)
249 {
250  unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
251 
252  cropFringe << < nblocks, kBlockThreads, 0, stream >> > (nx, ny, in_field, out_field, cropx1, cropx2, cropy1, cropy2);
253 
254 }
255 
256 extern "C"
257 void cudaGetFringe(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* in_field, cufftDoubleComplex* out_field, int sig_locationx, int sig_locationy,
258  double ssx, double ssy, double ppx, double ppy, double PI)
259 {
260  unsigned int nblocks = (pnx*pny + kBlockThreads - 1) / kBlockThreads;
261 
262  getFringe << < nblocks, kBlockThreads, 0, stream >> > (pnx, pny, in_field, out_field, sig_locationx, sig_locationy, ssx, ssy, ppx, ppy, PI);
263 
264 }
265 
266 extern "C"
267 void cudaChangeDepthQuanKernel(CUstream_st* stream, int pnx, int pny, double* depth_index_gpu, unsigned char* dimg_src_gpu,
268  int dtr, double d1, double d2, double params_num_of_depth, double params_far_depthmap, double params_near_depthmap)
269 {
270  dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
271  change_depth_quan_kernel << <grid, kBlockThreads, 0, stream >> > (depth_index_gpu, dimg_src_gpu, pnx, pny,
272  dtr, d1, d2, params_num_of_depth, params_far_depthmap, params_near_depthmap);
273 
274 }
275