1 #include <cuda_runtime.h>
5 #include <device_launch_parameters.h>
8 static const int kBlockThreads = 512;
10 __global__ void fftShift(int N, int nx, int ny, cufftDoubleComplex* input, cufftDoubleComplex* output, bool bNormailzed)
12 int tid = threadIdx.x + blockIdx.x*blockDim.x;
15 if (bNormailzed == true)
23 int ti = i - nx / 2; if (ti<0) ti += nx;
24 int tj = j - ny / 2; if (tj<0) tj += ny;
26 int oindex = tj*nx + ti;
29 output[tid].x = input[oindex].x / normalF;
30 output[tid].y = input[oindex].y / normalF;
32 tid += blockDim.x * gridDim.x;
36 __device__ void exponent_complex(cuDoubleComplex* val)
38 double exp_val = exp(val->x);
41 sincos(val->y, &sin_v, &cos_v);
43 val->x = exp_val * cos_v;
44 val->y = exp_val * sin_v;
49 __global__ void depth_sources_kernel(cufftDoubleComplex* u_o_gpu, unsigned char* img_src_gpu, unsigned char* dimg_src_gpu, double* depth_index_gpu,
50 int dtr, double rand_phase_val_a, double rand_phase_val_b, double carrier_phase_delay_a, double carrier_phase_delay_b, int pnx, int pny,
51 int FLAG_CHANGE_DEPTH_QUANTIZATION, unsigned int DEFAULT_DEPTH_QUANTIZATION)
53 int tid = threadIdx.x + blockIdx.x*blockDim.x;
57 double img = ((double)img_src_gpu[tid]) / 255.0;
59 if (FLAG_CHANGE_DEPTH_QUANTIZATION == 1)
60 depth_idx = depth_index_gpu[tid];
62 depth_idx = (double)DEFAULT_DEPTH_QUANTIZATION - (double)dimg_src_gpu[tid];
64 double alpha_map = ((double)img_src_gpu[tid] > 0.0 ? 1.0 : 0.0);
66 u_o_gpu[tid].x = img * alpha_map * (depth_idx == (double)dtr ? 1.0 : 0.0);
68 cuDoubleComplex tmp1 = cuCmul(make_cuDoubleComplex(rand_phase_val_a, rand_phase_val_b), make_cuDoubleComplex(carrier_phase_delay_a, carrier_phase_delay_b));
69 u_o_gpu[tid] = cuCmul(u_o_gpu[tid], tmp1);
75 __global__ void propagation_angularsp_kernel(cufftDoubleComplex* input_d, cufftDoubleComplex* u_complex, int pnx, int pny,
76 double ppx, double ppy, double ssx, double ssy, double lambda, double params_k, double propagation_dist)
78 int tid = threadIdx.x + blockIdx.x*blockDim.x;
85 double fxx = (-1.0 / (2.0*ppx)) + (1.0 / ssx) * (double)x;
86 double fyy = (1.0 / (2.0*ppy)) - (1.0 / ssy) - (1.0 / ssy) * (double)y;
88 double sval = sqrt(1 - (lambda*fxx)*(lambda*fxx) - (lambda*fyy)*(lambda*fyy));
89 sval *= params_k * propagation_dist;
91 cuDoubleComplex kernel = make_cuDoubleComplex(0, sval);
92 exponent_complex(&kernel);
94 int prop_mask = ((fxx * fxx + fyy * fyy) < (params_k *params_k)) ? 1 : 0;
96 cuDoubleComplex u_frequency = make_cuDoubleComplex(0, 0);
98 u_frequency = cuCmul(kernel,input_d[tid]);
100 u_complex[tid] = cuCadd(u_complex[tid], u_frequency);
106 __global__ void cropFringe(int nx, int ny, cufftDoubleComplex* in_filed, cufftDoubleComplex* out_filed, int cropx1, int cropx2, int cropy1, int cropy2)
108 int tid = threadIdx.x + blockIdx.x*blockDim.x;
115 if (x >= cropx1 && x <= cropx2 && y >= cropy1 && y <= cropy2)
116 out_filed[tid] = in_filed[tid];
121 __global__ void getFringe(int nx, int ny, cufftDoubleComplex* in_filed, cufftDoubleComplex* out_filed, int sig_locationx, int sig_locationy,
122 double ssx, double ssy, double ppx, double ppy, double pi)
124 int tid = threadIdx.x + blockIdx.x*blockDim.x;
128 cuDoubleComplex shift_phase = make_cuDoubleComplex(1, 0);
130 if (sig_locationy != 0)
134 double yy = (ssy / 2.0) - (ppy)*(double)r - ppy;
136 cuDoubleComplex val = make_cuDoubleComplex(0,0);
137 if (sig_locationy == 1)
138 val.y = 2.0 * pi * (yy / (4.0 * ppy));
140 val.y = 2.0 * pi * (-yy / (4.0 * ppy));
142 exponent_complex(&val);
144 shift_phase = cuCmul(shift_phase,val);
147 if (sig_locationx != 0)
151 double xx = (-ssx / 2.0) - (ppx)*(double)c - ppx;
153 cuDoubleComplex val = make_cuDoubleComplex(0, 0);
154 if (sig_locationx == -1)
155 val.y = 2.0 * pi * (-xx / (4.0 * ppx));
157 val.y = 2.0 * pi * (xx / (4.0 * ppx));
159 exponent_complex(&val);
160 shift_phase = cuCmul(shift_phase, val);
163 out_filed[tid] = cuCmul(in_filed[tid], shift_phase);
168 __global__ void change_depth_quan_kernel(double* depth_index_gpu, unsigned char* dimg_src_gpu, int pnx, int pny,
169 int dtr, double d1, double d2, double num_depth, double far_depth, double near_depth)
171 int tid = threadIdx.x + blockIdx.x*blockDim.x;
176 double dmap_src = double(dimg_src_gpu[tid]) / 255.0;
177 double dmap = (1.0 - dmap_src)*(far_depth - near_depth) + near_depth;
179 if (dtr < num_depth - 1)
180 tdepth = (dmap >= d1 ? 1 : 0) * (dmap < d2 ? 1 : 0);
182 tdepth = (dmap >= d1 ? 1 : 0) * (dmap <= d2 ? 1 : 0);
184 depth_index_gpu[tid] = depth_index_gpu[tid] + (double)(tdepth * (dtr + 1));
190 void cudaFFT(CUstream_st* stream, int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* output_field, int direction, bool bNormalized)
192 unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
194 fftShift << <nblocks, kBlockThreads, 0, stream >> >(N, nx, ny, in_field, output_field, false);
199 if (cufftPlan2d(&plan, ny, nx, CUFFT_Z2Z) != CUFFT_SUCCESS)
201 //LOG("FAIL in creating cufft plan");
208 result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_FORWARD);
210 result = cufftExecZ2Z(plan, output_field, in_field, CUFFT_INVERSE);
212 if (result != CUFFT_SUCCESS)
214 //LOG("------------------FAIL: execute cufft, code=%s", result);
218 if (cudaDeviceSynchronize() != cudaSuccess) {
219 //LOG("Cuda error: Failed to synchronize\n");
223 fftShift << < nblocks, kBlockThreads, 0, stream >> >(N, nx, ny, in_field, output_field, bNormalized);
230 void cudaDepthHoloKernel(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* u_o_gpu, unsigned char* img_src_gpu, unsigned char* dimg_src_gpu, double* depth_index_gpu,
231 int dtr, double rand_phase_val_a, double rand_phase_val_b, double carrier_phase_delay_a, double carrier_phase_delay_b, int flag_change_depth_quan, unsigned int default_depth_quan)
233 dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
234 depth_sources_kernel << <grid, kBlockThreads, 0, stream >> >(u_o_gpu, img_src_gpu, dimg_src_gpu, depth_index_gpu,
235 dtr, rand_phase_val_a, rand_phase_val_b, carrier_phase_delay_a, carrier_phase_delay_b, pnx, pny, flag_change_depth_quan, default_depth_quan);
239 void cudaPropagation_AngularSpKernel(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* input_d, cufftDoubleComplex* u_complex,
240 double ppx, double ppy, double ssx, double ssy, double lambda, double params_k, double propagation_dist)
242 dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
243 propagation_angularsp_kernel << <grid, kBlockThreads, 0, stream >> >(input_d, u_complex, pnx, pny, ppx, ppy, ssx, ssy, lambda, params_k, propagation_dist);
248 void cudaCropFringe(CUstream_st* stream, int nx, int ny, cufftDoubleComplex* in_field, cufftDoubleComplex* out_field, int cropx1,int cropx2, int cropy1, int cropy2)
250 unsigned int nblocks = (nx*ny + kBlockThreads - 1) / kBlockThreads;
252 cropFringe << < nblocks, kBlockThreads, 0, stream >> > (nx, ny, in_field, out_field, cropx1, cropx2, cropy1, cropy2);
257 void cudaGetFringe(CUstream_st* stream, int pnx, int pny, cufftDoubleComplex* in_field, cufftDoubleComplex* out_field, int sig_locationx, int sig_locationy,
258 double ssx, double ssy, double ppx, double ppy, double PI)
260 unsigned int nblocks = (pnx*pny + kBlockThreads - 1) / kBlockThreads;
262 getFringe << < nblocks, kBlockThreads, 0, stream >> > (pnx, pny, in_field, out_field, sig_locationx, sig_locationy, ssx, ssy, ppx, ppy, PI);
267 void cudaChangeDepthQuanKernel(CUstream_st* stream, int pnx, int pny, double* depth_index_gpu, unsigned char* dimg_src_gpu,
268 int dtr, double d1, double d2, double params_num_of_depth, double params_far_depthmap, double params_near_depthmap)
270 dim3 grid((pnx*pny + kBlockThreads - 1) / kBlockThreads, 1, 1);
271 change_depth_quan_kernel << <grid, kBlockThreads, 0, stream >> > (depth_index_gpu, dimg_src_gpu, pnx, pny,
272 dtr, d1, d2, params_num_of_depth, params_far_depthmap, params_near_depthmap);