56 this->
field = fftw_alloc_complex(this->params.
nmesh);
65 this->field_s = fftw_alloc_complex(this->params.
nmesh);
85#if defined(TRV_USE_OMP) && defined(TRV_USE_FFTWOMP)
87 fftw_plan_with_nthreads(omp_get_max_threads());
91 bool import_fftw_wisdom_f =
false;
92 bool import_fftw_wisdom_b =
false;
93 bool export_fftw_wisdom_f =
false;
94 bool export_fftw_wisdom_b =
false;
95 std::FILE* fftw_wisdom_file_f =
nullptr;
96 std::FILE* fftw_wisdom_file_b =
nullptr;
102 if (fftw_wisdom_file_f !=
nullptr) {
103 import_fftw_wisdom_f =
true;
104 export_fftw_wisdom_f =
false;
108 "FFTW planner flag is set to `FFTW_PATIENT`. "
109 "Ensure that the FFTW wisdom file '%s' imported has been "
110 "generated with an equivalent or higher planner flag.",
116 import_fftw_wisdom_f =
false;
117 export_fftw_wisdom_f =
true;
120 "No FFTW wisdom file for forward transforms "
121 "could be imported: %s",
131 if (fftw_wisdom_file_b !=
nullptr) {
132 import_fftw_wisdom_b =
true;
133 export_fftw_wisdom_b =
false;
137 "FFTW planner flag is set to `FFTW_PATIENT`. "
138 "Ensure that the FFTW wisdom file '%s' imported has been "
139 "generated with an equivalent or higher planner flag.",
145 import_fftw_wisdom_b =
false;
146 export_fftw_wisdom_b =
true;
149 "No FFTW wisdom file for backward transforms "
150 "could be imported: %s",
158 if (import_fftw_wisdom_f) {
159 fftw_import_wisdom_from_filename(
165 "FFTW wisdom file for forward transforms has been imported: %s",
175 CUDA_EXEC(cudaStreamCreate(&this->custream));
182 &this->d_field,
sizeof(fft_double_complex) * this->params.
nmesh
190 auto pre_plan_f_timept = std::chrono::system_clock::now();
192#if defined(TRV_USE_HIP)
193 HIPFFT_EXEC(hipfftPlan3d(
194 &this->transform_gpu,
195 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
201 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
205#elif defined(TRV_USE_CUDA)
206 CUFFT_EXEC(cufftCreate(&this->transform_gpu));
209 CUFFT_EXEC(cufftSetStream(this->transform_gpu, this->custream));
213 CUFFT_EXEC(cufftXtSetGPUs(
214 this->transform_gpu, gpus.size(), gpus.data()
218 std::size_t workspace_sizes[gpus.size()];
219 CUFFT_EXEC(cufftMakePlan3d(
221 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
227 CUDA_EXEC(cudaMalloc(
228 &this->d_field,
sizeof(fft_double_complex) * this->params.
nmesh
231 CUFFT_EXEC(cufftXtMalloc(
232 this->transform_gpu, &this->field_desc,
233 CUFFT_XT_FORMAT_INPLACE_SHUFFLED
239 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
246 this->transform = fftw_plan_dft_3d(
247 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
248 this->field, this->field,
249 FFTW_FORWARD, this->params.fftw_planner_flag
252 auto post_plan_f_timept = std::chrono::system_clock::now();
254 double plan_f_time = std::chrono::duration<double>(
255 post_plan_f_timept - pre_plan_f_timept
257 if (import_fftw_wisdom_f && plan_f_time > 1.) {
260 "FFTW plan for forward transforms took %.3f (> 1.) seconds "
261 "despite importing wisdom file, which may have been created "
262 "under different runtime conditions. "
263 "A new wisdom file will be exported.",
267 export_fftw_wisdom_f =
true;
270 if (export_fftw_wisdom_f) {
271 fftw_export_wisdom_to_filename(
276 "FFTW wisdom file for forward transforms has been exported: %s",
283 if (import_fftw_wisdom_b) {
284 fftw_import_wisdom_from_filename(
290 "FFTW wisdom file for backward transforms has been imported: %s",
296 auto pre_plan_b_timept = std::chrono::system_clock::now();
299 this->inv_transform = fftw_plan_dft_3d(
300 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
301 this->field, this->field,
302 FFTW_BACKWARD, this->params.fftw_planner_flag
305 auto post_plan_b_timept = std::chrono::system_clock::now();
307 double plan_b_time = std::chrono::duration<double>(
308 post_plan_b_timept - pre_plan_b_timept
310 if (import_fftw_wisdom_b && plan_b_time > 1.) {
313 "FFTW plan for backward transforms took %.3f (> 1.) seconds "
314 "despite importing wisdom file, which may have been created "
315 "under different runtime conditions. "
316 "A new wisdom file will be exported.",
320 export_fftw_wisdom_b =
true;
323 if (export_fftw_wisdom_b) {
324 fftw_export_wisdom_to_filename(
329 "FFTW wisdom file for backward transforms has been exported: %s",
339 this->transform_s = fftw_plan_dft_3d(
340 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
341 this->field_s, this->field_s,
342 FFTW_FORWARD, this->params.fftw_planner_flag
346 this->plan_ini =
true;
348 if (fftw_wisdom_file_f !=
nullptr) {std::fclose(fftw_wisdom_file_f);}
349 if (fftw_wisdom_file_b !=
nullptr) {std::fclose(fftw_wisdom_file_b);}
358 this->
dk[0] = 2.*M_PI / this->params.
boxsize[0];
359 this->
dk[1] = 2.*M_PI / this->params.
boxsize[1];
360 this->
dk[2] = 2.*M_PI / this->params.
boxsize[2];
369 fftw_plan& transform, fftw_plan& inv_transform,
370 const std::string&
name
375 "MeshField constructor called with external FFTW plans in GPU mode."
378 throw std::runtime_error(
379 "MeshField constructor called with external FFTW plans in GPU mode."
391 this->
field = fftw_alloc_complex(this->params.
nmesh);
400 this->field_s = fftw_alloc_complex(this->params.
nmesh);
412 this->transform = transform;
413 this->inv_transform = inv_transform;
415 this->transform_s = transform;
417 this->plan_ext =
true;
425 this->
dk[0] = 2.*M_PI / this->params.
boxsize[0];
426 this->
dk[1] = 2.*M_PI / this->params.
boxsize[1];
427 this->
dk[2] = 2.*M_PI / this->params.
boxsize[2];
435 if (this->plan_ini) {
438 std::vector<std::size_t> workspace_sizes(gpus.size());
439#if defined(TRV_USE_HIP)
440 HIP_EXEC(hipFree(this->d_field));
446 this->
params.ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
449 HIPFFT_EXEC(hipfftDestroy(this->transform_gpu));
450#elif defined(TRV_USE_CUDA)
452 CUDA_EXEC(cudaFree(this->d_field));
454 CUFFT_EXEC(cufftXtFree(this->field_desc));
459 this->
params.ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
460 workspace_sizes.data(),
463 CUFFT_EXEC(cufftDestroy(this->transform_gpu));
466 CUDA_EXEC(cudaStreamDestroy(this->custream));
470 fftw_destroy_plan(this->transform);
471 fftw_destroy_plan(this->inv_transform);
472 if (this->
params.interlace ==
"true") {
473 fftw_destroy_plan(this->transform_s);
478 if (this->window_assign_order != -1) {
484 if (this->
field !=
nullptr) {
485 fftw_free(this->
field);
486 this->
field =
nullptr;
491 if (this->field_s !=
nullptr) {
492 fftw_free(this->field_s);
493 this->field_s =
nullptr;
502#pragma omp parallel for simd
504 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
505 this->
field[gid][0] = 0.;
506 this->
field[gid][1] = 0.;
508 if (this->
params.interlace ==
"true") {
510#pragma omp parallel for simd
512 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
513 this->field_s[gid][0] = 0.;
514 this->field_s[gid][1] = 0.;
525 return this->
field[gid];
533long long MeshField::ret_grid_index(
int i,
int j,
int k) {
535 (i *
static_cast<long long>(this->
params.
ngrid[1]) + j)
540void MeshField::shift_grid_indices_fourier(
int& i,
int& j,
int& k) {
546void MeshField::get_grid_pos_vector(
int i,
int j,
int k,
double rvec[3]) {
547 rvec[0] = (i < this->
params.ngrid[0]/2) ?
548 i * this->
dr[0] : (i - this->
params.ngrid[0]) * this->
dr[0];
549 rvec[1] = (j < this->
params.ngrid[1]/2) ?
550 j * this->
dr[1] : (j - this->
params.ngrid[1]) * this->
dr[1];
551 rvec[2] = (k < this->
params.ngrid[2]/2) ?
552 k * this->
dr[2] : (k - this->
params.ngrid[2]) * this->
dr[2];
555void MeshField::get_grid_wavevector(
int i,
int j,
int k,
double kvec[3]) {
556 kvec[0] = (i < this->
params.ngrid[0]/2) ?
557 i * this->
dk[0] : (i - this->
params.ngrid[0]) * this->
dk[0];
558 kvec[1] = (j < this->
params.ngrid[1]/2) ?
559 j * this->
dk[1] : (j - this->
params.ngrid[1]) * this->
dk[1];
560 kvec[2] = (k < this->
params.ngrid[2]/2) ?
561 k * this->
dk[2] : (k - this->
params.ngrid[2]) * this->
dk[2];
574 "Performing mesh assignment scheme '%s' to '%s'.",
575 this->
params.assignment.c_str(),
580 for (
int iaxis = 0; iaxis < 3; iaxis++) {
581 double extent = particles.
pos_max[iaxis] - particles.
pos_min[iaxis];
582 if (
params.boxsize[iaxis] < extent) {
585 "Box size in dimension %d is smaller than catalogue extents: "
587 iaxis,
params.boxsize[iaxis], extent
593 if (this->
params.assignment ==
"ngp") {
594 this->assign_weighted_field_to_mesh_ngp(particles, weights);
596 if (this->
params.assignment ==
"cic") {
597 this->assign_weighted_field_to_mesh_cic(particles, weights);
599 if (this->
params.assignment ==
"tsc") {
600 this->assign_weighted_field_to_mesh_tsc(particles, weights);
602 if (this->
params.assignment ==
"pcs") {
603 this->assign_weighted_field_to_mesh_pcs(particles, weights);
607 "Unsupported mesh assignment scheme: '%s'.",
608 this->
params.assignment.c_str()
612 "Unsupported mesh assignment scheme: '%s'.",
613 this->
params.assignment.c_str()
618void MeshField::assign_weighted_field_to_mesh_ngp(
627 const double inv_vol_cell = 1 / this->
vol_cell;
634#pragma omp parallel for
636 for (
int pid = 0; pid < particles.
ntotal; pid++) {
638 double win[order][3];
641 for (
int iaxis = 0; iaxis < 3; iaxis++) {
646 int idx_grid = int(loc_grid);
647 if (loc_grid - idx_grid >= 0.5) {
648 idx_grid = (idx_grid == this->
params.
ngrid[iaxis] - 1)
652 ijk[0][iaxis] = idx_grid;
658 for (
int iloc = 0; iloc < order; iloc++) {
659 for (
int jloc = 0; jloc < order; jloc++) {
660 for (
int kloc = 0; kloc < order; kloc++) {
661 gid = this->ret_grid_index(
662 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
664 if (0 <= gid && gid < this->
params.nmesh) {
666 this->
field[gid][0] += inv_vol_cell
667 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
669 this->
field[gid][1] += inv_vol_cell
670 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
678 if (this->
params.interlace ==
"true") {
680#pragma omp parallel for
682 for (
int pid = 0; pid < particles.
ntotal; pid++) {
684 double win[order][3];
687 for (
int iaxis = 0; iaxis < 3; iaxis++) {
689 double loc_grid = this->
params.ngrid[iaxis]
690 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis] + 0.5;
692 if (loc_grid > this->
params.ngrid[iaxis]) {
693 loc_grid -= this->
params.ngrid[iaxis];
696 int idx_grid = int(loc_grid);
697 if (loc_grid - idx_grid >= 0.5) {
698 idx_grid = (idx_grid == this->
params.ngrid[iaxis] - 1)
702 ijk[0][iaxis] = idx_grid;
707 for (
int iloc = 0; iloc < order; iloc++) {
708 for (
int jloc = 0; jloc < order; jloc++) {
709 for (
int kloc = 0; kloc < order; kloc++) {
710 gid = this->ret_grid_index(
711 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
713 if (0 <= gid && gid < this->
params.nmesh) {
715 this->field_s[gid][0] += inv_vol_cell
716 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
718 this->field_s[gid][1] += inv_vol_cell
719 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
728void MeshField::assign_weighted_field_to_mesh_cic(
737 const double inv_vol_cell = 1 / this->
vol_cell;
744#pragma omp parallel for
746 for (
int pid = 0; pid < particles.ntotal; pid++) {
748 double win[order][3];
751 for (
int iaxis = 0; iaxis < 3; iaxis++) {
753 double loc_grid = this->
params.ngrid[iaxis]
754 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis];
756 int idx_grid = int(loc_grid);
758 ijk[0][iaxis] = idx_grid;
759 ijk[1][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
763 double s = loc_grid - idx_grid;
765 win[0][iaxis] = 1. - s;
769 for (
int iloc = 0; iloc < order; iloc++) {
770 for (
int jloc = 0; jloc < order; jloc++) {
771 for (
int kloc = 0; kloc < order; kloc++) {
772 gid = this->ret_grid_index(
773 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
775 if (0 <= gid && gid < this->
params.nmesh) {
777 this->
field[gid][0] += inv_vol_cell
778 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
780 this->
field[gid][1] += inv_vol_cell
781 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
789 if (this->
params.interlace ==
"true") {
791#pragma omp parallel for
793 for (
int pid = 0; pid < particles.ntotal; pid++) {
795 double win[order][3];
798 for (
int iaxis = 0; iaxis < 3; iaxis++) {
800 double loc_grid = this->
params.ngrid[iaxis]
801 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis] + 0.5;
803 if (loc_grid > this->
params.ngrid[iaxis]) {
804 loc_grid -= this->
params.ngrid[iaxis];
807 int idx_grid = int(loc_grid);
809 ijk[0][iaxis] = idx_grid;
810 ijk[1][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
813 double s = loc_grid - idx_grid;
814 win[0][iaxis] = 1. - s;
818 for (
int iloc = 0; iloc < order; iloc++) {
819 for (
int jloc = 0; jloc < order; jloc++) {
820 for (
int kloc = 0; kloc < order; kloc++) {
821 gid = this->ret_grid_index(
822 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
824 if (0 <= gid && gid < this->
params.nmesh) {
826 this->field_s[gid][0] += inv_vol_cell
827 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
829 this->field_s[gid][1] += inv_vol_cell
830 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
839void MeshField::assign_weighted_field_to_mesh_tsc(
848 const double inv_vol_cell = 1 / this->
vol_cell;
855#pragma omp parallel for
857 for (
int pid = 0; pid < particles.ntotal; pid++) {
859 double win[order][3];
862 for (
int iaxis = 0; iaxis < 3; iaxis++) {
864 double loc_grid = this->
params.ngrid[iaxis]
865 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis];
867 int idx_grid = int(loc_grid);
869 if (loc_grid - idx_grid < 0.5) {
870 ijk[0][iaxis] = (idx_grid == 0)
871 ? this->
params.ngrid[iaxis] - 1 : idx_grid - 1;
872 ijk[1][iaxis] = idx_grid;
873 ijk[2][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
876 ijk[0][iaxis] = idx_grid;
877 ijk[1][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
879 ijk[2][iaxis] = (ijk[1][iaxis] == this->
params.ngrid[iaxis] - 1)
880 ? 0 : ijk[1][iaxis] + 1;
884 double s = loc_grid - idx_grid;
887 win[0][iaxis] = 1./2 * (1./2 - s) * (1./2 - s);
888 win[1][iaxis] = 3./4 - s * s;
889 win[2][iaxis] = 1./2 * (1./2 + s) * (1./2 + s);
892 win[0][iaxis] = 1./2 * (1./2 + s) * (1./2 + s);
893 win[1][iaxis] = 3./4 - s * s;
894 win[2][iaxis] = 1./2 * (1./2 - s) * (1./2 - s);
898 for (
int iloc = 0; iloc < order; iloc++) {
899 for (
int jloc = 0; jloc < order; jloc++) {
900 for (
int kloc = 0; kloc < order; kloc++) {
901 gid = this->ret_grid_index(
902 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
904 if (0 <= gid && gid < this->
params.nmesh) {
906 this->
field[gid][0] += inv_vol_cell
907 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
909 this->
field[gid][1] += inv_vol_cell
910 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
918 if (this->
params.interlace ==
"true") {
920#pragma omp parallel for
922 for (
int pid = 0; pid < particles.ntotal; pid++) {
924 double win[order][3];
927 for (
int iaxis = 0; iaxis < 3; iaxis++) {
929 double loc_grid = this->
params.ngrid[iaxis]
930 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis] + 0.5;
932 if (loc_grid > this->
params.ngrid[iaxis]) {
933 loc_grid -= this->
params.ngrid[iaxis];
936 int idx_grid = int(loc_grid);
938 if (loc_grid - idx_grid < 0.5) {
939 ijk[0][iaxis] = (idx_grid == 0)
940 ? this->
params.ngrid[iaxis] - 1 : idx_grid - 1;
941 ijk[1][iaxis] = idx_grid;
942 ijk[2][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
945 ijk[0][iaxis] = idx_grid;
946 ijk[1][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
947 ? 0 : ijk[0][iaxis] + 1;
948 ijk[2][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
949 ? 0 : ijk[1][iaxis] + 1;
952 double s = loc_grid - idx_grid;
955 win[0][iaxis] = 1./2 * (1./2 - s) * (1./2 - s);
956 win[1][iaxis] = 3./4 - s * s;
957 win[2][iaxis] = 1./2 * (1./2 + s) * (1./2 + s);
960 win[0][iaxis] = 1./2 * (1./2 + s) * (1./2 + s);
961 win[1][iaxis] = 3./4 - s * s;
962 win[2][iaxis] = 1./2 * (1./2 - s) * (1./2 - s);
966 for (
int iloc = 0; iloc < order; iloc++) {
967 for (
int jloc = 0; jloc < order; jloc++) {
968 for (
int kloc = 0; kloc < order; kloc++) {
969 gid = this->ret_grid_index(
970 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
972 if (0 <= gid && gid < this->
params.nmesh) {
974 this->field_s[gid][0] += inv_vol_cell
975 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
977 this->field_s[gid][1] += inv_vol_cell
978 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
987void MeshField::assign_weighted_field_to_mesh_pcs(
996 const double inv_vol_cell = 1 / this->
vol_cell;
1003#pragma omp parallel for
1005 for (
int pid = 0; pid < particles.ntotal; pid++) {
1007 double win[order][3];
1010 for (
int iaxis = 0; iaxis < 3; iaxis++) {
1012 double loc_grid = this->
params.ngrid[iaxis]
1013 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis];
1015 int idx_grid = int(loc_grid);
1017 ijk[0][iaxis] = (idx_grid == 0)
1018 ? this->
params.ngrid[iaxis] - 1 : idx_grid - 1;
1019 ijk[1][iaxis] = idx_grid;
1020 ijk[2][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
1022 ijk[3][iaxis] = (ijk[2][iaxis] == this->
params.ngrid[iaxis] - 1)
1023 ? 0 : ijk[2][iaxis] + 1;
1026 double s = loc_grid - idx_grid;
1028 win[0][iaxis] = 1./6 * (1. - s) * (1. - s) * (1. - s);
1029 win[1][iaxis] = 1./6 * (4. - 6. * s * s + 3. * s * s * s);
1030 win[2][iaxis] = 1./6 * (
1031 4. - 6. * (1. - s) * (1. - s) + 3. * (1. - s) * (1. - s) * (1. - s)
1033 win[3][iaxis] = 1./6 * s * s * s;
1036 for (
int iloc = 0; iloc < order; iloc++) {
1037 for (
int jloc = 0; jloc < order; jloc++) {
1038 for (
int kloc = 0; kloc < order; kloc++) {
1039 gid = this->ret_grid_index(
1040 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
1042 if (0 <= gid && gid < this->
params.nmesh) {
1044 this->
field[gid][0] += inv_vol_cell
1045 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
1047 this->
field[gid][1] += inv_vol_cell
1048 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
1056 if (this->
params.interlace ==
"true") {
1058#pragma omp parallel for
1060 for (
int pid = 0; pid < particles.ntotal; pid++) {
1062 double win[order][3];
1065 for (
int iaxis = 0; iaxis < 3; iaxis++) {
1067 double loc_grid = this->
params.ngrid[iaxis]
1068 * particles[pid].pos[iaxis] / this->
params.boxsize[iaxis] + 0.5;
1070 if (loc_grid > this->
params.ngrid[iaxis]) {
1071 loc_grid -= this->
params.ngrid[iaxis];
1074 int idx_grid = int(loc_grid);
1076 ijk[0][iaxis] = (idx_grid == 0)
1077 ? this->
params.ngrid[iaxis] - 1 : idx_grid - 1;
1078 ijk[1][iaxis] = idx_grid;
1079 ijk[2][iaxis] = (idx_grid == this->
params.ngrid[iaxis] - 1)
1081 ijk[3][iaxis] = (ijk[2][iaxis] == this->
params.ngrid[iaxis] - 1)
1082 ? 0 : ijk[2][iaxis] + 1;
1083 double s = loc_grid - idx_grid;
1085 win[0][iaxis] = 1./6 * (1. - s) * (1. - s) * (1. - s);
1086 win[1][iaxis] = 1./6 * (4. - 6. * s * s + 3. * s * s * s);
1087 win[2][iaxis] = 1./6 * (
1088 4. - 6. * (1. - s) * (1. - s) + 3. * (1. - s) * (1. - s) * (1. - s)
1090 win[3][iaxis] = 1./6 * s * s * s;
1093 for (
int iloc = 0; iloc < order; iloc++) {
1094 for (
int jloc = 0; jloc < order; jloc++) {
1095 for (
int kloc = 0; kloc < order; kloc++) {
1096 gid = this->ret_grid_index(
1097 ijk[iloc][0], ijk[jloc][1], ijk[kloc][2]
1099 if (0 <= gid && gid < this->
params.nmesh) {
1101 this->field_s[gid][0] += inv_vol_cell
1102 * weight[pid][0] * win[iloc][0] * win[jloc][1] * win[kloc][2];
1104 this->field_s[gid][1] += inv_vol_cell
1105 * weight[pid][1] * win[iloc][0] * win[jloc][1] * win[kloc][2];
1114double MeshField::calc_assignment_window_in_fourier(
1115 int i,
int j,
int k,
int order
1120 "Invalid window assignment order: %d. Must be non-negative.",
1124 throw trvs::InvalidParameterError(
1125 "Invalid window assignment order: %d. Must be non-negative.",
1131 if (order == this->window_assign_order) {
1132 long long idx_grid = this->ret_grid_index(i, j, k);
1133 return this->window[idx_grid];
1136 this->shift_grid_indices_fourier(i, j, k);
1138 double u_x = M_PI * i / double(this->
params.ngrid[0]);
1139 double u_y = M_PI * j / double(this->
params.ngrid[1]);
1140 double u_z = M_PI * k / double(this->
params.ngrid[2]);
1143 double wk_x = (i != 0) ? std::sin(u_x) / u_x : 1.;
1144 double wk_y = (j != 0) ? std::sin(u_y) / u_y : 1.;
1145 double wk_z = (k != 0) ? std::sin(u_z) / u_z : 1.;
1147 double wk = wk_x * wk_y * wk_z;
1149 return std::pow(wk, order);
1152void MeshField::compute_assignment_window_in_fourier(
int order) {
1156 "Invalid window assignment order: %d. Must be non-negative.",
1160 throw trvs::InvalidParameterError(
1161 "Invalid window assignment order: %d. Must be non-negative.",
1166 if (this->window_assign_order == order) {
return;}
1170 "Computing interpolation window in Fourier space "
1171 "for assignment order %d.",
1176 if (this->window_assign_order == -1) {
1177 this->window.resize(this->
params.nmesh, 0.);
1187#pragma omp parallel for collapse(3)
1189 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1190 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1191 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1192 long long idx_grid = this->ret_grid_index(i, j, k);
1193 this->window[idx_grid] = this->calc_assignment_window_in_fourier(
1200 this->window_assign_order = order;
1209 fftw_complex* unit_weight = fftw_alloc_complex(particles.
ntotal);
1215#pragma omp parallel for simd
1217 for (
int pid = 0; pid < particles.
ntotal; pid++) {
1218 unit_weight[pid][0] = 1.;
1219 unit_weight[pid][1] = 0.;
1224 fftw_free(unit_weight);
1235 double nbar = double(particles.
ntotal) / this->
vol;
1238#pragma omp parallel for simd
1240 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1241 this->
field[gid][0] -= nbar;
1249 double alpha,
int ell,
int m
1251 fftw_complex* weight_kern = fftw_alloc_complex(particles_data.
ntotal);
1257#pragma omp parallel for
1259 for (
int pid = 0; pid < particles_data.
ntotal; pid++) {
1261 los_data[pid].
pos[0], los_data[pid].
pos[1], los_data[pid].
pos[2]
1267 weight_kern[pid][0] = ylm.real() * particles_data[pid].w;
1268 weight_kern[pid][1] = ylm.imag() * particles_data[pid].w;
1273 fftw_free(weight_kern);
1278 weight_kern = fftw_alloc_complex(particles_rand.
ntotal);
1284#pragma omp parallel for
1286 for (
int pid = 0; pid < particles_rand.
ntotal; pid++) {
1288 los_rand[pid].
pos[0], los_rand[pid].
pos[1], los_rand[pid].
pos[2]
1294 weight_kern[pid][0] = ylm.real() * particles_rand[pid].w;
1295 weight_kern[pid][1] = ylm.imag() * particles_rand[pid].w;
1301 fftw_free(weight_kern);
1307#pragma omp parallel for simd
1309 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1310 this->
field[gid][0] -= alpha * field_rand[gid][0];
1311 this->
field[gid][1] -= alpha * field_rand[gid][1];
1314 if (this->
params.interlace ==
"true") {
1316#pragma omp parallel for simd
1318 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1319 this->field_s[gid][0] -= alpha * field_rand.field_s[gid][0];
1320 this->field_s[gid][1] -= alpha * field_rand.field_s[gid][1];
1327 double alpha,
int ell,
int m
1330 fftw_complex* weight_kern = fftw_alloc_complex(particles.
ntotal);
1336#pragma omp parallel for
1338 for (
int pid = 0; pid < particles.
ntotal; pid++) {
1339 double los_[3] = {los[pid].
pos[0], los[pid].
pos[1], los[pid].
pos[2]};
1344 weight_kern[pid][0] = ylm.real() * particles[pid].w;
1345 weight_kern[pid][1] = ylm.imag() * particles[pid].w;
1350 fftw_free(weight_kern);
1356#pragma omp parallel for simd
1358 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1359 this->
field[gid][0] *= alpha;
1360 this->
field[gid][1] *= alpha;
1371 fftw_complex* weight_kern = fftw_alloc_complex(particles_data.
ntotal);
1377#pragma omp parallel for
1379 for (
int pid = 0; pid < particles_data.
ntotal; pid++) {
1381 los_data[pid].
pos[0], los_data[pid].
pos[1], los_data[pid].
pos[2]
1387 ylm = std::conj(ylm);
1389 weight_kern[pid][0] = ylm.real() * std::pow(particles_data[pid].w, 2);
1390 weight_kern[pid][1] = ylm.imag() * std::pow(particles_data[pid].w, 2);
1395 fftw_free(weight_kern);
1400 weight_kern = fftw_alloc_complex(particles_rand.
ntotal);
1406#pragma omp parallel for
1408 for (
int pid = 0; pid < particles_rand.
ntotal; pid++) {
1410 los_rand[pid].
pos[0], los_rand[pid].
pos[1], los_rand[pid].
pos[2]
1416 ylm = std::conj(ylm);
1418 weight_kern[pid][0] = ylm.real() * std::pow(particles_rand[pid].w, 2);
1419 weight_kern[pid][1] = ylm.imag() * std::pow(particles_rand[pid].w, 2);
1425 fftw_free(weight_kern);
1431#pragma omp parallel for simd
1433 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1434 this->
field[gid][0] += std::pow(alpha, 2) * field_rand[gid][0];
1435 this->
field[gid][1] += std::pow(alpha, 2) * field_rand[gid][1];
1438 if (this->
params.interlace ==
"true") {
1440#pragma omp parallel for simd
1442 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1443 this->field_s[gid][0] += std::pow(alpha, 2) * field_rand.field_s[gid][0];
1444 this->field_s[gid][1] += std::pow(alpha, 2) * field_rand.field_s[gid][1];
1451 double alpha,
int ell,
int m
1454 fftw_complex* weight_kern = fftw_alloc_complex(particles.
ntotal);
1460#pragma omp parallel for
1462 for (
int pid = 0; pid < particles.
ntotal; pid++) {
1463 double los_[3] = {los[pid].
pos[0], los[pid].
pos[1], los[pid].
pos[2]};
1468 ylm = std::conj(ylm);
1470 weight_kern[pid][0] = ylm.real() * std::pow(particles[pid].w, 2);
1471 weight_kern[pid][1] = ylm.imag() * std::pow(particles[pid].w, 2);
1476 fftw_free(weight_kern);
1483#pragma omp parallel for simd
1485 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1486 this->
field[gid][0] *= std::pow(alpha, 2);
1487 this->
field[gid][1] *= std::pow(alpha, 2);
1499 "Performing Fourier transform of '%s'.", this->
name.c_str()
1505#pragma omp parallel for simd
1507 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1514#if defined(TRV_USE_HIP)
1515 trva::copy_complex_array_htod(
1518 HIPFFT_EXEC(hipfftExecZ2Z(
1519 this->transform_gpu, this->d_field, this->d_field,
1522 trva::copy_complex_array_dtoh(
1525#elif defined(TRV_USE_CUDA)
1527 trva::copy_complex_array_htod(
1530 CUFFT_EXEC(cufftXtExec(
1531 this->transform_gpu, this->d_field, this->d_field,
1534 trva::copy_complex_array_dtoh(
1538 trva::copy_complex_array_htod_mgpu(
1539 this->transform_gpu, this->field_desc, this->
field
1541 CUFFT_EXEC(cufftXtExecDescriptor(
1542 this->transform_gpu, this->field_desc, this->field_desc,
1545 trva::copy_complex_array_dtoh_mgpu(
1546 this->transform_gpu, this->field_desc, this->
field
1551 if (this->plan_ext) {
1552 fftw_execute_dft(this->transform, this->
field, this->
field);
1554 fftw_execute(this->transform);
1560 if (this->
params.interlace ==
"true") {
1562#pragma omp parallel for simd
1564 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1565 this->field_s[gid][0] *= this->
vol_cell;
1566 this->field_s[gid][1] *= this->
vol_cell;
1570#if defined(TRV_USE_HIP)
1571 trva::copy_complex_array_htod(
1572 this->field_s, this->d_field, this->
params.nmesh
1574 HIPFFT_EXEC(hipfftExecZ2Z(
1575 this->transform_gpu, this->d_field, this->d_field,
1578 trva::copy_complex_array_dtoh(
1579 this->d_field, this->field_s, this->
params.nmesh
1581#elif defined(TRV_USE_CUDA)
1583 trva::copy_complex_array_htod(
1584 this->field_s, this->d_field, this->
params.nmesh
1586 CUFFT_EXEC(cufftXtExec(
1587 this->transform_gpu, this->d_field, this->d_field,
1590 trva::copy_complex_array_dtoh(
1591 this->d_field, this->field_s, this->
params.nmesh
1594 trva::copy_complex_array_htod_mgpu(
1595 this->transform_gpu, this->field_desc, this->field_s
1597 CUFFT_EXEC(cufftXtExecDescriptor(
1598 this->transform_gpu, this->field_desc, this->field_desc,
1601 trva::copy_complex_array_dtoh_mgpu(
1602 this->transform_gpu, this->field_desc, this->field_s
1607 if (this->plan_ext) {
1608 fftw_execute_dft(this->transform_s, this->field_s, this->field_s);
1610 fftw_execute(this->transform_s);
1616#pragma omp parallel for collapse(3)
1618 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1619 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1620 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1621 long long idx_grid = this->ret_grid_index(i, j, k);
1625 m[0] = (i < this->
params.ngrid[0]/2)
1626 ?
double(i) / this->
params.ngrid[0]
1627 : double(i) / this->
params.ngrid[0] - 1;
1628 m[1] = (j < this->
params.ngrid[1]/2)
1629 ?
double(j) / this->
params.ngrid[1]
1630 : double(j) / this->
params.ngrid[1] - 1;
1631 m[2] = (k < this->
params.ngrid[2]/2)
1632 ?
double(k) / this->
params.ngrid[2]
1633 : double(k) / this->
params.ngrid[2] - 1;
1638 double arg = M_PI * (m[0] + m[1] + m[2]);
1640 this->
field[idx_grid][0] +=
1641 std::cos(arg) * this->field_s[idx_grid][0]
1642 - std::sin(arg) * this->field_s[idx_grid][1]
1644 this->field[idx_grid][1] +=
1645 std::sin(arg) * this->field_s[idx_grid][0]
1646 + std::cos(arg) * this->field_s[idx_grid][1]
1649 this->field[idx_grid][0] /= 2.;
1650 this->field[idx_grid][1] /= 2.;
1660 "Performing inverse Fourier transform of '%s'.", this->
name.c_str()
1667#pragma omp parallel for simd
1669 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1676#if defined(TRV_USE_HIP)
1677 trva::copy_complex_array_htod(
1680 HIPFFT_EXEC(hipfftExecZ2Z(
1681 this->transform_gpu, this->d_field, this->d_field,
1684 trva::copy_complex_array_dtoh(
1687#elif defined(TRV_USE_CUDA)
1689 trva::copy_complex_array_htod(
1692 CUFFT_EXEC(cufftXtExec(
1693 this->transform_gpu, this->d_field, this->d_field,
1696 trva::copy_complex_array_dtoh(
1700 trva::copy_complex_array_htod_mgpu(
1701 this->transform_gpu, this->field_desc, this->
field
1703 CUFFT_EXEC(cufftXtExecDescriptor(
1704 this->transform_gpu, this->field_desc, this->field_desc,
1707 trva::copy_complex_array_dtoh_mgpu(
1708 this->transform_gpu, this->field_desc, this->
field
1713 if (this->plan_ext) {
1714 fftw_execute_dft(this->inv_transform, this->
field, this->
field);
1716 fftw_execute(this->inv_transform);
1730 "Applying wide-angle power-law kernel to '%s'.", this->
name.c_str()
1735 const double eps_r = 1.e-6;
1738#pragma omp parallel for collapse(3)
1740 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1741 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1742 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1743 long long idx_grid = this->ret_grid_index(i, j, k);
1746 this->get_grid_pos_vector(i, j, k, rv);
1754 this->
field[idx_grid][0] *=
1755 std::pow(r_, - this->
params.i_wa - this->params.j_wa);
1756 this->
field[idx_grid][1] *=
1757 std::pow(r_, - this->
params.i_wa - this->params.j_wa);
1767 "Applying assignment compensation to '%s'.", this->
name.c_str()
1771 this->compute_assignment_window_in_fourier(this->
params.assignment_order);
1774#pragma omp parallel for collapse(3)
1776 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1777 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1778 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1779 long long idx_grid = this->ret_grid_index(i, j, k);
1780 this->
field[idx_grid][0] /= this->window[idx_grid];
1781 this->
field[idx_grid][1] /= this->window[idx_grid];
1793 MeshField& field_fourier, std::vector< std::complex<double> >& ylm,
1794 double k_lower,
double k_upper,
1795 double& k_eff,
int& nmodes
1799 "Performing inverse Fourier transform to spherical harmonic weighted "
1800 "'%s' in wavenumber interval [%f, %f).",
1801 this->
name.c_str(), k_lower, k_upper
1810 this->compute_assignment_window_in_fourier(this->
params.assignment_order);
1813#pragma omp parallel for collapse(3) reduction(+:k_eff, nmodes)
1815 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1816 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1817 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1818 long long idx_grid = this->ret_grid_index(i, j, k);
1821 this->get_grid_wavevector(i, j, k, kv);
1826 if (k_lower <= k_ && k_ < k_upper) {
1827 std::complex<double> fk(
1828 field_fourier[idx_grid][0], field_fourier[idx_grid][1]
1832 fk /= this->window[idx_grid];
1835 this->
field[idx_grid][0] = (ylm[idx_grid] * fk).real();
1836 this->
field[idx_grid][1] = (ylm[idx_grid] * fk).imag();
1842 this->
field[idx_grid][0] = 0.;
1843 this->
field[idx_grid][1] = 0.;
1851#if defined(TRV_USE_HIP)
1852 trva::copy_complex_array_htod(
1855 HIPFFT_EXEC(hipfftExecZ2Z(
1856 this->transform_gpu, this->d_field, this->d_field,
1859 trva::copy_complex_array_dtoh(
1862#elif defined(TRV_USE_CUDA)
1864 trva::copy_complex_array_htod(
1867 CUFFT_EXEC(cufftXtExec(
1868 this->transform_gpu, this->d_field, this->d_field,
1871 trva::copy_complex_array_dtoh(
1875 trva::copy_complex_array_htod_mgpu(
1876 this->transform_gpu, this->field_desc, this->
field
1878 CUFFT_EXEC(cufftXtExecDescriptor(
1879 this->transform_gpu, this->field_desc, this->field_desc,
1882 trva::copy_complex_array_dtoh_mgpu(
1883 this->transform_gpu, this->field_desc, this->
field
1888 if (this->plan_ext) {
1889 fftw_execute_dft(this->inv_transform, this->
field, this->
field);
1891 fftw_execute(this->inv_transform);
1898#pragma omp parallel for simd
1900 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
1901 this->
field[gid][0] /= double(nmodes);
1902 this->
field[gid][1] /= double(nmodes);
1905 k_eff /= double(nmodes);
1910 std::vector< std::complex<double> >& ylm,
1916 "Performing inverse Fourier transform to spherical Bessel weighted "
1917 "'%s' at separation r = %f.",
1918 this->
name.c_str(), r
1924 this->compute_assignment_window_in_fourier(this->
params.assignment_order);
1934#pragma omp for collapse(3)
1936 for (
int i = 0; i < this->
params.ngrid[0]; i++) {
1937 for (
int j = 0; j < this->
params.ngrid[1]; j++) {
1938 for (
int k = 0; k < this->
params.ngrid[2]; k++) {
1939 long long idx_grid = this->ret_grid_index(i, j, k);
1942 this->get_grid_wavevector(i, j, k, kv);
1947 std::complex<double> fk(
1948 field_fourier[idx_grid][0], field_fourier[idx_grid][1]
1951 fk /= this->window[idx_grid];
1955 this->
field[idx_grid][0] =
1956 sjl_thread.
eval(k_ * r) * (ylm[idx_grid] * fk).real() / this->
vol;
1957 this->
field[idx_grid][1] =
1958 sjl_thread.
eval(k_ * r) * (ylm[idx_grid] * fk).imag() / this->
vol;
1966#if defined(TRV_USE_HIP)
1967 trva::copy_complex_array_htod(
1970 HIPFFT_EXEC(hipfftExecZ2Z(
1971 this->transform_gpu, this->d_field, this->d_field,
1974 trva::copy_complex_array_dtoh(
1977#elif defined(TRV_USE_CUDA)
1979 trva::copy_complex_array_htod(
1982 CUFFT_EXEC(cufftXtExec(
1983 this->transform_gpu, this->d_field, this->d_field,
1986 trva::copy_complex_array_dtoh(
1990 trva::copy_complex_array_htod_mgpu(
1991 this->transform_gpu, this->field_desc, this->
field
1993 CUFFT_EXEC(cufftXtExecDescriptor(
1994 this->transform_gpu, this->field_desc, this->field_desc,
1997 trva::copy_complex_array_dtoh_mgpu(
1998 this->transform_gpu, this->field_desc, this->
field
2003 if (this->plan_ext) {
2004 fftw_execute_dft(this->inv_transform, this->
field, this->
field);
2006 fftw_execute(this->inv_transform);
2021 fftw_complex* weight = fftw_alloc_complex(particles.
ntotal);
2027#if defined(__GNUC__) && !defined(__clang__)
2028#pragma omp parallel for simd
2030#pragma omp parallel for
2033 for (
int pid = 0; pid < particles.
ntotal; pid++) {
2034 weight[pid][0] = particles[pid].w;
2035 weight[pid][1] = 0.;
2047 double vol_int = 0.;
2050#if defined(__GNUC__) && !defined(__clang__)
2051#pragma omp parallel for simd reduction(+:vol_int)
2053#pragma omp parallel for reduction(+:vol_int)
2056 for (
long long gid = 0; gid < this->
params.nmesh; gid++) {
2057 vol_int += std::pow(this->
field[gid][0], order);
2062 double norm_factor = 1. / vol_int;
2077 this->params = params;
2082 this->dr[0] = this->params.
boxsize[0] / this->params.
ngrid[0];
2083 this->dr[1] = this->params.
boxsize[1] / this->params.
ngrid[1];
2084 this->dr[2] = this->params.
boxsize[2] / this->params.
ngrid[2];
2087 this->dk[0] = 2.*M_PI / this->params.
boxsize[0];
2088 this->dk[1] = 2.*M_PI / this->params.
boxsize[1];
2089 this->dk[2] = 2.*M_PI / this->params.
boxsize[2];
2092 this->vol = this->params.
volume;
2093 this->vol_cell = this->vol / double(this->params.
nmesh);
2097 this->twopt_3d = fftw_alloc_complex(this->params.
nmesh);
2101 &this->d_twopt_3d,
sizeof(fft_double_complex) * this->params.
nmesh
2115#if defined(TRV_USE_OMP) && defined(TRV_USE_FFTWOMP)
2116 fftw_plan_with_nthreads(omp_get_max_threads());
2123 CUDA_EXEC(cudaStreamCreate(&this->custream));
2128#if defined(TRV_USE_HIP)
2129 HIPFFT_EXEC(hipfftPlan3d(
2130 &this->inv_transform_gpu,
2131 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2135 this->inv_transform_gpu,
2137 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2141#elif defined(TRV_USE_CUDA)
2142 CUFFT_EXEC(cufftCreate(&this->inv_transform_gpu));
2145 CUFFT_EXEC(cufftSetStream(this->inv_transform_gpu, this->custream));
2149 CUFFT_EXEC(cufftXtSetGPUs(
2150 this->inv_transform_gpu, gpus.size(), gpus.data()
2154 std::size_t workspace_sizes[gpus.size()];
2155 CUFFT_EXEC(cufftMakePlan3d(
2156 this->inv_transform_gpu,
2157 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2163 CUDA_EXEC(cudaMalloc(
2164 &this->d_twopt_3d,
sizeof(fft_double_complex) * this->params.
nmesh
2167 CUFFT_EXEC(cufftXtMalloc(
2168 this->inv_transform_gpu, &this->twopt_3d_desc,
2169 CUFFT_XT_FORMAT_INPLACE_SHUFFLED
2173 this->inv_transform_gpu,
2175 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2182 this->inv_transform = fftw_plan_dft_3d(
2183 this->params.
ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2184 this->twopt_3d, this->twopt_3d,
2185 FFTW_BACKWARD, this->params.fftw_planner_flag
2189 this->plan_ini =
true;
2194 if (this->alias_ini) {
2200 if (this->plan_ini) {
2203 std::vector<std::size_t> workspace_sizes(gpus.size());
2204#if defined(TRV_USE_HIP)
2205 HIP_EXEC(hipFree(this->d_twopt_3d));
2209 this->inv_transform_gpu,
2211 this->params.ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2214 HIPFFT_EXEC(hipfftDestroy(this->inv_transform_gpu));
2215#elif defined(TRV_USE_CUDA)
2217 CUDA_EXEC(cudaFree(this->d_twopt_3d));
2219 CUFFT_EXEC(cufftXtFree(this->twopt_3d_desc));
2222 this->inv_transform_gpu,
2224 this->params.ngrid[0], this->params.ngrid[1], this->params.ngrid[2],
2225 workspace_sizes.data(),
2228 CUFFT_EXEC(cufftDestroy(this->inv_transform_gpu));
2231 CUDA_EXEC(cudaStreamDestroy(this->custream));
2235 fftw_destroy_plan(this->inv_transform);
2238 if (this->twopt_3d !=
nullptr) {
2239 fftw_free(this->twopt_3d);
2240 this->twopt_3d =
nullptr;
2249 std::fill(this->
nmodes.begin(), this->nmodes.end(), 0);
2250 std::fill(this->
npairs.begin(), this->npairs.end(), 0);
2251 std::fill(this->k.begin(), this->k.end(), 0.);
2252 std::fill(this->
r.begin(), this->r.end(), 0.);
2253 std::fill(this->
sn.begin(), this->sn.end(), 0.);
2254 std::fill(this->
pk.begin(), this->pk.end(), 0.);
2255 std::fill(this->
xi.begin(), this->xi.end(), 0.);
2258void FieldStats::resize_stats(
int num_bins){
2259 this->
nmodes.resize(num_bins);
2260 this->
npairs.resize(num_bins);
2261 this->k.resize(num_bins);
2262 this->
r.resize(num_bins);
2263 this->
sn.resize(num_bins);
2264 this->
pk.resize(num_bins);
2265 this->
xi.resize(num_bins);
2273bool FieldStats::if_fields_compatible(
2274 MeshField& field_a, MeshField& field_b
2276 bool flag_compatible =
true;
2279 for (
int iaxis = 0; iaxis < 3; iaxis++) {
2281 this->params.
boxsize[iaxis] != field_a.params.boxsize[iaxis]
2282 || this->params.boxsize[iaxis] != field_b.params.boxsize[iaxis]
2283 || this->params.ngrid[iaxis] != field_a.params.ngrid[iaxis]
2284 || this->params.ngrid[iaxis] != field_b.params.ngrid[iaxis]
2286 flag_compatible =
false;
2293 this->params.nmesh != field_a.params.nmesh
2294 || this->params.nmesh != field_b.params.nmesh
2295 || this->params.volume != field_b.params.volume
2296 || this->params.volume != field_b.params.volume
2298 flag_compatible =
false;
2301 return flag_compatible;
2304long long FieldStats::ret_grid_index(
int i,
int j,
int k) {
2305 long long idx_grid =
2306 (i *
static_cast<long long>(this->params.ngrid[1]) + j)
2307 * this->params.ngrid[2] +
k;
2311void FieldStats::shift_grid_indices_fourier(
int& i,
int& j,
int& k) {
2312 i = (i < this->params.ngrid[0]/2) ? i : i - this->params.ngrid[0];
2313 j = (j < this->params.ngrid[1]/2) ? j : j - this->params.ngrid[1];
2314 k = (
k < this->params.ngrid[2]/2) ?
k :
k - this->params.ngrid[2];
2320 double cellsizes[3];
2321 if (binning.
space ==
"config") {
2322 cellsizes[0] = this->dr[0];
2323 cellsizes[1] = this->dr[1];
2324 cellsizes[2] = this->dr[2];
2326 if (binning.
space ==
"fourier") {
2327 cellsizes[0] = this->dk[0];
2328 cellsizes[1] = this->dk[1];
2329 cellsizes[2] = this->dk[2];
2333 "Invalid binning space: '%s'.", binning.
space.c_str()
2336 throw trvs::InvalidDataError(
2337 "Invalid binning space: '%s'.", binning.
space.c_str()
2344 int lrange_upper[3], rrange_lower[3];
2345 if (binning.
space ==
"config") {
2346 for (
int iaxis = 0; iaxis < 3; iaxis++) {
2347 lrange_upper[iaxis] = std::min(
2348 std::ceil(b * this->params.ngrid[iaxis] / this->params.boxsize[iaxis]),
2349 std::ceil(this->params.ngrid[iaxis]/2.)
2351 rrange_lower[iaxis] = std::max(
2353 this->params.ngrid[iaxis]
2354 - b * this->params.ngrid[iaxis] / this->params.boxsize[iaxis]
2356 std::floor(this->params.ngrid[iaxis]/2.)
2360 if (binning.
space ==
"fourier") {
2361 for (
int iaxis = 0; iaxis < 3; iaxis++) {
2362 lrange_upper[iaxis] = std::min(
2363 std::ceil(b * this->params.boxsize[iaxis] / (2*M_PI)),
2364 std::ceil(this->params.ngrid[iaxis]/2.)
2366 rrange_lower[iaxis] = std::max(
2368 this->params.ngrid[iaxis]
2369 - b * this->params.boxsize[iaxis] / (2*M_PI)
2371 std::floor(this->params.ngrid[iaxis]/2.)
2376 auto generate_range = [](
2377 int lrange_lower,
int lrange_upper,
int rrange_lower,
int rrange_upper
2379 std::vector<int> range_vector;
2380 if (lrange_upper < rrange_lower) {
2381 for (
int i = lrange_lower; i <= lrange_upper; i++) {
2382 range_vector.push_back(i);
2384 for (
int i = rrange_lower; i <= rrange_upper; i++) {
2385 range_vector.push_back(i);
2389 for (
int i = lrange_lower; i <= rrange_upper; i++) {
2390 range_vector.push_back(i);
2393 return range_vector;
2396 std::vector<int> i_range = generate_range(
2397 0, lrange_upper[0], rrange_lower[0], params.ngrid[0] - 1
2399 std::vector<int> j_range = generate_range(
2400 0, lrange_upper[1], rrange_lower[1], params.ngrid[1] - 1
2402 std::vector<int> k_range = generate_range(
2403 0, lrange_upper[2], rrange_lower[2], params.ngrid[2] - 1
2407 trv::BinnedVectors binned_vectors;
2412#pragma omp parallel for collapse(3)
2414 for (
int i: i_range) {
2415 for (
int j: j_range) {
2416 for (
int k: k_range) {
2417 double vx = (i < this->params.ngrid[0]/2) ?
2418 i * cellsizes[0] : (i - this->params.ngrid[0]) * cellsizes[0];
2419 double vy = (j < this->params.ngrid[1]/2) ?
2420 j * cellsizes[1] : (j - this->params.ngrid[1]) * cellsizes[1];
2421 double vz = (k < this->params.ngrid[2]/2) ?
2422 k * cellsizes[2] : (k - this->params.ngrid[2]) * cellsizes[2];
2428 for (
int ibin = 0; ibin < binning.
num_bins; ibin++) {
2429 double bin_lower = binning.
bin_edges[ibin];
2430 double bin_upper = binning.
bin_edges[ibin + 1];
2431 if (bin_lower <= scale && scale < bin_upper) {
2432 binned_vectors.
indices.push_back(ibin);
2435 binned_vectors.
vecx.push_back(vx);
2436 binned_vectors.
vecy.push_back(vy);
2437 binned_vectors.
vecz.push_back(vz);
2438 binned_vectors.
count++;
2451 trv::BinnedVectors binned_vectors_sorted;
2453 binned_vectors_sorted.
count = binned_vectors.
count;
2456 binned_vectors_sorted.
indices.resize(binned_vectors.
count);
2459 binned_vectors_sorted.
vecx.resize(binned_vectors.
count);
2460 binned_vectors_sorted.
vecy.resize(binned_vectors.
count);
2461 binned_vectors_sorted.
vecz.resize(binned_vectors.
count);
2466 std::vector<int> indices_sorted =
2473#pragma omp parallel for
2475 for (
int i = 0; i < binned_vectors.
count; i++) {
2476 int idx = indices_sorted[i];
2480 binned_vectors_sorted.
vecx[i] = binned_vectors.
vecx[idx];
2481 binned_vectors_sorted.
vecy[i] = binned_vectors.
vecy[idx];
2482 binned_vectors_sorted.
vecz[i] = binned_vectors.
vecz[idx];
2486 if (save_file !=
"") {
2487 std::FILE* save_fileptr = std::fopen(save_file.c_str(),
"w");
2489 save_fileptr, this->params, binned_vectors_sorted
2491 std::fclose(save_fileptr);
2495 "Check binned-vectors file for reference: %s", save_file.c_str()
2503 return binned_vectors_sorted;
2515 this->resize_stats(kbinning.
num_bins);
2518 if (!this->if_fields_compatible(field_a, field_b)) {
2521 "Input mesh fields have incompatible physical properties."
2525 "Input mesh fields have incompatible physical properties."
2529 auto ret_grid_wavevector = [&field_a](
int i,
int j,
int k,
double kvec[3]) {
2530 field_a.get_grid_wavevector(i, j,
k, kvec);
2533 this->compute_shotnoise_aliasing();
2535 std::function<double(
int,
int,
int)> calc_shotnoise_aliasing = [
this](
2538 long long idx_grid = ret_grid_index(i, j,
k);
2539 return this->alias_sn[idx_grid];
2542 std::function<double(
int,
int,
int)> calc_win_pk, calc_win_sn;
2543 int assignment_order = this->params.assignment_order;
2544 if (this->params.interlace ==
"true") {
2545 calc_win_pk = [&field_a, &field_b, &assignment_order](
2549 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
2550 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
2552 calc_win_sn = calc_win_pk;
2554 if (this->params.interlace ==
"false") {
2555#ifndef DBG_FLAG_NOAC
2556 calc_win_sn = calc_shotnoise_aliasing;
2557 calc_win_pk = calc_win_sn;
2559 calc_win_pk = [&field_a, &field_b, &assignment_order](
2563 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
2564 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
2566 calc_win_sn = calc_shotnoise_aliasing;
2573 const int n_sample = 1e6;
2574 const double dk_sample = 1.e-5;
2575 if (kbinning.
bin_max > n_sample * dk_sample) {
2578 "Input bin range exceeds sampled range. "
2579 "Statistics in bins beyond sampled range are uncomputed."
2584 int* nmodes_sample =
new int[n_sample];
2585 double* k_sample =
new double[n_sample];
2586 double* pk_sample_real =
new double[n_sample];
2587 double* pk_sample_imag =
new double[n_sample];
2588 double* sn_sample_real =
new double[n_sample];
2589 double* sn_sample_imag =
new double[n_sample];
2590 std::complex<double>* pk_sample =
new std::complex<double>[n_sample];
2591 std::complex<double>* sn_sample =
new std::complex<double>[n_sample];
2595 for (
int i = 0; i < n_sample; i++) {
2596 nmodes_sample[i] = 0;
2598 pk_sample_real[i] = 0.;
2599 pk_sample_imag[i] = 0.;
2600 sn_sample_real[i] = 0.;
2601 sn_sample_imag[i] = 0.;
2607#pragma omp parallel for collapse(3)
2609 for (
int i = 0; i < this->params.ngrid[0]; i++) {
2610 for (
int j = 0; j < this->params.ngrid[1]; j++) {
2611 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
2612 long long idx_grid = ret_grid_index(i, j,
k);
2615 ret_grid_wavevector(i, j,
k, kv);
2619 int idx_k = int(k_ / dk_sample);
2620 if (0 <= idx_k && idx_k < n_sample) {
2621 std::complex<double> fa(field_a[idx_grid][0], field_a[idx_grid][1]);
2622 std::complex<double> fb(field_b[idx_grid][0], field_b[idx_grid][1]);
2624 std::complex<double> pk_mode = fa * std::conj(fb);
2625 std::complex<double> sn_mode =
2626 shotnoise_amp * calc_shotnoise_aliasing(i, j,
k);
2629 double win_pk = calc_win_pk(i, j,
k);
2630 double win_sn = calc_win_sn(i, j,
k);
2642 double pk_mode_real = pk_mode.real();
2643 double pk_mode_imag = pk_mode.imag();
2644 double sn_mode_real = sn_mode.real();
2645 double sn_mode_imag = sn_mode.imag();
2649 nmodes_sample[idx_k]++;
2651 k_sample[idx_k] += k_;
2653 pk_sample_real[idx_k] += pk_mode_real;
2655 pk_sample_imag[idx_k] += pk_mode_imag;
2657 sn_sample_real[idx_k] += sn_mode_real;
2659 sn_sample_imag[idx_k] += sn_mode_imag;
2665 for (
int i = 0; i < n_sample; i++) {
2666 pk_sample[i] = pk_sample_real[i] +
trvm::M_I * pk_sample_imag[i];
2667 sn_sample[i] = sn_sample_real[i] +
trvm::M_I * sn_sample_imag[i];
2671 for (
int ibin = 0; ibin < kbinning.
num_bins; ibin++) {
2672 double k_lower = kbinning.
bin_edges[ibin];
2673 double k_upper = kbinning.
bin_edges[ibin + 1];
2674 for (
int i = 0; i < n_sample; i++) {
2675 double k_ = i * dk_sample;
2676 if (k_lower <= k_ && k_ < k_upper) {
2677 this->
nmodes[ibin] += nmodes_sample[i];
2678 this->k[ibin] += k_sample[i];
2679 this->
pk[ibin] += pk_sample[i];
2680 this->
sn[ibin] += sn_sample[i];
2684 if (this->
nmodes[ibin] != 0) {
2685 this->k[ibin] /= double(this->
nmodes[ibin]);
2686 this->
pk[ibin] /= double(this->
nmodes[ibin]);
2687 this->
sn[ibin] /= double(this->
nmodes[ibin]);
2690 this->
pk[ibin] = 0.;
2691 this->
sn[ibin] = 0.;
2695 delete[] nmodes_sample;
2697 delete[] pk_sample_real;
2698 delete[] pk_sample_imag;
2699 delete[] sn_sample_real;
2700 delete[] sn_sample_imag;
2709 this->resize_stats(rbinning.
num_bins);
2713 if (!this->if_fields_compatible(field_a, field_b)) {
2716 "Input mesh fields have incompatible physical properties."
2720 "Input mesh fields have incompatible physical properties."
2724 auto ret_grid_pos_vector = [&field_a](
int i,
int j,
int k,
double rvec[3]) {
2725 field_a.get_grid_pos_vector(i, j,
k, rvec);
2728 this->compute_shotnoise_aliasing();
2730 std::function<double(
int,
int,
int)> calc_shotnoise_aliasing = [
this](
2733 long long idx_grid = ret_grid_index(i, j,
k);
2734 return this->alias_sn[idx_grid];
2737 std::function<double(
int,
int,
int)> calc_win_pk, calc_win_sn;
2738 int assignment_order = this->params.assignment_order;
2739 if (this->params.interlace ==
"true") {
2740 calc_win_pk = [&field_a, &field_b, &assignment_order](
2744 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
2745 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
2747 calc_win_sn = calc_win_pk;
2749 if (this->params.interlace ==
"false") {
2750#ifndef DBG_FLAG_NOAC
2751 calc_win_sn = calc_shotnoise_aliasing;
2752 calc_win_pk = calc_win_sn;
2754 calc_win_pk = [&field_a, &field_b, &assignment_order](
2758 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
2759 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
2761 calc_win_sn = calc_shotnoise_aliasing;
2766 for (
int i = 0; i < this->params.ngrid[0]; i++) {
2767 for (
int j = 0; j < this->params.ngrid[1]; j++) {
2768 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
2769 long long idx_grid = ret_grid_index(i, j,
k);
2771 std::complex<double> fa(field_a[idx_grid][0], field_a[idx_grid][1]);
2772 std::complex<double> fb(field_b[idx_grid][0], field_b[idx_grid][1]);
2774 std::complex<double> pk_mode = fa * std::conj(fb);
2775 std::complex<double> sn_mode =
2776 shotnoise_amp * calc_shotnoise_aliasing(i, j,
k);
2779 double win_pk = calc_win_pk(i, j,
k);
2780 double win_sn = calc_win_sn(i, j,
k);
2787 this->twopt_3d[idx_grid][0] = pk_mode.real() / this->vol;
2788 this->twopt_3d[idx_grid][1] = pk_mode.imag() / this->vol;
2795#if defined(TRV_USE_HIP)
2796 trva::copy_complex_array_htod(
2797 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
2799 HIPFFT_EXEC(hipfftExecZ2Z(
2800 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
2803 trva::copy_complex_array_dtoh(
2804 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
2806#elif defined(TRV_USE_CUDA)
2808 trva::copy_complex_array_htod(
2809 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
2811 CUFFT_EXEC(cufftExecZ2Z(
2812 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
2815 trva::copy_complex_array_dtoh(
2816 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
2819 trva::copy_complex_array_htod_mgpu(
2820 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
2822 CUFFT_EXEC(cufftXtExecDescriptor(
2823 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d_desc,
2826 trva::copy_complex_array_dtoh_mgpu(
2827 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
2832 if (this->plan_ini) {
2833 fftw_execute(this->inv_transform);
2835 fftw_execute_dft(field_a.inv_transform, this->twopt_3d, this->twopt_3d);
2843 const int n_sample = 1e6;
2844 const double dr_sample = 1.e-1;
2845 if (rbinning.
bin_max > n_sample * dr_sample) {
2848 "Input bin range exceeds sampled range. "
2849 "Statistics in bins beyond sampled range are uncomputed."
2854 int* npairs_sample =
new int[n_sample];
2855 double* r_sample =
new double[n_sample];
2856 double* xi_sample_real =
new double[n_sample];
2857 double* xi_sample_imag =
new double[n_sample];
2858 std::complex<double>* xi_sample =
new std::complex<double>[n_sample];
2862 for (
int i = 0; i < n_sample; i++) {
2863 npairs_sample[i] = 0;
2865 xi_sample_real[i] = 0.;
2866 xi_sample_imag[i] = 0.;
2872#pragma omp parallel for collapse(3)
2874 for (
int i = 0; i < this->params.ngrid[0]; i++) {
2875 for (
int j = 0; j < this->params.ngrid[1]; j++) {
2876 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
2877 long long idx_grid = ret_grid_index(i, j,
k);
2880 ret_grid_pos_vector(i, j,
k, rv);
2884 int idx_r = int(r_ / dr_sample);
2885 if (0 <= idx_r && idx_r < n_sample) {
2886 std::complex<double> xi_pair(
2887 this->twopt_3d[idx_grid][0], this->twopt_3d[idx_grid][1]
2896 double xi_pair_real = xi_pair.real();
2897 double xi_pair_imag = xi_pair.imag();
2901 npairs_sample[idx_r]++;
2903 r_sample[idx_r] += r_;
2905 xi_sample_real[idx_r] += xi_pair_real;
2907 xi_sample_imag[idx_r] += xi_pair_imag;
2913 for (
int i = 0; i < n_sample; i++) {
2914 xi_sample[i] = xi_sample_real[i] +
trvm::M_I * xi_sample_imag[i];
2918 for (
int ibin = 0; ibin < rbinning.
num_bins; ibin++) {
2919 double r_lower = rbinning.
bin_edges[ibin];
2920 double r_upper = rbinning.
bin_edges[ibin + 1];
2921 for (
int i = 0; i < n_sample; i++) {
2922 double r_ = i * dr_sample;
2923 if (r_lower <= r_ && r_ < r_upper) {
2924 this->
npairs[ibin] += npairs_sample[i];
2925 this->
r[ibin] += r_sample[i];
2926 this->
xi[ibin] += xi_sample[i];
2930 if (this->
npairs[ibin] != 0) {
2931 this->
r[ibin] /= double(this->
npairs[ibin]);
2932 this->
xi[ibin] /= double(this->
npairs[ibin]);
2936 this->
xi[ibin] = 0.;
2940 delete[] npairs_sample;
2942 delete[] xi_sample_real;
2943 delete[] xi_sample_imag;
2949 std::vector< std::complex<double> >& ylm_a,
2950 std::vector< std::complex<double> >& ylm_b,
2951 std::complex<double> shotnoise_amp,
2955 trvs::logger.debug(
"Computing uncoupled shot noise for 3PCF.");
2958 this->resize_stats(rbinning.
num_bins);
2962 if (!this->if_fields_compatible(field_a, field_b)) {
2965 "Input mesh fields have incompatible physical properties."
2969 "Input mesh fields have incompatible physical properties."
2973 auto ret_grid_pos_vector = [&field_a](
int i,
int j,
int k,
double rvec[3]) {
2974 field_a.get_grid_pos_vector(i, j,
k, rvec);
2977 this->compute_shotnoise_aliasing();
2979 std::function<double(
int,
int,
int)> calc_shotnoise_aliasing = [
this](
2982 long long idx_grid = ret_grid_index(i, j,
k);
2983 return this->alias_sn[idx_grid];
2986 std::function<double(
int,
int,
int)> calc_win_pk, calc_win_sn;
2987 int assignment_order = this->params.assignment_order;
2988 if (this->params.interlace ==
"true") {
2989 calc_win_pk = [&field_a, &field_b, &assignment_order](
2993 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
2994 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
2996 calc_win_sn = calc_win_pk;
2998 if (this->params.interlace ==
"false") {
2999#ifndef DBG_FLAG_NOAC
3000 calc_win_sn = calc_shotnoise_aliasing;
3001 calc_win_pk = calc_win_sn;
3003 calc_win_pk = [&field_a, &field_b, &assignment_order](
3007 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
3008 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
3010 calc_win_sn = calc_shotnoise_aliasing;
3016#pragma omp parallel for collapse(3)
3018 for (
int i = 0; i < this->params.ngrid[0]; i++) {
3019 for (
int j = 0; j < this->params.ngrid[1]; j++) {
3020 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
3021 long long idx_grid = ret_grid_index(i, j,
k);
3023 std::complex<double> fa(field_a[idx_grid][0], field_a[idx_grid][1]);
3024 std::complex<double> fb(field_b[idx_grid][0], field_b[idx_grid][1]);
3026 std::complex<double> pk_mode = fa * std::conj(fb);
3027 std::complex<double> sn_mode =
3028 shotnoise_amp * calc_shotnoise_aliasing(i, j,
k);
3031 double win_pk = calc_win_pk(i, j,
k);
3032 double win_sn = calc_win_sn(i, j,
k);
3039 this->twopt_3d[idx_grid][0] = pk_mode.real() / this->vol;
3040 this->twopt_3d[idx_grid][1] = pk_mode.imag() / this->vol;
3047#if defined(TRV_USE_HIP)
3048 trva::copy_complex_array_htod(
3049 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
3051 HIPFFT_EXEC(hipfftExecZ2Z(
3052 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
3055 trva::copy_complex_array_dtoh(
3056 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
3058#elif defined(TRV_USE_CUDA)
3060 trva::copy_complex_array_htod(
3061 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
3063 CUFFT_EXEC(cufftExecZ2Z(
3064 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
3067 trva::copy_complex_array_dtoh(
3068 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
3071 trva::copy_complex_array_htod_mgpu(
3072 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
3074 CUFFT_EXEC(cufftXtExecDescriptor(
3075 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d_desc,
3078 trva::copy_complex_array_dtoh_mgpu(
3079 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
3084 if (this->plan_ini) {
3085 fftw_execute(this->inv_transform);
3087 fftw_execute_dft(field_a.inv_transform, this->twopt_3d, this->twopt_3d);
3095 const int n_sample = 1e5;
3096 const double dr_sample = 1.;
3098 int* npairs_sample =
new int[n_sample];
3099 double* r_sample =
new double[n_sample];
3100 double* xi_sample_real =
new double[n_sample];
3101 double* xi_sample_imag =
new double[n_sample];
3102 std::complex<double>* xi_sample =
new std::complex<double>[n_sample];
3106 for (
int i = 0; i < n_sample; i++) {
3107 npairs_sample[i] = 0;
3109 xi_sample_real[i] = 0.;
3110 xi_sample_imag[i] = 0.;
3116#pragma omp parallel for collapse(3)
3118 for (
int i = 0; i < this->params.ngrid[0]; i++) {
3119 for (
int j = 0; j < this->params.ngrid[1]; j++) {
3120 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
3121 long long idx_grid = ret_grid_index(i, j,
k);
3124 ret_grid_pos_vector(i, j,
k, rv);
3128 int idx_r = int(r_ / dr_sample);
3129 if (0 <= idx_r && idx_r < n_sample) {
3130 std::complex<double> xi_pair(
3131 this->twopt_3d[idx_grid][0], this->twopt_3d[idx_grid][1]
3135 xi_pair *= ylm_a[idx_grid] * ylm_b[idx_grid];
3137 double xi_pair_real = xi_pair.real();
3138 double xi_pair_imag = xi_pair.imag();
3142 npairs_sample[idx_r]++;
3144 r_sample[idx_r] += r_;
3146 xi_sample_real[idx_r] += xi_pair_real;
3148 xi_sample_imag[idx_r] += xi_pair_imag;
3154 for (
int i = 0; i < n_sample; i++) {
3155 xi_sample[i] = xi_sample_real[i] +
trvm::M_I * xi_sample_imag[i];
3159 for (
int ibin = 0; ibin < rbinning.
num_bins; ibin++) {
3160 double r_lower = rbinning.
bin_edges[ibin];
3161 double r_upper = rbinning.
bin_edges[ibin + 1];
3162 for (
int i = 0; i < n_sample; i++) {
3163 double r_ = i * dr_sample;
3164 if (r_lower <= r_ && r_ < r_upper) {
3165 this->
npairs[ibin] += npairs_sample[i];
3166 this->
r[ibin] += r_sample[i];
3167 this->
xi[ibin] += xi_sample[i];
3171 if (this->
npairs[ibin] != 0) {
3172 this->
r[ibin] /= double(this->
npairs[ibin]);
3173 this->
xi[ibin] /= double(this->
npairs[ibin]);
3176 this->
xi[ibin] = 0.;
3181 double norm_factors = 1 / this->vol_cell
3182 * std::pow(-1, this->params.ell1 + this->params.ell2);
3184 for (
int ibin = 0; ibin < rbinning.
num_bins; ibin++) {
3185 if (this->
npairs[ibin] != 0) {
3186 this->
xi[ibin] *= norm_factors / double(this->
npairs[ibin]);
3191 delete[] npairs_sample;
3193 delete[] xi_sample_real;
3194 delete[] xi_sample_imag;
3198std::complex<double> \
3199FieldStats::compute_uncoupled_shotnoise_for_bispec_per_bin(
3201 std::vector< std::complex<double> >& ylm_a,
3202 std::vector< std::complex<double> >& ylm_b,
3204 std::complex<double> shotnoise_amp,
3205 double k_a,
double k_b
3209 "Computing uncoupled shot noise for bispectrum "
3210 "at wavenumbers (%f, %f).",
3217 if (!this->if_fields_compatible(field_a, field_b)) {
3220 "Input mesh fields have incompatible physical properties."
3224 "Input mesh fields have incompatible physical properties."
3228 auto ret_grid_pos_vector = [&field_a](
int i,
int j,
int k,
double rvec[3]) {
3229 field_a.get_grid_pos_vector(i, j,
k, rvec);
3232 this->compute_shotnoise_aliasing();
3234 std::function<double(
int,
int,
int)> calc_shotnoise_aliasing = [
this](
3237 long long idx_grid = ret_grid_index(i, j,
k);
3238 return this->alias_sn[idx_grid];
3241 std::function<double(
int,
int,
int)> calc_win_pk, calc_win_sn;
3242 int assignment_order = this->params.assignment_order;
3243 if (this->params.interlace ==
"true") {
3244 calc_win_pk = [&field_a, &field_b, &assignment_order](
3248 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
3249 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
3251 calc_win_sn = calc_win_pk;
3253 if (this->params.interlace ==
"false") {
3254#ifndef DBG_FLAG_NOAC
3255 calc_win_sn = calc_shotnoise_aliasing;
3256 calc_win_pk = calc_win_sn;
3258 calc_win_pk = [&field_a, &field_b, &assignment_order](
3262 field_a.calc_assignment_window_in_fourier(i, j,
k, assignment_order)
3263 * field_b.calc_assignment_window_in_fourier(i, j,
k, assignment_order);
3265 calc_win_sn = calc_shotnoise_aliasing;
3271#pragma omp parallel for collapse(3)
3273 for (
int i = 0; i < this->params.ngrid[0]; i++) {
3274 for (
int j = 0; j < this->params.ngrid[1]; j++) {
3275 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
3276 long long idx_grid = ret_grid_index(i, j,
k);
3278 std::complex<double> fa(field_a[idx_grid][0], field_a[idx_grid][1]);
3279 std::complex<double> fb(field_b[idx_grid][0], field_b[idx_grid][1]);
3281 std::complex<double> pk_mode = fa * std::conj(fb);
3282 std::complex<double> sn_mode =
3283 shotnoise_amp * calc_shotnoise_aliasing(i, j,
k);
3286 double win_pk = calc_win_pk(i, j,
k);
3287 double win_sn = calc_win_sn(i, j,
k);
3294 this->twopt_3d[idx_grid][0] = pk_mode.real() / this->vol;
3295 this->twopt_3d[idx_grid][1] = pk_mode.imag() / this->vol;
3302#if defined(TRV_USE_HIP)
3303 trva::copy_complex_array_htod(
3304 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
3306 HIPFFT_EXEC(hipfftExecZ2Z(
3307 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
3310 trva::copy_complex_array_dtoh(
3311 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
3313#elif defined(TRV_USE_CUDA)
3315 trva::copy_complex_array_htod(
3316 this->twopt_3d, this->d_twopt_3d, this->params.nmesh
3318 CUFFT_EXEC(cufftExecZ2Z(
3319 this->inv_transform_gpu, this->d_twopt_3d, this->d_twopt_3d,
3322 trva::copy_complex_array_dtoh(
3323 this->d_twopt_3d, this->twopt_3d, this->params.nmesh
3326 trva::copy_complex_array_htod_mgpu(
3327 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
3329 CUFFT_EXEC(cufftXtExecDescriptor(
3330 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d_desc,
3333 trva::copy_complex_array_dtoh_mgpu(
3334 this->inv_transform_gpu, this->twopt_3d_desc, this->twopt_3d
3339 if (this->plan_ini) {
3340 fftw_execute(this->inv_transform);
3342 fftw_execute_dft(field_a.inv_transform, this->twopt_3d, this->twopt_3d);
3349 double S_ij_k_real = 0., S_ij_k_imag = 0.;
3352#pragma omp parallel reduction(+:S_ij_k_real, S_ij_k_imag)
3360#pragma omp for collapse(3)
3362 for (
int i = 0; i < this->params.ngrid[0]; i++) {
3363 for (
int j = 0; j < this->params.ngrid[1]; j++) {
3364 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
3365 long long idx_grid = ret_grid_index(i, j,
k);
3368 ret_grid_pos_vector(i, j,
k, rv);
3372 double ja = sj_a_thread.
eval(k_a * r_);
3373 double jb = sj_b_thread.
eval(k_b * r_);
3375 std::complex<double> S_ij_k_3d(
3376 this->twopt_3d[idx_grid][0], this->twopt_3d[idx_grid][1]
3379 S_ij_k_3d *= ja * jb * ylm_a[idx_grid] * ylm_b[idx_grid];
3381 double S_ij_k_3d_real = S_ij_k_3d.real();
3382 double S_ij_k_3d_imag = S_ij_k_3d.imag();
3384 S_ij_k_real += S_ij_k_3d_real;
3385 S_ij_k_imag += S_ij_k_3d_imag;
3391 std::complex<double> S_ij_k(S_ij_k_real, S_ij_k_imag);
3393 S_ij_k *= this->vol_cell;
3403std::function<double(
int,
int,
int)> FieldStats::ret_calc_shotnoise_aliasing()
3407 return [
this](
int i,
int j,
int k) {
3408 return calc_shotnoise_aliasing_interlaced_isoapprox(
3414 if (this->params.assignment ==
"ngp") {
3415 return [
this](
int i,
int j,
int k) {
3416 return calc_shotnoise_aliasing_ngp(i, j,
k);
3419 if (this->params.assignment ==
"cic") {
3420 return [
this](
int i,
int j,
int k) {
3421 return calc_shotnoise_aliasing_cic(i, j,
k);
3424 if (this->params.assignment ==
"tsc") {
3425 return [
this](
int i,
int j,
int k) {
3426 return calc_shotnoise_aliasing_tsc(i, j,
k);
3429 if (this->params.assignment ==
"pcs") {
3430 return [
this](
int i,
int j,
int k) {
3431 return calc_shotnoise_aliasing_pcs(i, j,
k);
3437 "Invalid assignment scheme: '%s'.", this->params.assignment.c_str()
3440 throw trvs::InvalidParameterError(
3441 "Invalid assignment scheme: '%s'.", this->params.assignment.c_str()
3445void FieldStats::get_shotnoise_aliasing_sin2(
3446 int i,
int j,
int k,
double& sx2,
double& sy2,
double& sz2
3448 this->shift_grid_indices_fourier(i, j,
k);
3450 double u_x = M_PI * i / double(this->params.ngrid[0]);
3451 double u_y = M_PI * j / double(this->params.ngrid[1]);
3452 double u_z = M_PI *
k / double(this->params.ngrid[2]);
3454 sx2 = (i != 0) ? std::sin(u_x) * std::sin(u_x) : 0.;
3455 sy2 = (j != 0) ? std::sin(u_y) * std::sin(u_y) : 0.;
3456 sz2 = (
k != 0) ? std::sin(u_z) * std::sin(u_z) : 0.;
3459void FieldStats::get_shotnoise_aliasing_sin2_cos2_isoapprox(
3460 int i,
int j,
int k,
double& s2h,
double& c2h
3462 this->shift_grid_indices_fourier(i, j,
k);
3464 double u_x = M_PI * i / double(this->params.ngrid[0]);
3465 double u_y = M_PI * j / double(this->params.ngrid[1]);
3466 double u_z = M_PI *
k / double(this->params.ngrid[2]);
3468 double uhalf = std::sqrt(u_x * u_x + u_y * u_y + u_z * u_z) / 2.;
3470 s2h = std::sin(uhalf) * std::sin(uhalf);
3471 c2h = std::cos(uhalf) * std::cos(uhalf);
3475PURE
double FieldStats::calc_shotnoise_aliasing_ngp(
int i,
int j,
int k) {
3479double FieldStats::calc_shotnoise_aliasing_cic(
int i,
int j,
int k) {
3480 double sx2, sy2, sz2;
3481 this->get_shotnoise_aliasing_sin2(i, j,
k, sx2, sy2, sz2);
3483 return (1. - 2./3. * sx2) * (1. - 2./3. * sy2) * (1. - 2./3. * sz2);
3486double FieldStats::calc_shotnoise_aliasing_tsc(
int i,
int j,
int k) {
3487 double sx2, sy2, sz2;
3488 this->get_shotnoise_aliasing_sin2(i, j,
k, sx2, sy2, sz2);
3490 return (1. - sx2 + 2./15. * sx2 * sx2)
3491 * (1. - sy2 + 2./15. * sy2 * sy2)
3492 * (1. - sz2 + 2./15. * sz2 * sz2);
3495double FieldStats::calc_shotnoise_aliasing_pcs(
int i,
int j,
int k) {
3496 double sx2, sy2, sz2;
3497 this->get_shotnoise_aliasing_sin2(i, j,
k, sx2, sy2, sz2);
3499 return (1. - 4./3. * sx2 + 2./5. * sx2 * sx2 - 4./315. * sx2 * sx2 * sx2)
3500 * (1. - 4./3. * sy2 + 2./5. * sy2 * sy2 - 4./315. * sy2 * sy2 * sy2)
3501 * (1. - 4./3. * sz2 + 2./5. * sz2 * sz2 - 4./315. * sz2 * sz2 * sz2);
3504double FieldStats::calc_shotnoise_aliasing_interlaced_isoapprox(
3505 int i,
int j,
int k,
int order
3508 this->get_shotnoise_aliasing_sin2_cos2_isoapprox(i, j,
k, s2h, c2h);
3510 double cc = std::pow(c2h, order);
3517 cs = 1. - 2./3. * s2h;
3520 cs = 1. - s2h + 2./15. * s2h * s2h;
3523 cs = 1. - 4./3. * s2h + 2./5. * s2h * s2h - 4./315. * s2h * s2h * s2h;
3529void FieldStats::compute_shotnoise_aliasing() {
3530 if (this->alias_ini) {
return;}
3534 "Computing shot noise aliasing function in Fourier space "
3535 "for assignment order %d.",
3536 this->params.assignment_order
3540 this->alias_sn.resize(this->params.nmesh, 0.);
3548 std::function<double(
int,
int,
int)> calc_shotnoise_aliasing =
3549 this->ret_calc_shotnoise_aliasing();
3552#pragma omp parallel for collapse(3)
3554 for (
int i = 0; i < this->params.ngrid[0]; i++) {
3555 for (
int j = 0; j < this->params.ngrid[1]; j++) {
3556 for (
int k = 0;
k < this->params.ngrid[2];
k++) {
3557 long long idx_grid = ret_grid_index(i, j,
k);
3558 this->alias_sn[idx_grid] = calc_shotnoise_aliasing(i, j,
k);
3563 this->alias_ini =
true;
Isotropic coordinate binning.
std::vector< double > bin_edges
bin edges
std::vector< double > bin_centres
bin centres
std::string space
coordinate space
double bin_max
highest bin edge
int num_bins
number of bins
std::vector< std::complex< double > > sn
shot-noise power in bins
void reset_stats()
Reset two-point statistics.
std::vector< std::complex< double > > pk
pseudo power spectrum in bins
void compute_uncoupled_shotnoise_for_3pcf(MeshField &field_a, MeshField &field_b, std::vector< std::complex< double > > &ylm_a, std::vector< std::complex< double > > &ylm_b, std::complex< double > shotnoise_amp, trv::Binning &rbinning)
Compute binned uncoupled three-point correlation function shot noise.
~FieldStats()
Destruct two-point statistics.
std::vector< int > nmodes
number of wavevector modes in bins
FieldStats(trv::ParameterSet ¶ms, bool plan_ini=true)
Construct pseudo two-point statistics.
std::vector< double > k
average wavenumber in bins
std::vector< std::complex< double > > xi
pseudo two-point correlation function in bins
std::vector< int > npairs
number of separation pairs in bins
void compute_ylm_wgtd_2pt_stats_in_fourier(MeshField &field_a, MeshField &field_b, std::complex< double > shotnoise_amp, int ell, int m, trv::Binning &kbinning)
Compute binned two-point statistics in Fourier space.
trv::BinnedVectors record_binned_vectors(trv::Binning &binning, const std::string &save_file)
Record binned vectors given a binning scheme.
void compute_ylm_wgtd_2pt_stats_in_config(MeshField &field_a, MeshField &field_b, std::complex< double > shotnoise_amp, int ell, int m, trv::Binning &rbinning)
Compute binned two-point statistics in configuration space.
Discretely sampled field on a mesh grid from particle catalogues.
trv::ParameterSet params
parameter set
void inv_fourier_transform()
Inverse Fourier transform the (FFT-transformed) field.
void inv_fourier_transform_sjl_ylm_wgtd_field(MeshField &field_fourier, std::vector< std::complex< double > > &ylm, trvm::SphericalBesselCalculator &sjl, double r)
Inverse Fourier transform a field weighted by the spherical Bessel function and reduced spherical ha...
std::string name
field name
void assign_weighted_field_to_mesh(ParticleCatalogue &particles, fftw_complex *weights)
Assign a weighted field to a mesh by interpolation scheme.
double calc_grid_based_powlaw_norm(ParticleCatalogue &particles, int order)
void reset_density_field()
(Re-)initialise the complex field (and its shadow) on mesh.
void inv_fourier_transform_ylm_wgtd_field_band_limited(MeshField &field_fourier, std::vector< std::complex< double > > &ylm, double k_band, double dk_band, double &k_eff, int &nmodes)
Inverse Fourier transform a field weighted by the reduced spherical harmonics restricted to a wavenu...
double dr[3]
grid size in each dimension
void fourier_transform()
Fourier transform the field.
void compute_ylm_wgtd_field(ParticleCatalogue &particles_data, ParticleCatalogue &particles_rand, LineOfSight *los_data, LineOfSight *los_rand, double alpha, int ell, int m)
Compute the weighted field (fluctuations) further weighted by the reduced spherical harmonics.
void compute_unweighted_field(ParticleCatalogue &particles)
Compute the unweighted field.
void apply_wide_angle_pow_law_kernel()
Apply wide-angle correction kernel in configuration space.
void compute_ylm_wgtd_quad_field(ParticleCatalogue &particles_data, ParticleCatalogue &particles_rand, LineOfSight *los_data, LineOfSight *los_rand, double alpha, int ell, int m)
Compute the quadratic weighted field (fluctuations) further weighted by the reduced spherical harmoni...
fftw_complex * field
complex field on mesh
MeshField(trv::ParameterSet ¶ms, bool plan_ini=true, const std::string &name="mesh-field")
Construct the mesh field.
void compute_unweighted_field_fluctuations_insitu(ParticleCatalogue &particles)
Compute the unweighted field fluctuations.
double dk[3]
fundamental wavenumber in each dimension
~MeshField()
Destruct the mesh field.
double vol_cell
mesh grid cell volume
void apply_assignment_compensation()
Apply compensation needed after Fourier transform for assignment schemes.
const fftw_complex & operator[](long long gid)
Return mesh field grid cell value.
long long nmesh
number of mesh grid cells
std::string fftw_scheme
FFTW scheme: {"estimate", "measure" (default), "patient"}.
double volume
box volume (in Mpc^3/h^3)
std::string fftw_wisdom_file_b
backward-transform wisdom file path
std::string interlace
interlacing switch: {"true"/"on", "false"/"off" (default)}
int assignment_order
order of the assignment scheme
std::string fftw_wisdom_file_f
derived FFTW wisdom file paths
int ngrid[3]
grid cell number in each dimension
double boxsize[3]
box size (in Mpc/h) in each dimension
std::string use_fftw_wisdom
use FFTW wisdom: {"false" (default), <path-to-dir>}
double pos_min[3]
minimum particle coordinates
int ntotal
total number of particles
double pos_max[3]
maximum particle coordinates
Interpolated spherical Bessel function of the first kind.
double eval(double x)
Evaluate the interpolated function.
static std::complex< double > calc_reduced_spherical_harmonic(const int ell, const int m, double pos[3])
Calculate the reduced spherical harmonic.
Exception raised when the data to be operated on are invalid.
Exception raised when parameters are invalid.
void info(const char *fmt_string,...)
Emit a information-level message.
void debug(const char *fmt_string,...)
Emit a debugging-level message.
void error(const char *fmt_string,...)
Emit a warning-level message.
Mesh field (with one-point statistics) and pseudo two-point statistics.
std::vector< int > get_sorted_indices(std::vector< int > sorting_vector)
Get the sorted indices.
void print_binned_vectors_to_file(std::FILE *fileptr, trv::ParameterSet ¶ms, trv::BinnedVectors &binned_vectors)
Print binned vectors to a file.
double get_vec3d_magnitude(std::vector< double > vec)
Return the magnitude of a 3-d vector.
const std::complex< double > M_I
imaginary unit
double gbytesMem
current memory usage in gibibytes
void update_maxmem(bool gpu=false)
Update the maximum memory usage estimate.
double size_in_gb(long long num)
Return size in gibibytes.
std::vector< int > get_gpu_ids()
Get the indices of GPUs available for use.
bool fftw_wisdom_b_imported
wisdom import status for backward transform
bool is_gpu_enabled()
Check if GPU mode is enabled.
bool fftw_wisdom_f_imported
wisdom import status for forward transform
Logger logger
default logger (at NSET logging level)
float count_grid
number of grids
double gbytesMemGPU
current (GPU) memory usage in gibibytes
bool is_gpu_single()
Check if in single-GPU mode.
int count_fft
number of FFTs
int count_cgrid
number of 3-d complex grids
int count_rgrid
number of 3-d real grids
int count_ifft
number of IFFTs
void update_maxcntgrid()
Update the maximum 3-d grid counts.
std::vector< double > vecy
y-components
std::vector< double > vecz
z-components
int count
number of vectors
int num_bins
number of bins
std::vector< double > upper_edges
upper bin edges
std::vector< double > lower_edges
lower bin edges
std::vector< double > vecx
x-components
std::vector< int > indices
bin indices
double pos[3]
3-d position vector