1
|
/* FILE: fft.cc -*-Mode: c++-*-
|
2
|
*
|
3
|
* C++ code to do 1 and 2 dimensional FFT's.
|
4
|
*
|
5
|
*/
|
6
|
|
7
|
#include <string.h>
|
8
|
|
9
|
#include "nb.h"
|
10
|
#include "fft.h"
|
11
|
|
12
|
#ifdef USE_MPI
|
13
|
#include "mmsmpi.h"
|
14
|
#endif /* USE_MPI */
|
15
|
|
16
|
/* End includes */
|
17
|
|
18
|
#undef USE_COMPLEX
|
19
|
|
20
|
#ifndef OLD_CODE
|
21
|
#define CMULT(xr,xi,yr,yi,zr,zi) (zr) = (xr)*(yr)-(xi)*(yi), \
|
22
|
(zi) = (xr)*(yi)+(xi)*(yr)
|
23
|
#else
|
24
|
extern inline void CMULT(const MY_COMPLEX_REAL_TYPE &xr,
|
25
|
const MY_COMPLEX_REAL_TYPE &xi,
|
26
|
const MY_COMPLEX_REAL_TYPE &yr,
|
27
|
const MY_COMPLEX_REAL_TYPE &yi,
|
28
|
MY_COMPLEX_REAL_TYPE &zr,
|
29
|
MY_COMPLEX_REAL_TYPE &zi)
|
30
|
{
|
31
|
zr = xr*yr-xi*yi;
|
32
|
zi = xr*yi+xi*yr;
|
33
|
}
|
34
|
#endif // OLD_CODE
|
35
|
|
36
|
void FFT::ReleaseMemory(void)
|
37
|
{
|
38
|
if(vecsize>0) {
|
39
|
if(Uforward!=(MyComplex *)NULL) delete[] Uforward;
|
40
|
if(Uinverse!=(MyComplex *)NULL) delete[] Uinverse;
|
41
|
if(permindex!=(int *)NULL) delete[] permindex;
|
42
|
}
|
43
|
vecsize=0;
|
44
|
Uforward=Uinverse=(MyComplex *)NULL;
|
45
|
permindex=(int *)NULL;
|
46
|
}
|
47
|
|
48
|
FFT::~FFT(void)
|
49
|
{
|
50
|
ReleaseMemory();
|
51
|
}
|
52
|
|
53
|
void FFT::Setup(int size)
|
54
|
{
|
55
|
if(size==vecsize) return;
|
56
|
if(size<1) PlainError(1,"Error in FFT::Setup(int): "
|
57
|
"Requested length (%d) must be >0",size);
|
58
|
int k;
|
59
|
// Check that size is power of 2
|
60
|
for(k=size;k>2;k/=2)
|
61
|
if(k%2!=0) PlainError(1,"Error in FFT::Setup(int): "
|
62
|
"Requested length (%d) is not a power of 2",size);
|
63
|
ReleaseMemory();
|
64
|
vecsize=size;
|
65
|
|
66
|
// Allocate and setup MyComplex arrays
|
67
|
if((Uforward=new MyComplex[size])==0)
|
68
|
PlainError(1,"Error in FFT::Setup %s",ErrNoMem);
|
69
|
if((Uinverse=new MyComplex[size])==0)
|
70
|
PlainError(1,"Error in FFT::Setup %s",ErrNoMem);
|
71
|
#ifdef ORIG_CODE
|
72
|
double baseang= -2*PI/double(size);
|
73
|
Uforward[0]=Uinverse[0]=MyComplex(1,0);
|
74
|
double x,y;
|
75
|
for(k=1;k<size/2;k++) {
|
76
|
x = cos(baseang*k);
|
77
|
y = -sqrt(1-x*x);
|
78
|
y += (1-(x*x+y*y))/(2*y); // Tiny error correction
|
79
|
x += (1-(x*x+y*y))/(2*x);
|
80
|
Uforward[k]=Uinverse[size-k]=MyComplex(x,y);
|
81
|
Uforward[size-k]=Uinverse[k]=MyComplex(x,-y);
|
82
|
}
|
83
|
if(size>1) {
|
84
|
Uforward[size/2]=Uinverse[size/2]=MyComplex(-1,0);
|
85
|
}
|
86
|
#else
|
87
|
double baseang = -2*PI/double(size);
|
88
|
for(k=1;k<size/8;k++) {
|
89
|
double angle=k*baseang;
|
90
|
double y=sin(angle);
|
91
|
double x=cos(angle);
|
92
|
|
93
|
|
94
|
Uforward[k]=Uinverse[size-k]=MyComplex(x,y);
|
95
|
Uforward[size/2-k]=Uinverse[size/2+k]=MyComplex(-x,y);
|
96
|
Uforward[size/2+k]=Uinverse[size/2-k]=MyComplex(-x,-y);
|
97
|
Uforward[size-k]=Uinverse[k]=MyComplex(x,-y);
|
98
|
|
99
|
Uforward[size/4-k]=Uinverse[3*size/4+k]=MyComplex(-y,-x);
|
100
|
Uforward[size/4+k]=Uinverse[3*size/4-k]=MyComplex(y,-x);
|
101
|
Uforward[3*size/4-k]=Uinverse[size/4+k]=MyComplex(y,x);
|
102
|
Uforward[3*size/4+k]=Uinverse[size/4-k]=MyComplex(-y,x);
|
103
|
}
|
104
|
Uforward[0]=Uinverse[0]=MyComplex(1,0);
|
105
|
if(size>1) {
|
106
|
Uforward[size/2]=Uinverse[size/2]=MyComplex(-1,0);
|
107
|
}
|
108
|
if(size>3) {
|
109
|
Uforward[size/4]=Uinverse[3*size/4]=MyComplex(0,-1);
|
110
|
Uforward[3*size/4]=Uinverse[size/4]=MyComplex(0,1);
|
111
|
}
|
112
|
if(size>7) {
|
113
|
double x=SQRT1_2; // 1/sqrt(2)
|
114
|
double y=-x;
|
115
|
Uforward[size/8]=Uinverse[7*size/8]=MyComplex(x,y);
|
116
|
Uforward[3*size/8]=Uinverse[5*size/8]=MyComplex(-x,y);
|
117
|
Uforward[5*size/8]=Uinverse[3*size/8]=MyComplex(-x,-y);
|
118
|
Uforward[7*size/8]=Uinverse[size/8]=MyComplex(x,-y);
|
119
|
}
|
120
|
#endif // ORIG_CODE
|
121
|
|
122
|
// Allocate and setup (bit-reversal) permutation index
|
123
|
if((permindex=new int[size])==0)
|
124
|
PlainError(1,"Error in FFT::Setup %s",ErrNoMem);
|
125
|
permindex[0]=0;
|
126
|
int m,n; // The following code relies heavily on size==2^log2vecsize
|
127
|
for(k=1,n=size>>1;k<size;k++) {
|
128
|
// At each step, n is bit-reversed pattern of k
|
129
|
if(n>k) permindex[k]=n; // Swap index
|
130
|
else permindex[k]=0; // Do nothing: Index already swapped or the same
|
131
|
// Calculate next n
|
132
|
m=size>>1;
|
133
|
while(m>0 && n&m) { n-=m; m>>=1; }
|
134
|
n+=m;
|
135
|
}
|
136
|
}
|
137
|
|
138
|
void FFT::ForwardDecFreq(int size,MyComplex *vec,FFT_REAL_TYPE divisor)
|
139
|
{
|
140
|
if(divisor==0) divisor=1.0; // Default is no normalization on forward FFT
|
141
|
Setup(size);
|
142
|
BaseDecFreqForward(vec);
|
143
|
Permute(vec);
|
144
|
if(divisor!=0. && divisor!=1.) {
|
145
|
MY_COMPLEX_REAL_TYPE mult=1./divisor;
|
146
|
for(int k=0;k<size;k++) vec[k]*=mult;
|
147
|
}
|
148
|
}
|
149
|
|
150
|
void FFT::InverseDecTime(int size,MyComplex *vec,FFT_REAL_TYPE divisor)
|
151
|
{
|
152
|
if(divisor==0) divisor=(FFT_REAL_TYPE)size;
|
153
|
/// Default divisor on iFFT is 'size'
|
154
|
Setup(size);
|
155
|
Permute(vec);
|
156
|
BaseDecTimeInverse(vec);
|
157
|
if(divisor!=0. && divisor!=1.) {
|
158
|
MY_COMPLEX_REAL_TYPE mult=1./divisor;
|
159
|
for(int k=0;k<size;k++) vec[k]*=mult;
|
160
|
}
|
161
|
|
162
|
}
|
163
|
|
164
|
inline void Swap(MyComplex &a,MyComplex &b)
|
165
|
{ MyComplex c(a); a=b; b=c; }
|
166
|
|
167
|
void FFT::Permute(MyComplex *vec)
|
168
|
{ /* Bit reversal permutation */
|
169
|
int i,j;
|
170
|
for(i=0;i<vecsize;i++) {
|
171
|
if((j=permindex[i])!=0) Swap(vec[i],vec[j]);
|
172
|
}
|
173
|
}
|
174
|
|
175
|
void FFT::BaseDecFreqForward(MyComplex *vec)
|
176
|
{ // In-place forward FFT using Decimation in Frequency technique,
|
177
|
// *WITHOUT* resuffling of indices.
|
178
|
// NOTE 1: This code does not use the Complex class, because
|
179
|
// some compilers do not effectively optimize around
|
180
|
// Complex operations such as multiplication. So this
|
181
|
// routine just makes use of ordinary type "FFT_REAL_TYPE"
|
182
|
// variables, and assumes each Complex variable is
|
183
|
// actually two consecutive "MY_COMPLEX_REAL_TYPE" variables.
|
184
|
// NOTE 2: See notes in MJD's micromagnetics notebook, 11-Sep-96
|
185
|
// (p. 62) and 29-Sep-96 (p. 69).
|
186
|
// NOTE 3: This code has been optimized for performance on cascade.cam,
|
187
|
// a PentiumPro 200 MHz machine using a stock gcc 2.7.2
|
188
|
// compiler. In particular, the x86 chips suffer from a
|
189
|
// shortage of registers.
|
190
|
// NOTE 4: Some compromise made to RISC architectures on 27-May-1997,
|
191
|
// by moving all loads before any stores in main loop. As
|
192
|
// done, it hurts performance on PentiumPro-200 only a couple
|
193
|
// of percent. (mjd)
|
194
|
|
195
|
#define datat float /*Added by Guru on 11/10/2010: created data type datat; specified datat as required*/
|
196
|
typedef datat FFT_REAL_TYPE; /*Added by Guru on 11/10/2010: converted FFT_REAL_TYPE to datat from double*/
|
197
|
|
198
|
|
199
|
if(vecsize==1) return; // Nothing to do
|
200
|
|
201
|
MY_COMPLEX_REAL_TYPE *v;
|
202
|
|
203
|
MY_COMPLEX_REAL_TYPE const *const U=(MY_COMPLEX_REAL_TYPE *)Uforward;
|
204
|
MY_COMPLEX_REAL_TYPE *const dvec=(MY_COMPLEX_REAL_TYPE *)vec;
|
205
|
|
206
|
int block,blocksize,blockcount,offset,uoff1;
|
207
|
int halfbs,threehalfbs; // Half blocksize,3/2 blocksize
|
208
|
FFT_REAL_TYPE m1x,m1y,m2x,m2y,m3x,m3y;
|
209
|
FFT_REAL_TYPE x0,y0,x1,y1,x2,y2,x3,y3;
|
210
|
FFT_REAL_TYPE xs02,ys02,xd02,yd02,xs13,ys13,xd13,yd13;
|
211
|
FFT_REAL_TYPE t1x,t1y,t2x,t2y,t3x,t3y;
|
212
|
|
213
|
|
214
|
// Blocksize>4
|
215
|
for(blocksize=vecsize,blockcount=1;blocksize>4;
|
216
|
blocksize/=4,blockcount*=4) {
|
217
|
// Loop through double-step matrix multiplications
|
218
|
halfbs=blocksize/2; threehalfbs=blocksize+halfbs;
|
219
|
for(block=0,v=dvec;block<blockcount;block++,v+=2*blocksize) {
|
220
|
for(offset=0;offset<halfbs;offset+=2) {
|
221
|
uoff1=offset*blockcount;
|
222
|
m1x=U[uoff1]; m1y=U[uoff1+1];
|
223
|
m2x=U[2*uoff1]; m2y=U[2*uoff1+1];
|
224
|
m3x=U[3*uoff1]; m3y=U[3*uoff1+1];
|
225
|
|
226
|
x0=v[offset];
|
227
|
y0=v[offset+1];
|
228
|
x1=v[halfbs+offset];
|
229
|
y1=v[halfbs+offset+1];
|
230
|
x2=v[blocksize+offset];
|
231
|
y2=v[blocksize+offset+1];
|
232
|
x3=v[threehalfbs+offset];
|
233
|
y3=v[threehalfbs+offset+1];
|
234
|
|
235
|
xs02=x0+x2; xs13=x1+x3;
|
236
|
v[offset]=xs02+xs13;
|
237
|
t1x=xs02-xs13;
|
238
|
|
239
|
ys02=y0+y2; ys13=y1+y3;
|
240
|
v[offset+1]=ys02+ys13;
|
241
|
t1y=ys02-ys13;
|
242
|
|
243
|
v[halfbs+offset] =t1x*m2x-t1y*m2y;
|
244
|
v[halfbs+offset+1] =t1y*m2x+t1x*m2y;
|
245
|
|
246
|
xd02=x0-x2; yd13=y1-y3;
|
247
|
t3x=xd02-yd13; t2x=xd02+yd13;
|
248
|
yd02=y0-y2; xd13=x1-x3;
|
249
|
t3y=yd02+xd13; t2y=yd02-xd13;
|
250
|
v[blocksize+offset] =t2x*m1x-t2y*m1y;
|
251
|
v[blocksize+offset+1]=t2y*m1x+t2x*m1y;
|
252
|
v[threehalfbs+offset] =t3x*m3x-t3y*m3y;
|
253
|
v[threehalfbs+offset+1]=t3y*m3x+t3x*m3y;
|
254
|
}
|
255
|
}
|
256
|
}
|
257
|
|
258
|
// Do smallest blocks; size is either 4 or 2
|
259
|
if(blocksize==4) {
|
260
|
blockcount=vecsize/4;
|
261
|
for(block=0,v=dvec;block<blockcount;block++,v+=8) {
|
262
|
x0=v[0]; y0=v[1]; x1=v[2]; y1=v[3];
|
263
|
x2=v[4]; y2=v[5]; x3=v[6]; y3=v[7];
|
264
|
|
265
|
xs02=x0+x2;
|
266
|
xs13=x1+x3;
|
267
|
v[0]=xs02+xs13;
|
268
|
v[2]=xs02-xs13;
|
269
|
|
270
|
ys02=y0+y2;
|
271
|
ys13=y1+y3;
|
272
|
v[1]=ys02+ys13;
|
273
|
v[3]=ys02-ys13;
|
274
|
|
275
|
xd02=x0-x2;
|
276
|
yd13=y1-y3;
|
277
|
v[4]=xd02+yd13;
|
278
|
v[6]=xd02-yd13;
|
279
|
|
280
|
yd02=y0-y2;
|
281
|
xd13=x1-x3;
|
282
|
v[5]=yd02-xd13;
|
283
|
v[7]=yd02+xd13;
|
284
|
}
|
285
|
}
|
286
|
else { // blocksize==2
|
287
|
blockcount=vecsize/2;
|
288
|
for(block=0,v=dvec;block<blockcount;block++,v+=4) {
|
289
|
x0=v[0]; y0=v[1];
|
290
|
x1=v[2]; y1=v[3];
|
291
|
v[0]=x0+x1; v[2]=x0-x1;
|
292
|
v[1]=y0+y1; v[3]=y0-y1;
|
293
|
}
|
294
|
}
|
295
|
}
|
296
|
|
297
|
void FFT::BaseDecTimeInverse(MyComplex *vec)
|
298
|
{ // In-place inverse FFT using Decimation in Time technique,
|
299
|
// *WITHOUT* resuffling of indices.
|
300
|
// NOTE 1: This code does not use the Complex class, because
|
301
|
// some compilers do not effectively optimize around
|
302
|
// Complex operations such as multiplication. So this
|
303
|
// routine just makes use of ordinary type "FFT_REAL_TYPE"
|
304
|
// variables, and assumes each Complex variable is
|
305
|
// actually two consecutive "MY_COMPLEX_REAL_TYPE" variables.
|
306
|
// NOTE 2: See notes in MJD's micromagnetics notebook, 11-Sep-96
|
307
|
// (p. 62) and 29-Sep-96 (p. 69).
|
308
|
// NOTE 3: This code has been optimized for performance on cascade.cam,
|
309
|
// a PentiumPro 200 MHz machine using a stock gcc 2.7.2
|
310
|
// compiler. In particular, the x86 chips suffer from a
|
311
|
// shortage of registers.
|
312
|
// NOTE 4: Some compromise made to RISC architectures on 27-May-1997,
|
313
|
// by moving all loads before any stores in main loop. As
|
314
|
// done, it hurts performance on PentiumPro-200 only a couple
|
315
|
// of percent. (mjd)
|
316
|
|
317
|
#define datat float /*Added by Guru on 11/10/2010: created data type datat; specified datat as required*/
|
318
|
typedef datat FFT_REAL_TYPE; /*Added by Guru on 11/10/2010: converted FFT_REAL_TYPE to datat from double*/
|
319
|
|
320
|
(vecsize==1) return; // Nothing to do
|
321
|
|
322
|
MY_COMPLEX_REAL_TYPE *v;
|
323
|
|
324
|
MY_COMPLEX_REAL_TYPE const *const U=(MY_COMPLEX_REAL_TYPE *)Uinverse;
|
325
|
MY_COMPLEX_REAL_TYPE *const dvec=(MY_COMPLEX_REAL_TYPE *)vec;
|
326
|
|
327
|
int block,blocksize,blockcount,offset,uoff1;
|
328
|
int halfbs,threehalfbs; // Half blocksize,3/2 blocksize
|
329
|
FFT_REAL_TYPE m1x,m1y,m2x,m2y,m3x,m3y;
|
330
|
FFT_REAL_TYPE x0,y0,x1,y1,x2,y2,x3,y3;
|
331
|
FFT_REAL_TYPE xs01,ys01,xd01,yd01,xs23,ys23,xd23,yd23;
|
332
|
FFT_REAL_TYPE t1x,t1y,t2x,t2y,t3x,t3y;
|
333
|
|
334
|
// Do smallest blocks; size is either 4 or 2
|
335
|
if(vecsize>2) {
|
336
|
blockcount=vecsize/4;
|
337
|
for(block=0,v=dvec;block<blockcount;block++,v+=8) {
|
338
|
x0=v[0]; y0=v[1];
|
339
|
x1=v[2]; y1=v[3];
|
340
|
x2=v[4]; y2=v[5];
|
341
|
x3=v[6]; y3=v[7];
|
342
|
xs01=x0+x1; xs23=x2+x3; // See NOTE 3 above
|
343
|
v[0]=xs01+xs23; v[4]=xs01-xs23;
|
344
|
ys01=y0+y1; ys23=y2+y3;
|
345
|
v[1]=ys01+ys23; v[5]=ys01-ys23;
|
346
|
xd01=x0-x1; yd23=y2-y3;
|
347
|
v[2]=xd01-yd23; v[6]=xd01+yd23;
|
348
|
yd01=y0-y1; xd23=x2-x3;
|
349
|
v[3]=yd01+xd23; v[7]=yd01-xd23;
|
350
|
}
|
351
|
}
|
352
|
else { // vecsize==2
|
353
|
x0=dvec[0]; y0=dvec[1];
|
354
|
x1=dvec[2]; y1=dvec[3];
|
355
|
dvec[0]=x0+x1;
|
356
|
dvec[2]=x0-x1;
|
357
|
dvec[1]=y0+y1;
|
358
|
dvec[3]=y0-y1;
|
359
|
return;
|
360
|
}
|
361
|
|
362
|
// Blocksize>4
|
363
|
for(blocksize=16,blockcount=vecsize/16;blocksize<=vecsize;
|
364
|
blocksize*=4,blockcount/=4) {
|
365
|
// Loop through double-step matric multiplications
|
366
|
halfbs=blocksize/2; threehalfbs=blocksize+halfbs;
|
367
|
for(block=0,v=dvec;block<blockcount;block++,v+=2*blocksize) {
|
368
|
for(offset=0;offset<blocksize/2;offset+=2) {
|
369
|
x0=v[offset];
|
370
|
y0=v[offset+1];
|
371
|
t2x=v[blocksize+offset];
|
372
|
t2y=v[blocksize+offset+1];
|
373
|
uoff1=offset*blockcount;
|
374
|
m1x=U[uoff1]; m1y=U[uoff1+1];
|
375
|
x2=t2x*m1x-t2y*m1y;
|
376
|
y2=t2y*m1x+t2x*m1y;
|
377
|
|
378
|
m2x=U[2*uoff1]; m2y=U[2*uoff1+1];
|
379
|
t1x=v[halfbs+offset];
|
380
|
t1y=v[halfbs+offset+1];
|
381
|
x1=t1x*m2x-t1y*m2y;
|
382
|
y1=t1y*m2x+t1x*m2y;
|
383
|
|
384
|
t3x=v[threehalfbs+offset];
|
385
|
t3y=v[threehalfbs+offset+1];
|
386
|
m3x=U[3*uoff1]; m3y=U[3*uoff1+1];
|
387
|
x3=t3x*m3x-t3y*m3y;
|
388
|
y3=t3y*m3x+t3x*m3y;
|
389
|
|
390
|
xs01=x0+x1; xs23=x2+x3;
|
391
|
v[ offset ] = xs01+xs23;
|
392
|
v[ blocksize+offset ] = xs01-xs23;
|
393
|
|
394
|
ys01=y0+y1; ys23=y2+y3;
|
395
|
v[ offset+1] = ys01+ys23;
|
396
|
v[ blocksize+offset+1] = ys01-ys23;
|
397
|
|
398
|
yd01=y0-y1; xd23=x2-x3;
|
399
|
v[ halfbs +offset+1] = yd01+xd23;
|
400
|
v[threehalfbs+offset+1] = yd01-xd23;
|
401
|
|
402
|
xd01=x0-x1; yd23=y2-y3;
|
403
|
v[ halfbs +offset ] = xd01-yd23;
|
404
|
v[threehalfbs+offset ] = xd01+yd23;
|
405
|
}
|
406
|
}
|
407
|
}
|
408
|
|
409
|
if(blocksize==2*vecsize) {
|
410
|
// We still have to do one single-step matrix multiplication
|
411
|
blocksize=vecsize; v=dvec;
|
412
|
for(offset=0;offset<blocksize;offset+=2,v+=2) {
|
413
|
x0=v[0]; y0=v[1];
|
414
|
t1x=v[vecsize]; t1y=v[vecsize+1];
|
415
|
m1x=U[offset]; m1y=U[offset+1];
|
416
|
x1=t1x*m1x-t1y*m1y; y1=t1y*m1x+t1x*m1y;
|
417
|
v[0] = x0+x1;
|
418
|
v[vecsize] = x0-x1;
|
419
|
v[1] = y0+y1;
|
420
|
v[vecsize+1] = y0-y1;
|
421
|
}
|
422
|
}
|
423
|
|
424
|
}
|
425
|
|
426
|
const REALWIDE FFTReal2D::CRRCspeedratio=1.10;
|
427
|
/// Relative speed of ForwardCR as compared to ForwardRC.
|
428
|
/// If bigger than 1, then ForwardCR is faster. This will
|
429
|
/// be machine & compiler dependent...oh well.
|
430
|
|
431
|
void FFTReal2D::Setup(int size1,int size2)
|
432
|
{ // Note: This routine is also called by FFTReal2D::SetupInverse()
|
433
|
if(size1==vecsize1 && size2==vecsize2) return; // Nothing to do
|
434
|
|
435
|
// Check that sizes are powers of 2, and >= 1. Also extract
|
436
|
// base-2 log of sizes
|
437
|
if(size1<1) PlainError(1,"Error in FFTReal2D::Setup(int): "
|
438
|
"Requested size1 (%d) must be >=1",size1);
|
439
|
if(size2<1) PlainError(1,"Error in FFTReal2D::Setup(int): "
|
440
|
"Requested size2 (%d) must be >=1",size2);
|
441
|
|
442
|
int k;
|
443
|
if(size1==1) {
|
444
|
logsize1=0;
|
445
|
} else {
|
446
|
for(k=size1,logsize1=1;k>2;k/=2,logsize1++)
|
447
|
if(k%2!=0) PlainError(1,"Error in FFTReal2D::Setup(int): "
|
448
|
"Requested size1 (%d) is not a power of 2",size1);
|
449
|
}
|
450
|
if(size2==1) {
|
451
|
logsize2=0;
|
452
|
} else {
|
453
|
for(k=size2,logsize2=1;k>2;k/=2,logsize2++)
|
454
|
if(k%2!=0) PlainError(1,"Error in FFTReal2D::Setup(int): "
|
455
|
"Requested size2 (%d) is not a power of 2",size2);
|
456
|
}
|
457
|
|
458
|
// Allocate new space
|
459
|
ReleaseMemory();
|
460
|
scratch=new MyComplex[OC_MAX(size1,size2)];
|
461
|
scratchb=new MyComplex[OC_MAX(size1,size2)];
|
462
|
vecsize1=size1; vecsize2=size2;
|
463
|
}
|
464
|
|
465
|
void FFTReal2D::SetupInverse(int size1,int size2)
|
466
|
{
|
467
|
if(size1==vecsize1 && size2==vecsize2 && workarr!=NULL)
|
468
|
return; // Nothing to do
|
469
|
Setup(size1,size2);
|
470
|
if(workarr!=NULL) { delete[] workarr[0]; delete[] workarr; } // Safety
|
471
|
int rowcount=(vecsize1/2)+1;
|
472
|
workarr=new MyComplex*[rowcount];
|
473
|
workarr[0]=new MyComplex[rowcount*vecsize2];
|
474
|
for(int i=1;i<rowcount;i++) workarr[i]=workarr[i-1]+vecsize2;
|
475
|
}
|
476
|
|
477
|
void FFTReal2D::ReleaseMemory()
|
478
|
{
|
479
|
if(vecsize1==0 || vecsize2==0) return;
|
480
|
delete[] scratch; scratch=NULL;
|
481
|
delete[] scratchb; scratchb=NULL;
|
482
|
if(workarr!=NULL) {
|
483
|
delete[] workarr[0];
|
484
|
delete[] workarr;
|
485
|
workarr=NULL;
|
486
|
}
|
487
|
vecsize1=0; vecsize2=0;
|
488
|
fft1.ReleaseMemory(); fft2.ReleaseMemory();
|
489
|
}
|
490
|
|
491
|
void FFTReal2D::FillOut(int csize1,int csize2,MyComplex** carr)
|
492
|
{ // This routine assumes carr is a top-half-filled DFT of
|
493
|
// a real function, and fills in the bottom half using the
|
494
|
// relation
|
495
|
// carr[csize1-i][csize2-j]=conj(carr[i][j])
|
496
|
// for i>csize1/2, with the second indices interpreted 'mod csize2'.
|
497
|
int i,j;
|
498
|
for(i=1;i<csize1/2;i++) {
|
499
|
carr[csize1-i][0]=conj(carr[i][0]);
|
500
|
for(j=1;j<csize2;j++) {
|
501
|
carr[csize1-i][j]=conj(carr[i][csize2-j]);
|
502
|
}
|
503
|
}
|
504
|
|
505
|
}
|
506
|
|
507
|
void FFTReal2D::ForwardCR(int rsize1,int rsize2,
|
508
|
const double* const* rarr,
|
509
|
int csize1,int csize2,MyComplex** carr)
|
510
|
{
|
511
|
Setup(OC_MAX(1,2*(csize1-1)),csize2); // Safety
|
512
|
if(vecsize1<2 || vecsize2<2)
|
513
|
PlainError(1,"Error in FFTReal2D::ForwardCR(...): "
|
514
|
"Full array dimensions (%dx%d) must be both >=2",
|
515
|
vecsize1,vecsize2);
|
516
|
|
517
|
int i,j;
|
518
|
FFT_REAL_TYPE x1,x2,y1,y2;
|
519
|
FFT_REAL_TYPE xb1,xb2,yb1,yb2;
|
520
|
|
521
|
// Do FFT on columns, 2 at a time
|
522
|
for(j=0;j+3<rsize2;j+=4) {
|
523
|
// Pack into MyComplex scratch array
|
524
|
for(i=0;i<rsize1;i++) {
|
525
|
scratch[i]=MyComplex(rarr[i][j],rarr[i][j+1]);
|
526
|
scratchb[i]=MyComplex(rarr[i][j+2],rarr[i][j+3]);
|
527
|
}
|
528
|
// Zero pad scratch space
|
529
|
for(i=rsize1;i<vecsize1;i++) scratch[i]=MyComplex(0.,0.);
|
530
|
for(i=rsize1;i<vecsize1;i++) scratchb[i]=MyComplex(0.,0.);
|
531
|
// Do complex FFT
|
532
|
fft1.ForwardDecFreq(vecsize1,scratchb);
|
533
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
534
|
// Unpack into top half of 2D complex array
|
535
|
// Rows 0 & vecsize1/2 are real-valued, so pack them together
|
536
|
// into row 0 (row 0 as real part, row vecsize1/2 as imag. part).
|
537
|
carr[0][j] =MyComplex(scratch[0].real(),scratch[vecsize1/2].real());
|
538
|
carr[0][j+1]=MyComplex(scratch[0].imag(),scratch[vecsize1/2].imag());
|
539
|
carr[0][j+2]=MyComplex(scratchb[0].real(),scratchb[vecsize1/2].real());
|
540
|
carr[0][j+3]=MyComplex(scratchb[0].imag(),scratchb[vecsize1/2].imag());
|
541
|
for(i=1;i<vecsize1/2;i++) { // ASSUMES vecsize1 is even!
|
542
|
x1=scratch[i].real()/2; y1=scratch[i].imag()/2;
|
543
|
x2=scratch[vecsize1-i].real()/2; y2=scratch[vecsize1-i].imag()/2;
|
544
|
xb1=scratchb[i].real()/2; yb1=scratchb[i].imag()/2;
|
545
|
xb2=scratchb[vecsize1-i].real()/2; yb2=scratchb[vecsize1-i].imag()/2;
|
546
|
carr[i][j] =MyComplex(x1+x2,y1-y2);
|
547
|
carr[i][j+1] =MyComplex(y1+y2,x2-x1);
|
548
|
carr[i][j+2] =MyComplex(xb1+xb2,yb1-yb2);
|
549
|
carr[i][j+3] =MyComplex(yb1+yb2,xb2-xb1);
|
550
|
}
|
551
|
}
|
552
|
// Case rsize2 not divisible by 4
|
553
|
for(;j<rsize2;j+=2) {
|
554
|
// Pack into complex scratch array
|
555
|
if(j+1<rsize2) {
|
556
|
for(i=0;i<rsize1;i++) scratch[i]=MyComplex(rarr[i][j],rarr[i][j+1]);
|
557
|
}
|
558
|
else { // rsize2 == 1 mod 2.
|
559
|
for(i=0;i<rsize1;i++) scratch[i]=MyComplex(rarr[i][j],0.);
|
560
|
}
|
561
|
for(i=rsize1;i<vecsize1;i++) scratch[i]=MyComplex(0.,0.);
|
562
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
563
|
carr[0][j] =MyComplex(scratch[0].real(),scratch[vecsize1/2].real());
|
564
|
carr[0][j+1]=MyComplex(scratch[0].imag(),scratch[vecsize1/2].imag());
|
565
|
for(i=1;i<vecsize1/2;i++) { // ASSUMES vecsize1 is even!
|
566
|
x1=scratch[i].real()/2; y1=scratch[i].imag()/2;
|
567
|
x2=scratch[vecsize1-i].real()/2; y2=scratch[vecsize1-i].imag()/2;
|
568
|
carr[i][j] =MyComplex(x1+x2,y1-y2);
|
569
|
carr[i][j+1] =MyComplex(y1+y2,x2-x1);
|
570
|
}
|
571
|
}
|
572
|
// Zero-pad remaining columns
|
573
|
if(rsize2<csize2) {
|
574
|
for(i=0;i<csize1;i++) for(j=rsize2;j<csize2;j++)
|
575
|
carr[i][j]=MyComplex(0.,0.);
|
576
|
// Note: One _may_ be able to gain a few percent speedup
|
577
|
// by using the 'memcpy' C-library routine.
|
578
|
}
|
579
|
|
580
|
// Do FFT on top half of rows
|
581
|
for(i=0;i<csize1-1;i++) fft2.ForwardDecFreq(csize2,carr[i]);
|
582
|
|
583
|
// Pull out row 0 & row csize1-1 from (packed) row 0
|
584
|
carr[csize1-1][0] = MyComplex(carr[0][0].imag(),0.);
|
585
|
carr[0][0] = MyComplex(carr[0][0].real(),0.);
|
586
|
for(j=1;j<csize2/2;j++) {
|
587
|
x1=carr[0][j].real()/2; y1=carr[0][j].imag()/2;
|
588
|
x2=carr[0][csize2-j].real()/2; y2=carr[0][csize2-j].imag()/2;
|
589
|
MyComplex temp1(x1+x2,y1-y2);
|
590
|
MyComplex temp2(y1+y2,x2-x1);
|
591
|
carr[0][j] = temp1;
|
592
|
carr[0][csize2-j] = conj(temp1);
|
593
|
carr[csize1-1][j] = temp2;
|
594
|
carr[csize1-1][csize2-j] = conj(temp2);
|
595
|
}
|
596
|
carr[csize1-1][csize2/2] = MyComplex(carr[0][csize2/2].imag(),0.);
|
597
|
carr[0][csize2/2] = MyComplex(carr[0][csize2/2].real(),0.);
|
598
|
}
|
599
|
|
600
|
void FFTReal2D::ForwardRC(int rsize1,int rsize2,
|
601
|
const double* const* rarr,
|
602
|
int csize1,int csize2,MyComplex** carr)
|
603
|
{
|
604
|
Setup(OC_MAX(1,2*(csize1-1)),csize2); // Safety
|
605
|
if(vecsize1<2 || vecsize2<2)
|
606
|
PlainError(1,"Error in FFTReal2D::ForwardRC(...): "
|
607
|
"Full array dimensions (%dx%d) must be both >=2",
|
608
|
vecsize1,vecsize2);
|
609
|
|
610
|
int i,j;
|
611
|
FFT_REAL_TYPE x1,x2,y1,y2;
|
612
|
FFT_REAL_TYPE xb1,xb2,yb1,yb2;
|
613
|
|
614
|
// Do row FFT's
|
615
|
for(i=0;i+1<rsize1;i+=2) {
|
616
|
// Pack 'MyComplex' row
|
617
|
for(j=0;j<rsize2;j++)
|
618
|
carr[i/2][j]=MyComplex(rarr[i][j],rarr[i+1][j]);
|
619
|
for(j=rsize2;j<csize2;j++) carr[i/2][j]=MyComplex(0.,0.); // Zero pad
|
620
|
// Do FFT
|
621
|
fft2.ForwardDecFreq(csize2,carr[i/2]);
|
622
|
}
|
623
|
for(;i<rsize1;i+=2) { // In case rsize1 == 1 mod 2
|
624
|
for(j=0;j<rsize2;j++)
|
625
|
carr[i/2][j]=MyComplex(rarr[i][j],0.);
|
626
|
for(j=rsize2;j<csize2;j++) carr[i/2][j]=MyComplex(0.,0.); // Zero pad
|
627
|
// Do FFT
|
628
|
fft2.ForwardDecFreq(csize2,carr[i/2]);
|
629
|
}
|
630
|
// Any remaining rows are zero padding on the fly during
|
631
|
// the column FFT's (see below).
|
632
|
|
633
|
// Do column FFT's
|
634
|
// Do column 0 and csize2/2, making use of the fact that
|
635
|
// these 2 columns are 'real'.
|
636
|
for(i=0;i<(rsize1+1)/2;i++) {
|
637
|
x1=carr[i][0].real(); x2=carr[i][0].imag();
|
638
|
y1=carr[i][csize2/2].real(); y2=carr[i][csize2/2].imag();
|
639
|
scratch[2*i] = MyComplex(x1,y1);
|
640
|
scratch[(2*i)+1] = MyComplex(x2,y2);
|
641
|
}
|
642
|
for(i*=2;i<vecsize1;i++) scratch[i]=MyComplex(0.,0.); // Zero pad
|
643
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
644
|
carr[0][0] = MyComplex(scratch[0].real(),0.);
|
645
|
carr[0][csize2/2] = MyComplex(scratch[0].imag(),0.);
|
646
|
for(i=1;i<csize1-1;i++) {
|
647
|
x1=scratch[i].real()/2; y1=scratch[i].imag()/2;
|
648
|
x2=scratch[vecsize1-i].real()/2; y2=scratch[vecsize1-i].imag()/2;
|
649
|
carr[i][0] = MyComplex(x1+x2,y1-y2);
|
650
|
carr[i][csize2/2] = MyComplex(y1+y2,x2-x1);
|
651
|
}
|
652
|
carr[csize1-1][0] = MyComplex(scratch[csize1-1].real(),0.);
|
653
|
carr[csize1-1][csize2/2] = MyComplex(scratch[csize1-1].imag(),0.);
|
654
|
|
655
|
// Do remaining columns
|
656
|
for(j=1;j+1<csize2/2;j+=2) {
|
657
|
for(i=0;i<(rsize1+1)/2;i++) {
|
658
|
x1 =carr[i][j].real()/2; y1 =carr[i][j].imag()/2;
|
659
|
xb1=carr[i][j+1].real()/2; yb1=carr[i][j+1].imag()/2;
|
660
|
xb2=carr[i][csize2-1-j].real()/2; yb2=carr[i][csize2-1-j].imag()/2;
|
661
|
x2 =carr[i][csize2-j].real()/2; y2 =carr[i][csize2-j].imag()/2;
|
662
|
scratch[2*i] = MyComplex(x1+x2,y1-y2);
|
663
|
scratch[(2*i)+1] = MyComplex(y1+y2,x2-x1);
|
664
|
scratchb[2*i] = MyComplex(xb1+xb2,yb1-yb2);
|
665
|
scratchb[(2*i)+1] = MyComplex(yb1+yb2,xb2-xb1);
|
666
|
}
|
667
|
for(i*=2;i<vecsize1;i++)
|
668
|
scratch[i]= scratchb[i]=MyComplex(0.,0.); // Zero pad
|
669
|
fft1.ForwardDecFreq(vecsize1,scratchb);
|
670
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
671
|
carr[0][j]=scratch[0]; carr[0][csize2-j]=conj(scratch[0]);
|
672
|
carr[0][j+1]=scratchb[0]; carr[0][csize2-1-j]=conj(scratchb[0]);
|
673
|
for(i=1;i<csize1;i++) {
|
674
|
carr[i][j]=scratch[i];
|
675
|
carr[i][j+1]=scratchb[i];
|
676
|
carr[i][csize2-1-j]=conj(scratchb[vecsize1-i]);
|
677
|
carr[i][csize2-j]=conj(scratch[vecsize1-i]);
|
678
|
}
|
679
|
}
|
680
|
// There should be 1 column left over
|
681
|
if(j<csize2/2) {
|
682
|
for(i=0;i<(rsize1+1)/2;i++) {
|
683
|
x1=carr[i][j].real()/2; y1=carr[i][j].imag()/2;
|
684
|
x2=carr[i][csize2-j].real()/2; y2=carr[i][csize2-j].imag()/2;
|
685
|
scratch[2*i] = MyComplex(x1+x2,y1-y2);
|
686
|
scratch[(2*i)+1] = MyComplex(y1+y2,x2-x1);
|
687
|
}
|
688
|
for(i*=2;i<vecsize1;i++) scratch[i]=MyComplex(0.,0.); // Zero pad
|
689
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
690
|
carr[0][j]=scratch[0]; carr[0][csize2-j]=conj(scratch[0]);
|
691
|
for(i=1;i<csize1;i++) {
|
692
|
carr[i][j]=scratch[i];
|
693
|
carr[i][csize2-j]=conj(scratch[vecsize1-i]);
|
694
|
}
|
695
|
}
|
696
|
}
|
697
|
|
698
|
void FFTReal2D::InverseRC(int csize1,int csize2,
|
699
|
const MyComplex* const* carr,
|
700
|
int rsize1,int rsize2,double** rarr)
|
701
|
{
|
702
|
SetupInverse(OC_MAX(1,2*(csize1-1)),csize2); // Safety.
|
703
|
if(vecsize1<2 || vecsize2<2)
|
704
|
PlainError(1,"Error in FFTReal2D::InverseRC(...): "
|
705
|
"Full array dimensions (%dx%d) must be both >=2",
|
706
|
vecsize1,vecsize2);
|
707
|
|
708
|
int i,j;
|
709
|
FFT_REAL_TYPE x1,y1,x2,y2;
|
710
|
FFT_REAL_TYPE xb1,yb1,xb2,yb2;
|
711
|
|
712
|
// Do row inverse FFT's
|
713
|
// Handle the first & csize1'th row specially. These rows are
|
714
|
// the DFT's of real sequences, so they each satisfy the conjugate
|
715
|
// symmetry condition
|
716
|
workarr[0][0]=MyComplex(carr[0][0].real(),carr[csize1-1][0].real());
|
717
|
for(j=1;j<csize2/2;j++) {
|
718
|
x1=carr[0][j].real(); y1=carr[0][j].imag();
|
719
|
x2=carr[csize1-1][j].real(); y2=carr[csize1-1][j].imag();
|
720
|
workarr[0][j] = MyComplex(x1-y2,x2+y1);
|
721
|
workarr[0][csize2-j] = MyComplex(x1+y2,x2-y1);
|
722
|
}
|
723
|
workarr[0][csize2/2]=MyComplex(carr[0][csize2/2].real(),
|
724
|
carr[csize1-1][csize2/2].real());
|
725
|
fft2.InverseDecTime(csize2,workarr[0],1.);
|
726
|
|
727
|
// iFFT the remaining rows
|
728
|
for(i=1;i<csize1-1;i++) {
|
729
|
for(j=0;j<csize2;j++) workarr[i][j]=carr[i][j];
|
730
|
fft2.InverseDecTime(csize2,workarr[i],1.);
|
731
|
}
|
732
|
|
733
|
// Now do iFFT's on columns. These are conj. symmetric, so we
|
734
|
// process them 2 at a time. Also, recall the 1st row of workarr
|
735
|
// contains the iFFT's of the 1st and csize1'th row of the given carr.
|
736
|
for(j=0;j+3<rsize2;j+=4) {
|
737
|
scratch[0]=
|
738
|
MyComplex(workarr[0][j].real(),workarr[0][j+1].real());
|
739
|
scratch[csize1-1]=
|
740
|
MyComplex(workarr[0][j].imag(),workarr[0][j+1].imag());
|
741
|
scratchb[0]=
|
742
|
MyComplex(workarr[0][j+2].real(),workarr[0][j+3].real());
|
743
|
scratchb[csize1-1]=
|
744
|
MyComplex(workarr[0][j+2].imag(),workarr[0][j+3].imag());
|
745
|
for(i=1;i<csize1-1;i++) {
|
746
|
x1 =workarr[i][j].real(); y1 =workarr[i][j].imag();
|
747
|
x2 =workarr[i][j+1].real(); y2 =workarr[i][j+1].imag();
|
748
|
xb1=workarr[i][j+2].real(); yb1=workarr[i][j+2].imag();
|
749
|
xb2=workarr[i][j+3].real(); yb2=workarr[i][j+3].imag();
|
750
|
scratch[i] = MyComplex(x1-y2,x2+y1);
|
751
|
scratch[vecsize1-i] = MyComplex(x1+y2,x2-y1);
|
752
|
scratchb[i] = MyComplex(xb1-yb2,xb2+yb1);
|
753
|
scratchb[vecsize1-i] = MyComplex(xb1+yb2,xb2-yb1);
|
754
|
}
|
755
|
fft1.InverseDecTime(vecsize1,scratchb,FFT_REAL_TYPE(vecsize1*vecsize2));
|
756
|
fft1.InverseDecTime(vecsize1,scratch,FFT_REAL_TYPE(vecsize1*vecsize2));
|
757
|
for(i=0;i<rsize1;i++) {
|
758
|
rarr[i][j] =scratch[i].real();
|
759
|
rarr[i][j+1]=scratch[i].imag();
|
760
|
rarr[i][j+2]=scratchb[i].real();
|
761
|
rarr[i][j+3]=scratchb[i].imag();
|
762
|
}
|
763
|
}
|
764
|
// Remaining columns if rsize2 is not divisible by 4. OTOH, csize2
|
765
|
// *is* divisible by 2, so we can assume workarr[i][j+1] exists.
|
766
|
for(;j<rsize2;j+=2) {
|
767
|
scratch[0]=
|
768
|
MyComplex(workarr[0][j].real(),workarr[0][j+1].real());
|
769
|
scratch[csize1-1]=
|
770
|
MyComplex(workarr[0][j].imag(),workarr[0][j+1].imag());
|
771
|
for(i=1;i<csize1-1;i++) {
|
772
|
x1 =workarr[i][j].real(); y1 =workarr[i][j].imag();
|
773
|
x2 =workarr[i][j+1].real(); y2 =workarr[i][j+1].imag();
|
774
|
scratch[i] = MyComplex(x1-y2,x2+y1);
|
775
|
scratch[vecsize1-i] = MyComplex(x1+y2,x2-y1);
|
776
|
}
|
777
|
fft1.InverseDecTime(vecsize1,scratch,FFT_REAL_TYPE(vecsize1*vecsize2));
|
778
|
for(i=0;i<rsize1;i++) {
|
779
|
rarr[i][j] =scratch[i].real();
|
780
|
if(j+1<rsize2) rarr[i][j+1]=scratch[i].imag();
|
781
|
}
|
782
|
}
|
783
|
}
|
784
|
|
785
|
void FFTReal2D::InverseCR(int csize1,int csize2,
|
786
|
const MyComplex* const* carr,
|
787
|
int rsize1,int rsize2,double** rarr)
|
788
|
{
|
789
|
SetupInverse(OC_MAX(1,2*(csize1-1)),csize2); // Safety
|
790
|
if(vecsize1<2 || vecsize2<2)
|
791
|
PlainError(1,"Error in FFTReal2D::InverseCR(...): "
|
792
|
"Full array dimensions (%dx%d) must be both >=2",
|
793
|
vecsize1,vecsize2);
|
794
|
|
795
|
int i,j;
|
796
|
FFT_REAL_TYPE x1,y1,x2,y2,xb1,yb1,xb2,yb2;
|
797
|
|
798
|
// Column iFFT's
|
799
|
// Handle the first & csize2/2'th column specially. These cols are
|
800
|
// the DFT's of real sequences, so they each satisfy the conjugate
|
801
|
// symmetry condition
|
802
|
scratch[0]=MyComplex(carr[0][0].real(),carr[0][csize2/2].real());
|
803
|
for(i=1;i<csize1-1;i++) {
|
804
|
x1=carr[i][0].real(); y1=carr[i][0].imag();
|
805
|
x2=carr[i][csize2/2].real(); y2=carr[i][csize2/2].imag();
|
806
|
scratch[i] = MyComplex(x1-y2,x2+y1);
|
807
|
scratch[vecsize1-i] = MyComplex(x1+y2,x2-y1);
|
808
|
}
|
809
|
scratch[csize1-1]=MyComplex(carr[csize1-1][0].real(),
|
810
|
carr[csize1-1][csize2/2].real());
|
811
|
fft1.InverseDecTime(vecsize1,scratch,1);
|
812
|
for(i=0;i<vecsize1;i+=2) { // ASSUMES vecsize1 is even
|
813
|
// See packing note below.
|
814
|
workarr[i/2][0] = MyComplex(scratch[i].real(),scratch[i+1].real());
|
815
|
workarr[i/2][csize2/2] = MyComplex(scratch[i].imag(),scratch[i+1].imag());
|
816
|
}
|
817
|
//
|
818
|
// Do remaining column iFFT's, two at a time for better memory
|
819
|
// access locality.
|
820
|
for(j=1;j+1<csize2/2;j+=2) {
|
821
|
scratch[0]=carr[0][j];
|
822
|
scratchb[0]=carr[0][j+1];
|
823
|
for(i=1;i<csize1-1;i++) {
|
824
|
scratch[i]=carr[i][j];
|
825
|
scratchb[i]=carr[i][j+1];
|
826
|
scratchb[vecsize1-i]=conj(carr[i][csize2-1-j]);
|
827
|
scratch[vecsize1-i]=conj(carr[i][csize2-j]);
|
828
|
}
|
829
|
scratch[csize1-1]=carr[csize1-1][j];
|
830
|
scratchb[csize1-1]=carr[csize1-1][j+1];
|
831
|
fft1.InverseDecTime(vecsize1,scratchb,1.);
|
832
|
fft1.InverseDecTime(vecsize1,scratch,1.);
|
833
|
// Pack into workarr. Rows will be conjugate symmetric, so we
|
834
|
// can pack two rows into 1 via r[k]+i.r[k+1] -> workarr[k/2].
|
835
|
for(i=0;i<rsize1;i+=2) {
|
836
|
// CAREFUL! The above 'rsize1' bound may depend on how the
|
837
|
// iFFT's are calculated in the 'Row iFFT's' code section,
|
838
|
// and how 'i' is initialized.
|
839
|
x1=scratch[i].real(); y1=scratch[i].imag();
|
840
|
x2=scratch[i+1].real(); y2=scratch[i+1].imag();
|
841
|
xb1=scratchb[i].real(); yb1=scratchb[i].imag();
|
842
|
xb2=scratchb[i+1].real(); yb2=scratchb[i+1].imag();
|
843
|
workarr[i/2][j] = MyComplex(x1-y2,x2+y1);
|
844
|
workarr[i/2][j+1] = MyComplex(xb1-yb2,xb2+yb1);
|
845
|
workarr[i/2][csize2-j-1] = MyComplex(xb1+yb2,xb2-yb1);
|
846
|
workarr[i/2][csize2-j] = MyComplex(x1+y2,x2-y1);
|
847
|
}
|
848
|
}
|
849
|
// There should be 1 column left over
|
850
|
if((j=(csize2/2)-1)%2==1) {
|
851
|
// Column (csize2/2)-1 *not* processed above
|
852
|
scratch[0]=carr[0][j];
|
853
|
for(i=1;i<csize1-1;i++) {
|
854
|
scratch[i]=carr[i][j];
|
855
|
scratch[vecsize1-i]=conj(carr[i][csize2-j]);
|
856
|
}
|
857
|
scratch[csize1-1]=carr[csize1-1][j];
|
858
|
fft1.InverseDecTime(vecsize1,scratch,1);
|
859
|
for(i=0;i<rsize1;i+=2) {
|
860
|
// CAREFUL! The above 'rsize1' bound may depend on how the
|
861
|
// iFFT's are calculated in the 'Row iFFT's' code section,
|
862
|
// and how 'i' is initialized.
|
863
|
x1=scratch[i].real(); y1=scratch[i].imag();
|
864
|
x2=scratch[i+1].real(); y2=scratch[i+1].imag();
|
865
|
workarr[i/2][j] = MyComplex(x1-y2,x2+y1);
|
866
|
workarr[i/2][csize2-j] = MyComplex(x1+y2,x2-y1);
|
867
|
}
|
868
|
}
|
869
|
|
870
|
// Row iFFT's
|
871
|
for(i=0;i<rsize1;i+=2) {
|
872
|
fft2.InverseDecTime(vecsize2,workarr[i/2],
|
873
|
FFT_REAL_TYPE(vecsize1*vecsize2));
|
874
|
for(j=0;j<rsize2;j++) rarr[i][j] = workarr[i/2][j].real();
|
875
|
if(i+1<rsize1) {
|
876
|
for(j=0;j<rsize2;j++) rarr[i+1][j] = workarr[i/2][j].imag();
|
877
|
}
|
878
|
}
|
879
|
}
|
880
|
|
881
|
void FFTReal2D::Forward1D(int rsize1,int rsize2,
|
882
|
const double* const* rarr,
|
883
|
int csize1,int csize2,MyComplex** carr)
|
884
|
{
|
885
|
// The ForwardRC/CR routines assume full array dimensions >1.
|
886
|
// This routine handles the special case where (at least) one of
|
887
|
// the dimension is 1, which degenerates into a simple 1D FFT.
|
888
|
if(csize1==1) { // Single row FFT
|
889
|
int j;
|
890
|
for(j=0;j<rsize2;j++)
|
891
|
carr[0][j]=MyComplex(rarr[0][j],0.);
|
892
|
for(;j<csize2;j++)
|
893
|
carr[0][j]=MyComplex(0.,0.);
|
894
|
fft2.ForwardDecFreq(csize2,carr[0]);
|
895
|
} else if(csize2==1) { // Single column FFT
|
896
|
int i;
|
897
|
for(i=0;i<rsize1;i++)
|
898
|
scratch[i]=MyComplex(rarr[i][0],0.0);
|
899
|
for(;i<vecsize1;i++)
|
900
|
scratch[i]=MyComplex(0.0,0.0);
|
901
|
fft1.ForwardDecFreq(vecsize1,scratch);
|
902
|
for(i=0;i<csize1;i++)
|
903
|
carr[i][0]=scratch[i]; // Last half, from csize1 to vecsize1
|
904
|
/// is conj. sym. since input is real.
|
905
|
} else {
|
906
|
PlainError(1,"Error in FFTReal2D::Forward1D(...): "
|
907
|
"One array dimension (of %dx%d) must be==1",
|
908
|
vecsize1,vecsize2);
|
909
|
}
|
910
|
}
|
911
|
|
912
|
void FFTReal2D::Inverse1D(int csize1,int csize2,
|
913
|
const MyComplex* const* carr,
|
914
|
int rsize1,int rsize2,double** rarr)
|
915
|
{
|
916
|
// The InverseRC/CR routines assume full array dimensions >1.
|
917
|
// This routine handles the special case where (at least) one of
|
918
|
// the dimension is 1, which degenerates into a simple 1D FFT.
|
919
|
if(csize1==1) { // Single row iFFT
|
920
|
int j;
|
921
|
for(j=0;j<csize2;j++)
|
922
|
scratch[j]=carr[0][j];
|
923
|
fft2.InverseDecTime(csize2,scratch);
|
924
|
for(j=0;j<rsize2;j++)
|
925
|
rarr[0][j]=scratch[j].real();
|
926
|
} else if(csize2==1) { // Single column iFFT
|
927
|
int i;
|
928
|
scratch[0]=carr[0][0];
|
929
|
for(i=1;i<csize1-1;i++) {
|
930
|
scratch[i]=carr[i][0];
|
931
|
scratch[vecsize1-i]=conj(carr[i][0]); // Last half obtained
|
932
|
/// by using conjugate symmetry of real data FFT.
|
933
|
}
|
934
|
scratch[csize1-1]=carr[csize1-1][0];
|
935
|
fft2.InverseDecTime(vecsize1,scratch);
|
936
|
for(i=0;i<rsize1;i++)
|
937
|
rarr[i][0]=scratch[i].real();
|
938
|
} else {
|
939
|
PlainError(1,"Error in FFTReal2D::Inverse1D(...): "
|
940
|
"One array dimension (of %dx%d) must be==1",
|
941
|
vecsize1,vecsize2);
|
942
|
}
|
943
|
}
|
944
|
|
945
|
void FFTReal2D::Forward(int rsize1,int rsize2,
|
946
|
const double* const* rarr,
|
947
|
int csize1,int csize2,MyComplex** carr)
|
948
|
{
|
949
|
if(csize2<rsize2)
|
950
|
PlainError(1,"Error in FFTRealD::Forward(int,int,OC_REAL8,**,int,int,"
|
951
|
"MyComplex**): csize2 (=%d) *must* be >= rsize2 (=%d)\n",
|
952
|
csize2,rsize2);
|
953
|
|
954
|
if(csize1<(rsize1/2)+1)
|
955
|
PlainError(1,"Error in FFTRealD::Forward(int,int,double**,int,int,"
|
956
|
"MyComplex**): csize1 (=%d) *must* be >= (rsize1/2)+1"
|
957
|
" (=%d)\n",csize1,(rsize1/2)+1);
|
958
|
|
959
|
Setup(OC_MAX(1,2*(csize1-1)),csize2);
|
960
|
if(vecsize1<1 || vecsize2<1) return; // Nothing to do
|
961
|
|
962
|
// Check for 1D degenerate cases
|
963
|
if(vecsize1==1 || vecsize2==1) {
|
964
|
Forward1D(rsize1,rsize2,rarr,csize1,csize2,carr);
|
965
|
} else {
|
966
|
// Determine which Forward routine to call (ForwardCR or ForwardRC)
|
967
|
// (Use double arithmetic to protect against integer overflow. If
|
968
|
// the two times are very close, then the choice doesn't really
|
969
|
// matter.)
|
970
|
// 1) Estimated (proportional) time for ForwardCR
|
971
|
REALWIDE crtime=REALWIDE(vecsize1*vecsize2)*REALWIDE(logsize2)
|
972
|
+ REALWIDE(vecsize1*rsize2)*REALWIDE(logsize1);
|
973
|
// 2) Estimated (proportional) time for ForwardRC
|
974
|
REALWIDE rctime=REALWIDE(rsize1*vecsize2)*REALWIDE(logsize2)
|
975
|
+ REALWIDE(vecsize1*vecsize2)*REALWIDE(logsize1);
|
976
|
// Introduce empirical adjustment factor
|
977
|
rctime*=CRRCspeedratio;
|
978
|
if(crtime<=rctime) ForwardCR(rsize1,rsize2,rarr,csize1,csize2,carr);
|
979
|
else ForwardRC(rsize1,rsize2,rarr,csize1,csize2,carr);
|
980
|
}
|
981
|
}
|
982
|
|
983
|
void FFTReal2D::Inverse(int csize1,int csize2,
|
984
|
const MyComplex* const* carr,
|
985
|
int rsize1,int rsize2,double** rarr)
|
986
|
{
|
987
|
if(csize2<rsize2)
|
988
|
PlainError(1,"Error in FFTRealD::Inverse(int,int,double**,"
|
989
|
"int,int,MyComplex**): csize2 (=%d) *must* be >="
|
990
|
" rsize2 (=%d)\n",csize2,rsize2);
|
991
|
|
992
|
if(csize1<(rsize1/2)+1)
|
993
|
PlainError(1,"Error in FFTRealD::Inverse(int,int,double**,"
|
994
|
"int,int,MyComplex**): csize1 (=%d) *must* be >="
|
995
|
" (rsize1/2)+1 (=%d)\n",csize1,(rsize1/2)+1);
|
996
|
|
997
|
SetupInverse(OC_MAX(1,2*(csize1-1)),csize2);
|
998
|
if(vecsize1<1 || vecsize2<1) return; // Nothing to do
|
999
|
|
1000
|
// Check for 1D degenerate cases
|
1001
|
if(vecsize1==1 || vecsize2==1) {
|
1002
|
Inverse1D(csize1,csize2,carr,rsize1,rsize2,rarr);
|
1003
|
} else {
|
1004
|
// Determine which Inverse routine to call (InverseRC or InverseCR)
|
1005
|
// (Use double arithmetic to protect against integer overflow.
|
1006
|
// If the two times are very close, then the choice doesn't really
|
1007
|
// matter.)
|
1008
|
// 1) Estimated (proportional) time for InverseRC (==ForwardCR)
|
1009
|
REALWIDE irctime=REALWIDE(vecsize1*vecsize2)*REALWIDE(logsize2)
|
1010
|
+ REALWIDE(vecsize1*rsize2)*REALWIDE(logsize1);
|
1011
|
// 2) Estimated (proportional) time for InverseCR (==ForwardRC)
|
1012
|
REALWIDE icrtime=REALWIDE(rsize1*vecsize2)*REALWIDE(logsize2)
|
1013
|
+ REALWIDE(vecsize1*vecsize2)*REALWIDE(logsize1);
|
1014
|
// Introduce empirical adjustment factor
|
1015
|
icrtime*=CRRCspeedratio;
|
1016
|
if(irctime<=icrtime) InverseRC(csize1,csize2,carr,rsize1,rsize2,rarr);
|
1017
|
else InverseCR(csize1,csize2,carr,rsize1,rsize2,rarr);
|
1018
|
}
|
1019
|
}
|
1020
|
|
1021
|
|
1022
|
#ifdef USE_MPI
|
1023
|
|
1024
|
static FFT fft1_mpi,fft2_mpi;
|
1025
|
static int vecsize1_mpi(0),vecsize2_mpi(0);
|
1026
|
static MyComplex* scratch_mpi(NULL);
|
1027
|
static MyComplex** workarr_mpi(NULL);
|
1028
|
|
1029
|
FFTReal2D_mpi::FFTReal2D_mpi()
|
1030
|
{
|
1031
|
}
|
1032
|
|
1033
|
void SetupMemory_mpi(int size1,int size2)
|
1034
|
{
|
1035
|
if(size1!=vecsize1_mpi) {
|
1036
|
if(scratch_mpi!=NULL) delete[] scratch_mpi;
|
1037
|
vecsize1_mpi=size1;
|
1038
|
scratch_mpi=new MyComplex[vecsize1_mpi];
|
1039
|
}
|
1040
|
if(size2!=vecsize2_mpi) {
|
1041
|
if(workarr_mpi!=NULL) {
|
1042
|
delete[] workarr_mpi[0];
|
1043
|
delete[] workarr_mpi;
|
1044
|
}
|
1045
|
vecsize2_mpi=size2;
|
1046
|
int rowcount=(vecsize1_mpi/2)+1;
|
1047
|
workarr_mpi=new MyComplex*[rowcount];
|
1048
|
workarr_mpi[0]=new MyComplex[rowcount*vecsize2_mpi];
|
1049
|
for(int i=1;i<rowcount;i++)
|
1050
|
workarr_mpi[i]=workarr_mpi[i-1]+vecsize2_mpi;
|
1051
|
}
|
1052
|
}
|
1053
|
|
1054
|
void ReleaseMemory_mpi()
|
1055
|
{
|
1056
|
fft1_mpi.ReleaseMemory();
|
1057
|
fft2_mpi.ReleaseMemory();
|
1058
|
if(scratch_mpi!=NULL) delete[] scratch_mpi;
|
1059
|
scratch_mpi=NULL;
|
1060
|
vecsize1_mpi=0;
|
1061
|
if(workarr_mpi!=NULL) {
|
1062
|
delete[] workarr_mpi[0];
|
1063
|
delete[] workarr_mpi;
|
1064
|
}
|
1065
|
workarr_mpi=NULL;
|
1066
|
vecsize2_mpi=0;
|
1067
|
}
|
1068
|
|
1069
|
void FFTReal2D_mpi::ReleaseMemory()
|
1070
|
{
|
1071
|
Mmsolve_MpiWakeUp(ReleaseMemory_mpi); // On slaves
|
1072
|
ReleaseMemory_mpi(); // On master
|
1073
|
}
|
1074
|
|
1075
|
static int vecsize1_mpi_b(0),vecsize2_mpi_b(0);
|
1076
|
static MyComplex **work1_mpi(NULL),**work2_mpi(NULL);
|
1077
|
|
1078
|
static void SetupMemory_mpi_base_b(int size1,int size2)
|
1079
|
{
|
1080
|
int i;
|
1081
|
if(size1!=vecsize1_mpi_b || size2!=vecsize2_mpi_b) {
|
1082
|
if(work1_mpi!=NULL) {
|
1083
|
delete[] work1_mpi[0];
|
1084
|
delete[] work1_mpi;
|
1085
|
}
|
1086
|
if(work2_mpi!=NULL) {
|
1087
|
delete[] work2_mpi[0];
|
1088
|
delete[] work2_mpi;
|
1089
|
}
|
1090
|
vecsize1_mpi_b=size1;
|
1091
|
vecsize2_mpi_b=size2;
|
1092
|
work1_mpi=new MyComplex*[vecsize1_mpi_b];
|
1093
|
work1_mpi[0]=new MyComplex[vecsize1_mpi_b*vecsize2_mpi_b];
|
1094
|
for(i=1;i<vecsize1_mpi_b;i++)
|
1095
|
work1_mpi[i]=work1_mpi[i-1]+vecsize2_mpi_b;
|
1096
|
int rowcount=vecsize2_mpi_b/2;
|
1097
|
work2_mpi=new MyComplex*[rowcount];
|
1098
|
work2_mpi[0]=new MyComplex[rowcount*vecsize1_mpi_b];
|
1099
|
for(i=1;i<rowcount;i++)
|
1100
|
work2_mpi[i]=work2_mpi[i-1]+vecsize1_mpi_b;
|
1101
|
}
|
1102
|
}
|
1103
|
|
1104
|
static void SetupMemory_mpi_slave_b()
|
1105
|
{
|
1106
|
int size1,size2;
|
1107
|
MPI_Bcast(&size1,1,MPI_INT,0,MPI_COMM_WORLD);
|
1108
|
MPI_Bcast(&size2,1,MPI_INT,0,MPI_COMM_WORLD);
|
1109
|
if(size1<1 || size2<1)
|
1110
|
PlainError(1,"Error propagating array size info in "
|
1111
|
"SetupMemory_mpi_slave_b(): size1=%d, size2=%d",
|
1112
|
size1,size2);
|
1113
|
SetupMemory_mpi_base_b(size1,size2);
|
1114
|
}
|
1115
|
|
1116
|
static void SetupMemory_mpi_master_b(int size1,int size2)
|
1117
|
{
|
1118
|
Mmsolve_MpiWakeUp(SetupMemory_mpi_slave_b);
|
1119
|
MPI_Bcast(&size1,1,MPI_INT,0,MPI_COMM_WORLD);
|
1120
|
MPI_Bcast(&size2,1,MPI_INT,0,MPI_COMM_WORLD);
|
1121
|
SetupMemory_mpi_base_b(size1,size2);
|
1122
|
}
|
1123
|
|
1124
|
void ForwardFFT1_mpi_slave_b()
|
1125
|
{
|
1126
|
// Get data
|
1127
|
int rowcount,colcount;
|
1128
|
colcount=vecsize2_mpi_b;
|
1129
|
MPI_Request request[1];
|
1130
|
MPI_Status status[1];
|
1131
|
MPI_Irecv(&rowcount,1,MPI_INT,0,1,MPI_COMM_WORLD,request);
|
1132
|
MPI_Waitall(1,request,status);
|
1133
|
MPI_Irecv(work1_mpi[0],rowcount*colcount,MMS_COMPLEX,0,2,
|
1134
|
MPI_COMM_WORLD,request);
|
1135
|
MPI_Waitall(1,request,status);
|
1136
|
|
1137
|
// Transform rows
|
1138
|
for(int i=0;i<rowcount;i++) {
|
1139
|
fft1_mpi.ForwardDecFreq(colcount,work1_mpi[i]);
|
1140
|
}
|
1141
|
|
1142
|
// Return results
|
1143
|
MPI_Isend(work1_mpi[0],rowcount*colcount,MMS_COMPLEX,0,3,
|
1144
|
MPI_COMM_WORLD,request);
|
1145
|
MPI_Waitall(1,request,status);
|
1146
|
}
|
1147
|
|
1148
|
void ForwardFFT2_mpi_slave_b()
|
1149
|
{
|
1150
|
// Get data
|
1151
|
int rowcount,colcount;
|
1152
|
colcount=vecsize1_mpi_b;
|
1153
|
MPI_Request request[1];
|
1154
|
MPI_Status status[1];
|
1155
|
MPI_Irecv(&rowcount,1,MPI_INT,0,1,MPI_COMM_WORLD,request);
|
1156
|
MPI_Waitall(1,request,status);
|
1157
|
MPI_Irecv(work2_mpi[0],rowcount*colcount,MMS_COMPLEX,0,2,
|
1158
|
MPI_COMM_WORLD,request);
|
1159
|
MPI_Waitall(1,request,status);
|
1160
|
|
1161
|
// Transform rows
|
1162
|
for(int i=0;i<rowcount;i++) {
|
1163
|
fft2_mpi.ForwardDecFreq(colcount,work2_mpi[i]);
|
1164
|
}
|
1165
|
|
1166
|
// Return results
|
1167
|
MPI_Isend(work2_mpi[0],rowcount*colcount,MMS_COMPLEX,0,3,
|
1168
|
MPI_COMM_WORLD,request);
|
1169
|
MPI_Waitall(1,request,status);
|
1170
|
}
|
1171
|
|
1172
|
void FFTReal2D_mpi::Forward(int rsize1,int rsize2,
|
1173
|
const double* const* rarr,
|
1174
|
int csize1,int csize2,MyComplex** carr)
|
1175
|
{ // Computes FFT of rsize1 x rsize2 double** rarr, leaving result in
|
1176
|
// csize1 x csize2 MyComplex** carr. rsize2 *must* be <= csize2, and
|
1177
|
// rsize1 *must* be <= 2*(csize1-1). This routine returns only the
|
1178
|
// top half +1 of the transform. The bottom half is given by
|
1179
|
// carr[2*(csize1-1)-i][csize2-j]=conj(carr[i][j])
|
1180
|
// for i>=csize1, with the second indices interpreted 'mod csize2'.
|
1181
|
// The dimensions csize2 and 2*(csize1-1) *must* be powers of 2,
|
1182
|
// but rsize1 and rsize2 do not. On import, the rarr will be
|
1183
|
// implicitly zero-padded as necessary to fit into the specified
|
1184
|
// output array carr. To get a non-periodic transform, set
|
1185
|
//
|
1186
|
// 2*(csize1-1) to 2*(first power of 2 >= rsize1), and
|
1187
|
// csize2 to 2*(first power of 2 >= rsize2).
|
1188
|
|
1189
|
int i,j;
|
1190
|
FFT_REAL_TYPE x1,y1,x2,y2;
|
1191
|
int vecsize1=OC_MAX(1,2*(csize1-1));
|
1192
|
int vecsize2=csize2;
|
1193
|
for(i=vecsize1;i>2;i/=2)
|
1194
|
if(i%2!=0) PlainError(1,"Error in FFTReal2D_mpi::Forward(int): "
|
1195
|
"Requested csize1 - 1 (%d - 1) is not a power of 2",
|
1196
|
csize1);
|
1197
|
for(j=vecsize2;j>2;j/=2)
|
1198
|
if(j%2!=0) PlainError(1,"Error in FFTReal2D_mpi::Forward(int): "
|
1199
|
"Requested csize2 (%d) is not a power of 2",
|
1200
|
csize2);
|
1201
|
if(vecsize1==0 || vecsize2==0) return; // Nothing to do
|
1202
|
SetupMemory_mpi_master_b(vecsize1,vecsize2);
|
1203
|
|
1204
|
// Copy input data into packed complex array
|
1205
|
for(i=0;i<rsize1/2;i++) {
|
1206
|
for(j=0;j<rsize2;j++)
|
1207
|
work1_mpi[i][j]=MyComplex(rarr[2*i][j],rarr[2*i+1][j]);
|
1208
|
for(;j<vecsize2;j++)
|
1209
|
work1_mpi[i][j]=MyComplex(0,0); // Zero pad
|
1210
|
}
|
1211
|
if(2*i<rsize1) {
|
1212
|
// Odd number of rows. Pack last with zeros.
|
1213
|
for(j=0;j<rsize2;j++)
|
1214
|
work1_mpi[i][j]=MyComplex(rarr[2*i][j],0.0);
|
1215
|
for(;j<vecsize2;j++)
|
1216
|
work1_mpi[i][j]=MyComplex(0,0); // Zero pad
|
1217
|
}
|
1218
|
|
1219
|
// Do FFT across rows
|
1220
|
int proc,proc_count,base_size,base_orphan,whole_size,chunk_size;
|
1221
|
proc_count=mms_mpi_size;
|
1222
|
Mmsolve_MpiWakeUp(ForwardFFT1_mpi_slave_b);
|
1223
|
whole_size=(rsize1+1)/2;
|
1224
|
base_size=whole_size/proc_count;
|
1225
|
base_orphan=whole_size%proc_count;
|
1226
|
i=0;
|
1227
|
chunk_size=base_size+(0<base_orphan?1:0);
|
1228
|
int msg_count=3*(proc_count-1);
|
1229
|
MPI_Request *request=new MPI_Request[msg_count];
|
1230
|
MPI_Status *status=new MPI_Status[msg_count];
|
1231
|
for(proc=1;proc<proc_count;proc++) {
|
1232
|
i+=chunk_size;
|
1233
|
chunk_size=base_size+(proc<base_orphan?1:0);
|
1234
|
MPI_Isend(&chunk_size,1,MPI_INT,proc,1,MPI_COMM_WORLD,
|
1235
|
request+3*(proc-1));
|
1236
|
MPI_Isend(work1_mpi[i],chunk_size*vecsize2,MMS_COMPLEX,proc,2,
|
1237
|
MPI_COMM_WORLD,request+3*(proc-1)+1);
|
1238
|
MPI_Irecv(work1_mpi[i],chunk_size*vecsize2,MMS_COMPLEX,proc,3,
|
1239
|
MPI_COMM_WORLD,request+3*(proc-1)+2);
|
1240
|
}
|
1241
|
chunk_size=base_size+(0<base_orphan?1:0);
|
1242
|
for(i=0;i<chunk_size;i++) {
|
1243
|
fft1_mpi.ForwardDecFreq(vecsize2,work1_mpi[i]);
|
1244
|
}
|
1245
|
MPI_Waitall(msg_count,request,status);
|
1246
|
|
1247
|
// Unpack and transpose to prepare for FFT in cross dimension.
|
1248
|
// We fill the first row of work2_mpi with the first and middle
|
1249
|
// columns from unpacked work1_mpi, because these are real-valued.
|
1250
|
for(i=0;i<(rsize1+1)/2;i++) {
|
1251
|
work2_mpi[0][2*i]
|
1252
|
=MyComplex(work1_mpi[i][0].real(),work1_mpi[i][vecsize2/2].real());
|
1253
|
work2_mpi[0][2*i+1]
|
1254
|
=MyComplex(work1_mpi[i][0].imag(),work1_mpi[i][vecsize2/2].imag());
|
1255
|
}
|
1256
|
for(i=2*i;i<vecsize1;i++) work2_mpi[0][i]=MyComplex(0.,0.); // Zero pad
|
1257
|
// Process rest of the rows.
|
1258
|
for(j=1;j<vecsize2/2;j++) {
|
1259
|
for(i=0;i<(rsize1+1)/2;i++) {
|
1260
|
x1=work1_mpi[i][j].real()/2.;
|
1261
|
y1=work1_mpi[i][j].imag()/2.;
|
1262
|
x2=work1_mpi[i][vecsize2-j].real()/2.;
|
1263
|
y2=work1_mpi[i][vecsize2-j].imag()/2.;
|
1264
|
work2_mpi[j][2*i]=MyComplex(x1+x2,y1-y2);
|
1265
|
work2_mpi[j][2*i+1]=MyComplex(y1+y2,x2-x1);
|
1266
|
}
|
1267
|
for(i=2*i;i<vecsize1;i++) work2_mpi[j][i]=MyComplex(0.,0.); // Zero pad
|
1268
|
}
|
1269
|
|
1270
|
|
1271
|
// Do FFT's on transposed matrix
|
1272
|
Mmsolve_MpiWakeUp(ForwardFFT2_mpi_slave_b);
|
1273
|
whole_size=vecsize2/2;
|
1274
|
base_size=whole_size/proc_count;
|
1275
|
base_orphan=whole_size%proc_count;
|
1276
|
i=0;
|
1277
|
chunk_size=base_size+(0<base_orphan?1:0);
|
1278
|
msg_count=3*(proc_count-1);
|
1279
|
for(proc=1;proc<proc_count;proc++) {
|
1280
|
i+=chunk_size;
|
1281
|
chunk_size=base_size+(proc<base_orphan?1:0);
|
1282
|
MPI_Isend(&chunk_size,1,MPI_INT,proc,1,MPI_COMM_WORLD,
|
1283
|
request+3*(proc-1));
|
1284
|
MPI_Isend(work2_mpi[i],chunk_size*vecsize1,MMS_COMPLEX,proc,2,
|
1285
|
MPI_COMM_WORLD,request+3*(proc-1)+1);
|
1286
|
MPI_Irecv(work2_mpi[i],chunk_size*vecsize1,MMS_COMPLEX,proc,3,
|
1287
|
MPI_COMM_WORLD,request+3*(proc-1)+2);
|
1288
|
}
|
1289
|
chunk_size=base_size+(0<base_orphan?1:0);
|
1290
|
for(i=0;i<chunk_size;i++) {
|
1291
|
fft2_mpi.ForwardDecFreq(vecsize1,work2_mpi[i]);
|
1292
|
}
|
1293
|
MPI_Waitall(msg_count,request,status);
|
1294
|
delete[] request;
|
1295
|
delete[] status;
|
1296
|
|
1297
|
|
1298
|
// Un-transpose and pack into carr.
|
1299
|
// First row of work2_mpi needs to be handled separately, because
|
1300
|
// of unique packing described above.
|
1301
|
carr[0][0]=MyComplex(work2_mpi[0][0].real(),0.);
|
1302
|
carr[0][vecsize2/2]=MyComplex(work2_mpi[0][0].imag(),0.);
|
1303
|
carr[vecsize1/2][0]=MyComplex(work2_mpi[0][vecsize1/2].real(),0.);
|
1304
|
carr[vecsize1/2][vecsize2/2]=MyComplex(work2_mpi[0][vecsize1/2].imag(),0.);
|
1305
|
for(i=1;i<vecsize1/2;i++) {
|
1306
|
x1=work2_mpi[0][i].real()/2.;
|
1307
|
y1=work2_mpi[0][i].imag()/2.;
|
1308
|
x2=work2_mpi[0][vecsize1-i].real()/2.;
|
1309
|
y2=work2_mpi[0][vecsize1-i].imag()/2.;
|
1310
|
carr[i][0]=MyComplex(x1+x2,y1-y2);
|
1311
|
carr[i][vecsize2/2]=MyComplex(y1+y2,x2-x1);
|
1312
|
}
|
1313
|
// Process remaining rows
|
1314
|
for(j=1;j<vecsize2/2;j++) {
|
1315
|
carr[0][j]=work2_mpi[j][0];
|
1316
|
carr[0][csize2-j]=conj(work2_mpi[j][0]);
|
1317
|
for(i=1;i<vecsize1/2;i++)
|
1318
|
carr[i][j]=work2_mpi[j][i];
|
1319
|
carr[i][j]=work2_mpi[j][i];
|
1320
|
carr[i][csize2-j]=conj(work2_mpi[j][i]);
|
1321
|
for(i=csize1;i<vecsize1;i++)
|
1322
|
carr[vecsize1-i][csize2-j]=conj(work2_mpi[j][i]);
|
1323
|
}
|
1324
|
}
|
1325
|
|
1326
|
void FFTReal2D_mpi::Inverse(int csize1,int csize2,
|
1327
|
const MyComplex* const* carr,
|
1328
|
int rsize1,int rsize2,double** rarr)
|
1329
|
{
|
1330
|
// Initialization
|
1331
|
int i,j;
|
1332
|
int vecsize1=OC_MAX(1,2*(csize1-1));
|
1333
|
int vecsize2=csize2;
|
1334
|
FFT_REAL_TYPE x1,y1,x2,y2; // FFT unpacking scratch vars
|
1335
|
for(i=vecsize1;i>2;i/=2)
|
1336
|
if(i%2!=0) PlainError(1,"Error in FFTReal2D_mpi::Inverse(int): "
|
1337
|
"Requested csize1 - 1 (%d - 1) is not a power of 2",
|
1338
|
csize1);
|
1339
|
for(j=vecsize2;j>2;j/=2)
|
1340
|
if(j%2!=0) PlainError(1,"Error in FFTReal2D_mpi::Inverse(int): "
|
1341
|
"Requested csize2 (%d) is not a power of 2",
|
1342
|
csize2);
|
1343
|
if(vecsize1==0 || vecsize2==0) return; // Nothing to do
|
1344
|
SetupMemory_mpi(vecsize1,vecsize2);
|
1345
|
|
1346
|
|
1347
|
// Do row inverse FFT's
|
1348
|
// Handle the first & csize1'th row specially. These rows are
|
1349
|
// the DFT's of real sequences, so they each satisfy the conjugate
|
1350
|
// symmetry condition
|
1351
|
workarr_mpi[0][0]=MyComplex(carr[0][0].real(),carr[csize1-1][0].real());
|
1352
|
for(j=1;j<csize2/2;j++) {
|
1353
|
x1=carr[0][j].real(); y1=carr[0][j].imag();
|
1354
|
x2=carr[csize1-1][j].real(); y2=carr[csize1-1][j].imag();
|
1355
|
workarr_mpi[0][j] = MyComplex(x1-y2,x2+y1);
|
1356
|
workarr_mpi[0][csize2-j] = MyComplex(x1+y2,x2-y1);
|
1357
|
}
|
1358
|
workarr_mpi[0][csize2/2]=MyComplex(carr[0][csize2/2].real(),
|
1359
|
carr[csize1-1][csize2/2].real());
|
1360
|
fft2_mpi.InverseDecTime(csize2,workarr_mpi[0],1.);
|
1361
|
|
1362
|
// iFFT the remaining rows
|
1363
|
for(i=1;i<csize1-1;i++) {
|
1364
|
for(j=0;j<csize2;j++) workarr_mpi[i][j]=carr[i][j];
|
1365
|
fft2_mpi.InverseDecTime(csize2,workarr_mpi[i],1.);
|
1366
|
}
|
1367
|
|
1368
|
// Now do iFFT's on columns. These are conj. symmetric, so we
|
1369
|
// process them 2 at a time. Also, recall the 1st row of workarr
|
1370
|
// contains the iFFT's of the 1st and csize1'th row of the given carr.
|
1371
|
// Note that csize is guaranteed divisible by 2, so if rsize2 is odd
|
1372
|
// then rsize2+1<=csize2.
|
1373
|
for(j=0;j<rsize2;j+=2) {
|
1374
|
scratch_mpi[0]=
|
1375
|
MyComplex(workarr_mpi[0][j].real(),workarr_mpi[0][j+1].real());
|
1376
|
scratch_mpi[csize1-1]=
|
1377
|
MyComplex(workarr_mpi[0][j].imag(),workarr_mpi[0][j+1].imag());
|
1378
|
for(i=1;i<csize1-1;i++) {
|
1379
|
x1 =workarr_mpi[i][j].real();
|
1380
|
y1 =workarr_mpi[i][j].imag();
|
1381
|
x2 =workarr_mpi[i][j+1].real();
|
1382
|
y2 =workarr_mpi[i][j+1].imag();
|
1383
|
scratch_mpi[i] = MyComplex(x1-y2,x2+y1);
|
1384
|
scratch_mpi[vecsize1-i] = MyComplex(x1+y2,x2-y1);
|
1385
|
}
|
1386
|
fft1_mpi.InverseDecTime(vecsize1,scratch_mpi,
|
1387
|
FFT_REAL_TYPE(vecsize1*vecsize2));
|
1388
|
if(j+1<rsize2) {
|
1389
|
for(i=0;i<rsize1;i++) {
|
1390
|
rarr[i][j]=scratch_mpi[i].real();
|
1391
|
rarr[i][j+1]=scratch_mpi[i].imag();
|
1392
|
}
|
1393
|
} else {
|
1394
|
for(i=0;i<rsize1;i++) rarr[i][j]=scratch_mpi[i].real();
|
1395
|
}
|
1396
|
}
|
1397
|
}
|
1398
|
|
1399
|
|
1400
|
#endif /* USE_MPI */
|