Actual source code: agmresorthog.c

  1: #define PETSCKSP_DLL

  3: #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
  4: /*
  5:  *  This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes. These processes are organized in a virtual ring.
  6:  * References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
  7:  *
  8:  *
  9:  * initial author R. B. SIDJE,
 10:  * modified : G.-A Atenekeng-Kahou, 2008
 11:  * modified : D. NUENTSA WAKAM, 2011
 12:  *
 13:  */

 15: /*
 16:  * Take the processes that own the vectors and Organize them on a virtual ring.
 17:  */
 18: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal*, PetscReal*, PetscReal*, PetscInt);

 20: PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
 21: {
 22:   MPI_Comm       comm;
 23:   KSP_AGMRES     *agmres = (KSP_AGMRES*)(ksp->data);
 25:   PetscMPIInt    First, Last, rank, size;

 28:   PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm);
 29:   MPI_Comm_rank(comm, &rank);
 30:   MPI_Comm_size(comm, &size);
 31:   MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm);
 32:   MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm);

 34:   if ((rank != Last) && (rank == 0)) {
 35:     agmres->Ileft  = rank - 1;
 36:     agmres->Iright = rank + 1;
 37:   } else {
 38:     if (rank == Last) {
 39:       agmres->Ileft  = rank - 1;
 40:       agmres->Iright = First;
 41:     } else {
 42:       {
 43:         agmres->Ileft  = Last;
 44:         agmres->Iright = rank + 1;
 45:       }
 46:     }
 47:   }
 48:   agmres->rank  = rank;
 49:   agmres->size  = size;
 50:   agmres->First = First;
 51:   agmres->Last  = Last;
 52:   return(0);
 53: }

 55: static PetscErrorCode  KSPAGMRESRoddecGivens(PetscReal * c, PetscReal * s, PetscReal * r, PetscInt make_r)
 56: {
 57:   PetscReal a, b, t;

 60:   if (make_r == 1) {
 61:     a = *c;
 62:     b = *s;
 63:     if (b == 0.e0) {
 64:       *c = 1.e0;
 65:       *s = 0.e0;
 66:     } else {
 67:       if (PetscAbsReal(b) > PetscAbsReal(a)) {
 68:         t  = -a / b;
 69:         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 70:         *c = (*s) * t;
 71:       } else {
 72:         t  = -b / a;
 73:         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
 74:         *s = (*c) * t;
 75:       }
 76:     }
 77:     if (*c == 0.e0) {
 78:       *r = 1.e0;
 79:     } else {
 80:       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
 81:         *r = PetscSign(*c) * (*s) / 2.e0;
 82:       } else {
 83:         *r = PetscSign(*s) * 2.e0 / (*c);
 84:       }
 85:     }
 86:   }

 88:   if (*r == 1.e0) {
 89:     *c = 0.e0;
 90:     *s = 1.e0;
 91:   } else {
 92:     if (PetscAbsReal(*r) < 1.e0) {
 93:       *s = 2.e0 * (*r);
 94:       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
 95:     } else {
 96:       *c = 2.e0 / (*r);
 97:       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
 98:     }
 99:   }
100:   return(0);

102: }

104: /*
105:  * Compute the QR factorization of the Krylov basis vectors
106:  * Input :
107:  *  - the vectors are availabe in agmres->vecs (alias VEC_V)
108:  *  - nvec :  the number of vectors
109:  * Output :
110:  *  - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
111:  *  - agmres->sgn :  Sign of the rotations
112:  *  - agmres->tloc : scalar factors of the elementary reflectors.

114:  */
115: PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
116: {
117:   KSP_AGMRES     *agmres = (KSP_AGMRES*) ksp->data;
118:   MPI_Comm       comm;
119:   PetscScalar    *Qloc   = agmres->Qloc;
120:   PetscScalar    *sgn    = agmres->sgn;
121:   PetscScalar    *tloc   = agmres->tloc;
123:   PetscReal      *wbufptr = agmres->wbufptr;
124:   PetscMPIInt    rank     = agmres->rank;
125:   PetscMPIInt    First    = agmres->First;
126:   PetscMPIInt    Last     = agmres->Last;
127:   PetscBLASInt   pas,len,bnloc,bpos;
128:   PetscInt       nloc,d, i, j, k;
129:   PetscInt       pos;
130:   PetscReal      c, s, rho, Ajj, val, tt, old;
131:   PetscScalar    *col;
132:   MPI_Status     status;
133:   PetscBLASInt   N = MAXKSPSIZE + 1;

136:   PetscObjectGetComm((PetscObject)ksp,&comm);
137:   PetscLogEventBegin(KSP_AGMRESRoddec,ksp,0,0,0);
138:   PetscArrayzero(agmres->Rloc, N*N);
139:   /* check input arguments */
140:   if (nvec < 1) SETERRQ(PetscObjectComm((PetscObject)ksp),PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors shoud be positive");
141:   VecGetLocalSize(VEC_V(0), &nloc);
142:   PetscBLASIntCast(nloc,&bnloc);
143:   if (nvec > nloc) SETERRQ(PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
144:   pas = 1;
145:   /* Copy the vectors of the basis */
146:   for (j = 0; j < nvec; j++) {
147:     VecGetArray(VEC_V(j), &col);
148:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnloc, col, &pas, &Qloc[j*nloc], &pas));
149:     VecRestoreArray(VEC_V(j), &col);
150:   }
151:   /* Each process performs a local QR on its own block */
152:   for (j = 0; j < nvec; j++) {
153:     len = nloc - j;
154:     Ajj = Qloc[j*nloc+j];
155:     PetscStackCallBLAS("BLASnrm2",rho = -PetscSign(Ajj) * BLASnrm2_(&len, &(Qloc[j*nloc+j]), &pas));
156:     if (rho == 0.0) tloc[j] = 0.0;
157:     else {
158:       tloc[j] = (Ajj - rho) / rho;
159:       len     = len - 1;
160:       val     = 1.0 / (Ajj - rho);
161:       PetscStackCallBLAS("BLASscal",BLASscal_(&len, &val, &(Qloc[j*nloc+j+1]), &pas));
162:       Qloc[j*nloc+j] = 1.0;
163:       len            = len + 1;
164:       for (k = j + 1; k < nvec; k++) {
165:         PetscStackCallBLAS("BLASdot",tt = tloc[j] * BLASdot_(&len, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
166:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&len, &tt, &(Qloc[j*nloc+j]), &pas, &(Qloc[k*nloc+j]), &pas));
167:       }
168:       Qloc[j*nloc+j] = rho;
169:     }
170:   }
171:   /* annihilate undesirable Rloc, diagonal by diagonal*/
172:   for (d = 0; d < nvec; d++) {
173:     len = nvec - d;
174:     if (rank == First) {
175:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(Qloc[d*nloc+d]), &bnloc, &(wbufptr[d]), &pas));
176:       MPI_Send(&(wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
177:     } else {
178:       MPI_Recv(&(wbufptr[d]), len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status);
179:       /* Elimination of Rloc(1,d)*/
180:       c    = wbufptr[d];
181:       s    = Qloc[d*nloc];
182:       KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
183:       /* Apply Givens Rotation*/
184:       for (k = d; k < nvec; k++) {
185:         old          = wbufptr[k];
186:         wbufptr[k]   =  c * old - s * Qloc[k*nloc];
187:         Qloc[k*nloc] =  s * old + c * Qloc[k*nloc];
188:       }
189:       Qloc[d*nloc] = rho;
190:       if (rank != Last) {
191:         MPI_Send(& (wbufptr[d]), len, MPIU_SCALAR, rank + 1, agmres->tag, comm);
192:       }
193:       /* zero-out the d-th diagonal of Rloc ...*/
194:       for (j = d + 1; j < nvec; j++) {
195:         /* elimination of Rloc[i][j]*/
196:         i    = j - d;
197:         c    = Qloc[j*nloc+i-1];
198:         s    = Qloc[j*nloc+i];
199:         KSPAGMRESRoddecGivens(&c, &s, &rho, 1);
200:         for (k = j; k < nvec; k++) {
201:           old              = Qloc[k*nloc+i-1];
202:           Qloc[k*nloc+i-1] = c * old - s * Qloc[k*nloc+i];
203:           Qloc[k*nloc+i]   =   s * old + c * Qloc[k*nloc+i];
204:         }
205:         Qloc[j*nloc+i] = rho;
206:       }
207:       if (rank == Last) {
208:         PetscStackCallBLAS("BLAScopy",BLAScopy_(&len, &(wbufptr[d]), &pas, RLOC(d,d), &N));
209:         for (k = d + 1; k < nvec; k++) *RLOC(k,d) = 0.0;
210:       }
211:     }
212:   }

214:   if (rank == Last) {
215:     for (d = 0; d < nvec; d++) {
216:       pos    = nvec - d;
217:       PetscBLASIntCast(pos,&bpos);
218:       sgn[d] = PetscSign(*RLOC(d,d));
219:       PetscStackCallBLAS("BLASscal",BLASscal_(&bpos, &(sgn[d]), RLOC(d,d), &N));
220:     }
221:   }
222:   /* BroadCast Rloc to all other processes
223:    * NWD : should not be needed
224:    */
225:   MPI_Bcast(agmres->Rloc,N*N,MPIU_SCALAR,Last,comm);
226:   PetscLogEventEnd(KSP_AGMRESRoddec,ksp,0,0,0);
227:   return(0);
228: }

230: /*
231:  *  Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
232:  *  Input
233:  *   - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
234:  *   - In : input vector (size nvec)
235:  *  Output :
236:  *   - Out : Petsc vector (distributed as the basis vectors)
237:  */
238: PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
239: {
240:   KSP_AGMRES     *agmres  = (KSP_AGMRES*) ksp->data;
241:   MPI_Comm       comm;
242:   PetscScalar    *Qloc    = agmres->Qloc;
243:   PetscScalar    *sgn     = agmres->sgn;
244:   PetscScalar    *tloc    = agmres->tloc;
245:   PetscMPIInt    rank     = agmres->rank;
246:   PetscMPIInt    First    = agmres->First, Last = agmres->Last;
247:   PetscMPIInt    Iright   = agmres->Iright, Ileft = agmres->Ileft;
248:   PetscScalar    *y, *zloc;
250:   PetscInt       nloc,d, len, i, j;
251:   PetscBLASInt   bnvec,pas,blen;
252:   PetscInt       dpt;
253:   PetscReal      c, s, rho, zp, zq, yd=0.0, tt;
254:   MPI_Status     status;

257:   PetscBLASIntCast(nvec,&bnvec);
258:   PetscObjectGetComm((PetscObject)ksp,&comm);
259:   pas  = 1;
260:   VecGetLocalSize(VEC_V(0), &nloc);
261:   PetscMalloc1(nvec, &y);
262:   PetscArraycpy(y, In, nvec);
263:   VecGetArray(Out, &zloc);

265:   if (rank == Last) {
266:     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
267:   }
268:   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
269:   if (agmres->size == 1) PetscStackCallBLAS("BLAScopy",BLAScopy_(&bnvec, y, &pas, &(zloc[0]), &pas));
270:   else {
271:     for (d = nvec - 1; d >= 0; d--) {
272:       if (rank == First) {
273:         MPI_Recv(&(zloc[d]), 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
274:       } else {
275:         for (j = nvec - 1; j >= d + 1; j--) {
276:           i         = j - d;
277:           KSPAGMRESRoddecGivens(&c, &s, &(Qloc[j * nloc + i]), 0);
278:           zp        = zloc[i-1];
279:           zq        = zloc[i];
280:           zloc[i-1] =     c * zp + s * zq;
281:           zloc[i]   =     -s * zp + c * zq;
282:         }
283:         KSPAGMRESRoddecGivens(&c, &s, &(Qloc[d * nloc]), 0);
284:         if (rank == Last) {
285:           zp      = y[d];
286:           zq      = zloc[0];
287:           y[d]    =      c * zp + s * zq;
288:           zloc[0] =   -s * zp + c * zq;
289:           MPI_Send(&(y[d]), 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
290:         } else {
291:           MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status);
292:           zp      = yd;
293:           zq      = zloc[0];
294:           yd      =      c * zp + s * zq;
295:           zloc[0] =   -s * zp + c * zq;
296:           MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm);
297:         }
298:       }
299:     }
300:   }
301:   for (j = nvec - 1; j >= 0; j--) {
302:     dpt = j * nloc + j;
303:     if (tloc[j] != 0.0) {
304:       len       = nloc - j;
305:       PetscBLASIntCast(len,&blen);
306:       rho       = Qloc[dpt];
307:       Qloc[dpt] = 1.0;
308:       tt        = tloc[j] * (BLASdot_(&blen, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
309:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&blen, &tt, &(Qloc[dpt]), &pas, &(zloc[j]), &pas));
310:       Qloc[dpt] = rho;
311:     }
312:   }
313:   VecRestoreArray(Out, &zloc);
314:   PetscFree(y);
315:   return(0);
316: }