Actual source code: gs.c


  2: /***********************************gs.c***************************************

  4: Author: Henry M. Tufo III

  6: e-mail: hmt@cs.brown.edu

  8: snail-mail:
  9: Division of Applied Mathematics
 10: Brown University
 11: Providence, RI 02912

 13: Last Modification:
 14: 6.21.97
 15: ************************************gs.c**************************************/

 17: /***********************************gs.c***************************************
 18: File Description:
 19: -----------------

 21: ************************************gs.c**************************************/

 23: #include <../src/ksp/pc/impls/tfs/tfs.h>

 25: /* default length of number of items via tree - doubles if exceeded */
 26: #define TREE_BUF_SZ 2048;
 27: #define GS_VEC_SZ   1

 29: /***********************************gs.c***************************************
 30: Type: struct gather_scatter_id
 31: ------------------------------

 33: ************************************gs.c**************************************/
 34: typedef struct gather_scatter_id {
 35:   PetscInt    id;
 36:   PetscInt    nel_min;
 37:   PetscInt    nel_max;
 38:   PetscInt    nel_sum;
 39:   PetscInt    negl;
 40:   PetscInt    gl_max;
 41:   PetscInt    gl_min;
 42:   PetscInt    repeats;
 43:   PetscInt    ordered;
 44:   PetscInt    positive;
 45:   PetscScalar *vals;

 47:   /* bit mask info */
 48:   PetscInt *my_proc_mask;
 49:   PetscInt mask_sz;
 50:   PetscInt *ngh_buf;
 51:   PetscInt ngh_buf_sz;
 52:   PetscInt *nghs;
 53:   PetscInt num_nghs;
 54:   PetscInt max_nghs;
 55:   PetscInt *pw_nghs;
 56:   PetscInt num_pw_nghs;
 57:   PetscInt *tree_nghs;
 58:   PetscInt num_tree_nghs;

 60:   PetscInt num_loads;

 62:   /* repeats == true -> local info */
 63:   PetscInt nel;         /* number of unique elememts */
 64:   PetscInt *elms;       /* of size nel */
 65:   PetscInt nel_total;
 66:   PetscInt *local_elms; /* of size nel_total */
 67:   PetscInt *companion;  /* of size nel_total */

 69:   /* local info */
 70:   PetscInt num_local_total;
 71:   PetscInt local_strength;
 72:   PetscInt num_local;
 73:   PetscInt *num_local_reduce;
 74:   PetscInt **local_reduce;
 75:   PetscInt num_local_gop;
 76:   PetscInt *num_gop_local_reduce;
 77:   PetscInt **gop_local_reduce;

 79:   /* pairwise info */
 80:   PetscInt    level;
 81:   PetscInt    num_pairs;
 82:   PetscInt    max_pairs;
 83:   PetscInt    loc_node_pairs;
 84:   PetscInt    max_node_pairs;
 85:   PetscInt    min_node_pairs;
 86:   PetscInt    avg_node_pairs;
 87:   PetscInt    *pair_list;
 88:   PetscInt    *msg_sizes;
 89:   PetscInt    **node_list;
 90:   PetscInt    len_pw_list;
 91:   PetscInt    *pw_elm_list;
 92:   PetscScalar *pw_vals;

 94:   MPI_Request *msg_ids_in;
 95:   MPI_Request *msg_ids_out;

 97:   PetscScalar *out;
 98:   PetscScalar *in;
 99:   PetscInt    msg_total;

101:   /* tree - crystal accumulator info */
102:   PetscInt max_left_over;
103:   PetscInt *pre;
104:   PetscInt *in_num;
105:   PetscInt *out_num;
106:   PetscInt **in_list;
107:   PetscInt **out_list;

109:   /* new tree work*/
110:   PetscInt    tree_nel;
111:   PetscInt    *tree_elms;
112:   PetscScalar *tree_buf;
113:   PetscScalar *tree_work;

115:   PetscInt tree_map_sz;
116:   PetscInt *tree_map_in;
117:   PetscInt *tree_map_out;

119:   /* current memory status */
120:   PetscInt gl_bss_min;
121:   PetscInt gl_perm_min;

123:   /* max segment size for PCTFS_gs_gop_vec() */
124:   PetscInt vec_sz;

126:   /* hack to make paul happy */
127:   MPI_Comm PCTFS_gs_comm;

129: } PCTFS_gs_id;

131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);

138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);

149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);

152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);

156: /* global vars */
157: /* from comm.c module */

159: static PetscInt num_gs_ids = 0;

161: /* should make this dynamic ... later */
162: static PetscInt msg_buf    =MAX_MSG_BUF;
163: static PetscInt vec_sz     =GS_VEC_SZ;
164: static PetscInt *tree_buf  =NULL;
165: static PetscInt tree_buf_sz=0;
166: static PetscInt ntree      =0;

168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
172:   vec_sz = size;
173:   return(0);
174: }

176: /******************************************************************************/
177: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178: {
180:   msg_buf = buf_size;
181:   return(0);
182: }

184: /******************************************************************************/
185: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186: {
187:   PCTFS_gs_id    *gs;
188:   MPI_Group      PCTFS_gs_group;
189:   MPI_Comm       PCTFS_gs_comm;

192:   /* ensure that communication package has been initialized */
193:   PCTFS_comm_init();

195:   /* determines if we have enough dynamic/semi-static memory */
196:   /* checks input, allocs and sets gd_id template            */
197:   gs = gsi_check_args(elms,nel,level);

199:   /* only bit mask version up and working for the moment    */
200:   /* LATER :: get int list version working for sparse pblms */
201:   gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

203:   MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
204:   MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
205:   MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);

207:   gs->PCTFS_gs_comm=PCTFS_gs_comm;

209:   return(gs);
210: }

212: /******************************************************************************/
213: static PCTFS_gs_id *gsi_new(void)
214: {
216:   PCTFS_gs_id    *gs;
217:   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
218:   PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
219:   return(gs);
220: }

222: /******************************************************************************/
223: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
224: {
225:   PetscInt       i, j, k, t2;
226:   PetscInt       *companion, *elms, *unique, *iptr;
227:   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
228:   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
229:   PetscInt       vals[sizeof(oprs)/sizeof(oprs[0])-1];
230:   PetscInt       work[sizeof(oprs)/sizeof(oprs[0])-1];
231:   PCTFS_gs_id    *gs;

234:   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
235:   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");

237:   if (nel==0) { PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }

239:   /* get space for gs template */
240:   gs     = gsi_new();
241:   gs->id = ++num_gs_ids;

243:   /* hmt 6.4.99                                            */
244:   /* caller can set global ids that don't participate to 0 */
245:   /* PCTFS_gs_init ignores all zeros in elm list                 */
246:   /* negative global ids are still invalid                 */
247:   for (i=j=0; i<nel; i++) {
248:     if (in_elms[i]!=0) j++;
249:   }

251:   k=nel; nel=j;

253:   /* copy over in_elms list and create inverse map */
254:   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
255:   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));

257:   for (i=j=0; i<k; i++) {
258:     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
259:   }

261:   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");

263:   /* pre-pass ... check to see if sorted */
264:   elms[nel] = INT_MAX;
265:   iptr      = elms;
266:   unique    = elms+1;
267:   j         =0;
268:   while (*iptr!=INT_MAX) {
269:     if (*iptr++>*unique++) { j=1; break; }
270:   }

272:   /* set up inverse map */
273:   if (j) {
274:     PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
275:     PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
276:   } else { PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
277:   elms[nel] = INT_MIN;

279:   /* first pass */
280:   /* determine number of unique elements, check pd */
281:   for (i=k=0; i<nel; i+=j) {
282:     t2 = elms[i];
283:     j  = ++i;

285:     /* clump 'em for now */
286:     while (elms[j]==t2) j++;

288:     /* how many together and num local */
289:     if (j-=i) { num_local++; k+=j; }
290:   }

292:   /* how many unique elements? */
293:   gs->repeats = k;
294:   gs->nel     = nel-k;

296:   /* number of repeats? */
297:   gs->num_local        = num_local;
298:   num_local           += 2;
299:   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
300:   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));

302:   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
303:   gs->elms       = unique;
304:   gs->nel_total  = nel;
305:   gs->local_elms = elms;
306:   gs->companion  = companion;

308:   /* compess map as well as keep track of local ops */
309:   for (num_local=i=j=0; i<gs->nel; i++) {
310:     k            = j;
311:     t2           = unique[i] = elms[j];
312:     companion[i] = companion[j];

314:     while (elms[j]==t2) j++;

316:     if ((t2=(j-k))>1) {
317:       /* number together */
318:       num_to_reduce[num_local] = t2++;

320:       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));

322:       /* to use binary searching don't remap until we check intersection */
323:       *iptr++ = i;

325:       /* note that we're skipping the first one */
326:       while (++k<j) *(iptr++) = companion[k];
327:       *iptr = -1;
328:     }
329:   }

331:   /* sentinel for ngh_buf */
332:   unique[gs->nel]=INT_MAX;

334:   /* for two partition sort hack */
335:   num_to_reduce[num_local]   = 0;
336:   local_reduce[num_local]    = NULL;
337:   num_to_reduce[++num_local] = 0;
338:   local_reduce[num_local]    = NULL;

340:   /* load 'em up */
341:   /* note one extra to hold NON_UNIFORM flag!!! */
342:   vals[2] = vals[1] = vals[0] = nel;
343:   if (gs->nel>0) {
344:     vals[3] = unique[0];
345:     vals[4] = unique[gs->nel-1];
346:   } else {
347:     vals[3] = INT_MAX;
348:     vals[4] = INT_MIN;
349:   }
350:   vals[5] = level;
351:   vals[6] = num_gs_ids;

353:   /* GLOBAL: send 'em out */
354:   PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);

356:   /* must be semi-pos def - only pairwise depends on this */
357:   /* LATER - remove this restriction */
358:   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
359:   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");

361:   gs->nel_min = vals[0];
362:   gs->nel_max = vals[1];
363:   gs->nel_sum = vals[2];
364:   gs->gl_min  = vals[3];
365:   gs->gl_max  = vals[4];
366:   gs->negl    = vals[4]-vals[3]+1;

368:   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");

370:   /* LATER :: add level == -1 -> program selects level */
371:   if (vals[5]<0) vals[5]=0;
372:   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
373:   gs->level = vals[5];

375:   return(gs);
376: }

378: /******************************************************************************/
379: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
380: {
381:   PetscInt       i, nel, *elms;
382:   PetscInt       t1;
383:   PetscInt       **reduce;
384:   PetscInt       *map;

388:   /* totally local removes ... PCTFS_ct_bits == 0 */
389:   get_ngh_buf(gs);

391:   if (gs->level) set_pairwise(gs);
392:   if (gs->max_left_over) set_tree(gs);

394:   /* intersection local and pairwise/tree? */
395:   gs->num_local_total      = gs->num_local;
396:   gs->gop_local_reduce     = gs->local_reduce;
397:   gs->num_gop_local_reduce = gs->num_local_reduce;

399:   map = gs->companion;

401:   /* is there any local compression */
402:   if (!gs->num_local) {
403:     gs->local_strength = NONE;
404:     gs->num_local_gop  = 0;
405:   } else {
406:     /* ok find intersection */
407:     map    = gs->companion;
408:     reduce = gs->local_reduce;
409:     for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
410:       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
411:         t1++;
412:         if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
413:         gs->num_local_reduce[i] *= -1;
414:       }
415:       **reduce=map[**reduce];
416:     }

418:     /* intersection is empty */
419:     if (!t1) {
420:       gs->local_strength = FULL;
421:       gs->num_local_gop  = 0;
422:     } else { /* intersection not empty */
423:       gs->local_strength = PARTIAL;

425:       PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);

427:       gs->num_local_gop        = t1;
428:       gs->num_local_total      =  gs->num_local;
429:       gs->num_local           -= t1;
430:       gs->gop_local_reduce     = gs->local_reduce;
431:       gs->num_gop_local_reduce = gs->num_local_reduce;

433:       for (i=0; i<t1; i++) {
434:         if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
435:         gs->num_gop_local_reduce[i] *= -1;
436:         gs->local_reduce++;
437:         gs->num_local_reduce++;
438:       }
439:       gs->local_reduce++;
440:       gs->num_local_reduce++;
441:     }
442:   }

444:   elms = gs->pw_elm_list;
445:   nel  = gs->len_pw_list;
446:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

448:   elms = gs->tree_map_in;
449:   nel  = gs->tree_map_sz;
450:   for (i=0; i<nel; i++) elms[i] = map[elms[i]];

452:   /* clean up */
453:   free((void*) gs->local_elms);
454:   free((void*) gs->companion);
455:   free((void*) gs->elms);
456:   free((void*) gs->ngh_buf);
457:   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
458:   return(0);
459: }

461: /******************************************************************************/
462: static PetscErrorCode place_in_tree(PetscInt elm)
463: {
464:   PetscInt *tp, n;

467:   if (ntree==tree_buf_sz) {
468:     if (tree_buf_sz) {
469:       tp           = tree_buf;
470:       n            = tree_buf_sz;
471:       tree_buf_sz<<=1;
472:       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
473:       PCTFS_ivec_copy(tree_buf,tp,n);
474:       free(tp);
475:     } else {
476:       tree_buf_sz = TREE_BUF_SZ;
477:       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
478:     }
479:   }

481:   tree_buf[ntree++] = elm;
482:   return(0);
483: }

485: /******************************************************************************/
486: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
487: {
488:   PetscInt       i, j, npw=0, ntree_map=0;
489:   PetscInt       p_mask_size, ngh_buf_size, buf_size;
490:   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
491:   PetscInt       *ngh_buf, *buf1, *buf2;
492:   PetscInt       offset, per_load, num_loads, or_ct, start, end;
493:   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
494:   PetscInt       oper=GL_B_OR;
495:   PetscInt       *ptr3, *t_mask, level, ct1, ct2;

499:   /* to make life easier */
500:   nel   = gs->nel;
501:   elms  = gs->elms;
502:   level = gs->level;

504:   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505:   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
506:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

508:   /* allocate space for masks and info bufs */
509:   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
510:   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
511:   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
512:   t_mask         = (PetscInt*) malloc(p_mask_size);
513:   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);

515:   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516:   /* had thought I could exploit rendezvous threshold */

518:   /* default is one pass */
519:   per_load      = negl  = gs->negl;
520:   gs->num_loads = num_loads = 1;
521:   i             = p_mask_size*negl;

523:   /* possible overflow on buffer size */
524:   /* overflow hack                    */
525:   if (i<0) i=INT_MAX;

527:   buf_size = PetscMin(msg_buf,i);

529:   /* can we do it? */
530:   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);

532:   /* get PCTFS_giop buf space ... make *only* one malloc */
533:   buf1 = (PetscInt*) malloc(buf_size<<1);

535:   /* more than one gior exchange needed? */
536:   if (buf_size!=i) {
537:     per_load      = buf_size/p_mask_size;
538:     buf_size      = per_load*p_mask_size;
539:     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
540:   }

542:   /* convert buf sizes from #bytes to #ints - 32 bit only! */
543:   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);

545:   /* find PCTFS_giop work space */
546:   buf2 = buf1+buf_size;

548:   /* hold #ints needed for processor masks */
549:   gs->mask_sz=p_mask_size;

551:   /* init buffers */
552:   PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
553:   PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
554:   PCTFS_ivec_zero(ngh_buf,ngh_buf_size);

556:   /* HACK reset tree info */
557:   tree_buf    = NULL;
558:   tree_buf_sz = ntree = 0;

560:   /* ok do it */
561:   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
562:     /* identity for bitwise or is 000...000 */
563:     PCTFS_ivec_zero(buf1,buf_size);

565:     /* load msg buffer */
566:     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
567:       offset = (offset-start)*p_mask_size;
568:       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
569:     }

571:     /* GLOBAL: pass buffer */
572:     PCTFS_giop(buf1,buf2,buf_size,&oper);

574:     /* unload buffer into ngh_buf */
575:     ptr2=(elms+i_start);
576:     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
577:       /* I own it ... may have to pairwise it */
578:       if (j==*ptr2) {
579:         /* do i share it w/anyone? */
580:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
581:         /* guess not */
582:         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }

584:         /* i do ... so keep info and turn off my bit */
585:         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
586:         PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
587:         PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);

589:         /* is it to be done pairwise? */
590:         if (--ct1<=level) {
591:           npw++;

593:           /* turn on high bit to indicate pw need to process */
594:           *ptr2++ |= TOP_BIT;
595:           PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
596:           ptr1    += p_mask_size;
597:           continue;
598:         }

600:         /* get set for next and note that I have a tree contribution */
601:         /* could save exact elm index for tree here -> save a search */
602:         ptr2++; ptr1+=p_mask_size; ntree_map++;
603:       } else { /* i don't but still might be involved in tree */

605:         /* shared by how many? */
606:         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));

608:         /* none! */
609:         if (ct1<2) continue;

611:         /* is it going to be done pairwise? but not by me of course!*/
612:         if (--ct1<=level) continue;
613:       }
614:       /* LATER we're going to have to process it NOW */
615:       /* nope ... tree it */
616:       place_in_tree(j);
617:     }
618:   }

620:   free((void*)t_mask);
621:   free((void*)buf1);

623:   gs->len_pw_list = npw;
624:   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));

626:   /* expand from bit mask list to int list and save ngh list */
627:   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
628:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);

630:   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));

632:   oper         = GL_MAX;
633:   ct1          = gs->num_nghs;
634:   PCTFS_giop(&ct1,&ct2,1,&oper);
635:   gs->max_nghs = ct1;

637:   gs->tree_map_sz  = ntree_map;
638:   gs->max_left_over=ntree;

640:   free((void*)p_mask);
641:   free((void*)sh_proc_mask);
642:   return(0);
643: }

645: /******************************************************************************/
646: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
647: {
648:   PetscInt       i, j;
649:   PetscInt       p_mask_size;
650:   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
651:   PetscInt       *ngh_buf, *buf2;
652:   PetscInt       offset;
653:   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
654:   PetscInt       *pairwise_elm_list, len_pair_list=0;
655:   PetscInt       *iptr, t1, i_start, nel, *elms;
656:   PetscInt       ct;

660:   /* to make life easier */
661:   nel          = gs->nel;
662:   elms         = gs->elms;
663:   ngh_buf      = gs->ngh_buf;
664:   sh_proc_mask = gs->pw_nghs;

666:   /* need a few temp masks */
667:   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
668:   p_mask        = (PetscInt*) malloc(p_mask_size);
669:   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);

671:   /* set mask to my PCTFS_my_id's bit mask */
672:   PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);

674:   p_mask_size /= sizeof(PetscInt);

676:   len_pair_list   = gs->len_pw_list;
677:   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));

679:   /* how many processors (nghs) do we have to exchange with? */
680:   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));

682:   /* allocate space for PCTFS_gs_gop() info */
683:   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
684:   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
685:   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));

687:   /* init msg_size list */
688:   PCTFS_ivec_zero(msg_size,nprs);

690:   /* expand from bit mask list to int list */
691:   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);

693:   /* keep list of elements being handled pairwise */
694:   for (i=j=0; i<nel; i++) {
695:     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
696:   }
697:   pairwise_elm_list[j] = -1;

699:   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
700:   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
701:   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
702:   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
703:   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);

705:   /* find who goes to each processor */
706:   for (i_start=i=0; i<nprs; i++) {
707:     /* processor i's mask */
708:     PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);

710:     /* det # going to processor i */
711:     for (ct=j=0; j<len_pair_list; j++) {
712:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
713:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
714:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
715:     }
716:     msg_size[i] = ct;
717:     i_start     = PetscMax(i_start,ct);

719:     /*space to hold nodes in message to first neighbor */
720:     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));

722:     for (j=0;j<len_pair_list;j++) {
723:       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
724:       PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
725:       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
726:     }
727:     *iptr = -1;
728:   }
729:   msg_nodes[nprs] = NULL;

731:   j                  = gs->loc_node_pairs=i_start;
732:   t1                 = GL_MAX;
733:   PCTFS_giop(&i_start,&offset,1,&t1);
734:   gs->max_node_pairs = i_start;

736:   i_start            = j;
737:   t1                 = GL_MIN;
738:   PCTFS_giop(&i_start,&offset,1,&t1);
739:   gs->min_node_pairs = i_start;

741:   i_start            = j;
742:   t1                 = GL_ADD;
743:   PCTFS_giop(&i_start,&offset,1,&t1);
744:   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;

746:   i_start = nprs;
747:   t1      = GL_MAX;
748:   PCTFS_giop(&i_start,&offset,1,&t1);
749:   gs->max_pairs = i_start;

751:   /* remap pairwise in tail of gsi_via_bit_mask() */
752:   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
753:   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
754:   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);

756:   /* reset malloc pool */
757:   free((void*)p_mask);
758:   free((void*)tmp_proc_mask);
759:   return(0);
760: }

762: /* to do pruned tree just save ngh buf copy for each one and decode here!
763: ******************************************************************************/
764: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
765: {
766:   PetscInt i, j, n, nel;
767:   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;

770:   /* local work ptrs */
771:   elms = gs->elms;
772:   nel  = gs->nel;

774:   /* how many via tree */
775:   gs->tree_nel     = n = ntree;
776:   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
777:   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
778:   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
779:   j                = gs->tree_map_sz;
780:   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
781:   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));

783:   /* search the longer of the two lists */
784:   /* note ... could save this info in get_ngh_buf and save searches */
785:   if (n<=nel) {
786:     /* bijective fct w/remap - search elm list */
787:     for (i=0; i<n; i++) {
788:       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
789:     }
790:   } else {
791:     for (i=0; i<nel; i++) {
792:       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
793:     }
794:   }

796:   /* sentinel */
797:   *iptr_in = *iptr_out = -1;
798:   return(0);
799: }

801: /******************************************************************************/
802: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
803: {
804:   PetscInt    *num, *map, **reduce;
805:   PetscScalar tmp;

808:   num    = gs->num_gop_local_reduce;
809:   reduce = gs->gop_local_reduce;
810:   while ((map = *reduce++)) {
811:     /* wall */
812:     if (*num == 2) {
813:       num++;
814:       vals[map[1]] = vals[map[0]];
815:     } else if (*num == 3) { /* corner shared by three elements */
816:       num++;
817:       vals[map[2]] = vals[map[1]] = vals[map[0]];
818:     } else if (*num == 4) { /* corner shared by four elements */
819:       num++;
820:       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
821:     } else { /* general case ... odd geoms ... 3D*/
822:       num++;
823:       tmp = *(vals + *map++);
824:       while (*map >= 0) *(vals + *map++) = tmp;
825:     }
826:   }
827:   return(0);
828: }

830: /******************************************************************************/
831: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
832: {
833:   PetscInt    *num, *map, **reduce;
834:   PetscScalar tmp;

837:   num    = gs->num_local_reduce;
838:   reduce = gs->local_reduce;
839:   while ((map = *reduce)) {
840:     /* wall */
841:     if (*num == 2) {
842:       num++; reduce++;
843:       vals[map[1]] = vals[map[0]] += vals[map[1]];
844:     } else if (*num == 3) { /* corner shared by three elements */
845:       num++; reduce++;
846:       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
847:     } else if (*num == 4) { /* corner shared by four elements */
848:       num++; reduce++;
849:       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
850:     } else { /* general case ... odd geoms ... 3D*/
851:       num++;
852:       tmp = 0.0;
853:       while (*map >= 0) tmp += *(vals + *map++);

855:       map = *reduce++;
856:       while (*map >= 0) *(vals + *map++) = tmp;
857:     }
858:   }
859:   return(0);
860: }

862: /******************************************************************************/
863: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
864: {
865:   PetscInt    *num, *map, **reduce;
866:   PetscScalar *base;

869:   num    = gs->num_gop_local_reduce;
870:   reduce = gs->gop_local_reduce;
871:   while ((map = *reduce++)) {
872:     /* wall */
873:     if (*num == 2) {
874:       num++;
875:       vals[map[0]] += vals[map[1]];
876:     } else if (*num == 3) { /* corner shared by three elements */
877:       num++;
878:       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
879:     } else if (*num == 4) { /* corner shared by four elements */
880:       num++;
881:       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
882:     } else { /* general case ... odd geoms ... 3D*/
883:       num++;
884:       base = vals + *map++;
885:       while (*map >= 0) *base += *(vals + *map++);
886:     }
887:   }
888:   return(0);
889: }

891: /******************************************************************************/
892: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
893: {
894:   PetscInt       i;

898:   MPI_Comm_free(&gs->PCTFS_gs_comm);
899:   if (gs->nghs) free((void*) gs->nghs);
900:   if (gs->pw_nghs) free((void*) gs->pw_nghs);

902:   /* tree */
903:   if (gs->max_left_over) {
904:     if (gs->tree_elms) free((void*) gs->tree_elms);
905:     if (gs->tree_buf) free((void*) gs->tree_buf);
906:     if (gs->tree_work) free((void*) gs->tree_work);
907:     if (gs->tree_map_in) free((void*) gs->tree_map_in);
908:     if (gs->tree_map_out) free((void*) gs->tree_map_out);
909:   }

911:   /* pairwise info */
912:   if (gs->num_pairs) {
913:     /* should be NULL already */
914:     if (gs->ngh_buf) free((void*) gs->ngh_buf);
915:     if (gs->elms) free((void*) gs->elms);
916:     if (gs->local_elms) free((void*) gs->local_elms);
917:     if (gs->companion) free((void*) gs->companion);

919:     /* only set if pairwise */
920:     if (gs->vals) free((void*) gs->vals);
921:     if (gs->in) free((void*) gs->in);
922:     if (gs->out) free((void*) gs->out);
923:     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
924:     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
925:     if (gs->pw_vals) free((void*) gs->pw_vals);
926:     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
927:     if (gs->node_list) {
928:       for (i=0;i<gs->num_pairs;i++) {
929:         if (gs->node_list[i])  {
930:           free((void*) gs->node_list[i]);
931:         }
932:       }
933:       free((void*) gs->node_list);
934:     }
935:     if (gs->msg_sizes) free((void*) gs->msg_sizes);
936:     if (gs->pair_list) free((void*) gs->pair_list);
937:   }

939:   /* local info */
940:   if (gs->num_local_total>=0) {
941:     for (i=0;i<gs->num_local_total+1;i++) {
942:       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
943:     }
944:   }

946:   /* if intersection tree/pairwise and local isn't empty */
947:   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
948:   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);

950:   free((void*) gs);
951:   return(0);
952: }

954: /******************************************************************************/
955: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
956: {

960:   switch (*op) {
961:   case '+':
962:     PCTFS_gs_gop_vec_plus(gs,vals,step);
963:     break;
964:   default:
965:     PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
966:     PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
967:     PCTFS_gs_gop_vec_plus(gs,vals,step);
968:     break;
969:   }
970:   return(0);
971: }

973: /******************************************************************************/
974: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
975: {
977:   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");

979:   /* local only operations!!! */
980:   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);

982:   /* if intersection tree/pairwise and local isn't empty */
983:   if (gs->num_local_gop) {
984:     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);

986:     /* pairwise */
987:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

989:     /* tree */
990:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);

992:     PCTFS_gs_gop_vec_local_out(gs,vals,step);
993:   } else { /* if intersection tree/pairwise and local is empty */
994:     /* pairwise */
995:     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);

997:     /* tree */
998:     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
999:   }
1000:   return(0);
1001: }

1003: /******************************************************************************/
1004: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1005: {
1006:   PetscInt    *num, *map, **reduce;
1007:   PetscScalar *base;

1010:   num    = gs->num_local_reduce;
1011:   reduce = gs->local_reduce;
1012:   while ((map = *reduce)) {
1013:     base = vals + map[0] * step;

1015:     /* wall */
1016:     if (*num == 2) {
1017:       num++; reduce++;
1018:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1019:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1020:     } else if (*num == 3) { /* corner shared by three elements */
1021:       num++; reduce++;
1022:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1023:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1024:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1025:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1026:     } else if (*num == 4) { /* corner shared by four elements */
1027:       num++; reduce++;
1028:       PCTFS_rvec_add (base,vals+map[1]*step,step);
1029:       PCTFS_rvec_add (base,vals+map[2]*step,step);
1030:       PCTFS_rvec_add (base,vals+map[3]*step,step);
1031:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1032:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1033:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1034:     } else { /* general case ... odd geoms ... 3D */
1035:       num++;
1036:       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);

1038:       map = *reduce;
1039:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);

1041:       reduce++;
1042:     }
1043:   }
1044:   return(0);
1045: }

1047: /******************************************************************************/
1048: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1049: {
1050:   PetscInt    *num, *map, **reduce;
1051:   PetscScalar *base;

1054:   num    = gs->num_gop_local_reduce;
1055:   reduce = gs->gop_local_reduce;
1056:   while ((map = *reduce++)) {
1057:     base = vals + map[0] * step;

1059:     /* wall */
1060:     if (*num == 2) {
1061:       num++;
1062:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1063:     } else if (*num == 3) { /* corner shared by three elements */
1064:       num++;
1065:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1066:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1067:     } else if (*num == 4) { /* corner shared by four elements */
1068:       num++;
1069:       PCTFS_rvec_add(base,vals+map[1]*step,step);
1070:       PCTFS_rvec_add(base,vals+map[2]*step,step);
1071:       PCTFS_rvec_add(base,vals+map[3]*step,step);
1072:     } else { /* general case ... odd geoms ... 3D*/
1073:       num++;
1074:       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1075:     }
1076:   }
1077:   return(0);
1078: }

1080: /******************************************************************************/
1081: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1082: {
1083:   PetscInt    *num, *map, **reduce;
1084:   PetscScalar *base;

1087:   num    = gs->num_gop_local_reduce;
1088:   reduce = gs->gop_local_reduce;
1089:   while ((map = *reduce++)) {
1090:     base = vals + map[0] * step;

1092:     /* wall */
1093:     if (*num == 2) {
1094:       num++;
1095:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1096:     } else if (*num == 3) { /* corner shared by three elements */
1097:       num++;
1098:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1099:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1100:     } else if (*num == 4) { /* corner shared by four elements */
1101:       num++;
1102:       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1103:       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1104:       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1105:     } else { /* general case ... odd geoms ... 3D*/
1106:       num++;
1107:       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1108:     }
1109:   }
1110:   return(0);
1111: }

1113: /******************************************************************************/
1114: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1115: {
1116:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1117:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1118:   PetscInt       *pw, *list, *size, **nodes;
1119:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1120:   MPI_Status     status;
1121:   PetscBLASInt   i1 = 1,dstep;

1125:   /* strip and load s */
1126:   msg_list    = list     = gs->pair_list;
1127:   msg_size    = size     = gs->msg_sizes;
1128:   msg_nodes   = nodes    = gs->node_list;
1129:   iptr        = pw       = gs->pw_elm_list;
1130:   dptr1       = dptr3    = gs->pw_vals;
1131:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1132:   msg_ids_out = ids_out  = gs->msg_ids_out;
1133:   dptr2                  = gs->out;
1134:   in1=in2                = gs->in;

1136:   /* post the receives */
1137:   /*  msg_nodes=nodes; */
1138:   do {
1139:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1140:         second one *list and do list++ afterwards */
1141:     MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1142:     list++;msg_ids_in++;
1143:     in1 += *size++ *step;
1144:   } while (*++msg_nodes);
1145:   msg_nodes=nodes;

1147:   /* load gs values into in out gs buffers */
1148:   while (*iptr >= 0) {
1149:     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1150:     dptr3+=step;
1151:     iptr++;
1152:   }

1154:   /* load out buffers and post the sends */
1155:   while ((iptr = *msg_nodes++)) {
1156:     dptr3 = dptr2;
1157:     while (*iptr >= 0) {
1158:       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1159:       dptr2+=step;
1160:       iptr++;
1161:     }
1162:     MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1163:     msg_size++; msg_list++;msg_ids_out++;
1164:   }

1166:   /* tree */
1167:   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);

1169:   /* process the received data */
1170:   msg_nodes=nodes;
1171:   while ((iptr = *nodes++)) {
1172:     PetscScalar d1 = 1.0;

1174:     /* Should I check the return value of MPI_Wait() or status? */
1175:     /* Can this loop be replaced by a call to MPI_Waitall()? */
1176:     MPI_Wait(ids_in, &status);
1177:     ids_in++;
1178:     while (*iptr >= 0) {
1179:       PetscBLASIntCast(step,&dstep);
1180:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1181:       in2+=step;
1182:       iptr++;
1183:     }
1184:   }

1186:   /* replace vals */
1187:   while (*pw >= 0) {
1188:     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1189:     dptr1+=step;
1190:     pw++;
1191:   }

1193:   /* clear isend message handles */
1194:   /* This changed for clarity though it could be the same */

1196:   /* Should I check the return value of MPI_Wait() or status? */
1197:   /* Can this loop be replaced by a call to MPI_Waitall()? */
1198:   while (*msg_nodes++) {
1199:     MPI_Wait(ids_out, &status);
1200:     ids_out++;
1201:   }
1202:   return(0);
1203: }

1205: /******************************************************************************/
1206: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1207: {
1208:   PetscInt       size, *in, *out;
1209:   PetscScalar    *buf, *work;
1210:   PetscInt       op[] = {GL_ADD,0};
1211:   PetscBLASInt   i1   = 1;
1213:   PetscBLASInt   dstep;

1216:   /* copy over to local variables */
1217:   in   = gs->tree_map_in;
1218:   out  = gs->tree_map_out;
1219:   buf  = gs->tree_buf;
1220:   work = gs->tree_work;
1221:   size = gs->tree_nel*step;

1223:   /* zero out collection buffer */
1224:   PCTFS_rvec_zero(buf,size);

1226:   /* copy over my contributions */
1227:   while (*in >= 0) {
1228:     PetscBLASIntCast(step,&dstep);
1229:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1230:   }

1232:   /* perform fan in/out on full buffer */
1233:   /* must change PCTFS_grop to handle the blas */
1234:   PCTFS_grop(buf,work,size,op);

1236:   /* reset */
1237:   in  = gs->tree_map_in;
1238:   out = gs->tree_map_out;

1240:   /* get the portion of the results I need */
1241:   while (*in >= 0) {
1242:     PetscBLASIntCast(step,&dstep);
1243:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1244:   }
1245:   return(0);
1246: }

1248: /******************************************************************************/
1249: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1250: {

1254:   switch (*op) {
1255:   case '+':
1256:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1257:     break;
1258:   default:
1259:     PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1260:     PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1261:     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1262:     break;
1263:   }
1264:   return(0);
1265: }

1267: /******************************************************************************/
1268: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1269: {
1271:   /* if there's nothing to do return */
1272:   if (dim<=0) return(0);

1274:   /* can't do more dimensions then exist */
1275:   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);

1277:   /* local only operations!!! */
1278:   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);

1280:   /* if intersection tree/pairwise and local isn't empty */
1281:   if (gs->num_local_gop) {
1282:     PCTFS_gs_gop_local_in_plus(gs,vals);

1284:     /* pairwise will do tree inside ... */
1285:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1286:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);

1288:     PCTFS_gs_gop_local_out(gs,vals);
1289:   } else { /* if intersection tree/pairwise and local is empty */
1290:     /* pairwise will do tree inside */
1291:     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1292:     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1293:   }
1294:   return(0);
1295: }

1297: /******************************************************************************/
1298: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1299: {
1300:   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1301:   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1302:   PetscInt       *pw, *list, *size, **nodes;
1303:   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1304:   MPI_Status     status;
1305:   PetscInt       i, mask=1;

1309:   for (i=1; i<dim; i++) { mask<<=1; mask++; }

1311:   /* strip and load s */
1312:   msg_list    = list     = gs->pair_list;
1313:   msg_size    = size     = gs->msg_sizes;
1314:   msg_nodes   = nodes    = gs->node_list;
1315:   iptr        = pw       = gs->pw_elm_list;
1316:   dptr1       = dptr3    = gs->pw_vals;
1317:   msg_ids_in  = ids_in   = gs->msg_ids_in;
1318:   msg_ids_out = ids_out  = gs->msg_ids_out;
1319:   dptr2       = gs->out;
1320:   in1         = in2      = gs->in;

1322:   /* post the receives */
1323:   /*  msg_nodes=nodes; */
1324:   do {
1325:     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1326:         second one *list and do list++ afterwards */
1327:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1328:       MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1329:       list++; msg_ids_in++;in1 += *size++;
1330:     } else { list++; size++; }
1331:   } while (*++msg_nodes);

1333:   /* load gs values into in out gs buffers */
1334:   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);

1336:   /* load out buffers and post the sends */
1337:   msg_nodes=nodes;
1338:   list     = msg_list;
1339:   while ((iptr = *msg_nodes++)) {
1340:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1341:       dptr3 = dptr2;
1342:       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1343:       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1344:       /* is msg_ids_out++ correct? */
1345:       MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1346:       msg_size++;list++;msg_ids_out++;
1347:     } else {list++; msg_size++;}
1348:   }

1350:   /* do the tree while we're waiting */
1351:   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);

1353:   /* process the received data */
1354:   msg_nodes=nodes;
1355:   list     = msg_list;
1356:   while ((iptr = *nodes++)) {
1357:     if ((PCTFS_my_id|mask)==(*list|mask)) {
1358:       /* Should I check the return value of MPI_Wait() or status? */
1359:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1360:       MPI_Wait(ids_in, &status);
1361:       ids_in++;
1362:       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1363:     }
1364:     list++;
1365:   }

1367:   /* replace vals */
1368:   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;

1370:   /* clear isend message handles */
1371:   /* This changed for clarity though it could be the same */
1372:   while (*msg_nodes++) {
1373:     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1374:       /* Should I check the return value of MPI_Wait() or status? */
1375:       /* Can this loop be replaced by a call to MPI_Waitall()? */
1376:       MPI_Wait(ids_out, &status);
1377:       ids_out++;
1378:     }
1379:     msg_list++;
1380:   }
1381:   return(0);
1382: }

1384: /******************************************************************************/
1385: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1386: {
1387:   PetscInt    size;
1388:   PetscInt    *in, *out;
1389:   PetscScalar *buf, *work;
1390:   PetscInt    op[] = {GL_ADD,0};

1393:   in   = gs->tree_map_in;
1394:   out  = gs->tree_map_out;
1395:   buf  = gs->tree_buf;
1396:   work = gs->tree_work;
1397:   size = gs->tree_nel;

1399:   PCTFS_rvec_zero(buf,size);

1401:   while (*in >= 0) *(buf + *out++) = *(vals + *in++);

1403:   in  = gs->tree_map_in;
1404:   out = gs->tree_map_out;

1406:   PCTFS_grop_hc(buf,work,size,op,dim);

1408:   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1409:   return(0);
1410: }