Actual source code: sfhip.hip.cpp

  1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
  2: #include <petscpkg_version.h>

  4: /* compilation issues on SPOCK */
  5: #undef PETSC_HAVE_COMPLEX

  7: /* Map a thread id to an index in root/leaf space through a series of 3D subdomains. See PetscSFPackOpt. */
  8: __device__ static inline PetscInt MapTidToIndex(const PetscInt *opt,PetscInt tid)
  9: {
 10:   PetscInt        i,j,k,m,n,r;
 11:   const PetscInt  *offset,*start,*dx,*dy,*X,*Y;

 13:   n      = opt[0];
 14:   offset = opt + 1;
 15:   start  = opt + n + 2;
 16:   dx     = opt + 2*n + 2;
 17:   dy     = opt + 3*n + 2;
 18:   X      = opt + 5*n + 2;
 19:   Y      = opt + 6*n + 2;
 20:   for (r=0; r<n; r++) {if (tid < offset[r+1]) break;}
 21:   m = (tid - offset[r]);
 22:   k = m/(dx[r]*dy[r]);
 23:   j = (m - k*dx[r]*dy[r])/dx[r];
 24:   i = m - k*dx[r]*dy[r] - j*dx[r];

 26:   return (start[r] + k*X[r]*Y[r] + j*X[r] + i);
 27: }

 29: /*====================================================================================*/
 30: /*  Templated HIP kernels for pack/unpack. The Op can be regular or atomic           */
 31: /*====================================================================================*/

 33: /* Suppose user calls PetscSFReduce(sf,unit,...) and <unit> is an MPI data type made of 16 PetscReals, then
 34:    <Type> is PetscReal, which is the primitive type we operate on.
 35:    <bs>   is 16, which says <unit> contains 16 primitive types.
 36:    <BS>   is 8, which is the maximal SIMD width we will try to vectorize operations on <unit>.
 37:    <EQ>   is 0, which is (bs == BS ? 1 : 0)

 39:   If instead, <unit> has 8 PetscReals, then bs=8, BS=8, EQ=1, rendering MBS below to a compile time constant.
 40:   For the common case in VecScatter, bs=1, BS=1, EQ=1, MBS=1, the inner for-loops below will be totally unrolled.
 41: */
 42: template<class Type,PetscInt BS,PetscInt EQ>
 43: __global__ static void d_Pack(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,const Type *data,Type *buf)
 44: {
 45:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 46:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 47:   const PetscInt  M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */
 48:   const PetscInt  MBS = M*BS;  /* MBS=bs. We turn MBS into a compile-time const when EQ=1. */

 50:   for (; tid<count; tid += grid_size) {
 51:     /* opt != NULL ==> idx == NULL, i.e., the indices have patterns but not contiguous;
 52:        opt == NULL && idx == NULL ==> the indices are contiguous;
 53:      */
 54:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 55:     s = tid*MBS;
 56:     for (i=0; i<MBS; i++) buf[s+i] = data[t+i];
 57:   }
 58: }

 60: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 61: __global__ static void d_UnpackAndOp(PetscInt bs,PetscInt count,PetscInt start,const PetscInt *opt,const PetscInt *idx,Type *data,const Type *buf)
 62: {
 63:   PetscInt        i,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 64:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 65:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 66:   Op              op;

 68:   for (; tid<count; tid += grid_size) {
 69:     t = (opt? MapTidToIndex(opt,tid) : (idx? idx[tid] : start+tid))*MBS;
 70:     s = tid*MBS;
 71:     for (i=0; i<MBS; i++) op(data[t+i],buf[s+i]);
 72:   }
 73: }

 75: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 76: __global__ static void d_FetchAndOp(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,Type *leafbuf)
 77: {
 78:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
 79:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 80:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 81:   Op              op;

 83:   for (; tid<count; tid += grid_size) {
 84:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
 85:     l = tid*MBS;
 86:     for (i=0; i<MBS; i++) leafbuf[l+i] = op(rootdata[r+i],leafbuf[l+i]);
 87:   }
 88: }

 90: template<class Type,class Op,PetscInt BS,PetscInt EQ>
 91: __global__ static void d_ScatterAndOp(PetscInt bs,PetscInt count,PetscInt srcx,PetscInt srcy,PetscInt srcX,PetscInt srcY,PetscInt srcStart,const PetscInt* srcIdx,const Type *src,PetscInt dstx,PetscInt dsty,PetscInt dstX,PetscInt dstY,PetscInt dstStart,const PetscInt *dstIdx,Type *dst)
 92: {
 93:   PetscInt        i,j,k,s,t,tid = blockIdx.x*blockDim.x + threadIdx.x;
 94:   const PetscInt  grid_size = gridDim.x * blockDim.x;
 95:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
 96:   Op              op;

 98:   for (; tid<count; tid += grid_size) {
 99:     if (!srcIdx) { /* src is either contiguous or 3D */
100:       k = tid/(srcx*srcy);
101:       j = (tid - k*srcx*srcy)/srcx;
102:       i = tid - k*srcx*srcy - j*srcx;
103:       s = srcStart + k*srcX*srcY + j*srcX + i;
104:     } else {
105:       s = srcIdx[tid];
106:     }

108:     if (!dstIdx) { /* dst is either contiguous or 3D */
109:       k = tid/(dstx*dsty);
110:       j = (tid - k*dstx*dsty)/dstx;
111:       i = tid - k*dstx*dsty - j*dstx;
112:       t = dstStart + k*dstX*dstY + j*dstX + i;
113:     } else {
114:       t = dstIdx[tid];
115:     }

117:     s *= MBS;
118:     t *= MBS;
119:     for (i=0; i<MBS; i++) op(dst[t+i],src[s+i]);
120:   }
121: }

123: template<class Type,class Op,PetscInt BS,PetscInt EQ>
124: __global__ static void d_FetchAndOpLocal(PetscInt bs,PetscInt count,PetscInt rootstart,const PetscInt *rootopt,const PetscInt *rootidx,Type *rootdata,PetscInt leafstart,const PetscInt *leafopt,const PetscInt *leafidx,const Type *leafdata,Type *leafupdate)
125: {
126:   PetscInt        i,l,r,tid = blockIdx.x*blockDim.x + threadIdx.x;
127:   const PetscInt  grid_size = gridDim.x * blockDim.x;
128:   const PetscInt  M = (EQ) ? 1 : bs/BS, MBS = M*BS;
129:   Op              op;

131:   for (; tid<count; tid += grid_size) {
132:     r = (rootopt? MapTidToIndex(rootopt,tid) : (rootidx? rootidx[tid] : rootstart+tid))*MBS;
133:     l = (leafopt? MapTidToIndex(leafopt,tid) : (leafidx? leafidx[tid] : leafstart+tid))*MBS;
134:     for (i=0; i<MBS; i++) leafupdate[l+i] = op(rootdata[r+i],leafdata[l+i]);
135:   }
136: }

138: /*====================================================================================*/
139: /*                             Regular operations on device                           */
140: /*====================================================================================*/
141: template<typename Type> struct Insert {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = y;             return old;}};
142: template<typename Type> struct Add    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x += y;             return old;}};
143: template<typename Type> struct Mult   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x *= y;             return old;}};
144: template<typename Type> struct Min    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMin(x,y); return old;}};
145: template<typename Type> struct Max    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = PetscMax(x,y); return old;}};
146: template<typename Type> struct LAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x && y;        return old;}};
147: template<typename Type> struct LOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x || y;        return old;}};
148: template<typename Type> struct LXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = !x != !y;      return old;}};
149: template<typename Type> struct BAND   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x & y;         return old;}};
150: template<typename Type> struct BOR    {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x | y;         return old;}};
151: template<typename Type> struct BXOR   {__device__ Type operator() (Type& x,Type y) const {Type old = x; x  = x ^ y;         return old;}};
152: template<typename Type> struct Minloc {
153:   __device__ Type operator() (Type& x,Type y) const {
154:     Type old = x;
155:     if (y.a < x.a) x = y;
156:     else if (y.a == x.a) x.b = min(x.b,y.b);
157:     return old;
158:   }
159: };
160: template<typename Type> struct Maxloc {
161:   __device__ Type operator() (Type& x,Type y) const {
162:     Type old = x;
163:     if (y.a > x.a) x = y;
164:     else if (y.a == x.a) x.b = min(x.b,y.b); /* See MPI MAXLOC */
165:     return old;
166:   }
167: };

169: /*====================================================================================*/
170: /*                             Atomic operations on device                            */
171: /*====================================================================================*/

173: /*
174:   Atomic Insert (exchange) operations

176:   See Cuda version
177: */
178: #if PETSC_PKG_HIP_VERSION_LT(4,4,0)
179: __device__ static double atomicExch(double* address,double val) {return __longlong_as_double(atomicExch((ullint*)address,__double_as_longlong(val)));}
180: #endif

182: __device__ static llint atomicExch(llint* address,llint val) {return (llint)(atomicExch((ullint*)address,(ullint)val));}

184: template<typename Type> struct AtomicInsert {__device__ Type operator() (Type& x,Type y) const {return atomicExch(&x,y);}};

186: #if defined(PETSC_HAVE_COMPLEX)
187: #if defined(PETSC_USE_REAL_DOUBLE)
188: template<> struct AtomicInsert<PetscComplex> {
189:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
190:     PetscComplex         old, *z = &old;
191:     double               *xp = (double*)&x,*yp = (double*)&y;
192:     AtomicInsert<double> op;
193:     z[0] = op(xp[0],yp[0]);
194:     z[1] = op(xp[1],yp[1]);
195:     return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
196:   }
197: };
198: #elif defined(PETSC_USE_REAL_SINGLE)
199: template<> struct AtomicInsert<PetscComplex> {
200:   __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
201:     double               *xp = (double*)&x,*yp = (double*)&y;
202:     AtomicInsert<double> op;
203:     return op(xp[0],yp[0]);
204:   }
205: };
206: #endif
207: #endif

209: /*
210:   Atomic add operations

212: */
213: __device__ static llint atomicAdd(llint* address,llint val) {return (llint)atomicAdd((ullint*)address,(ullint)val);}

215: template<typename Type> struct AtomicAdd {__device__ Type operator() (Type& x,Type y) const {return atomicAdd(&x,y);}};

217: template<> struct AtomicAdd<double> {
218:   __device__ double operator() (double& x,double y) const {
219:    /* Cuda version does more checks that may be needed */
220:     return atomicAdd(&x,y);
221:   }
222: };

224: template<> struct AtomicAdd<float> {
225:   __device__ float operator() (float& x,float y) const {
226:     /* Cuda version does more checks that may be needed */
227:     return atomicAdd(&x,y);
228:   }
229: };

231: #if defined(PETSC_HAVE_COMPLEX)
232: template<> struct AtomicAdd<PetscComplex> {
233:  __device__ PetscComplex operator() (PetscComplex& x,PetscComplex y) const {
234:   PetscComplex         old, *z = &old;
235:   PetscReal            *xp = (PetscReal*)&x,*yp = (PetscReal*)&y;
236:   AtomicAdd<PetscReal> op;
237:   z[0] = op(xp[0],yp[0]);
238:   z[1] = op(xp[1],yp[1]);
239:   return old; /* The returned value may not be atomic. It can be mix of two ops. Caller should discard it. */
240:  }
241: };
242: #endif

244: /*
245:   Atomic Mult operations:

247:   HIP has no atomicMult at all, so we build our own with atomicCAS
248:  */
249: #if defined(PETSC_USE_REAL_DOUBLE)
250: __device__ static double atomicMult(double* address, double val)
251: {
252:   ullint *address_as_ull = (ullint*)(address);
253:   ullint old = *address_as_ull, assumed;
254:   do {
255:     assumed = old;
256:     /* Other threads can access and modify value of *address_as_ull after the read above and before the write below */
257:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(val*__longlong_as_double(assumed)));
258:   } while (assumed != old);
259:   return __longlong_as_double(old);
260: }
261: #elif defined(PETSC_USE_REAL_SINGLE)
262: __device__ static float atomicMult(float* address,float val)
263: {
264:   int *address_as_int = (int*)(address);
265:   int old = *address_as_int, assumed;
266:   do {
267:     assumed  = old;
268:     old      = atomicCAS(address_as_int, assumed, __float_as_int(val*__int_as_float(assumed)));
269:   } while (assumed != old);
270:   return __int_as_float(old);
271: }
272: #endif

274: __device__ static int atomicMult(int* address,int val)
275: {
276:   int *address_as_int = (int*)(address);
277:   int old = *address_as_int, assumed;
278:   do {
279:     assumed = old;
280:     old     = atomicCAS(address_as_int, assumed, val*assumed);
281:   } while (assumed != old);
282:   return (int)old;
283: }

285: __device__ static llint atomicMult(llint* address,llint val)
286: {
287:   ullint *address_as_ull = (ullint*)(address);
288:   ullint old = *address_as_ull, assumed;
289:   do {
290:     assumed = old;
291:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val*(llint)assumed));
292:   } while (assumed != old);
293:   return (llint)old;
294: }

296: template<typename Type> struct AtomicMult {__device__ Type operator() (Type& x,Type y) const {return atomicMult(&x,y);}};

298: /*
299:   Atomic Min/Max operations

301:   See CUDA version for comments.
302:  */
303: #if PETSC_PKG_HIP_VERSION_LT(4,4,0)
304: #if defined(PETSC_USE_REAL_DOUBLE)
305: __device__ static double atomicMin(double* address, double val)
306: {
307:   ullint *address_as_ull = (ullint*)(address);
308:   ullint old = *address_as_ull, assumed;
309:   do {
310:     assumed = old;
311:     old     = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMin(val,__longlong_as_double(assumed))));
312:   } while (assumed != old);
313:   return __longlong_as_double(old);
314: }

316: __device__ static double atomicMax(double* address, double val)
317: {
318:   ullint *address_as_ull = (ullint*)(address);
319:   ullint old = *address_as_ull, assumed;
320:   do {
321:     assumed  = old;
322:     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(PetscMax(val,__longlong_as_double(assumed))));
323:   } while (assumed != old);
324:   return __longlong_as_double(old);
325: }
326: #elif defined(PETSC_USE_REAL_SINGLE)
327: __device__ static float atomicMin(float* address,float val)
328: {
329:   int *address_as_int = (int*)(address);
330:   int old = *address_as_int, assumed;
331:   do {
332:     assumed = old;
333:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMin(val,__int_as_float(assumed))));
334:   } while (assumed != old);
335:   return __int_as_float(old);
336: }

338: __device__ static float atomicMax(float* address,float val)
339: {
340:   int *address_as_int = (int*)(address);
341:   int old = *address_as_int, assumed;
342:   do {
343:     assumed = old;
344:     old     = atomicCAS(address_as_int, assumed, __float_as_int(PetscMax(val,__int_as_float(assumed))));
345:   } while (assumed != old);
346:   return __int_as_float(old);
347: }
348: #endif
349: #endif

351: /* As of ROCm 3.10 llint atomicMin/Max(llint*, llint) is not supported */
352: __device__ static llint atomicMin(llint* address,llint val)
353: {
354:   ullint *address_as_ull = (ullint*)(address);
355:   ullint old = *address_as_ull, assumed;
356:   do {
357:     assumed = old;
358:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMin(val,(llint)assumed)));
359:   } while (assumed != old);
360:   return (llint)old;
361: }

363: __device__ static llint atomicMax(llint* address,llint val)
364: {
365:   ullint *address_as_ull = (ullint*)(address);
366:   ullint old = *address_as_ull, assumed;
367:   do {
368:     assumed = old;
369:     old     = atomicCAS(address_as_ull, assumed, (ullint)(PetscMax(val,(llint)assumed)));
370:   } while (assumed != old);
371:   return (llint)old;
372: }

374: template<typename Type> struct AtomicMin {__device__ Type operator() (Type& x,Type y) const {return atomicMin(&x,y);}};
375: template<typename Type> struct AtomicMax {__device__ Type operator() (Type& x,Type y) const {return atomicMax(&x,y);}};

377: /*
378:   Atomic bitwise operations
379:   As of ROCm 3.10, the llint atomicAnd/Or/Xor(llint*, llint) is not supported
380: */

382: __device__ static llint atomicAnd(llint* address,llint val)
383: {
384:   ullint *address_as_ull = (ullint*)(address);
385:   ullint old = *address_as_ull, assumed;
386:   do {
387:     assumed = old;
388:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val & (llint)assumed));
389:   } while (assumed != old);
390:   return (llint)old;
391: }
392: __device__ static llint atomicOr(llint* address,llint val)
393: {
394:   ullint *address_as_ull = (ullint*)(address);
395:   ullint old = *address_as_ull, assumed;
396:   do {
397:     assumed = old;
398:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val | (llint)assumed));
399:   } while (assumed != old);
400:   return (llint)old;
401: }

403: __device__ static llint atomicXor(llint* address,llint val)
404: {
405:   ullint *address_as_ull = (ullint*)(address);
406:   ullint old = *address_as_ull, assumed;
407:   do {
408:     assumed = old;
409:     old     = atomicCAS(address_as_ull, assumed, (ullint)(val ^ (llint)assumed));
410:   } while (assumed != old);
411:   return (llint)old;
412: }

414: template<typename Type> struct AtomicBAND {__device__ Type operator() (Type& x,Type y) const {return atomicAnd(&x,y);}};
415: template<typename Type> struct AtomicBOR  {__device__ Type operator() (Type& x,Type y) const {return atomicOr (&x,y);}};
416: template<typename Type> struct AtomicBXOR {__device__ Type operator() (Type& x,Type y) const {return atomicXor(&x,y);}};

418: /*
419:   Atomic logical operations:

421:   CUDA has no atomic logical operations at all. We support them on integer types.
422: */

424: /* A template without definition makes any instantiation not using given specializations erroneous at compile time,
425:    which is what we want since we only support 32-bit and 64-bit integers.
426:  */
427: template<typename Type,class Op,int size/* sizeof(Type) */> struct AtomicLogical;

429: template<typename Type,class Op>
430: struct AtomicLogical<Type,Op,4> {
431:   __device__ Type operator()(Type& x,Type y) const {
432:     int *address_as_int = (int*)(&x);
433:     int old = *address_as_int, assumed;
434:     Op op;
435:     do {
436:       assumed = old;
437:       old     = atomicCAS(address_as_int, assumed, (int)(op((Type)assumed,y)));
438:     } while (assumed != old);
439:     return (Type)old;
440:   }
441: };

443: template<typename Type,class Op>
444: struct AtomicLogical<Type,Op,8> {
445:   __device__ Type operator()(Type& x,Type y) const {
446:     ullint *address_as_ull = (ullint*)(&x);
447:     ullint old = *address_as_ull, assumed;
448:     Op op;
449:     do {
450:       assumed = old;
451:       old     = atomicCAS(address_as_ull, assumed, (ullint)(op((Type)assumed,y)));
452:     } while (assumed != old);
453:     return (Type)old;
454:   }
455: };

457: /* Note land/lor/lxor below are different from LAND etc above. Here we pass arguments by value and return result of ops (not old value) */
458: template<typename Type> struct land {__device__ Type operator()(Type x, Type y) {return x && y;}};
459: template<typename Type> struct lor  {__device__ Type operator()(Type x, Type y) {return x || y;}};
460: template<typename Type> struct lxor {__device__ Type operator()(Type x, Type y) {return (!x != !y);}};

462: template<typename Type> struct AtomicLAND {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,land<Type>,sizeof(Type)> op; return op(x,y);}};
463: template<typename Type> struct AtomicLOR  {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lor<Type> ,sizeof(Type)> op; return op(x,y);}};
464: template<typename Type> struct AtomicLXOR {__device__ Type operator()(Type& x,Type y) const {AtomicLogical<Type,lxor<Type>,sizeof(Type)> op; return op(x,y);}};

466: /*====================================================================================*/
467: /*  Wrapper functions of hip kernels. Function pointers are stored in 'link'         */
468: /*====================================================================================*/
469: template<typename Type,PetscInt BS,PetscInt EQ>
470: static PetscErrorCode Pack(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *data,void *buf)
471: {
472:   hipError_t         err;
473:   PetscInt           nthreads=256;
474:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
475:   const PetscInt     *iarray=opt ? opt->array : NULL;

478:   if (!count) return(0);
479:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
480:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_Pack<Type,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(const Type*)data,(Type*)buf);
481:   err = hipGetLastError();CHKERRHIP(err);
482:   return(0);
483: }

485: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
486: static PetscErrorCode UnpackAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,const void *buf)
487: {
488:   hipError_t         cerr;
489:   PetscInt           nthreads=256;
490:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
491:   const PetscInt     *iarray=opt ? opt->array : NULL;

494:   if (!count) return(0);
495:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
496:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_UnpackAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(const Type*)buf);
497:   cerr = hipGetLastError();CHKERRHIP(cerr);
498:   return(0);
499: }

501: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
502: static PetscErrorCode FetchAndOp(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *data,void *buf)
503: {
504:   hipError_t         cerr;
505:   PetscInt           nthreads=256;
506:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
507:   const PetscInt     *iarray=opt ? opt->array : NULL;

510:   if (!count) return(0);
511:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
512:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,start,iarray,idx,(Type*)data,(Type*)buf);
513:   cerr = hipGetLastError();CHKERRHIP(cerr);
514:   return(0);
515: }

517: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
518: static PetscErrorCode ScatterAndOp(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
519: {
520:   hipError_t         cerr;
521:   PetscInt           nthreads=256;
522:   PetscInt           nblocks=(count+nthreads-1)/nthreads;
523:   PetscInt           srcx=0,srcy=0,srcX=0,srcY=0,dstx=0,dsty=0,dstX=0,dstY=0;

526:   if (!count) return(0);
527:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);

529:   /* The 3D shape of source subdomain may be different than that of the destination, which makes it difficult to use CUDA 3D grid and block */
530:   if (srcOpt)       {srcx = srcOpt->dx[0]; srcy = srcOpt->dy[0]; srcX = srcOpt->X[0]; srcY = srcOpt->Y[0]; srcStart = srcOpt->start[0]; srcIdx = NULL;}
531:   else if (!srcIdx) {srcx = srcX = count; srcy = srcY = 1;}

533:   if (dstOpt)       {dstx = dstOpt->dx[0]; dsty = dstOpt->dy[0]; dstX = dstOpt->X[0]; dstY = dstOpt->Y[0]; dstStart = dstOpt->start[0]; dstIdx = NULL;}
534:   else if (!dstIdx) {dstx = dstX = count; dsty = dstY = 1;}

536:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_ScatterAndOp<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,srcx,srcy,srcX,srcY,srcStart,srcIdx,(const Type*)src,dstx,dsty,dstX,dstY,dstStart,dstIdx,(Type*)dst);
537:   cerr = hipGetLastError();CHKERRHIP(cerr);
538:   return(0);
539: }

541: /* Specialization for Insert since we may use hipMemcpyAsync */
542: template<typename Type,PetscInt BS,PetscInt EQ>
543: static PetscErrorCode ScatterAndInsert(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst)
544: {
545:   PetscErrorCode    ierr;
546:   hipError_t       cerr;

549:   if (!count) return(0);
550:   /*src and dst are contiguous */
551:   if ((!srcOpt && !srcIdx) && (!dstOpt && !dstIdx) && src != dst) {
552:     cerr = hipMemcpyAsync((Type*)dst+dstStart*link->bs,(const Type*)src+srcStart*link->bs,count*link->unitbytes,hipMemcpyDeviceToDevice,link->stream);CHKERRHIP(cerr);
553:   } else {
554:     ScatterAndOp<Type,Insert<Type>,BS,EQ>(link,count,srcStart,srcOpt,srcIdx,src,dstStart,dstOpt,dstIdx,dst);
555:   }
556:   return(0);
557: }

559: template<typename Type,class Op,PetscInt BS,PetscInt EQ>
560: static PetscErrorCode FetchAndOpLocal(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate)
561: {
562:   hipError_t        cerr;
563:   PetscInt          nthreads=256;
564:   PetscInt          nblocks=(count+nthreads-1)/nthreads;
565:   const PetscInt    *rarray = rootopt ? rootopt->array : NULL;
566:   const PetscInt    *larray = leafopt ? leafopt->array : NULL;

569:   if (!count) return(0);
570:   nblocks = PetscMin(nblocks,link->maxResidentThreadsPerGPU/nthreads);
571:   hipLaunchKernelGGL(HIP_KERNEL_NAME(d_FetchAndOpLocal<Type,Op,BS,EQ>), dim3(nblocks), dim3(nthreads), 0, link->stream, link->bs,count,rootstart,rarray,rootidx,(Type*)rootdata,leafstart,larray,leafidx,(const Type*)leafdata,(Type*)leafupdate);
572:   cerr = hipGetLastError();CHKERRHIP(cerr);
573:   return(0);
574: }

576: /*====================================================================================*/
577: /*  Init various types and instantiate pack/unpack function pointers                  */
578: /*====================================================================================*/
579: template<typename Type,PetscInt BS,PetscInt EQ>
580: static void PackInit_RealType(PetscSFLink link)
581: {
582:   /* Pack/unpack for remote communication */
583:   link->d_Pack              = Pack<Type,BS,EQ>;
584:   link->d_UnpackAndInsert   = UnpackAndOp     <Type,Insert<Type>      ,BS,EQ>;
585:   link->d_UnpackAndAdd      = UnpackAndOp     <Type,Add<Type>         ,BS,EQ>;
586:   link->d_UnpackAndMult     = UnpackAndOp     <Type,Mult<Type>        ,BS,EQ>;
587:   link->d_UnpackAndMin      = UnpackAndOp     <Type,Min<Type>         ,BS,EQ>;
588:   link->d_UnpackAndMax      = UnpackAndOp     <Type,Max<Type>         ,BS,EQ>;
589:   link->d_FetchAndAdd       = FetchAndOp      <Type,Add<Type>         ,BS,EQ>;

591:   /* Scatter for local communication */
592:   link->d_ScatterAndInsert  = ScatterAndInsert<Type                   ,BS,EQ>; /* Has special optimizations */
593:   link->d_ScatterAndAdd     = ScatterAndOp    <Type,Add<Type>         ,BS,EQ>;
594:   link->d_ScatterAndMult    = ScatterAndOp    <Type,Mult<Type>        ,BS,EQ>;
595:   link->d_ScatterAndMin     = ScatterAndOp    <Type,Min<Type>         ,BS,EQ>;
596:   link->d_ScatterAndMax     = ScatterAndOp    <Type,Max<Type>         ,BS,EQ>;
597:   link->d_FetchAndAddLocal  = FetchAndOpLocal <Type,Add <Type>        ,BS,EQ>;

599:   /* Atomic versions when there are data-race possibilities */
600:   link->da_UnpackAndInsert  = UnpackAndOp     <Type,AtomicInsert<Type>,BS,EQ>;
601:   link->da_UnpackAndAdd     = UnpackAndOp     <Type,AtomicAdd<Type>   ,BS,EQ>;
602:   link->da_UnpackAndMult    = UnpackAndOp     <Type,AtomicMult<Type>  ,BS,EQ>;
603:   link->da_UnpackAndMin     = UnpackAndOp     <Type,AtomicMin<Type>   ,BS,EQ>;
604:   link->da_UnpackAndMax     = UnpackAndOp     <Type,AtomicMax<Type>   ,BS,EQ>;
605:   link->da_FetchAndAdd      = FetchAndOp      <Type,AtomicAdd<Type>   ,BS,EQ>;

607:   link->da_ScatterAndInsert = ScatterAndOp    <Type,AtomicInsert<Type>,BS,EQ>;
608:   link->da_ScatterAndAdd    = ScatterAndOp    <Type,AtomicAdd<Type>   ,BS,EQ>;
609:   link->da_ScatterAndMult   = ScatterAndOp    <Type,AtomicMult<Type>  ,BS,EQ>;
610:   link->da_ScatterAndMin    = ScatterAndOp    <Type,AtomicMin<Type>   ,BS,EQ>;
611:   link->da_ScatterAndMax    = ScatterAndOp    <Type,AtomicMax<Type>   ,BS,EQ>;
612:   link->da_FetchAndAddLocal = FetchAndOpLocal <Type,AtomicAdd<Type>   ,BS,EQ>;
613: }

615: /* Have this templated class to specialize for char integers */
616: template<typename Type,PetscInt BS,PetscInt EQ,PetscInt size/*sizeof(Type)*/>
617: struct PackInit_IntegerType_Atomic {
618:   static void Init(PetscSFLink link)
619:   {
620:     link->da_UnpackAndInsert  = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
621:     link->da_UnpackAndAdd     = UnpackAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
622:     link->da_UnpackAndMult    = UnpackAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
623:     link->da_UnpackAndMin     = UnpackAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
624:     link->da_UnpackAndMax     = UnpackAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
625:     link->da_UnpackAndLAND    = UnpackAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
626:     link->da_UnpackAndLOR     = UnpackAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
627:     link->da_UnpackAndLXOR    = UnpackAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
628:     link->da_UnpackAndBAND    = UnpackAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
629:     link->da_UnpackAndBOR     = UnpackAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
630:     link->da_UnpackAndBXOR    = UnpackAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
631:     link->da_FetchAndAdd      = FetchAndOp <Type,AtomicAdd<Type>   ,BS,EQ>;

633:     link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
634:     link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>   ,BS,EQ>;
635:     link->da_ScatterAndMult   = ScatterAndOp<Type,AtomicMult<Type>  ,BS,EQ>;
636:     link->da_ScatterAndMin    = ScatterAndOp<Type,AtomicMin<Type>   ,BS,EQ>;
637:     link->da_ScatterAndMax    = ScatterAndOp<Type,AtomicMax<Type>   ,BS,EQ>;
638:     link->da_ScatterAndLAND   = ScatterAndOp<Type,AtomicLAND<Type>  ,BS,EQ>;
639:     link->da_ScatterAndLOR    = ScatterAndOp<Type,AtomicLOR<Type>   ,BS,EQ>;
640:     link->da_ScatterAndLXOR   = ScatterAndOp<Type,AtomicLXOR<Type>  ,BS,EQ>;
641:     link->da_ScatterAndBAND   = ScatterAndOp<Type,AtomicBAND<Type>  ,BS,EQ>;
642:     link->da_ScatterAndBOR    = ScatterAndOp<Type,AtomicBOR<Type>   ,BS,EQ>;
643:     link->da_ScatterAndBXOR   = ScatterAndOp<Type,AtomicBXOR<Type>  ,BS,EQ>;
644:     link->da_FetchAndAddLocal = FetchAndOpLocal<Type,AtomicAdd<Type>,BS,EQ>;
645:   }
646: };

648: /*  See cuda version */
649: template<typename Type,PetscInt BS,PetscInt EQ>
650: struct PackInit_IntegerType_Atomic<Type,BS,EQ,1> {
651:   static void Init(PetscSFLink link) {/* Nothing to leave function pointers NULL */}
652: };

654: template<typename Type,PetscInt BS,PetscInt EQ>
655: static void PackInit_IntegerType(PetscSFLink link)
656: {
657:   link->d_Pack            = Pack<Type,BS,EQ>;
658:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
659:   link->d_UnpackAndAdd    = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
660:   link->d_UnpackAndMult   = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
661:   link->d_UnpackAndMin    = UnpackAndOp<Type,Min<Type>   ,BS,EQ>;
662:   link->d_UnpackAndMax    = UnpackAndOp<Type,Max<Type>   ,BS,EQ>;
663:   link->d_UnpackAndLAND   = UnpackAndOp<Type,LAND<Type>  ,BS,EQ>;
664:   link->d_UnpackAndLOR    = UnpackAndOp<Type,LOR<Type>   ,BS,EQ>;
665:   link->d_UnpackAndLXOR   = UnpackAndOp<Type,LXOR<Type>  ,BS,EQ>;
666:   link->d_UnpackAndBAND   = UnpackAndOp<Type,BAND<Type>  ,BS,EQ>;
667:   link->d_UnpackAndBOR    = UnpackAndOp<Type,BOR<Type>   ,BS,EQ>;
668:   link->d_UnpackAndBXOR   = UnpackAndOp<Type,BXOR<Type>  ,BS,EQ>;
669:   link->d_FetchAndAdd     = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

671:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
672:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
673:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
674:   link->d_ScatterAndMin    = ScatterAndOp<Type,Min<Type>   ,BS,EQ>;
675:   link->d_ScatterAndMax    = ScatterAndOp<Type,Max<Type>   ,BS,EQ>;
676:   link->d_ScatterAndLAND   = ScatterAndOp<Type,LAND<Type>  ,BS,EQ>;
677:   link->d_ScatterAndLOR    = ScatterAndOp<Type,LOR<Type>   ,BS,EQ>;
678:   link->d_ScatterAndLXOR   = ScatterAndOp<Type,LXOR<Type>  ,BS,EQ>;
679:   link->d_ScatterAndBAND   = ScatterAndOp<Type,BAND<Type>  ,BS,EQ>;
680:   link->d_ScatterAndBOR    = ScatterAndOp<Type,BOR<Type>   ,BS,EQ>;
681:   link->d_ScatterAndBXOR   = ScatterAndOp<Type,BXOR<Type>  ,BS,EQ>;
682:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;
683:   PackInit_IntegerType_Atomic<Type,BS,EQ,sizeof(Type)>::Init(link);
684: }

686: #if defined(PETSC_HAVE_COMPLEX)
687: template<typename Type,PetscInt BS,PetscInt EQ>
688: static void PackInit_ComplexType(PetscSFLink link)
689: {
690:   link->d_Pack             = Pack<Type,BS,EQ>;
691:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
692:   link->d_UnpackAndAdd     = UnpackAndOp<Type,Add<Type>   ,BS,EQ>;
693:   link->d_UnpackAndMult    = UnpackAndOp<Type,Mult<Type>  ,BS,EQ>;
694:   link->d_FetchAndAdd      = FetchAndOp <Type,Add<Type>   ,BS,EQ>;

696:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
697:   link->d_ScatterAndAdd    = ScatterAndOp<Type,Add<Type>   ,BS,EQ>;
698:   link->d_ScatterAndMult   = ScatterAndOp<Type,Mult<Type>  ,BS,EQ>;
699:   link->d_FetchAndAddLocal = FetchAndOpLocal<Type,Add<Type>,BS,EQ>;

701:   link->da_UnpackAndInsert = UnpackAndOp<Type,AtomicInsert<Type>,BS,EQ>;
702:   link->da_UnpackAndAdd    = UnpackAndOp<Type,AtomicAdd<Type>,BS,EQ>;
703:   link->da_UnpackAndMult   = NULL; /* Not implemented yet */
704:   link->da_FetchAndAdd     = NULL; /* Return value of atomicAdd on complex is not atomic */

706:   link->da_ScatterAndInsert = ScatterAndOp<Type,AtomicInsert<Type>,BS,EQ>;
707:   link->da_ScatterAndAdd    = ScatterAndOp<Type,AtomicAdd<Type>,BS,EQ>;
708: }
709: #endif

711: typedef signed char                      SignedChar;
712: typedef unsigned char                    UnsignedChar;
713: typedef struct {int a;      int b;     } PairInt;
714: typedef struct {PetscInt a; PetscInt b;} PairPetscInt;

716: template<typename Type>
717: static void PackInit_PairType(PetscSFLink link)
718: {
719:   link->d_Pack            = Pack<Type,1,1>;
720:   link->d_UnpackAndInsert = UnpackAndOp<Type,Insert<Type>,1,1>;
721:   link->d_UnpackAndMaxloc = UnpackAndOp<Type,Maxloc<Type>,1,1>;
722:   link->d_UnpackAndMinloc = UnpackAndOp<Type,Minloc<Type>,1,1>;

724:   link->d_ScatterAndInsert = ScatterAndOp<Type,Insert<Type>,1,1>;
725:   link->d_ScatterAndMaxloc = ScatterAndOp<Type,Maxloc<Type>,1,1>;
726:   link->d_ScatterAndMinloc = ScatterAndOp<Type,Minloc<Type>,1,1>;
727:   /* Atomics for pair types are not implemented yet */
728: }

730: template<typename Type,PetscInt BS,PetscInt EQ>
731: static void PackInit_DumbType(PetscSFLink link)
732: {
733:   link->d_Pack             = Pack<Type,BS,EQ>;
734:   link->d_UnpackAndInsert  = UnpackAndOp<Type,Insert<Type>,BS,EQ>;
735:   link->d_ScatterAndInsert = ScatterAndInsert<Type,BS,EQ>;
736:   /* Atomics for dumb types are not implemented yet */
737: }

739: /* Some device-specific utilities */
740: static PetscErrorCode PetscSFLinkSyncDevice_HIP(PetscSFLink link)
741: {
742:   hipError_t cerr;
744:   cerr = hipDeviceSynchronize();CHKERRHIP(cerr);
745:   return(0);
746: }

748: static PetscErrorCode PetscSFLinkSyncStream_HIP(PetscSFLink link)
749: {
750:   hipError_t cerr;
752:   cerr = hipStreamSynchronize(link->stream);CHKERRHIP(cerr);
753:   return(0);
754: }

756: static PetscErrorCode PetscSFLinkMemcpy_HIP(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
757: {
759:   enum hipMemcpyKind kinds[2][2] = {{hipMemcpyHostToHost,hipMemcpyHostToDevice},{hipMemcpyDeviceToHost,hipMemcpyDeviceToDevice}};

761:   if (n) {
762:     if (PetscMemTypeHost(dstmtype) && PetscMemTypeHost(srcmtype)) { /* Separate HostToHost so that pure-cpu code won't call hip runtime */
763:       PetscErrorCode PetscMemcpy(dst,src,n);
764:     } else {
765:       int stype = PetscMemTypeDevice(srcmtype) ? 1 : 0;
766:       int dtype = PetscMemTypeDevice(dstmtype) ? 1 : 0;
767:       hipError_t cerr = hipMemcpyAsync(dst,src,n,kinds[stype][dtype],link->stream);CHKERRHIP(cerr);
768:     }
769:   }
770:   return(0);
771: }

773: PetscErrorCode PetscSFMalloc_HIP(PetscMemType mtype,size_t size,void** ptr)
774: {
776:   if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscMalloc(size,ptr);}
777:   else if (PetscMemTypeDevice(mtype)) {
778:     if (!PetscHIPInitialized) { PetscErrorCode PetscHIPInitializeCheck(); }
779:     hipError_t err = hipMalloc(ptr,size);CHKERRHIP(err);
780:   } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d", (int)mtype);
781:   return(0);
782: }

784: PetscErrorCode PetscSFFree_HIP(PetscMemType mtype,void* ptr)
785: {
787:   if (PetscMemTypeHost(mtype)) {PetscErrorCode PetscFree(ptr);}
788:   else if (PetscMemTypeDevice(mtype)) {hipError_t err = hipFree(ptr);CHKERRHIP(err);}
789:   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
790:   return(0);
791: }

793: /* Destructor when the link uses MPI for communication on HIP device */
794: static PetscErrorCode PetscSFLinkDestroy_MPI_HIP(PetscSF sf,PetscSFLink link)
795: {
796:   hipError_t    cerr;

799:   for (int i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
800:     cerr = hipFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
801:     cerr = hipFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRHIP(cerr);
802:   }
803:   return(0);
804: }

806: /*====================================================================================*/
807: /*                Main driver to init MPI datatype on device                          */
808: /*====================================================================================*/

810: /* Some fields of link are initialized by PetscSFPackSetUp_Host. This routine only does what needed on device */
811: PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
812: {
814:   hipError_t     cerr;
815:   PetscInt       nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
816:   PetscBool      is2Int,is2PetscInt;
817: #if defined(PETSC_HAVE_COMPLEX)
818:   PetscInt       nPetscComplex=0;
819: #endif

822:   if (link->deviceinited) return(0);
823:   MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR,  &nSignedChar);
824:   MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
825:   /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
826:   MPIPetsc_Type_compare_contig(unit,MPI_INT,  &nInt);
827:   MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
828:   MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
829: #if defined(PETSC_HAVE_COMPLEX)
830:   MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
831: #endif
832:   MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
833:   MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);

835:   if (is2Int) {
836:     PackInit_PairType<PairInt>(link);
837:   } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
838:     PackInit_PairType<PairPetscInt>(link);
839:   } else if (nPetscReal) {
840:     if      (nPetscReal == 8) PackInit_RealType<PetscReal,8,1>(link); else if (nPetscReal%8 == 0) PackInit_RealType<PetscReal,8,0>(link);
841:     else if (nPetscReal == 4) PackInit_RealType<PetscReal,4,1>(link); else if (nPetscReal%4 == 0) PackInit_RealType<PetscReal,4,0>(link);
842:     else if (nPetscReal == 2) PackInit_RealType<PetscReal,2,1>(link); else if (nPetscReal%2 == 0) PackInit_RealType<PetscReal,2,0>(link);
843:     else if (nPetscReal == 1) PackInit_RealType<PetscReal,1,1>(link); else if (nPetscReal%1 == 0) PackInit_RealType<PetscReal,1,0>(link);
844:   } else if (nPetscInt && sizeof(PetscInt) == sizeof(llint)) {
845:     if      (nPetscInt == 8) PackInit_IntegerType<llint,8,1>(link); else if (nPetscInt%8 == 0) PackInit_IntegerType<llint,8,0>(link);
846:     else if (nPetscInt == 4) PackInit_IntegerType<llint,4,1>(link); else if (nPetscInt%4 == 0) PackInit_IntegerType<llint,4,0>(link);
847:     else if (nPetscInt == 2) PackInit_IntegerType<llint,2,1>(link); else if (nPetscInt%2 == 0) PackInit_IntegerType<llint,2,0>(link);
848:     else if (nPetscInt == 1) PackInit_IntegerType<llint,1,1>(link); else if (nPetscInt%1 == 0) PackInit_IntegerType<llint,1,0>(link);
849:   } else if (nInt) {
850:     if      (nInt == 8) PackInit_IntegerType<int,8,1>(link); else if (nInt%8 == 0) PackInit_IntegerType<int,8,0>(link);
851:     else if (nInt == 4) PackInit_IntegerType<int,4,1>(link); else if (nInt%4 == 0) PackInit_IntegerType<int,4,0>(link);
852:     else if (nInt == 2) PackInit_IntegerType<int,2,1>(link); else if (nInt%2 == 0) PackInit_IntegerType<int,2,0>(link);
853:     else if (nInt == 1) PackInit_IntegerType<int,1,1>(link); else if (nInt%1 == 0) PackInit_IntegerType<int,1,0>(link);
854:   } else if (nSignedChar) {
855:     if      (nSignedChar == 8) PackInit_IntegerType<SignedChar,8,1>(link); else if (nSignedChar%8 == 0) PackInit_IntegerType<SignedChar,8,0>(link);
856:     else if (nSignedChar == 4) PackInit_IntegerType<SignedChar,4,1>(link); else if (nSignedChar%4 == 0) PackInit_IntegerType<SignedChar,4,0>(link);
857:     else if (nSignedChar == 2) PackInit_IntegerType<SignedChar,2,1>(link); else if (nSignedChar%2 == 0) PackInit_IntegerType<SignedChar,2,0>(link);
858:     else if (nSignedChar == 1) PackInit_IntegerType<SignedChar,1,1>(link); else if (nSignedChar%1 == 0) PackInit_IntegerType<SignedChar,1,0>(link);
859:   }  else if (nUnsignedChar) {
860:     if      (nUnsignedChar == 8) PackInit_IntegerType<UnsignedChar,8,1>(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType<UnsignedChar,8,0>(link);
861:     else if (nUnsignedChar == 4) PackInit_IntegerType<UnsignedChar,4,1>(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType<UnsignedChar,4,0>(link);
862:     else if (nUnsignedChar == 2) PackInit_IntegerType<UnsignedChar,2,1>(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType<UnsignedChar,2,0>(link);
863:     else if (nUnsignedChar == 1) PackInit_IntegerType<UnsignedChar,1,1>(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType<UnsignedChar,1,0>(link);
864: #if defined(PETSC_HAVE_COMPLEX)
865:   } else if (nPetscComplex) {
866:     if      (nPetscComplex == 8) PackInit_ComplexType<PetscComplex,8,1>(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType<PetscComplex,8,0>(link);
867:     else if (nPetscComplex == 4) PackInit_ComplexType<PetscComplex,4,1>(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType<PetscComplex,4,0>(link);
868:     else if (nPetscComplex == 2) PackInit_ComplexType<PetscComplex,2,1>(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType<PetscComplex,2,0>(link);
869:     else if (nPetscComplex == 1) PackInit_ComplexType<PetscComplex,1,1>(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType<PetscComplex,1,0>(link);
870: #endif
871:   } else {
872:     MPI_Aint lb,nbyte;
873:     MPI_Type_get_extent(unit,&lb,&nbyte);
874:     if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
875:     if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
876:       if      (nbyte == 4) PackInit_DumbType<char,4,1>(link); else if (nbyte%4 == 0) PackInit_DumbType<char,4,0>(link);
877:       else if (nbyte == 2) PackInit_DumbType<char,2,1>(link); else if (nbyte%2 == 0) PackInit_DumbType<char,2,0>(link);
878:       else if (nbyte == 1) PackInit_DumbType<char,1,1>(link); else if (nbyte%1 == 0) PackInit_DumbType<char,1,0>(link);
879:     } else {
880:       nInt = nbyte / sizeof(int);
881:       if      (nInt == 8) PackInit_DumbType<int,8,1>(link); else if (nInt%8 == 0) PackInit_DumbType<int,8,0>(link);
882:       else if (nInt == 4) PackInit_DumbType<int,4,1>(link); else if (nInt%4 == 0) PackInit_DumbType<int,4,0>(link);
883:       else if (nInt == 2) PackInit_DumbType<int,2,1>(link); else if (nInt%2 == 0) PackInit_DumbType<int,2,0>(link);
884:       else if (nInt == 1) PackInit_DumbType<int,1,1>(link); else if (nInt%1 == 0) PackInit_DumbType<int,1,0>(link);
885:     }
886:   }

888:   if (!sf->maxResidentThreadsPerGPU) { /* Not initialized */
889:     int                   device;
890:     struct hipDeviceProp_t props;
891:     cerr = hipGetDevice(&device);CHKERRHIP(cerr);
892:     cerr = hipGetDeviceProperties(&props,device);CHKERRHIP(cerr);
893:     sf->maxResidentThreadsPerGPU = props.maxThreadsPerMultiProcessor*props.multiProcessorCount;
894:   }
895:   link->maxResidentThreadsPerGPU = sf->maxResidentThreadsPerGPU;

897:   link->stream       = PetscDefaultHipStream;
898:   link->Destroy      = PetscSFLinkDestroy_MPI_HIP;
899:   link->SyncDevice   = PetscSFLinkSyncDevice_HIP;
900:   link->SyncStream   = PetscSFLinkSyncStream_HIP;
901:   link->Memcpy       = PetscSFLinkMemcpy_HIP;
902:   link->deviceinited = PETSC_TRUE;
903:   return(0);
904: }