Actual source code: sbaij2.c
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <../src/mat/impls/dense/seq/dense.h>
4: #include <../src/mat/impls/sbaij/seq/sbaij.h>
5: #include <petsc/private/kernels/blockinvert.h>
6: #include <petscbt.h>
7: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqSBAIJ(Mat A, PetscInt is_max, IS is[], PetscInt ov)
10: {
11: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
12: PetscInt brow, i, j, k, l, mbs, n, *nidx, isz, bcol, bcol_max, start, end, *ai, *aj, bs, *nidx2;
13: const PetscInt *idx;
14: PetscBT table_out, table_in;
17: mbs = a->mbs;
18: ai = a->i;
19: aj = a->j;
20: bs = A->rmap->bs;
21: PetscBTCreate(mbs, &table_out);
22: PetscMalloc1(mbs + 1, &nidx);
23: PetscMalloc1(A->rmap->N + 1, &nidx2);
24: PetscBTCreate(mbs, &table_in);
26: for (i = 0; i < is_max; i++) { /* for each is */
27: isz = 0;
28: PetscBTMemzero(mbs, table_out);
30: /* Extract the indices, assume there can be duplicate entries */
31: ISGetIndices(is[i], &idx);
32: ISGetLocalSize(is[i], &n);
34: /* Enter these into the temp arrays i.e mark table_out[brow], enter brow into new index */
35: bcol_max = 0;
36: for (j = 0; j < n; ++j) {
37: brow = idx[j] / bs; /* convert the indices into block indices */
39: if (!PetscBTLookupSet(table_out, brow)) {
40: nidx[isz++] = brow;
41: if (bcol_max < brow) bcol_max = brow;
42: }
43: }
44: ISRestoreIndices(is[i], &idx);
45: ISDestroy(&is[i]);
47: k = 0;
48: for (j = 0; j < ov; j++) { /* for each overlap */
49: /* set table_in for lookup - only mark entries that are added onto nidx in (j-1)-th overlap */
50: PetscBTMemzero(mbs, table_in);
51: for (l = k; l < isz; l++) PetscBTSet(table_in, nidx[l]);
53: n = isz; /* length of the updated is[i] */
54: for (brow = 0; brow < mbs; brow++) {
55: start = ai[brow];
56: end = ai[brow + 1];
57: if (PetscBTLookup(table_in, brow)) { /* brow is on nidx - row search: collect all bcol in this brow */
58: for (l = start; l < end; l++) {
59: bcol = aj[l];
60: if (!PetscBTLookupSet(table_out, bcol)) {
61: nidx[isz++] = bcol;
62: if (bcol_max < bcol) bcol_max = bcol;
63: }
64: }
65: k++;
66: if (k >= n) break; /* for (brow=0; brow<mbs; brow++) */
67: } else { /* brow is not on nidx - col serach: add brow onto nidx if there is a bcol in nidx */
68: for (l = start; l < end; l++) {
69: bcol = aj[l];
70: if (bcol > bcol_max) break;
71: if (PetscBTLookup(table_in, bcol)) {
72: if (!PetscBTLookupSet(table_out, brow)) nidx[isz++] = brow;
73: break; /* for l = start; l<end ; l++) */
74: }
75: }
76: }
77: }
78: } /* for each overlap */
80: /* expand the Index Set */
81: for (j = 0; j < isz; j++) {
82: for (k = 0; k < bs; k++) nidx2[j * bs + k] = nidx[j] * bs + k;
83: }
84: ISCreateGeneral(PETSC_COMM_SELF, isz * bs, nidx2, PETSC_COPY_VALUES, is + i);
85: }
86: PetscBTDestroy(&table_out);
87: PetscFree(nidx);
88: PetscFree(nidx2);
89: PetscBTDestroy(&table_in);
90: return 0;
91: }
93: /* Bseq is non-symmetric SBAIJ matrix, only used internally by PETSc.
94: Zero some ops' to avoid invalid usse */
95: PetscErrorCode MatSeqSBAIJZeroOps_Private(Mat Bseq)
96: {
97: MatSetOption(Bseq, MAT_SYMMETRIC, PETSC_FALSE);
98: Bseq->ops->mult = NULL;
99: Bseq->ops->multadd = NULL;
100: Bseq->ops->multtranspose = NULL;
101: Bseq->ops->multtransposeadd = NULL;
102: Bseq->ops->lufactor = NULL;
103: Bseq->ops->choleskyfactor = NULL;
104: Bseq->ops->lufactorsymbolic = NULL;
105: Bseq->ops->choleskyfactorsymbolic = NULL;
106: Bseq->ops->getinertia = NULL;
107: return 0;
108: }
110: /* same as MatCreateSubMatrices_SeqBAIJ(), except cast Mat_SeqSBAIJ */
111: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ_Private(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
112: {
113: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *c;
114: PetscInt *smap, i, k, kstart, kend, oldcols = a->nbs, *lens;
115: PetscInt row, mat_i, *mat_j, tcol, *mat_ilen;
116: const PetscInt *irow, *icol;
117: PetscInt nrows, ncols, *ssmap, bs = A->rmap->bs, bs2 = a->bs2;
118: PetscInt *aj = a->j, *ai = a->i;
119: MatScalar *mat_a;
120: Mat C;
121: PetscBool flag;
124: ISGetIndices(isrow, &irow);
125: ISGetIndices(iscol, &icol);
126: ISGetLocalSize(isrow, &nrows);
127: ISGetLocalSize(iscol, &ncols);
129: PetscCalloc1(1 + oldcols, &smap);
130: ssmap = smap;
131: PetscMalloc1(1 + nrows, &lens);
132: for (i = 0; i < ncols; i++) smap[icol[i]] = i + 1;
133: /* determine lens of each row */
134: for (i = 0; i < nrows; i++) {
135: kstart = ai[irow[i]];
136: kend = kstart + a->ilen[irow[i]];
137: lens[i] = 0;
138: for (k = kstart; k < kend; k++) {
139: if (ssmap[aj[k]]) lens[i]++;
140: }
141: }
142: /* Create and fill new matrix */
143: if (scall == MAT_REUSE_MATRIX) {
144: c = (Mat_SeqSBAIJ *)((*B)->data);
147: PetscArraycmp(c->ilen, lens, c->mbs, &flag);
149: PetscArrayzero(c->ilen, c->mbs);
150: C = *B;
151: } else {
152: MatCreate(PetscObjectComm((PetscObject)A), &C);
153: MatSetSizes(C, nrows * bs, ncols * bs, PETSC_DETERMINE, PETSC_DETERMINE);
154: MatSetType(C, ((PetscObject)A)->type_name);
155: MatSeqSBAIJSetPreallocation(C, bs, 0, lens);
156: }
157: c = (Mat_SeqSBAIJ *)(C->data);
158: for (i = 0; i < nrows; i++) {
159: row = irow[i];
160: kstart = ai[row];
161: kend = kstart + a->ilen[row];
162: mat_i = c->i[i];
163: mat_j = c->j + mat_i;
164: mat_a = c->a + mat_i * bs2;
165: mat_ilen = c->ilen + i;
166: for (k = kstart; k < kend; k++) {
167: if ((tcol = ssmap[a->j[k]])) {
168: *mat_j++ = tcol - 1;
169: PetscArraycpy(mat_a, a->a + k * bs2, bs2);
170: mat_a += bs2;
171: (*mat_ilen)++;
172: }
173: }
174: }
175: /* sort */
176: {
177: MatScalar *work;
179: PetscMalloc1(bs2, &work);
180: for (i = 0; i < nrows; i++) {
181: PetscInt ilen;
182: mat_i = c->i[i];
183: mat_j = c->j + mat_i;
184: mat_a = c->a + mat_i * bs2;
185: ilen = c->ilen[i];
186: PetscSortIntWithDataArray(ilen, mat_j, mat_a, bs2 * sizeof(MatScalar), work);
187: }
188: PetscFree(work);
189: }
191: /* Free work space */
192: ISRestoreIndices(iscol, &icol);
193: PetscFree(smap);
194: PetscFree(lens);
195: MatAssemblyBegin(C, MAT_FINAL_ASSEMBLY);
196: MatAssemblyEnd(C, MAT_FINAL_ASSEMBLY);
198: ISRestoreIndices(isrow, &irow);
199: *B = C;
200: return 0;
201: }
203: PetscErrorCode MatCreateSubMatrix_SeqSBAIJ(Mat A, IS isrow, IS iscol, MatReuse scall, Mat *B)
204: {
205: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
206: IS is1, is2;
207: PetscInt *vary, *iary, nrows, ncols, i, bs = A->rmap->bs, count, maxmnbs;
208: const PetscInt *irow, *icol;
210: ISGetIndices(isrow, &irow);
211: ISGetIndices(iscol, &icol);
212: ISGetLocalSize(isrow, &nrows);
213: ISGetLocalSize(iscol, &ncols);
215: /* Verify if the indices corespond to each element in a block
216: and form the IS with compressed IS */
217: maxmnbs = PetscMax(a->mbs, a->nbs);
218: PetscMalloc2(maxmnbs, &vary, maxmnbs, &iary);
219: PetscArrayzero(vary, a->mbs);
220: for (i = 0; i < nrows; i++) vary[irow[i] / bs]++;
222: count = 0;
223: for (i = 0; i < nrows; i++) {
224: PetscInt j = irow[i] / bs;
225: if ((vary[j]--) == bs) iary[count++] = j;
226: }
227: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is1);
229: PetscArrayzero(vary, a->nbs);
230: for (i = 0; i < ncols; i++) vary[icol[i] / bs]++;
232: count = 0;
233: for (i = 0; i < ncols; i++) {
234: PetscInt j = icol[i] / bs;
235: if ((vary[j]--) == bs) iary[count++] = j;
236: }
237: ISCreateGeneral(PETSC_COMM_SELF, count, iary, PETSC_COPY_VALUES, &is2);
238: ISRestoreIndices(isrow, &irow);
239: ISRestoreIndices(iscol, &icol);
240: PetscFree2(vary, iary);
242: MatCreateSubMatrix_SeqSBAIJ_Private(A, is1, is2, scall, B);
243: ISDestroy(&is1);
244: ISDestroy(&is2);
246: if (isrow != iscol) {
247: PetscBool isequal;
248: ISEqual(isrow, iscol, &isequal);
249: if (!isequal) MatSeqSBAIJZeroOps_Private(*B);
250: }
251: return 0;
252: }
254: PetscErrorCode MatCreateSubMatrices_SeqSBAIJ(Mat A, PetscInt n, const IS irow[], const IS icol[], MatReuse scall, Mat *B[])
255: {
256: PetscInt i;
258: if (scall == MAT_INITIAL_MATRIX) PetscCalloc1(n + 1, B);
260: for (i = 0; i < n; i++) MatCreateSubMatrix_SeqSBAIJ(A, irow[i], icol[i], scall, &(*B)[i]);
261: return 0;
262: }
264: /* -------------------------------------------------------*/
265: /* Should check that shapes of vectors and matrices match */
266: /* -------------------------------------------------------*/
268: PetscErrorCode MatMult_SeqSBAIJ_2(Mat A, Vec xx, Vec zz)
269: {
270: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
271: PetscScalar *z, x1, x2, zero = 0.0;
272: const PetscScalar *x, *xb;
273: const MatScalar *v;
274: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
275: const PetscInt *aj = a->j, *ai = a->i, *ib;
276: PetscInt nonzerorow = 0;
278: VecSet(zz, zero);
279: if (!a->nz) return 0;
280: VecGetArrayRead(xx, &x);
281: VecGetArray(zz, &z);
283: v = a->a;
284: xb = x;
286: for (i = 0; i < mbs; i++) {
287: n = ai[1] - ai[0]; /* length of i_th block row of A */
288: x1 = xb[0];
289: x2 = xb[1];
290: ib = aj + *ai;
291: jmin = 0;
292: nonzerorow += (n > 0);
293: if (*ib == i) { /* (diag of A)*x */
294: z[2 * i] += v[0] * x1 + v[2] * x2;
295: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
296: v += 4;
297: jmin++;
298: }
299: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
300: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
301: for (j = jmin; j < n; j++) {
302: /* (strict lower triangular part of A)*x */
303: cval = ib[j] * 2;
304: z[cval] += v[0] * x1 + v[1] * x2;
305: z[cval + 1] += v[2] * x1 + v[3] * x2;
306: /* (strict upper triangular part of A)*x */
307: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
308: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
309: v += 4;
310: }
311: xb += 2;
312: ai++;
313: }
315: VecRestoreArrayRead(xx, &x);
316: VecRestoreArray(zz, &z);
317: PetscLogFlops(8.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
318: return 0;
319: }
321: PetscErrorCode MatMult_SeqSBAIJ_3(Mat A, Vec xx, Vec zz)
322: {
323: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
324: PetscScalar *z, x1, x2, x3, zero = 0.0;
325: const PetscScalar *x, *xb;
326: const MatScalar *v;
327: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
328: const PetscInt *aj = a->j, *ai = a->i, *ib;
329: PetscInt nonzerorow = 0;
331: VecSet(zz, zero);
332: if (!a->nz) return 0;
333: VecGetArrayRead(xx, &x);
334: VecGetArray(zz, &z);
336: v = a->a;
337: xb = x;
339: for (i = 0; i < mbs; i++) {
340: n = ai[1] - ai[0]; /* length of i_th block row of A */
341: x1 = xb[0];
342: x2 = xb[1];
343: x3 = xb[2];
344: ib = aj + *ai;
345: jmin = 0;
346: nonzerorow += (n > 0);
347: if (*ib == i) { /* (diag of A)*x */
348: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
349: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
350: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
351: v += 9;
352: jmin++;
353: }
354: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
355: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
356: for (j = jmin; j < n; j++) {
357: /* (strict lower triangular part of A)*x */
358: cval = ib[j] * 3;
359: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
360: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
361: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
362: /* (strict upper triangular part of A)*x */
363: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
364: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
365: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
366: v += 9;
367: }
368: xb += 3;
369: ai++;
370: }
372: VecRestoreArrayRead(xx, &x);
373: VecRestoreArray(zz, &z);
374: PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
375: return 0;
376: }
378: PetscErrorCode MatMult_SeqSBAIJ_4(Mat A, Vec xx, Vec zz)
379: {
380: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
381: PetscScalar *z, x1, x2, x3, x4, zero = 0.0;
382: const PetscScalar *x, *xb;
383: const MatScalar *v;
384: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
385: const PetscInt *aj = a->j, *ai = a->i, *ib;
386: PetscInt nonzerorow = 0;
388: VecSet(zz, zero);
389: if (!a->nz) return 0;
390: VecGetArrayRead(xx, &x);
391: VecGetArray(zz, &z);
393: v = a->a;
394: xb = x;
396: for (i = 0; i < mbs; i++) {
397: n = ai[1] - ai[0]; /* length of i_th block row of A */
398: x1 = xb[0];
399: x2 = xb[1];
400: x3 = xb[2];
401: x4 = xb[3];
402: ib = aj + *ai;
403: jmin = 0;
404: nonzerorow += (n > 0);
405: if (*ib == i) { /* (diag of A)*x */
406: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
407: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
408: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
409: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
410: v += 16;
411: jmin++;
412: }
413: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
414: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
415: for (j = jmin; j < n; j++) {
416: /* (strict lower triangular part of A)*x */
417: cval = ib[j] * 4;
418: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
419: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
420: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
421: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
422: /* (strict upper triangular part of A)*x */
423: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
424: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
425: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
426: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
427: v += 16;
428: }
429: xb += 4;
430: ai++;
431: }
433: VecRestoreArrayRead(xx, &x);
434: VecRestoreArray(zz, &z);
435: PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
436: return 0;
437: }
439: PetscErrorCode MatMult_SeqSBAIJ_5(Mat A, Vec xx, Vec zz)
440: {
441: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
442: PetscScalar *z, x1, x2, x3, x4, x5, zero = 0.0;
443: const PetscScalar *x, *xb;
444: const MatScalar *v;
445: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
446: const PetscInt *aj = a->j, *ai = a->i, *ib;
447: PetscInt nonzerorow = 0;
449: VecSet(zz, zero);
450: if (!a->nz) return 0;
451: VecGetArrayRead(xx, &x);
452: VecGetArray(zz, &z);
454: v = a->a;
455: xb = x;
457: for (i = 0; i < mbs; i++) {
458: n = ai[1] - ai[0]; /* length of i_th block row of A */
459: x1 = xb[0];
460: x2 = xb[1];
461: x3 = xb[2];
462: x4 = xb[3];
463: x5 = xb[4];
464: ib = aj + *ai;
465: jmin = 0;
466: nonzerorow += (n > 0);
467: if (*ib == i) { /* (diag of A)*x */
468: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
469: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
470: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
471: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
472: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
473: v += 25;
474: jmin++;
475: }
476: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
477: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
478: for (j = jmin; j < n; j++) {
479: /* (strict lower triangular part of A)*x */
480: cval = ib[j] * 5;
481: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
482: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
483: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
484: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
485: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
486: /* (strict upper triangular part of A)*x */
487: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
488: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
489: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
490: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
491: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
492: v += 25;
493: }
494: xb += 5;
495: ai++;
496: }
498: VecRestoreArrayRead(xx, &x);
499: VecRestoreArray(zz, &z);
500: PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
501: return 0;
502: }
504: PetscErrorCode MatMult_SeqSBAIJ_6(Mat A, Vec xx, Vec zz)
505: {
506: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
507: PetscScalar *z, x1, x2, x3, x4, x5, x6, zero = 0.0;
508: const PetscScalar *x, *xb;
509: const MatScalar *v;
510: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
511: const PetscInt *aj = a->j, *ai = a->i, *ib;
512: PetscInt nonzerorow = 0;
514: VecSet(zz, zero);
515: if (!a->nz) return 0;
516: VecGetArrayRead(xx, &x);
517: VecGetArray(zz, &z);
519: v = a->a;
520: xb = x;
522: for (i = 0; i < mbs; i++) {
523: n = ai[1] - ai[0]; /* length of i_th block row of A */
524: x1 = xb[0];
525: x2 = xb[1];
526: x3 = xb[2];
527: x4 = xb[3];
528: x5 = xb[4];
529: x6 = xb[5];
530: ib = aj + *ai;
531: jmin = 0;
532: nonzerorow += (n > 0);
533: if (*ib == i) { /* (diag of A)*x */
534: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
535: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
536: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
537: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
538: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
539: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
540: v += 36;
541: jmin++;
542: }
543: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
544: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
545: for (j = jmin; j < n; j++) {
546: /* (strict lower triangular part of A)*x */
547: cval = ib[j] * 6;
548: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
549: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
550: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
551: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
552: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
553: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
554: /* (strict upper triangular part of A)*x */
555: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
556: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
557: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
558: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
559: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
560: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
561: v += 36;
562: }
563: xb += 6;
564: ai++;
565: }
567: VecRestoreArrayRead(xx, &x);
568: VecRestoreArray(zz, &z);
569: PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
570: return 0;
571: }
573: PetscErrorCode MatMult_SeqSBAIJ_7(Mat A, Vec xx, Vec zz)
574: {
575: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
576: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7, zero = 0.0;
577: const PetscScalar *x, *xb;
578: const MatScalar *v;
579: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
580: const PetscInt *aj = a->j, *ai = a->i, *ib;
581: PetscInt nonzerorow = 0;
583: VecSet(zz, zero);
584: if (!a->nz) return 0;
585: VecGetArrayRead(xx, &x);
586: VecGetArray(zz, &z);
588: v = a->a;
589: xb = x;
591: for (i = 0; i < mbs; i++) {
592: n = ai[1] - ai[0]; /* length of i_th block row of A */
593: x1 = xb[0];
594: x2 = xb[1];
595: x3 = xb[2];
596: x4 = xb[3];
597: x5 = xb[4];
598: x6 = xb[5];
599: x7 = xb[6];
600: ib = aj + *ai;
601: jmin = 0;
602: nonzerorow += (n > 0);
603: if (*ib == i) { /* (diag of A)*x */
604: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
605: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
606: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
607: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
608: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
609: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
610: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
611: v += 49;
612: jmin++;
613: }
614: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
615: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
616: for (j = jmin; j < n; j++) {
617: /* (strict lower triangular part of A)*x */
618: cval = ib[j] * 7;
619: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
620: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
621: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
622: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
623: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
624: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
625: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
626: /* (strict upper triangular part of A)*x */
627: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
628: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
629: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
630: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
631: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
632: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
633: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
634: v += 49;
635: }
636: xb += 7;
637: ai++;
638: }
639: VecRestoreArrayRead(xx, &x);
640: VecRestoreArray(zz, &z);
641: PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow) - nonzerorow);
642: return 0;
643: }
645: /*
646: This will not work with MatScalar == float because it calls the BLAS
647: */
648: PetscErrorCode MatMult_SeqSBAIJ_N(Mat A, Vec xx, Vec zz)
649: {
650: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
651: PetscScalar *z, *z_ptr, *zb, *work, *workt, zero = 0.0;
652: const PetscScalar *x, *x_ptr, *xb;
653: const MatScalar *v;
654: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
655: const PetscInt *idx, *aj, *ii;
656: PetscInt nonzerorow = 0;
658: VecSet(zz, zero);
659: if (!a->nz) return 0;
660: VecGetArrayRead(xx, &x);
661: VecGetArray(zz, &z);
663: x_ptr = x;
664: z_ptr = z;
666: aj = a->j;
667: v = a->a;
668: ii = a->i;
670: if (!a->mult_work) PetscMalloc1(A->rmap->N + 1, &a->mult_work);
671: work = a->mult_work;
673: for (i = 0; i < mbs; i++) {
674: n = ii[1] - ii[0];
675: ncols = n * bs;
676: workt = work;
677: idx = aj + ii[0];
678: nonzerorow += (n > 0);
680: /* upper triangular part */
681: for (j = 0; j < n; j++) {
682: xb = x_ptr + bs * (*idx++);
683: for (k = 0; k < bs; k++) workt[k] = xb[k];
684: workt += bs;
685: }
686: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
687: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
689: /* strict lower triangular part */
690: idx = aj + ii[0];
691: if (n && *idx == i) {
692: ncols -= bs;
693: v += bs2;
694: idx++;
695: n--;
696: }
698: if (ncols > 0) {
699: workt = work;
700: PetscArrayzero(workt, ncols);
701: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
702: for (j = 0; j < n; j++) {
703: zb = z_ptr + bs * (*idx++);
704: for (k = 0; k < bs; k++) zb[k] += workt[k];
705: workt += bs;
706: }
707: }
708: x += bs;
709: v += n * bs2;
710: z += bs;
711: ii++;
712: }
714: VecRestoreArrayRead(xx, &x);
715: VecRestoreArray(zz, &z);
716: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow) * bs2 - nonzerorow);
717: return 0;
718: }
720: PetscErrorCode MatMultAdd_SeqSBAIJ_1(Mat A, Vec xx, Vec yy, Vec zz)
721: {
722: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
723: PetscScalar *z, x1;
724: const PetscScalar *x, *xb;
725: const MatScalar *v;
726: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
727: const PetscInt *aj = a->j, *ai = a->i, *ib;
728: PetscInt nonzerorow = 0;
729: #if defined(PETSC_USE_COMPLEX)
730: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
731: #else
732: const int aconj = 0;
733: #endif
735: VecCopy(yy, zz);
736: if (!a->nz) return 0;
737: VecGetArrayRead(xx, &x);
738: VecGetArray(zz, &z);
739: v = a->a;
740: xb = x;
742: for (i = 0; i < mbs; i++) {
743: n = ai[1] - ai[0]; /* length of i_th row of A */
744: x1 = xb[0];
745: ib = aj + *ai;
746: jmin = 0;
747: nonzerorow += (n > 0);
748: if (n && *ib == i) { /* (diag of A)*x */
749: z[i] += *v++ * x[*ib++];
750: jmin++;
751: }
752: if (aconj) {
753: for (j = jmin; j < n; j++) {
754: cval = *ib;
755: z[cval] += PetscConj(*v) * x1; /* (strict lower triangular part of A)*x */
756: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
757: }
758: } else {
759: for (j = jmin; j < n; j++) {
760: cval = *ib;
761: z[cval] += *v * x1; /* (strict lower triangular part of A)*x */
762: z[i] += *v++ * x[*ib++]; /* (strict upper triangular part of A)*x */
763: }
764: }
765: xb++;
766: ai++;
767: }
769: VecRestoreArrayRead(xx, &x);
770: VecRestoreArray(zz, &z);
772: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
773: return 0;
774: }
776: PetscErrorCode MatMultAdd_SeqSBAIJ_2(Mat A, Vec xx, Vec yy, Vec zz)
777: {
778: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
779: PetscScalar *z, x1, x2;
780: const PetscScalar *x, *xb;
781: const MatScalar *v;
782: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
783: const PetscInt *aj = a->j, *ai = a->i, *ib;
784: PetscInt nonzerorow = 0;
786: VecCopy(yy, zz);
787: if (!a->nz) return 0;
788: VecGetArrayRead(xx, &x);
789: VecGetArray(zz, &z);
791: v = a->a;
792: xb = x;
794: for (i = 0; i < mbs; i++) {
795: n = ai[1] - ai[0]; /* length of i_th block row of A */
796: x1 = xb[0];
797: x2 = xb[1];
798: ib = aj + *ai;
799: jmin = 0;
800: nonzerorow += (n > 0);
801: if (n && *ib == i) { /* (diag of A)*x */
802: z[2 * i] += v[0] * x1 + v[2] * x2;
803: z[2 * i + 1] += v[2] * x1 + v[3] * x2;
804: v += 4;
805: jmin++;
806: }
807: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
808: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
809: for (j = jmin; j < n; j++) {
810: /* (strict lower triangular part of A)*x */
811: cval = ib[j] * 2;
812: z[cval] += v[0] * x1 + v[1] * x2;
813: z[cval + 1] += v[2] * x1 + v[3] * x2;
814: /* (strict upper triangular part of A)*x */
815: z[2 * i] += v[0] * x[cval] + v[2] * x[cval + 1];
816: z[2 * i + 1] += v[1] * x[cval] + v[3] * x[cval + 1];
817: v += 4;
818: }
819: xb += 2;
820: ai++;
821: }
822: VecRestoreArrayRead(xx, &x);
823: VecRestoreArray(zz, &z);
825: PetscLogFlops(4.0 * (a->nz * 2.0 - nonzerorow));
826: return 0;
827: }
829: PetscErrorCode MatMultAdd_SeqSBAIJ_3(Mat A, Vec xx, Vec yy, Vec zz)
830: {
831: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
832: PetscScalar *z, x1, x2, x3;
833: const PetscScalar *x, *xb;
834: const MatScalar *v;
835: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
836: const PetscInt *aj = a->j, *ai = a->i, *ib;
837: PetscInt nonzerorow = 0;
839: VecCopy(yy, zz);
840: if (!a->nz) return 0;
841: VecGetArrayRead(xx, &x);
842: VecGetArray(zz, &z);
844: v = a->a;
845: xb = x;
847: for (i = 0; i < mbs; i++) {
848: n = ai[1] - ai[0]; /* length of i_th block row of A */
849: x1 = xb[0];
850: x2 = xb[1];
851: x3 = xb[2];
852: ib = aj + *ai;
853: jmin = 0;
854: nonzerorow += (n > 0);
855: if (n && *ib == i) { /* (diag of A)*x */
856: z[3 * i] += v[0] * x1 + v[3] * x2 + v[6] * x3;
857: z[3 * i + 1] += v[3] * x1 + v[4] * x2 + v[7] * x3;
858: z[3 * i + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
859: v += 9;
860: jmin++;
861: }
862: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
863: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864: for (j = jmin; j < n; j++) {
865: /* (strict lower triangular part of A)*x */
866: cval = ib[j] * 3;
867: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3;
868: z[cval + 1] += v[3] * x1 + v[4] * x2 + v[5] * x3;
869: z[cval + 2] += v[6] * x1 + v[7] * x2 + v[8] * x3;
870: /* (strict upper triangular part of A)*x */
871: z[3 * i] += v[0] * x[cval] + v[3] * x[cval + 1] + v[6] * x[cval + 2];
872: z[3 * i + 1] += v[1] * x[cval] + v[4] * x[cval + 1] + v[7] * x[cval + 2];
873: z[3 * i + 2] += v[2] * x[cval] + v[5] * x[cval + 1] + v[8] * x[cval + 2];
874: v += 9;
875: }
876: xb += 3;
877: ai++;
878: }
880: VecRestoreArrayRead(xx, &x);
881: VecRestoreArray(zz, &z);
883: PetscLogFlops(18.0 * (a->nz * 2.0 - nonzerorow));
884: return 0;
885: }
887: PetscErrorCode MatMultAdd_SeqSBAIJ_4(Mat A, Vec xx, Vec yy, Vec zz)
888: {
889: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
890: PetscScalar *z, x1, x2, x3, x4;
891: const PetscScalar *x, *xb;
892: const MatScalar *v;
893: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
894: const PetscInt *aj = a->j, *ai = a->i, *ib;
895: PetscInt nonzerorow = 0;
897: VecCopy(yy, zz);
898: if (!a->nz) return 0;
899: VecGetArrayRead(xx, &x);
900: VecGetArray(zz, &z);
902: v = a->a;
903: xb = x;
905: for (i = 0; i < mbs; i++) {
906: n = ai[1] - ai[0]; /* length of i_th block row of A */
907: x1 = xb[0];
908: x2 = xb[1];
909: x3 = xb[2];
910: x4 = xb[3];
911: ib = aj + *ai;
912: jmin = 0;
913: nonzerorow += (n > 0);
914: if (n && *ib == i) { /* (diag of A)*x */
915: z[4 * i] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
916: z[4 * i + 1] += v[4] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
917: z[4 * i + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[14] * x4;
918: z[4 * i + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
919: v += 16;
920: jmin++;
921: }
922: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
923: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
924: for (j = jmin; j < n; j++) {
925: /* (strict lower triangular part of A)*x */
926: cval = ib[j] * 4;
927: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4;
928: z[cval + 1] += v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4;
929: z[cval + 2] += v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4;
930: z[cval + 3] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4;
931: /* (strict upper triangular part of A)*x */
932: z[4 * i] += v[0] * x[cval] + v[4] * x[cval + 1] + v[8] * x[cval + 2] + v[12] * x[cval + 3];
933: z[4 * i + 1] += v[1] * x[cval] + v[5] * x[cval + 1] + v[9] * x[cval + 2] + v[13] * x[cval + 3];
934: z[4 * i + 2] += v[2] * x[cval] + v[6] * x[cval + 1] + v[10] * x[cval + 2] + v[14] * x[cval + 3];
935: z[4 * i + 3] += v[3] * x[cval] + v[7] * x[cval + 1] + v[11] * x[cval + 2] + v[15] * x[cval + 3];
936: v += 16;
937: }
938: xb += 4;
939: ai++;
940: }
942: VecRestoreArrayRead(xx, &x);
943: VecRestoreArray(zz, &z);
945: PetscLogFlops(32.0 * (a->nz * 2.0 - nonzerorow));
946: return 0;
947: }
949: PetscErrorCode MatMultAdd_SeqSBAIJ_5(Mat A, Vec xx, Vec yy, Vec zz)
950: {
951: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
952: PetscScalar *z, x1, x2, x3, x4, x5;
953: const PetscScalar *x, *xb;
954: const MatScalar *v;
955: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
956: const PetscInt *aj = a->j, *ai = a->i, *ib;
957: PetscInt nonzerorow = 0;
959: VecCopy(yy, zz);
960: if (!a->nz) return 0;
961: VecGetArrayRead(xx, &x);
962: VecGetArray(zz, &z);
964: v = a->a;
965: xb = x;
967: for (i = 0; i < mbs; i++) {
968: n = ai[1] - ai[0]; /* length of i_th block row of A */
969: x1 = xb[0];
970: x2 = xb[1];
971: x3 = xb[2];
972: x4 = xb[3];
973: x5 = xb[4];
974: ib = aj + *ai;
975: jmin = 0;
976: nonzerorow += (n > 0);
977: if (n && *ib == i) { /* (diag of A)*x */
978: z[5 * i] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
979: z[5 * i + 1] += v[5] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
980: z[5 * i + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
981: z[5 * i + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[23] * x5;
982: z[5 * i + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
983: v += 25;
984: jmin++;
985: }
986: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
987: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
988: for (j = jmin; j < n; j++) {
989: /* (strict lower triangular part of A)*x */
990: cval = ib[j] * 5;
991: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5;
992: z[cval + 1] += v[5] * x1 + v[6] * x2 + v[7] * x3 + v[8] * x4 + v[9] * x5;
993: z[cval + 2] += v[10] * x1 + v[11] * x2 + v[12] * x3 + v[13] * x4 + v[14] * x5;
994: z[cval + 3] += v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5;
995: z[cval + 4] += v[20] * x1 + v[21] * x2 + v[22] * x3 + v[23] * x4 + v[24] * x5;
996: /* (strict upper triangular part of A)*x */
997: z[5 * i] += v[0] * x[cval] + v[5] * x[cval + 1] + v[10] * x[cval + 2] + v[15] * x[cval + 3] + v[20] * x[cval + 4];
998: z[5 * i + 1] += v[1] * x[cval] + v[6] * x[cval + 1] + v[11] * x[cval + 2] + v[16] * x[cval + 3] + v[21] * x[cval + 4];
999: z[5 * i + 2] += v[2] * x[cval] + v[7] * x[cval + 1] + v[12] * x[cval + 2] + v[17] * x[cval + 3] + v[22] * x[cval + 4];
1000: z[5 * i + 3] += v[3] * x[cval] + v[8] * x[cval + 1] + v[13] * x[cval + 2] + v[18] * x[cval + 3] + v[23] * x[cval + 4];
1001: z[5 * i + 4] += v[4] * x[cval] + v[9] * x[cval + 1] + v[14] * x[cval + 2] + v[19] * x[cval + 3] + v[24] * x[cval + 4];
1002: v += 25;
1003: }
1004: xb += 5;
1005: ai++;
1006: }
1008: VecRestoreArrayRead(xx, &x);
1009: VecRestoreArray(zz, &z);
1011: PetscLogFlops(50.0 * (a->nz * 2.0 - nonzerorow));
1012: return 0;
1013: }
1015: PetscErrorCode MatMultAdd_SeqSBAIJ_6(Mat A, Vec xx, Vec yy, Vec zz)
1016: {
1017: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1018: PetscScalar *z, x1, x2, x3, x4, x5, x6;
1019: const PetscScalar *x, *xb;
1020: const MatScalar *v;
1021: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1022: const PetscInt *aj = a->j, *ai = a->i, *ib;
1023: PetscInt nonzerorow = 0;
1025: VecCopy(yy, zz);
1026: if (!a->nz) return 0;
1027: VecGetArrayRead(xx, &x);
1028: VecGetArray(zz, &z);
1030: v = a->a;
1031: xb = x;
1033: for (i = 0; i < mbs; i++) {
1034: n = ai[1] - ai[0]; /* length of i_th block row of A */
1035: x1 = xb[0];
1036: x2 = xb[1];
1037: x3 = xb[2];
1038: x4 = xb[3];
1039: x5 = xb[4];
1040: x6 = xb[5];
1041: ib = aj + *ai;
1042: jmin = 0;
1043: nonzerorow += (n > 0);
1044: if (n && *ib == i) { /* (diag of A)*x */
1045: z[6 * i] += v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
1046: z[6 * i + 1] += v[6] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
1047: z[6 * i + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
1048: z[6 * i + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
1049: z[6 * i + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[34] * x6;
1050: z[6 * i + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1051: v += 36;
1052: jmin++;
1053: }
1054: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1055: PetscPrefetchBlock(v + 36 * n, 36 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1056: for (j = jmin; j < n; j++) {
1057: /* (strict lower triangular part of A)*x */
1058: cval = ib[j] * 6;
1059: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6;
1060: z[cval + 1] += v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4 + v[10] * x5 + v[11] * x6;
1061: z[cval + 2] += v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4 + v[16] * x5 + v[17] * x6;
1062: z[cval + 3] += v[18] * x1 + v[19] * x2 + v[20] * x3 + v[21] * x4 + v[22] * x5 + v[23] * x6;
1063: z[cval + 4] += v[24] * x1 + v[25] * x2 + v[26] * x3 + v[27] * x4 + v[28] * x5 + v[29] * x6;
1064: z[cval + 5] += v[30] * x1 + v[31] * x2 + v[32] * x3 + v[33] * x4 + v[34] * x5 + v[35] * x6;
1065: /* (strict upper triangular part of A)*x */
1066: z[6 * i] += v[0] * x[cval] + v[6] * x[cval + 1] + v[12] * x[cval + 2] + v[18] * x[cval + 3] + v[24] * x[cval + 4] + v[30] * x[cval + 5];
1067: z[6 * i + 1] += v[1] * x[cval] + v[7] * x[cval + 1] + v[13] * x[cval + 2] + v[19] * x[cval + 3] + v[25] * x[cval + 4] + v[31] * x[cval + 5];
1068: z[6 * i + 2] += v[2] * x[cval] + v[8] * x[cval + 1] + v[14] * x[cval + 2] + v[20] * x[cval + 3] + v[26] * x[cval + 4] + v[32] * x[cval + 5];
1069: z[6 * i + 3] += v[3] * x[cval] + v[9] * x[cval + 1] + v[15] * x[cval + 2] + v[21] * x[cval + 3] + v[27] * x[cval + 4] + v[33] * x[cval + 5];
1070: z[6 * i + 4] += v[4] * x[cval] + v[10] * x[cval + 1] + v[16] * x[cval + 2] + v[22] * x[cval + 3] + v[28] * x[cval + 4] + v[34] * x[cval + 5];
1071: z[6 * i + 5] += v[5] * x[cval] + v[11] * x[cval + 1] + v[17] * x[cval + 2] + v[23] * x[cval + 3] + v[29] * x[cval + 4] + v[35] * x[cval + 5];
1072: v += 36;
1073: }
1074: xb += 6;
1075: ai++;
1076: }
1078: VecRestoreArrayRead(xx, &x);
1079: VecRestoreArray(zz, &z);
1081: PetscLogFlops(72.0 * (a->nz * 2.0 - nonzerorow));
1082: return 0;
1083: }
1085: PetscErrorCode MatMultAdd_SeqSBAIJ_7(Mat A, Vec xx, Vec yy, Vec zz)
1086: {
1087: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1088: PetscScalar *z, x1, x2, x3, x4, x5, x6, x7;
1089: const PetscScalar *x, *xb;
1090: const MatScalar *v;
1091: PetscInt mbs = a->mbs, i, n, cval, j, jmin;
1092: const PetscInt *aj = a->j, *ai = a->i, *ib;
1093: PetscInt nonzerorow = 0;
1095: VecCopy(yy, zz);
1096: if (!a->nz) return 0;
1097: VecGetArrayRead(xx, &x);
1098: VecGetArray(zz, &z);
1100: v = a->a;
1101: xb = x;
1103: for (i = 0; i < mbs; i++) {
1104: n = ai[1] - ai[0]; /* length of i_th block row of A */
1105: x1 = xb[0];
1106: x2 = xb[1];
1107: x3 = xb[2];
1108: x4 = xb[3];
1109: x5 = xb[4];
1110: x6 = xb[5];
1111: x7 = xb[6];
1112: ib = aj + *ai;
1113: jmin = 0;
1114: nonzerorow += (n > 0);
1115: if (n && *ib == i) { /* (diag of A)*x */
1116: z[7 * i] += v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
1117: z[7 * i + 1] += v[7] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
1118: z[7 * i + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
1119: z[7 * i + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
1120: z[7 * i + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
1121: z[7 * i + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[47] * x7;
1122: z[7 * i + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1123: v += 49;
1124: jmin++;
1125: }
1126: PetscPrefetchBlock(ib + jmin + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1127: PetscPrefetchBlock(v + 49 * n, 49 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1128: for (j = jmin; j < n; j++) {
1129: /* (strict lower triangular part of A)*x */
1130: cval = ib[j] * 7;
1131: z[cval] += v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4 + v[4] * x5 + v[5] * x6 + v[6] * x7;
1132: z[cval + 1] += v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5 + v[12] * x6 + v[13] * x7;
1133: z[cval + 2] += v[14] * x1 + v[15] * x2 + v[16] * x3 + v[17] * x4 + v[18] * x5 + v[19] * x6 + v[20] * x7;
1134: z[cval + 3] += v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4 + v[25] * x5 + v[26] * x6 + v[27] * x7;
1135: z[cval + 4] += v[28] * x1 + v[29] * x2 + v[30] * x3 + v[31] * x4 + v[32] * x5 + v[33] * x6 + v[34] * x7;
1136: z[cval + 5] += v[35] * x1 + v[36] * x2 + v[37] * x3 + v[38] * x4 + v[39] * x5 + v[40] * x6 + v[41] * x7;
1137: z[cval + 6] += v[42] * x1 + v[43] * x2 + v[44] * x3 + v[45] * x4 + v[46] * x5 + v[47] * x6 + v[48] * x7;
1138: /* (strict upper triangular part of A)*x */
1139: z[7 * i] += v[0] * x[cval] + v[7] * x[cval + 1] + v[14] * x[cval + 2] + v[21] * x[cval + 3] + v[28] * x[cval + 4] + v[35] * x[cval + 5] + v[42] * x[cval + 6];
1140: z[7 * i + 1] += v[1] * x[cval] + v[8] * x[cval + 1] + v[15] * x[cval + 2] + v[22] * x[cval + 3] + v[29] * x[cval + 4] + v[36] * x[cval + 5] + v[43] * x[cval + 6];
1141: z[7 * i + 2] += v[2] * x[cval] + v[9] * x[cval + 1] + v[16] * x[cval + 2] + v[23] * x[cval + 3] + v[30] * x[cval + 4] + v[37] * x[cval + 5] + v[44] * x[cval + 6];
1142: z[7 * i + 3] += v[3] * x[cval] + v[10] * x[cval + 1] + v[17] * x[cval + 2] + v[24] * x[cval + 3] + v[31] * x[cval + 4] + v[38] * x[cval + 5] + v[45] * x[cval + 6];
1143: z[7 * i + 4] += v[4] * x[cval] + v[11] * x[cval + 1] + v[18] * x[cval + 2] + v[25] * x[cval + 3] + v[32] * x[cval + 4] + v[39] * x[cval + 5] + v[46] * x[cval + 6];
1144: z[7 * i + 5] += v[5] * x[cval] + v[12] * x[cval + 1] + v[19] * x[cval + 2] + v[26] * x[cval + 3] + v[33] * x[cval + 4] + v[40] * x[cval + 5] + v[47] * x[cval + 6];
1145: z[7 * i + 6] += v[6] * x[cval] + v[13] * x[cval + 1] + v[20] * x[cval + 2] + v[27] * x[cval + 3] + v[34] * x[cval + 4] + v[41] * x[cval + 5] + v[48] * x[cval + 6];
1146: v += 49;
1147: }
1148: xb += 7;
1149: ai++;
1150: }
1152: VecRestoreArrayRead(xx, &x);
1153: VecRestoreArray(zz, &z);
1155: PetscLogFlops(98.0 * (a->nz * 2.0 - nonzerorow));
1156: return 0;
1157: }
1159: PetscErrorCode MatMultAdd_SeqSBAIJ_N(Mat A, Vec xx, Vec yy, Vec zz)
1160: {
1161: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1162: PetscScalar *z, *z_ptr = NULL, *zb, *work, *workt;
1163: const PetscScalar *x, *x_ptr, *xb;
1164: const MatScalar *v;
1165: PetscInt mbs = a->mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2, ncols, k;
1166: const PetscInt *idx, *aj, *ii;
1167: PetscInt nonzerorow = 0;
1169: VecCopy(yy, zz);
1170: if (!a->nz) return 0;
1171: VecGetArrayRead(xx, &x);
1172: x_ptr = x;
1173: VecGetArray(zz, &z);
1174: z_ptr = z;
1176: aj = a->j;
1177: v = a->a;
1178: ii = a->i;
1180: if (!a->mult_work) PetscMalloc1(A->rmap->n + 1, &a->mult_work);
1181: work = a->mult_work;
1183: for (i = 0; i < mbs; i++) {
1184: n = ii[1] - ii[0];
1185: ncols = n * bs;
1186: workt = work;
1187: idx = aj + ii[0];
1188: nonzerorow += (n > 0);
1190: /* upper triangular part */
1191: for (j = 0; j < n; j++) {
1192: xb = x_ptr + bs * (*idx++);
1193: for (k = 0; k < bs; k++) workt[k] = xb[k];
1194: workt += bs;
1195: }
1196: /* z(i*bs:(i+1)*bs-1) += A(i,:)*x */
1197: PetscKernel_w_gets_w_plus_Ar_times_v(bs, ncols, work, v, z);
1199: /* strict lower triangular part */
1200: idx = aj + ii[0];
1201: if (n && *idx == i) {
1202: ncols -= bs;
1203: v += bs2;
1204: idx++;
1205: n--;
1206: }
1207: if (ncols > 0) {
1208: workt = work;
1209: PetscArrayzero(workt, ncols);
1210: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs, ncols, x, v, workt);
1211: for (j = 0; j < n; j++) {
1212: zb = z_ptr + bs * (*idx++);
1213: for (k = 0; k < bs; k++) zb[k] += workt[k];
1214: workt += bs;
1215: }
1216: }
1218: x += bs;
1219: v += n * bs2;
1220: z += bs;
1221: ii++;
1222: }
1224: VecRestoreArrayRead(xx, &x);
1225: VecRestoreArray(zz, &z);
1227: PetscLogFlops(2.0 * (a->nz * 2.0 - nonzerorow));
1228: return 0;
1229: }
1231: PetscErrorCode MatScale_SeqSBAIJ(Mat inA, PetscScalar alpha)
1232: {
1233: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)inA->data;
1234: PetscScalar oalpha = alpha;
1235: PetscBLASInt one = 1, totalnz;
1237: PetscBLASIntCast(a->bs2 * a->nz, &totalnz);
1238: PetscCallBLAS("BLASscal", BLASscal_(&totalnz, &oalpha, a->a, &one));
1239: PetscLogFlops(totalnz);
1240: return 0;
1241: }
1243: PetscErrorCode MatNorm_SeqSBAIJ(Mat A, NormType type, PetscReal *norm)
1244: {
1245: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1246: const MatScalar *v = a->a;
1247: PetscReal sum_diag = 0.0, sum_off = 0.0, *sum;
1248: PetscInt i, j, k, bs = A->rmap->bs, bs2 = a->bs2, k1, mbs = a->mbs, jmin, jmax, nexti, ik, *jl, *il;
1249: const PetscInt *aj = a->j, *col;
1251: if (!a->nz) {
1252: *norm = 0.0;
1253: return 0;
1254: }
1255: if (type == NORM_FROBENIUS) {
1256: for (k = 0; k < mbs; k++) {
1257: jmin = a->i[k];
1258: jmax = a->i[k + 1];
1259: col = aj + jmin;
1260: if (jmax - jmin > 0 && *col == k) { /* diagonal block */
1261: for (i = 0; i < bs2; i++) {
1262: sum_diag += PetscRealPart(PetscConj(*v) * (*v));
1263: v++;
1264: }
1265: jmin++;
1266: }
1267: for (j = jmin; j < jmax; j++) { /* off-diagonal blocks */
1268: for (i = 0; i < bs2; i++) {
1269: sum_off += PetscRealPart(PetscConj(*v) * (*v));
1270: v++;
1271: }
1272: }
1273: }
1274: *norm = PetscSqrtReal(sum_diag + 2 * sum_off);
1275: PetscLogFlops(2.0 * bs2 * a->nz);
1276: } else if (type == NORM_INFINITY || type == NORM_1) { /* maximum row/column sum */
1277: PetscMalloc3(bs, &sum, mbs, &il, mbs, &jl);
1278: for (i = 0; i < mbs; i++) jl[i] = mbs;
1279: il[0] = 0;
1281: *norm = 0.0;
1282: for (k = 0; k < mbs; k++) { /* k_th block row */
1283: for (j = 0; j < bs; j++) sum[j] = 0.0;
1284: /*-- col sum --*/
1285: i = jl[k]; /* first |A(i,k)| to be added */
1286: /* jl[k]=i: first nozero element in row i for submatrix A(1:k,k:n) (active window)
1287: at step k */
1288: while (i < mbs) {
1289: nexti = jl[i]; /* next block row to be added */
1290: ik = il[i]; /* block index of A(i,k) in the array a */
1291: for (j = 0; j < bs; j++) {
1292: v = a->a + ik * bs2 + j * bs;
1293: for (k1 = 0; k1 < bs; k1++) {
1294: sum[j] += PetscAbsScalar(*v);
1295: v++;
1296: }
1297: }
1298: /* update il, jl */
1299: jmin = ik + 1; /* block index of array a: points to the next nonzero of A in row i */
1300: jmax = a->i[i + 1];
1301: if (jmin < jmax) {
1302: il[i] = jmin;
1303: j = a->j[jmin];
1304: jl[i] = jl[j];
1305: jl[j] = i;
1306: }
1307: i = nexti;
1308: }
1309: /*-- row sum --*/
1310: jmin = a->i[k];
1311: jmax = a->i[k + 1];
1312: for (i = jmin; i < jmax; i++) {
1313: for (j = 0; j < bs; j++) {
1314: v = a->a + i * bs2 + j;
1315: for (k1 = 0; k1 < bs; k1++) {
1316: sum[j] += PetscAbsScalar(*v);
1317: v += bs;
1318: }
1319: }
1320: }
1321: /* add k_th block row to il, jl */
1322: col = aj + jmin;
1323: if (jmax - jmin > 0 && *col == k) jmin++;
1324: if (jmin < jmax) {
1325: il[k] = jmin;
1326: j = a->j[jmin];
1327: jl[k] = jl[j];
1328: jl[j] = k;
1329: }
1330: for (j = 0; j < bs; j++) {
1331: if (sum[j] > *norm) *norm = sum[j];
1332: }
1333: }
1334: PetscFree3(sum, il, jl);
1335: PetscLogFlops(PetscMax(mbs * a->nz - 1, 0));
1336: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for this norm yet");
1337: return 0;
1338: }
1340: PetscErrorCode MatEqual_SeqSBAIJ(Mat A, Mat B, PetscBool *flg)
1341: {
1342: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b = (Mat_SeqSBAIJ *)B->data;
1344: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1345: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs) || (a->nz != b->nz)) {
1346: *flg = PETSC_FALSE;
1347: return 0;
1348: }
1350: /* if the a->i are the same */
1351: PetscArraycmp(a->i, b->i, a->mbs + 1, flg);
1352: if (!*flg) return 0;
1354: /* if a->j are the same */
1355: PetscArraycmp(a->j, b->j, a->nz, flg);
1356: if (!*flg) return 0;
1358: /* if a->a are the same */
1359: PetscArraycmp(a->a, b->a, (a->nz) * (A->rmap->bs) * (A->rmap->bs), flg);
1360: return 0;
1361: }
1363: PetscErrorCode MatGetDiagonal_SeqSBAIJ(Mat A, Vec v)
1364: {
1365: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1366: PetscInt i, j, k, row, bs, ambs, bs2;
1367: const PetscInt *ai, *aj;
1368: PetscScalar *x, zero = 0.0;
1369: const MatScalar *aa, *aa_j;
1371: bs = A->rmap->bs;
1374: aa = a->a;
1375: ambs = a->mbs;
1377: if (A->factortype == MAT_FACTOR_CHOLESKY || A->factortype == MAT_FACTOR_ICC) {
1378: PetscInt *diag = a->diag;
1379: aa = a->a;
1380: ambs = a->mbs;
1381: VecGetArray(v, &x);
1382: for (i = 0; i < ambs; i++) x[i] = 1.0 / aa[diag[i]];
1383: VecRestoreArray(v, &x);
1384: return 0;
1385: }
1387: ai = a->i;
1388: aj = a->j;
1389: bs2 = a->bs2;
1390: VecSet(v, zero);
1391: if (!a->nz) return 0;
1392: VecGetArray(v, &x);
1393: for (i = 0; i < ambs; i++) {
1394: j = ai[i];
1395: if (aj[j] == i) { /* if this is a diagonal element */
1396: row = i * bs;
1397: aa_j = aa + j * bs2;
1398: for (k = 0; k < bs2; k += (bs + 1), row++) x[row] = aa_j[k];
1399: }
1400: }
1401: VecRestoreArray(v, &x);
1402: return 0;
1403: }
1405: PetscErrorCode MatDiagonalScale_SeqSBAIJ(Mat A, Vec ll, Vec rr)
1406: {
1407: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1408: PetscScalar x;
1409: const PetscScalar *l, *li, *ri;
1410: MatScalar *aa, *v;
1411: PetscInt i, j, k, lm, M, m, mbs, tmp, bs, bs2;
1412: const PetscInt *ai, *aj;
1413: PetscBool flg;
1415: if (ll != rr) {
1416: VecEqual(ll, rr, &flg);
1418: }
1419: if (!ll) return 0;
1420: ai = a->i;
1421: aj = a->j;
1422: aa = a->a;
1423: m = A->rmap->N;
1424: bs = A->rmap->bs;
1425: mbs = a->mbs;
1426: bs2 = a->bs2;
1428: VecGetArrayRead(ll, &l);
1429: VecGetLocalSize(ll, &lm);
1431: for (i = 0; i < mbs; i++) { /* for each block row */
1432: M = ai[i + 1] - ai[i];
1433: li = l + i * bs;
1434: v = aa + bs2 * ai[i];
1435: for (j = 0; j < M; j++) { /* for each block */
1436: ri = l + bs * aj[ai[i] + j];
1437: for (k = 0; k < bs; k++) {
1438: x = ri[k];
1439: for (tmp = 0; tmp < bs; tmp++) (*v++) *= li[tmp] * x;
1440: }
1441: }
1442: }
1443: VecRestoreArrayRead(ll, &l);
1444: PetscLogFlops(2.0 * a->nz);
1445: return 0;
1446: }
1448: PetscErrorCode MatGetInfo_SeqSBAIJ(Mat A, MatInfoType flag, MatInfo *info)
1449: {
1450: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1452: info->block_size = a->bs2;
1453: info->nz_allocated = a->bs2 * a->maxnz; /*num. of nonzeros in upper triangular part */
1454: info->nz_used = a->bs2 * a->nz; /*num. of nonzeros in upper triangular part */
1455: info->nz_unneeded = info->nz_allocated - info->nz_used;
1456: info->assemblies = A->num_ass;
1457: info->mallocs = A->info.mallocs;
1458: info->memory = 0; /* REVIEW ME */
1459: if (A->factortype) {
1460: info->fill_ratio_given = A->info.fill_ratio_given;
1461: info->fill_ratio_needed = A->info.fill_ratio_needed;
1462: info->factor_mallocs = A->info.factor_mallocs;
1463: } else {
1464: info->fill_ratio_given = 0;
1465: info->fill_ratio_needed = 0;
1466: info->factor_mallocs = 0;
1467: }
1468: return 0;
1469: }
1471: PetscErrorCode MatZeroEntries_SeqSBAIJ(Mat A)
1472: {
1473: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1475: PetscArrayzero(a->a, a->bs2 * a->i[a->mbs]);
1476: return 0;
1477: }
1479: PetscErrorCode MatGetRowMaxAbs_SeqSBAIJ(Mat A, Vec v, PetscInt idx[])
1480: {
1481: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1482: PetscInt i, j, n, row, col, bs, mbs;
1483: const PetscInt *ai, *aj;
1484: PetscReal atmp;
1485: const MatScalar *aa;
1486: PetscScalar *x;
1487: PetscInt ncols, brow, bcol, krow, kcol;
1491: bs = A->rmap->bs;
1492: aa = a->a;
1493: ai = a->i;
1494: aj = a->j;
1495: mbs = a->mbs;
1497: VecSet(v, 0.0);
1498: VecGetArray(v, &x);
1499: VecGetLocalSize(v, &n);
1501: for (i = 0; i < mbs; i++) {
1502: ncols = ai[1] - ai[0];
1503: ai++;
1504: brow = bs * i;
1505: for (j = 0; j < ncols; j++) {
1506: bcol = bs * (*aj);
1507: for (kcol = 0; kcol < bs; kcol++) {
1508: col = bcol + kcol; /* col index */
1509: for (krow = 0; krow < bs; krow++) {
1510: atmp = PetscAbsScalar(*aa);
1511: aa++;
1512: row = brow + krow; /* row index */
1513: if (PetscRealPart(x[row]) < atmp) x[row] = atmp;
1514: if (*aj > i && PetscRealPart(x[col]) < atmp) x[col] = atmp;
1515: }
1516: }
1517: aj++;
1518: }
1519: }
1520: VecRestoreArray(v, &x);
1521: return 0;
1522: }
1524: PetscErrorCode MatMatMultSymbolic_SeqSBAIJ_SeqDense(Mat A, Mat B, PetscReal fill, Mat C)
1525: {
1526: MatMatMultSymbolic_SeqDense_SeqDense(A, B, 0.0, C);
1527: C->ops->matmultnumeric = MatMatMultNumeric_SeqSBAIJ_SeqDense;
1528: return 0;
1529: }
1531: PetscErrorCode MatMatMult_SeqSBAIJ_1_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1532: {
1533: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1534: PetscScalar *z = c;
1535: const PetscScalar *xb;
1536: PetscScalar x1;
1537: const MatScalar *v = a->a, *vv;
1538: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1539: #if defined(PETSC_USE_COMPLEX)
1540: const int aconj = A->hermitian == PETSC_BOOL3_TRUE;
1541: #else
1542: const int aconj = 0;
1543: #endif
1545: for (i = 0; i < mbs; i++) {
1546: n = ii[1] - ii[0];
1547: ii++;
1548: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1549: PetscPrefetchBlock(v + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1550: jj = idx;
1551: vv = v;
1552: for (k = 0; k < cn; k++) {
1553: idx = jj;
1554: v = vv;
1555: for (j = 0; j < n; j++) {
1556: xb = b + (*idx);
1557: x1 = xb[0 + k * bm];
1558: z[0 + k * cm] += v[0] * x1;
1559: if (*idx != i) c[(*idx) + k * cm] += (aconj ? PetscConj(v[0]) : v[0]) * b[i + k * bm];
1560: v += 1;
1561: ++idx;
1562: }
1563: }
1564: z += 1;
1565: }
1566: return 0;
1567: }
1569: PetscErrorCode MatMatMult_SeqSBAIJ_2_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1570: {
1571: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1572: PetscScalar *z = c;
1573: const PetscScalar *xb;
1574: PetscScalar x1, x2;
1575: const MatScalar *v = a->a, *vv;
1576: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1578: for (i = 0; i < mbs; i++) {
1579: n = ii[1] - ii[0];
1580: ii++;
1581: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1582: PetscPrefetchBlock(v + 4 * n, 4 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1583: jj = idx;
1584: vv = v;
1585: for (k = 0; k < cn; k++) {
1586: idx = jj;
1587: v = vv;
1588: for (j = 0; j < n; j++) {
1589: xb = b + 2 * (*idx);
1590: x1 = xb[0 + k * bm];
1591: x2 = xb[1 + k * bm];
1592: z[0 + k * cm] += v[0] * x1 + v[2] * x2;
1593: z[1 + k * cm] += v[1] * x1 + v[3] * x2;
1594: if (*idx != i) {
1595: c[2 * (*idx) + 0 + k * cm] += v[0] * b[2 * i + k * bm] + v[1] * b[2 * i + 1 + k * bm];
1596: c[2 * (*idx) + 1 + k * cm] += v[2] * b[2 * i + k * bm] + v[3] * b[2 * i + 1 + k * bm];
1597: }
1598: v += 4;
1599: ++idx;
1600: }
1601: }
1602: z += 2;
1603: }
1604: return 0;
1605: }
1607: PetscErrorCode MatMatMult_SeqSBAIJ_3_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1608: {
1609: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1610: PetscScalar *z = c;
1611: const PetscScalar *xb;
1612: PetscScalar x1, x2, x3;
1613: const MatScalar *v = a->a, *vv;
1614: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1616: for (i = 0; i < mbs; i++) {
1617: n = ii[1] - ii[0];
1618: ii++;
1619: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1620: PetscPrefetchBlock(v + 9 * n, 9 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1621: jj = idx;
1622: vv = v;
1623: for (k = 0; k < cn; k++) {
1624: idx = jj;
1625: v = vv;
1626: for (j = 0; j < n; j++) {
1627: xb = b + 3 * (*idx);
1628: x1 = xb[0 + k * bm];
1629: x2 = xb[1 + k * bm];
1630: x3 = xb[2 + k * bm];
1631: z[0 + k * cm] += v[0] * x1 + v[3] * x2 + v[6] * x3;
1632: z[1 + k * cm] += v[1] * x1 + v[4] * x2 + v[7] * x3;
1633: z[2 + k * cm] += v[2] * x1 + v[5] * x2 + v[8] * x3;
1634: if (*idx != i) {
1635: c[3 * (*idx) + 0 + k * cm] += v[0] * b[3 * i + k * bm] + v[3] * b[3 * i + 1 + k * bm] + v[6] * b[3 * i + 2 + k * bm];
1636: c[3 * (*idx) + 1 + k * cm] += v[1] * b[3 * i + k * bm] + v[4] * b[3 * i + 1 + k * bm] + v[7] * b[3 * i + 2 + k * bm];
1637: c[3 * (*idx) + 2 + k * cm] += v[2] * b[3 * i + k * bm] + v[5] * b[3 * i + 1 + k * bm] + v[8] * b[3 * i + 2 + k * bm];
1638: }
1639: v += 9;
1640: ++idx;
1641: }
1642: }
1643: z += 3;
1644: }
1645: return 0;
1646: }
1648: PetscErrorCode MatMatMult_SeqSBAIJ_4_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1649: {
1650: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1651: PetscScalar *z = c;
1652: const PetscScalar *xb;
1653: PetscScalar x1, x2, x3, x4;
1654: const MatScalar *v = a->a, *vv;
1655: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1657: for (i = 0; i < mbs; i++) {
1658: n = ii[1] - ii[0];
1659: ii++;
1660: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1661: PetscPrefetchBlock(v + 16 * n, 16 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1662: jj = idx;
1663: vv = v;
1664: for (k = 0; k < cn; k++) {
1665: idx = jj;
1666: v = vv;
1667: for (j = 0; j < n; j++) {
1668: xb = b + 4 * (*idx);
1669: x1 = xb[0 + k * bm];
1670: x2 = xb[1 + k * bm];
1671: x3 = xb[2 + k * bm];
1672: x4 = xb[3 + k * bm];
1673: z[0 + k * cm] += v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
1674: z[1 + k * cm] += v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
1675: z[2 + k * cm] += v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
1676: z[3 + k * cm] += v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
1677: if (*idx != i) {
1678: c[4 * (*idx) + 0 + k * cm] += v[0] * b[4 * i + k * bm] + v[4] * b[4 * i + 1 + k * bm] + v[8] * b[4 * i + 2 + k * bm] + v[12] * b[4 * i + 3 + k * bm];
1679: c[4 * (*idx) + 1 + k * cm] += v[1] * b[4 * i + k * bm] + v[5] * b[4 * i + 1 + k * bm] + v[9] * b[4 * i + 2 + k * bm] + v[13] * b[4 * i + 3 + k * bm];
1680: c[4 * (*idx) + 2 + k * cm] += v[2] * b[4 * i + k * bm] + v[6] * b[4 * i + 1 + k * bm] + v[10] * b[4 * i + 2 + k * bm] + v[14] * b[4 * i + 3 + k * bm];
1681: c[4 * (*idx) + 3 + k * cm] += v[3] * b[4 * i + k * bm] + v[7] * b[4 * i + 1 + k * bm] + v[11] * b[4 * i + 2 + k * bm] + v[15] * b[4 * i + 3 + k * bm];
1682: }
1683: v += 16;
1684: ++idx;
1685: }
1686: }
1687: z += 4;
1688: }
1689: return 0;
1690: }
1692: PetscErrorCode MatMatMult_SeqSBAIJ_5_Private(Mat A, PetscScalar *b, PetscInt bm, PetscScalar *c, PetscInt cm, PetscInt cn)
1693: {
1694: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1695: PetscScalar *z = c;
1696: const PetscScalar *xb;
1697: PetscScalar x1, x2, x3, x4, x5;
1698: const MatScalar *v = a->a, *vv;
1699: PetscInt mbs = a->mbs, i, *idx = a->j, *ii = a->i, j, *jj, n, k;
1701: for (i = 0; i < mbs; i++) {
1702: n = ii[1] - ii[0];
1703: ii++;
1704: PetscPrefetchBlock(idx + n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1705: PetscPrefetchBlock(v + 25 * n, 25 * n, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1706: jj = idx;
1707: vv = v;
1708: for (k = 0; k < cn; k++) {
1709: idx = jj;
1710: v = vv;
1711: for (j = 0; j < n; j++) {
1712: xb = b + 5 * (*idx);
1713: x1 = xb[0 + k * bm];
1714: x2 = xb[1 + k * bm];
1715: x3 = xb[2 + k * bm];
1716: x4 = xb[3 + k * bm];
1717: x5 = xb[4 + k * cm];
1718: z[0 + k * cm] += v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
1719: z[1 + k * cm] += v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
1720: z[2 + k * cm] += v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
1721: z[3 + k * cm] += v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
1722: z[4 + k * cm] += v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
1723: if (*idx != i) {
1724: c[5 * (*idx) + 0 + k * cm] += v[0] * b[5 * i + k * bm] + v[5] * b[5 * i + 1 + k * bm] + v[10] * b[5 * i + 2 + k * bm] + v[15] * b[5 * i + 3 + k * bm] + v[20] * b[5 * i + 4 + k * bm];
1725: c[5 * (*idx) + 1 + k * cm] += v[1] * b[5 * i + k * bm] + v[6] * b[5 * i + 1 + k * bm] + v[11] * b[5 * i + 2 + k * bm] + v[16] * b[5 * i + 3 + k * bm] + v[21] * b[5 * i + 4 + k * bm];
1726: c[5 * (*idx) + 2 + k * cm] += v[2] * b[5 * i + k * bm] + v[7] * b[5 * i + 1 + k * bm] + v[12] * b[5 * i + 2 + k * bm] + v[17] * b[5 * i + 3 + k * bm] + v[22] * b[5 * i + 4 + k * bm];
1727: c[5 * (*idx) + 3 + k * cm] += v[3] * b[5 * i + k * bm] + v[8] * b[5 * i + 1 + k * bm] + v[13] * b[5 * i + 2 + k * bm] + v[18] * b[5 * i + 3 + k * bm] + v[23] * b[5 * i + 4 + k * bm];
1728: c[5 * (*idx) + 4 + k * cm] += v[4] * b[5 * i + k * bm] + v[9] * b[5 * i + 1 + k * bm] + v[14] * b[5 * i + 2 + k * bm] + v[19] * b[5 * i + 3 + k * bm] + v[24] * b[5 * i + 4 + k * bm];
1729: }
1730: v += 25;
1731: ++idx;
1732: }
1733: }
1734: z += 5;
1735: }
1736: return 0;
1737: }
1739: PetscErrorCode MatMatMultNumeric_SeqSBAIJ_SeqDense(Mat A, Mat B, Mat C)
1740: {
1741: Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1742: Mat_SeqDense *bd = (Mat_SeqDense *)B->data;
1743: Mat_SeqDense *cd = (Mat_SeqDense *)C->data;
1744: PetscInt cm = cd->lda, cn = B->cmap->n, bm = bd->lda;
1745: PetscInt mbs, i, bs = A->rmap->bs, j, n, bs2 = a->bs2;
1746: PetscBLASInt bbs, bcn, bbm, bcm;
1747: PetscScalar *z = NULL;
1748: PetscScalar *c, *b;
1749: const MatScalar *v;
1750: const PetscInt *idx, *ii;
1751: PetscScalar _DOne = 1.0;
1753: if (!cm || !cn) return 0;
1757: b = bd->v;
1758: MatZeroEntries(C);
1759: MatDenseGetArray(C, &c);
1760: switch (bs) {
1761: case 1:
1762: MatMatMult_SeqSBAIJ_1_Private(A, b, bm, c, cm, cn);
1763: break;
1764: case 2:
1765: MatMatMult_SeqSBAIJ_2_Private(A, b, bm, c, cm, cn);
1766: break;
1767: case 3:
1768: MatMatMult_SeqSBAIJ_3_Private(A, b, bm, c, cm, cn);
1769: break;
1770: case 4:
1771: MatMatMult_SeqSBAIJ_4_Private(A, b, bm, c, cm, cn);
1772: break;
1773: case 5:
1774: MatMatMult_SeqSBAIJ_5_Private(A, b, bm, c, cm, cn);
1775: break;
1776: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1777: PetscBLASIntCast(bs, &bbs);
1778: PetscBLASIntCast(cn, &bcn);
1779: PetscBLASIntCast(bm, &bbm);
1780: PetscBLASIntCast(cm, &bcm);
1781: idx = a->j;
1782: v = a->a;
1783: mbs = a->mbs;
1784: ii = a->i;
1785: z = c;
1786: for (i = 0; i < mbs; i++) {
1787: n = ii[1] - ii[0];
1788: ii++;
1789: for (j = 0; j < n; j++) {
1790: if (*idx != i) PetscCallBLAS("BLASgemm", BLASgemm_("T", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * i, &bbm, &_DOne, c + bs * (*idx), &bcm));
1791: PetscCallBLAS("BLASgemm", BLASgemm_("N", "N", &bbs, &bcn, &bbs, &_DOne, v, &bbs, b + bs * (*idx++), &bbm, &_DOne, z, &bcm));
1792: v += bs2;
1793: }
1794: z += bs;
1795: }
1796: }
1797: MatDenseRestoreArray(C, &c);
1798: PetscLogFlops((2.0 * (a->nz * 2.0 - a->nonzerorowcnt) * bs2 - a->nonzerorowcnt) * cn);
1799: return 0;
1800: }