/** \file opennl/superlu/smyblas2.c * \ingroup opennl */ /* * -- SuperLU routine (version 2.0) -- * Univ. of California Berkeley, Xerox Palo Alto Research Center, * and Lawrence Berkeley National Lab. * November 15, 1997 * */ /* * File name: smyblas2.c * Purpose: * Level 2 BLAS operations: solves and matvec, written in C. * Note: * This is only used when the system lacks an efficient BLAS library. */ /* * Solves a dense UNIT lower triangular system. The unit lower * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). * The solution will be returned in the rhs vector. */ /* local prototypes*/ void slsolve ( int, int, float *, float *); void susolve ( int, int, float *, float *); void smatvec ( int, int, int, float *, float *, float *); void slsolve ( int ldm, int ncol, float *M, float *rhs ) { int k; float x0, x1, x2, x3, x4, x5, x6, x7; float *M0; register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; register int firstcol = 0; M0 = &M[0]; while ( firstcol < ncol - 7 ) { /* Do 8 columns */ Mki0 = M0 + 1; Mki1 = Mki0 + ldm + 1; Mki2 = Mki1 + ldm + 1; Mki3 = Mki2 + ldm + 1; Mki4 = Mki3 + ldm + 1; Mki5 = Mki4 + ldm + 1; Mki6 = Mki5 + ldm + 1; Mki7 = Mki6 + ldm + 1; x0 = rhs[firstcol]; x1 = rhs[firstcol+1] - x0 * *Mki0++; x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++; x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++ - x4 * *Mki4++; x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - x6 * *Mki6++; rhs[++firstcol] = x1; rhs[++firstcol] = x2; rhs[++firstcol] = x3; rhs[++firstcol] = x4; rhs[++firstcol] = x5; rhs[++firstcol] = x6; rhs[++firstcol] = x7; ++firstcol; for (k = firstcol; k < ncol; k++) rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ - x6 * *Mki6++ - x7 * *Mki7++; M0 += 8 * ldm + 8; } while ( firstcol < ncol - 3 ) { /* Do 4 columns */ Mki0 = M0 + 1; Mki1 = Mki0 + ldm + 1; Mki2 = Mki1 + ldm + 1; Mki3 = Mki2 + ldm + 1; x0 = rhs[firstcol]; x1 = rhs[firstcol+1] - x0 * *Mki0++; x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; rhs[++firstcol] = x1; rhs[++firstcol] = x2; rhs[++firstcol] = x3; ++firstcol; for (k = firstcol; k < ncol; k++) rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ - x3 * *Mki3++; M0 += 4 * ldm + 4; } if ( firstcol < ncol - 1 ) { /* Do 2 columns */ Mki0 = M0 + 1; Mki1 = Mki0 + ldm + 1; x0 = rhs[firstcol]; x1 = rhs[firstcol+1] - x0 * *Mki0++; rhs[++firstcol] = x1; ++firstcol; for (k = firstcol; k < ncol; k++) rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; } } /* * Solves a dense upper triangular system. The upper triangular matrix is * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned * in the rhs vector. */ void susolve ( ldm, ncol, M, rhs ) int ldm; /* in */ int ncol; /* in */ float *M; /* in */ float *rhs; /* modified */ { float xj; int jcol, j, irow; jcol = ncol - 1; for (j = 0; j < ncol; j++) { xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ rhs[jcol] = xj; for (irow = 0; irow < jcol; irow++) rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ jcol--; } } /* * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec. * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[]. */ void smatvec ( ldm, nrow, ncol, M, vec, Mxvec ) int ldm; /* in -- leading dimension of M */ int nrow; /* in */ int ncol; /* in */ float *M; /* in */ float *vec; /* in */ float *Mxvec; /* in/out */ { float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; float *M0; register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; register int firstcol = 0; int k; M0 = &M[0]; while ( firstcol < ncol - 7 ) { /* Do 8 columns */ Mki0 = M0; Mki1 = Mki0 + ldm; Mki2 = Mki1 + ldm; Mki3 = Mki2 + ldm; Mki4 = Mki3 + ldm; Mki5 = Mki4 + ldm; Mki6 = Mki5 + ldm; Mki7 = Mki6 + ldm; vi0 = vec[firstcol++]; vi1 = vec[firstcol++]; vi2 = vec[firstcol++]; vi3 = vec[firstcol++]; vi4 = vec[firstcol++]; vi5 = vec[firstcol++]; vi6 = vec[firstcol++]; vi7 = vec[firstcol++]; for (k = 0; k < nrow; k++) Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + vi2 * *Mki2++ + vi3 * *Mki3++ + vi4 * *Mki4++ + vi5 * *Mki5++ + vi6 * *Mki6++ + vi7 * *Mki7++; M0 += 8 * ldm; } while ( firstcol < ncol - 3 ) { /* Do 4 columns */ Mki0 = M0; Mki1 = Mki0 + ldm; Mki2 = Mki1 + ldm; Mki3 = Mki2 + ldm; vi0 = vec[firstcol++]; vi1 = vec[firstcol++]; vi2 = vec[firstcol++]; vi3 = vec[firstcol++]; for (k = 0; k < nrow; k++) Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + vi2 * *Mki2++ + vi3 * *Mki3++ ; M0 += 4 * ldm; } while ( firstcol < ncol ) { /* Do 1 column */ Mki0 = M0; vi0 = vec[firstcol++]; for (k = 0; k < nrow; k++) Mxvec[k] += vi0 * *Mki0++; M0 += ldm; } }