diff -r 000000000000 -r 2f259fa3e83a ode/src/fastlsolve.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ode/src/fastlsolve.c Tue Feb 02 01:00:49 2010 +0200 @@ -0,0 +1,298 @@ +/* generated code, do not edit. */ + +#include + +/* solve L*X=B, with B containing 1 right hand sides. + * L is an n*n lower triangular matrix with ones on the diagonal. + * L is stored by rows and its leading dimension is lskip. + * B is an n*1 matrix that contains the right hand sides. + * B is stored by columns and its leading dimension is also lskip. + * B is overwritten with X. + * this processes blocks of 4*4. + * if this is in the factorizer source file, n must be a multiple of 4. + */ + +EXPORT_C void dSolveL1 (const dReal *L, dReal *B, int n, int lskip1) +{ + /* declare variables - Z matrix, p and q vectors, etc */ + dReal Z11,Z21,Z31,Z41,p1,q1,p2,p3,p4,*ex; + const dReal *ell; + int lskip2,lskip3,i,j; + /* compute lskip values */ + lskip2 = 2*lskip1; + lskip3 = 3*lskip1; + /* compute all 4 x 1 blocks of X */ + for (i=0; i <= n-4; i+=4) { + /* compute all 4 x 1 block of X, from rows i..i+4-1 */ + /* set the Z matrix to 0 */ + Z11=0; + Z21=0; + Z31=0; + Z41=0; + ell = L + i*lskip1; + ex = B; + /* the inner loop that computes outer products and adds them to Z */ + for (j=i-12; j >= 0; j -= 12) { + /* load p and q values */ + p1=ell[0]; + q1=ex[0]; + p2=ell[lskip1]; + p3=ell[lskip2]; + p4=ell[lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[1]; + q1=ex[1]; + p2=ell[1+lskip1]; + p3=ell[1+lskip2]; + p4=ell[1+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[2]; + q1=ex[2]; + p2=ell[2+lskip1]; + p3=ell[2+lskip2]; + p4=ell[2+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[3]; + q1=ex[3]; + p2=ell[3+lskip1]; + p3=ell[3+lskip2]; + p4=ell[3+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[4]; + q1=ex[4]; + p2=ell[4+lskip1]; + p3=ell[4+lskip2]; + p4=ell[4+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[5]; + q1=ex[5]; + p2=ell[5+lskip1]; + p3=ell[5+lskip2]; + p4=ell[5+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[6]; + q1=ex[6]; + p2=ell[6+lskip1]; + p3=ell[6+lskip2]; + p4=ell[6+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[7]; + q1=ex[7]; + p2=ell[7+lskip1]; + p3=ell[7+lskip2]; + p4=ell[7+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[8]; + q1=ex[8]; + p2=ell[8+lskip1]; + p3=ell[8+lskip2]; + p4=ell[8+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[9]; + q1=ex[9]; + p2=ell[9+lskip1]; + p3=ell[9+lskip2]; + p4=ell[9+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[10]; + q1=ex[10]; + p2=ell[10+lskip1]; + p3=ell[10+lskip2]; + p4=ell[10+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* load p and q values */ + p1=ell[11]; + q1=ex[11]; + p2=ell[11+lskip1]; + p3=ell[11+lskip2]; + p4=ell[11+lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* advance pointers */ + ell += 12; + ex += 12; + /* end of inner loop */ + } + /* compute left-over iterations */ + j += 12; + for (; j > 0; j--) { + /* load p and q values */ + p1=ell[0]; + q1=ex[0]; + p2=ell[lskip1]; + p3=ell[lskip2]; + p4=ell[lskip3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + Z21 += dMUL(p2,q1); + Z31 += dMUL(p3,q1); + Z41 += dMUL(p4,q1); + /* advance pointers */ + ell += 1; + ex += 1; + } + /* finish computing the X(i) block */ + Z11 = ex[0] - Z11; + ex[0] = Z11; + p1 = ell[lskip1]; + Z21 = ex[1] - Z21 - dMUL(p1,Z11); + ex[1] = Z21; + p1 = ell[lskip2]; + p2 = ell[1+lskip2]; + Z31 = ex[2] - Z31 - dMUL(p1,Z11) - dMUL(p2,Z21); + ex[2] = Z31; + p1 = ell[lskip3]; + p2 = ell[1+lskip3]; + p3 = ell[2+lskip3]; + Z41 = ex[3] - Z41 - dMUL(p1,Z11) - dMUL(p2,Z21) - dMUL(p3,Z31); + ex[3] = Z41; + /* end of outer loop */ + } + /* compute rows at end that are not a multiple of block size */ + for (; i < n; i++) { + /* compute all 1 x 1 block of X, from rows i..i+1-1 */ + /* set the Z matrix to 0 */ + Z11=0; + ell = L + i*lskip1; + ex = B; + /* the inner loop that computes outer products and adds them to Z */ + for (j=i-12; j >= 0; j -= 12) { + /* load p and q values */ + p1=ell[0]; + q1=ex[0]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[1]; + q1=ex[1]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[2]; + q1=ex[2]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[3]; + q1=ex[3]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[4]; + q1=ex[4]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[5]; + q1=ex[5]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[6]; + q1=ex[6]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[7]; + q1=ex[7]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[8]; + q1=ex[8]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[9]; + q1=ex[9]; + /* compute outer product and add it to the Z matrix */ + Z11 +=dMUL( p1,q1); + /* load p and q values */ + p1=ell[10]; + q1=ex[10]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* load p and q values */ + p1=ell[11]; + q1=ex[11]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* advance pointers */ + ell += 12; + ex += 12; + /* end of inner loop */ + } + /* compute left-over iterations */ + j += 12; + for (; j > 0; j--) { + /* load p and q values */ + p1=ell[0]; + q1=ex[0]; + /* compute outer product and add it to the Z matrix */ + Z11 += dMUL(p1,q1); + /* advance pointers */ + ell += 1; + ex += 1; + } + /* finish computing the X(i) block */ + Z11 = ex[0] - Z11; + ex[0] = Z11; + } +}