diff -r 000000000000 -r 2f259fa3e83a ode/src/lcp.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ode/src/lcp.cpp Tue Feb 02 01:00:49 2010 +0200 @@ -0,0 +1,1276 @@ +/************************************************************************* + * * + * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. * + * All rights reserved. Email: russ@q12.org Web: www.q12.org * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of EITHER: * + * (1) The GNU Lesser General Public License as published by the Free * + * Software Foundation; either version 2.1 of the License, or (at * + * your option) any later version. The text of the GNU Lesser * + * General Public License is included with this library in the * + * file LICENSE.TXT. * + * (2) The BSD-style license that is included with this library in * + * the file LICENSE-BSD.TXT. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files * + * LICENSE.TXT and LICENSE-BSD.TXT for more details. * + * * + *************************************************************************/ + +/* + + +THE ALGORITHM +------------- + +solve A*x = b+w, with x and w subject to certain LCP conditions. +each x(i),w(i) must lie on one of the three line segments in the following +diagram. each line segment corresponds to one index set : + + w(i) + /|\ | : + | | : + | |i in N : + w>0 | |state[i]=0 : + | | : + | | : i in C + w=0 + +-----------------------+ + | : | + | : | + w<0 | : |i in N + | : |state[i]=1 + | : | + | : | + +-------|-----------|-----------|----------> x(i) + lo 0 hi + +the Dantzig algorithm proceeds as follows: + for i=1:n + * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or + negative towards the line. as this is done, the other (x(j),w(j)) + for j= 0. this makes the algorithm a bit +simpler, because the starting point for x(i),w(i) is always on the dotted +line x=0 and x will only ever increase in one direction, so it can only hit +two out of the three line segments. + + +NOTES +----- + +this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m". +the implementation is split into an LCP problem object (dLCP) and an LCP +driver function. most optimization occurs in the dLCP object. + +a naive implementation of the algorithm requires either a lot of data motion +or a lot of permutation-array lookup, because we are constantly re-ordering +rows and columns. to avoid this and make a more optimized algorithm, a +non-trivial data structure is used to represent the matrix A (this is +implemented in the fast version of the dLCP object). + +during execution of this algorithm, some indexes in A are clamped (set C), +some are non-clamped (set N), and some are "don't care" (where x=0). +A,x,b,w (and other problem vectors) are permuted such that the clamped +indexes are first, the unclamped indexes are next, and the don't-care +indexes are last. this permutation is recorded in the array `p'. +initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped, +the corresponding elements of p are swapped. + +because the C and N elements are grouped together in the rows of A, we can do +lots of work with a fast dot product function. if A,x,etc were not permuted +and we only had a permutation array, then those dot products would be much +slower as we would have a permutation array lookup in some inner loops. + +A is accessed through an array of row pointers, so that element (i,j) of the +permuted matrix is A[i][j]. this makes row swapping fast. for column swapping +we still have to actually move the data. + +during execution of this algorithm we maintain an L*D*L' factorization of +the clamped submatrix of A (call it `AC') which is the top left nC*nC +submatrix of A. there are two ways we could arrange the rows/columns in AC. + +(1) AC is always permuted such that L*D*L' = AC. this causes a problem + when a row/column is removed from C, because then all the rows/columns of A + between the deleted index and the end of C need to be rotated downward. + this results in a lot of data motion and slows things down. +(2) L*D*L' is actually a factorization of a *permutation* of AC (which is + itself a permutation of the underlying A). this is what we do - the + permutation is recorded in the vector C. call this permutation A[C,C]. + when a row/column is removed from C, all we have to do is swap two + rows/columns and manipulate C. + +*/ + +#include +#include "lcp.h" +#include +#include +#include "mat.h" // for testing +#include // for testing + +//*************************************************************************** +// code generation parameters + +#define dLCP_FAST // use fast dLCP object +#define dUSE_MALLOC_FOR_ALLOCA + +// option 1 : matrix row pointers (less data copying) +#define ROWPTRS +#define ATYPE dReal ** +#define AROW(i) (A[i]) + +// option 2 : no matrix row pointers (slightly faster inner loops) +//#define NOROWPTRS +//#define ATYPE dReal * +//#define AROW(i) (A+(i)*nskip) + +// use protected, non-stack memory allocation system + +#ifdef dUSE_MALLOC_FOR_ALLOCA +extern unsigned int dMemoryFlag; + +#define ALLOCA(t,v,s) t* v = (t*) malloc(s) +#define UNALLOCA(t) free(t) + +#else + +#define ALLOCA(t,v,s) t* v =(t*)dALLOCA16(s) +#define UNALLOCA(t) /* nothing */ + +#endif + +#define NUB_OPTIMIZATIONS + +//*************************************************************************** + +// swap row/column i1 with i2 in the n*n matrix A. the leading dimension of +// A is nskip. this only references and swaps the lower triangle. +// if `do_fast_row_swaps' is nonzero and row pointers are being used, then +// rows will be swapped by exchanging row pointers. otherwise the data will +// be copied. + +static void swapRowsAndCols (ATYPE A, int n, int i1, int i2, int /*nskip*/, + int do_fast_row_swaps) +{ + int i; + + +# ifdef ROWPTRS + for (i=i1+1; i 0) { + memcpy (tmprow,A+i1*nskip,i1*sizeof(dReal)); + memcpy (A+i1*nskip,A+i2*nskip,i1*sizeof(dReal)); + memcpy (A+i2*nskip,tmprow,i1*sizeof(dReal)); + } + for (i=i1+1; i nub + if (nub < n) { + for (k=0; k<100; k++) { + int i1,i2; + do { + i1 = dRandInt(n-nub)+nub; + i2 = dRandInt(n-nub)+nub; + } + while (i1 > i2); + //printf ("--> %d %d\n",i1,i2); + swapProblem (A,x,b,w,lo,hi,p,state,findex,n,i1,i2,nskip,0); + } + } + */ + + // permute the problem so that *all* the unbounded variables are at the + // start, i.e. look for unbounded variables not included in `nub'. we can + // potentially push up `nub' this way and get a bigger initial factorization. + // note that when we swap rows/cols here we must not just swap row pointers, + // as the initial factorization relies on the data being all in one chunk. + // variables that have findex >= 0 are *not* considered to be unbounded even + // if lo=-inf and hi=inf - this is because these limits may change during the + // solution process. + + for (k=nub; k= 0) continue; + if (lo[k]==-dInfinity && hi[k]==dInfinity) { + swapProblem (A,x,b,w,lo,hi,p,state,findex,n,nub,k,nskip,0); + nub++; + } + } + + // if there are unbounded variables at the start, factorize A up to that + // point and solve for x. this puts all indexes 0..nub-1 into C. + if (nub > 0) { + for (k=0; k nub such that all findex variables are at the end + if (findex) { + int num_at_end = 0; + for (k=n-1; k >= nub; k--) { + if (findex[k] >= 0) { + swapProblem (A,x,b,w,lo,hi,p,state,findex,n,k,n-1-num_at_end,nskip,1); + num_at_end++; + } + } + } + + // print info about indexes + /* + for (k=0; k 0) { + // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C)) + for (j=0; j 0) { + dReal *aptr = AROW(i); +# ifdef NUB_OPTIMIZATIONS + // if nub>0, initial part of aptr unpermuted + for (j=0; j 0) { + for (int i=0; i 0) { + dReal *aptr = AROW(i); +# ifdef NUB_OPTIMIZATIONS + // if nub>0, initial part of aptr[] is guaranteed unpermuted + for (j=0; j 0) { + for (j=0; j= 0) { + lcp.transfer_i_to_N (i); + } + else { + for (;;) { + // compute: delta_x(C) = -A(C,C)\A(C,i) + dSetZero (delta_x,n); + lcp.solve1 (delta_x,i); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (dMemoryFlag == d_MEMORY_OUT_OF_MEMORY) { + UNALLOCA(dummy); + UNALLOCA(C); + UNALLOCA(p); + UNALLOCA(Arows); + UNALLOCA(tmp); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + return; + } +#endif + delta_x[i] = REAL(1.0); + + // compute: delta_w = A*delta_x + dSetZero (delta_w,n); + lcp.pN_equals_ANC_times_qC (delta_w,delta_x); + lcp.pN_plusequals_ANi (delta_w,i); + delta_w[i] = lcp.AiC_times_qC (i,delta_x) + lcp.Aii(i); + + // find index to switch + int si = i; // si = switch index + int si_in_N = 0; // set to 1 if si in N + dReal s = -dDIV(w[i],delta_w[i]); + + if (s <= 0) { + if (i < (n-1)) { + dSetZero (x+i,n-i); + dSetZero (w+i,n-i); + } + goto done; + } + + for (k=0; k < lcp.numN(); k++) { + if (delta_w[lcp.indexN(k)] < 0) { + dReal s2 = -dDIV(w[lcp.indexN(k)],delta_w[lcp.indexN(k)]); + if (s2 < s) { + s = s2; + si = lcp.indexN(k); + si_in_N = 1; + } + } + } + for (k=0; k < lcp.numC(); k++) { + if (delta_x[lcp.indexC(k)] < 0) { + dReal s2 = -dDIV(x[lcp.indexC(k)],delta_x[lcp.indexC(k)]); + if (s2 < s) { + s = s2; + si = lcp.indexC(k); + si_in_N = 0; + } + } + } + + // apply x = x + s * delta_x + lcp.pC_plusequals_s_times_qC (x,s,delta_x); + x[i] += s; + lcp.pN_plusequals_s_times_qN (w,s,delta_w); + w[i] += dMUL(s,delta_w[i]); + + // switch indexes between sets if necessary + if (si==i) { + w[i] = 0; + lcp.transfer_i_to_C (i); + break; + } + if (si_in_N) { + w[si] = 0; + lcp.transfer_i_from_N_to_C (si); + } + else { + x[si] = 0; + lcp.transfer_i_from_C_to_N (si); + } + } + } + } + + done: + lcp.unpermute(); + + UNALLOCA (L); + UNALLOCA (d); + UNALLOCA (delta_x); + UNALLOCA (delta_w); + UNALLOCA (Dell); + UNALLOCA (ell); + UNALLOCA (tmp); + UNALLOCA (Arows); + UNALLOCA (p); + UNALLOCA (C); + UNALLOCA (dummy); +} + +//*************************************************************************** +// an optimized Dantzig LCP driver routine for the lo-hi LCP problem. + +void dSolveLCP (int n, dReal *A, dReal *x, dReal *b, + dReal *w, int nub, dReal *lo, dReal *hi, int *findex) +{ + + int i,k,hit_first_friction_index = 0; + int nskip = dPAD(n); + + // if all the variables are unbounded then we can just factor, solve, + // and return + if (nub >= n) { + dFactorLDLT (A,w,n,nskip); // use w for d + dSolveLDLT (A,w,b,n,nskip); + memcpy (x,b,n*sizeof(dReal)); + dSetZero (w,n); + + return; + } + ALLOCA (dReal,L,n*nskip*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (L == NULL) { + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal,d,n*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (d == NULL) { + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal,delta_x,n*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (delta_x == NULL) { + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal,delta_w,n*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (delta_w == NULL) { + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal,Dell,n*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (Dell == NULL) { + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal,ell,n*sizeof(dReal)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (ell == NULL) { + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (dReal*,Arows,n*sizeof(dReal*)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (Arows == NULL) { + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (int,p,n*sizeof(int)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (p == NULL) { + UNALLOCA(Arows); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + ALLOCA (int,C,n*sizeof(int)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (C == NULL) { + UNALLOCA(p); + UNALLOCA(Arows); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + + int dir; + dReal dirf; + + // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i) + ALLOCA (int,state,n*sizeof(int)); +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (state == NULL) { + UNALLOCA(C); + UNALLOCA(p); + UNALLOCA(Arows); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + dMemoryFlag = d_MEMORY_OUT_OF_MEMORY; + return; + } +#endif + + // create LCP object. note that tmp is set to delta_w to save space, this + // optimization relies on knowledge of how tmp is used, so be careful! + dLCP *lcp=new dLCP(n,nub,A,x,b,w,lo,hi,L,d,Dell,ell,delta_w,state,findex,p,C,Arows); + nub = lcp->getNub(); + + // loop over all indexes nub..n-1. for index i, if x(i),w(i) satisfy the + // LCP conditions then i is added to the appropriate index set. otherwise + // x(i),w(i) is driven either +ve or -ve to force it to the valid region. + // as we drive x(i), x(C) is also adjusted to keep w(C) at zero. + // while driving x(i) we maintain the LCP conditions on the other variables + // 0..i-1. we do this by watching out for other x(i),w(i) values going + // outside the valid region, and then switching them between index sets + // when that happens. + + for (i=nub; i= 0) { + // un-permute x into delta_w, which is not being used at the moment + for (k=0; kAiC_times_qC (i,x) + lcp->AiN_times_qN (i,x) - b[i]; + + // if lo=hi=0 (which can happen for tangential friction when normals are + // 0) then the index will be assigned to set N with some state. however, + // set C's line has zero size, so the index will always remain in set N. + // with the "normal" switching logic, if w changed sign then the index + // would have to switch to set C and then back to set N with an inverted + // state. this is pointless, and also computationally expensive. to + // prevent this from happening, we use the rule that indexes with lo=hi=0 + // will never be checked for set changes. this means that the state for + // these indexes may be incorrect, but that doesn't matter. + + // see if x(i),w(i) is in a valid region + if (lo[i]==0 && w[i] >= 0) { + lcp->transfer_i_to_N (i); + state[i] = 0; + } + else if (hi[i]==0 && w[i] <= 0) { + lcp->transfer_i_to_N (i); + state[i] = 1; + } + else if (w[i]==0) { + // this is a degenerate case. by the time we get to this test we know + // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve, + // and similarly that hi > 0. this means that the line segment + // corresponding to set C is at least finite in extent, and we are on it. + // NOTE: we must call lcp->solve1() before lcp->transfer_i_to_C() + lcp->solve1 (delta_x,i,0,1); + +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (dMemoryFlag == d_MEMORY_OUT_OF_MEMORY) { + UNALLOCA(state); + UNALLOCA(C); + UNALLOCA(p); + UNALLOCA(Arows); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + return; + } +#endif + + lcp->transfer_i_to_C (i); + } + else { + // we must push x(i) and w(i) + for (;;) { + // find direction to push on x(i) + if (w[i] <= 0) { + dir = 1; + dirf = REAL(1.0); + } + else { + dir = -1; + dirf = REAL(-1.0); + } + + // compute: delta_x(C) = -dir*A(C,C)\A(C,i) + lcp->solve1 (delta_x,i,dir); + +#ifdef dUSE_MALLOC_FOR_ALLOCA + if (dMemoryFlag == d_MEMORY_OUT_OF_MEMORY) { + UNALLOCA(state); + UNALLOCA(C); + UNALLOCA(p); + UNALLOCA(Arows); + UNALLOCA(ell); + UNALLOCA(Dell); + UNALLOCA(delta_w); + UNALLOCA(delta_x); + UNALLOCA(d); + UNALLOCA(L); + return; + } +#endif + + // note that delta_x[i] = dirf, but we wont bother to set it + + // compute: delta_w = A*delta_x ... note we only care about + // delta_w(N) and delta_w(i), the rest is ignored + lcp->pN_equals_ANC_times_qC (delta_w,delta_x); + lcp->pN_plusequals_ANi (delta_w,i,dir); + delta_w[i] = lcp->AiC_times_qC (i,delta_x) + dMUL(lcp->Aii(i),dirf); + + // find largest step we can take (size=s), either to drive x(i),w(i) + // to the valid LCP region or to drive an already-valid variable + // outside the valid region. + + int cmd = 1; // index switching command + int si = 0; // si = index to switch if cmd>3 + dReal s = -dDIV(w[i],delta_w[i]); + if (dir > 0) { + if (hi[i] < dInfinity) { + dReal s2 = dDIV((hi[i]-x[i]),dirf); // step to x(i)=hi(i) + if (s2 < s) { + s = s2; + cmd = 3; + } + } + } + else { + if (lo[i] > -dInfinity) { + dReal s2 = dDIV((lo[i]-x[i]),dirf); // step to x(i)=lo(i) + if (s2 < s) { + s = s2; + cmd = 2; + } + } + } + + for (k=0; k < lcp->numN(); k++) { + if ((state[lcp->indexN(k)]==0 && delta_w[lcp->indexN(k)] < 0) || + (state[lcp->indexN(k)]!=0 && delta_w[lcp->indexN(k)] > 0)) { + // don't bother checking if lo=hi=0 + if (lo[lcp->indexN(k)] == 0 && hi[lcp->indexN(k)] == 0) continue; + dReal s2 = -dDIV(w[lcp->indexN(k)],delta_w[lcp->indexN(k)]); + if (s2 < s) { + s = s2; + cmd = 4; + si = lcp->indexN(k); + } + } + } + + for (k=nub; k < lcp->numC(); k++) { + if (delta_x[lcp->indexC(k)] < 0 && lo[lcp->indexC(k)] > -dInfinity) { + dReal s2 = dDIV((lo[lcp->indexC(k)]-x[lcp->indexC(k)]),delta_x[lcp->indexC(k)]); + if (s2 < s) { + s = s2; + cmd = 5; + si = lcp->indexC(k); + } + } + if (delta_x[lcp->indexC(k)] > 0 && hi[lcp->indexC(k)] < dInfinity) { + dReal s2 = dDIV((hi[lcp->indexC(k)]-x[lcp->indexC(k)]),delta_x[lcp->indexC(k)]); + if (s2 < s) { + s = s2; + cmd = 6; + si = lcp->indexC(k); + } + } + } + + //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C", + // "C->NL","C->NH"}; + //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i); + + // if s <= 0 then we've got a problem. if we just keep going then + // we're going to get stuck in an infinite loop. instead, just cross + // our fingers and exit with the current solution. + if (s <= 0) { + if (i < (n-1)) { + dSetZero (x+i,n-i); + dSetZero (w+i,n-i); + } + goto done; + } + + // apply x = x + s * delta_x + lcp->pC_plusequals_s_times_qC (x,s,delta_x); + x[i] += dMUL(s,dirf); + + // apply w = w + s * delta_w + lcp->pN_plusequals_s_times_qN (w,s,delta_w); + w[i] += dMUL(s,delta_w[i]); + + // switch indexes between sets if necessary + switch (cmd) { + case 1: // done + w[i] = 0; + lcp->transfer_i_to_C (i); + break; + case 2: // done + x[i] = lo[i]; + state[i] = 0; + lcp->transfer_i_to_N (i); + break; + case 3: // done + x[i] = hi[i]; + state[i] = 1; + lcp->transfer_i_to_N (i); + break; + case 4: // keep going + w[si] = 0; + lcp->transfer_i_from_N_to_C (si); + break; + case 5: // keep going + x[si] = lo[si]; + state[si] = 0; + lcp->transfer_i_from_C_to_N (si); + break; + case 6: // keep going + x[si] = hi[si]; + state[si] = 1; + lcp->transfer_i_from_C_to_N (si); + break; + } + + if (cmd <= 3) break; + } + } + } + + done: + lcp->unpermute(); + delete lcp; + + UNALLOCA (L); + UNALLOCA (d); + UNALLOCA (delta_x); + UNALLOCA (delta_w); + UNALLOCA (Dell); + UNALLOCA (ell); + UNALLOCA (Arows); + UNALLOCA (p); + UNALLOCA (C); + UNALLOCA (state); +}