/*
 dpr1fact - Factor d[iag] p[lus] r[ank] 1:
    [p,perm,beta,betajc,d,dep,ordered] = dpr1fact(x, d, rowperm,colperm,xsuper,
        dep,maxu)
    Computes fi and d such that
       diag(d_IN) + x*x' = 
(PI_{i=1}^n L(p_OUT^i,beta_i)) * diag(d_OUT) * (PI_{i=1}^n L(p_OUT^i,beta_i))'
    where L(p,beta) = eye(n) + tril(p*beta',-1).
    
 dep_OUT is the subset of dep where the rows/columns remain dependent.
 ordered(k) = 1 if p(:,k) has been reordered, with permutation in p.ir.
 We reorder if otherwise |p(i,k)*beta(j,k)| > maxu.

    This file is part of SeDuMi 1.03BETA
    Copyright (C) 1999 Jos F. Sturm
    Dept. Quantitative Economics, Maastricht University, the Netherlands.
    Affiliations up to SeDuMi 1.02 (AUG1998):
      CRL, McMaster University, Canada.
      Supported by the Netherlands Organization for Scientific Research (NWO).
  
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
  
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
  
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/

#include <math.h>
#include <string.h>
#include "mex.h"
#include "blksdp.h"

/* 
  [p,beta,betajc,d,dep,ordered] = dpr1fact(x, d, rowperm,colperm,xsuper,
   dep, maxu)
*/
#define P_OUT myplhs[0]
#define PERM_OUT myplhs[1]
#define BETA_OUT myplhs[2]
#define BETAJC_OUT myplhs[3]
#define D_OUT myplhs[4]
#define DEP_OUT myplhs[5]
#define ORDERED_OUT myplhs[6]
#define NPAROUT 7

#define X_IN prhs[0]
#define D_IN prhs[1]
#define ROWPERM_IN prhs[2]
#define COLPERM_IN prhs[3]
#define XSUPER_IN prhs[4]
#define DEPROWS_IN prhs[5]
#define MAXU_IN prhs[6]
#define NPARIN 7

/* --------------------------------------
   FLOAT COMPARE: FOR SORTING A FLOAT ARRAY
   -------------------------------------- */
typedef struct{
  double r;
  int i;
} iandr;

/* From blksdp.h
typedef int (*COMPFUN)(const void *pa,const void *pb);
*/
#define rirsortdec(vec,n)  qsort((void *)(vec), (n), \
			    sizeof(iandr), (COMPFUN) rircmpdec);

int rircmpdec(const iandr *a, const iandr *b)
{
   return( (a->r < b->r) - (a->r > b->r)  );
}

#define isortinc(vec,n)  qsort((void *)(vec), (n), \
			    sizeof(int), (COMPFUN) icmpinc);

int icmpinc(const int *a, const int *b)
{
   return( (*a > *b) - (*a < *b)  );
}

/* ============================================================
   DPR1FACT-subroutines: Compact Cholesky for X = diag(d) + p*p'.
   several versions, to allow sequential or permuted ordering.
   ============================================================ */

/* ************************************************************
   dpr1fact - Compact Cholesky for X = diag(d) + p*p' to
       X = L(p,beta) * diag(d_OUT) * L(p,beta)'
       where L(p,beta) = eye(m) + tril(p*beta',-1)
   INPUT:
     n - Order of beta. n = min(m,idep), where idep is the
       1st entry where d(idep) = 0 on input. Caller then needs to finish by
       pivoting on idep by itself.
     t - Initial t: set t = 1 for D+p*p', set t = -1 for D-p*p'.
     mu - mu(m) = 0, mu(i) = max(psqr(i+1:mk)), for i=1:mk-1.
     maxu - Controls stability check: we postpone rows such that
       max(abs(L)) <= maxu.
   UPDATED:
     d    - Length n vector: the diagonal entries. On input, the old ones,
          d(1:n) > 0. On output the updated ones after the factorization.
          Remain positive if t > 0.
     fi   - on input, contains the vector x (=p.^2),
          on output it is such that beta(j) = p(j) / fi(j), for
          j not in ph2psqr.i.
   OUTPUT
     ph2psqr - The postponed rows j, with corresponding psqr(j). Controled
       by maxu.
   REMARK:
     Since L=eye(m)+tril(p*beta'), beta(n-1) and fi(n-1) are useful only
     if m > n: it'll be used in rows n:m-1.
   RETURNS: nph2, number of postponed nodes = length(ph2psqr).
   ************************************************************ */
int dpr1fact(double *fi, double *d, iandr *ph2psqr, double *pt, const int n,
             const double *mu, const double maxu)
{
  int nph2;
  double dj,fij, muph2, t;
  iandr p2j;

/* ------------------------------------------------------------
   fi(j) = x(j) + t*d(j),  d_new(j) = fi(j)/t,  tnew = fi(j)/d_old(j)
   Store j in p2j.i
   ------------------------------------------------------------ */ 
  t = *pt;
  nph2 = 0;
  muph2 = 0.0;                 /* muph2 = max(psqr(postponed_nodes)) */
  for(p2j.i = 0; p2j.i < n; p2j.i++){
    dj = d[p2j.i];
    p2j.r = fi[p2j.i];                    /* p2j = {j, p_j^2} */
    fij = p2j.r + t*dj;                  /* fi(j) = p_j^2 + t*d_j */
    if(p2j.r * MAX(muph2, mu[p2j.i]) <= SQR(maxu * fij)){
      fi[p2j.i] = fij;                 /* pivot j is stable */
      d[p2j.i]  = fij / t;             /* d(j;NEW) = d_j + (p_j^2 / t). */
      t         = fij / dj;            /* Compute new t for next iter. */
    }
    else{
      ph2psqr[nph2++] = p2j;            /* Postpone to phase 2 */
      muph2 = MAX(muph2, p2j.r);        /* max(ph2psqr.r) */
    }
  }
  *pt = t;
  return nph2;
}


/* ************************************************************
   dpr1factperm - Compact Cholesky for X = diag(d) + p*p' to
       X = L(p,beta) * diag(d_OUT) * L(p,beta)'
       where L(p,beta) = eye(m) + tril(p*beta',-1).
       Follows the sequence given in "perm"; realligns accepted pivots
       from start of "perm", stores rejected ones in ph2psqr.
   INPUT:
     n - Order of beta.  n = min(m,idep), where idep is the
       1st entry where d(idep) = 0 on input. Caller then needs to finish by
       pivoting on idep by itself.
     t - Initial t: set t = 1 for D+p*p', set t = -1 for D-p*p'.
     maxu - Controls stability check: we postpone rows such that
       max(abs(L)) <= maxu.
     mu - max(psqr(perm[i+1:m-1])) for all i=1:n (n <= m). NB: in perm-order.
   UPDATED:
     perm - pivot sequence. Evaluate pivots perm(0:n-1). On output,
       perm(0:n-nph2-1) are the accepted pivots. 
     d    - Length n vector: the diagonal entries. On input, the old ones,
          d(1:n) > 0. On output the updated ones after the factorization.
          Remain positive if t > 0.
     fi   - on input, contains the vector x (=p.^2),
          on output s.t. beta(j) = p(j) / fi(j) for j=perm[0:n-nph2-1].
   OUTPUT
     ph2psqr - The postponed rows j, with corresponding psqr(j). Controled
       by maxu.
   REMARK:
     Since L=eye(m)+tril(p*beta'), beta(n-1) and fi(n-1) are useful only
     if m > n: it'll be used in rows n:m-1.
   RETURNS: nph2, number of postponed nodes = length(ph2psqr).
   ************************************************************ */
int dpr1factperm(double *fi, double *d, iandr *ph2psqr, double *pt,
                 int *perm, const int n, const double *mu, const double maxu)
{
  int i, jnz, nph2;
  double dj,fij, muph2, t;
  iandr p2j;

/* ------------------------------------------------------------
   fi(j) = x(j) + t*d(j),  d_new(j) = fi(j)/t,  tnew = fi(j)/d_old(j)
   Store j in p2j.i
   ------------------------------------------------------------ */ 
  t = *pt;
  nph2 = 0;
  muph2 = 0.0;
  jnz = 0;         /* index into perm_OUT, for accepted pivots */
  for(i = 0; i < n; i++){
    p2j.i = perm[i];
    dj = d[p2j.i];
    p2j.r = fi[p2j.i];                    /* p2j = {j, p_j^2} */
    fij = p2j.r + t*dj;                  /* fi(j) = p_j^2 + t*d_j */
    if(p2j.r * MAX(muph2, mu[i]) <= SQR(maxu * fij)){
      fi[p2j.i] = fij;                   /* pivot j is stable */
      perm[jnz++] = p2j.i;
      d[p2j.i]  = fij / t;             /* d(j;NEW) = d_j + (p_j^2 / t). */
      t         = fij / dj;            /* Compute new t for next iter. */
    }
    else{
      ph2psqr[nph2++] = p2j;            /* Postpone to phase 2 */
      muph2 = MAX(muph2, p2j.r);        /* max(ph2psqr.r) */
    }
  }
  mxAssert(jnz + nph2 == n, "");
  *pt = t;
  return nph2;
}

/* ************************************************************
   TIME CRITICAL PROCEDURE fromto -- x(i:n) = i:n
   ************************************************************ */
void fromto(int *x, int i, const int n)
{
  for(; i < n; i++)
    x[i] = i;
}

/* ************************************************************
   ph2dpr1fact - Compact Cholesky for X = diag(d) + p*p' to
       X = L(p,beta) * diag(d_OUT) * L(p,beta)'
       where L(p,beta) = eye(m) + tril(p*beta',-1)
   INPUT:
     n - Order of psqr (number of phase-2 rows).
     t - Initial t: output from 1st phase; is mon. incr.
       t >= 1 for D+p*p', whereas -1 <= t < 0 for D-p*p'.
   UPDATED:
     psqr - Contains the sparse vector (p.^2), where the row-indices
          are the postponed row numbers. On output, the r-values are
          replaced by fi (so that beta = p ./ fi).
     d    - the diagonal entries. On input, the old ones,
          on output the updated ones after the factorization.
          Only those with psqr.i-indices are changed (should be
          all positive already on input).
   REMARK:
     Since L=eye(m)+tril(p*beta'), beta(n-1) and fi(n-1) are useful only
     if m > n: it'll be used in rows n:m-1.
   ************************************************************ */
void ph2dpr1fact(iandr *psqr, double *d, double *pt, const int n)
{
  int j, jnz;
  double dj,fij,t;
  t = *pt;
/* ------------------------------------------------------------
   fi(j) = x(j) + t*d(j),  d_new(j) = fi(j)/t,  tnew = fi(j)/d_old(j)
   ------------------------------------------------------------ */ 
  for(jnz = 0; jnz < n; jnz++){
    j = (psqr+jnz)->i;
    dj = d[j];
    fij = ((psqr+jnz)->r += t*dj);        /* fi(j) = p_j^2 + t*d_j */
    d[j]    = fij / t;             /* d(j;NEW) = d_j + (p_j^2 / t). */
    t       = fij / dj;            /* Compute new t for next iter. */
  }
  *pt = t;
}

/* ============================================================
   MAIN routine for Compact Cholesky for X = diag(d) + p*p'.
   redirects to the dpr1fact subroutines.
   ============================================================ */

/* ************************************************************
   PROCEDURE dodpr1fact - Factors diag +/- rank-1:
     (D+(1/t)*p*p')(perm) = L * diag(d_NEW(perm)) * L',
     L = I+tril(p(perm)*beta',-1).
   INPUT
     p    - length m. We've to factor diag(d)+ (1/t) * p*p'.
     t    - scalar: 1 for adding p*p', -1 for subtracting p*p'.
     maxu - scalar >= 1: The factor L(p,beta) = I+tril(p(perm)*beta',-1)
       will be such that max(abs(L)) <= maxu by choosing perm-ordering.
     m    - length(p).
   UPDATED
     d    - length m. The diagonal. This factors
       diag(d_OLD)+(1/t)*p*p' = L(p,beta) * diag(d_NEW) * L(p,beta)'
   OUTPUT
     beta - Length <= m (actual length returned in *pm).
     perm - Length m. Only written if RETURN=1, which means that the
       original ordering was not maxu-stable. Pivot ordering on p,d.
     pn   -  *pn = length(beta) <= m; n<m only if there are dependent rows.
     dep  - Length ndep+1. Lists rows i where d(i) == 0. Indices are
       ascending, and dep[ndep] >= m is tail of this list. On output,
       one entry may be removed, and stored in dep[ndep_OLD].
     *pndep - Cardinality of dep. May be decremented on output, if a
       dependency could be removed, i.e. if t > 0 and p(dep) != 0.
   WORK
     psqr - length m float working array, for p.^2 and later "fi".
     riwork - length m working array for storing postponed
       rows (rowno and psqr(rowno)), which have to be sorted.
   RETURNS 1 if reordered rows into perm; 0 means that we used
     the sequential 0:m-1 ordering.
   CAUTION: If t < 0, one dependency may be added by the
       rank-1 subtraction. The caller should therefore call findnewdep
       afterwards (for t < 0).
   ************************************************************ */
char dodpr1fact(double *beta, int *perm, double *d, double t, const double *p,
                const int m, int *pn, int *dep, int *pndep,
                const double maxu, double *psqr, iandr *riwork)
{
  int ndep, n, i, j, nph2, nextj, idep;
  double psqrdep, h;
  double *mu;
  char deldep;

  ndep = *pndep;
/* ------------------------------------------------------------
   Use beta temporarily as mu(1:m), which lists max(psqr(i+1:m)).
   mu will be used only to select stable pivots, before writing beta.
   ------------------------------------------------------------ */
  mu = beta;
/* ------------------------------------------------------------
   Let psqr = p(1:m).^2
   ------------------------------------------------------------ */
  realHadamard(psqr, p, p, m);
/* ------------------------------------------------------------
   Case A: d(1:mk) > 0 (no dep). Then n = m.
   ------------------------------------------------------------ */
  if(dep[0] >= m){
    *pn = m;
/* ------------------------------------------------------------
   Let mu(m) = 0, mu(i) = max(psqr(i+1:mk)), for i=1:mk-1.
   ------------------------------------------------------------ */
    for(h = 0.0, i = m - 1; i >= 0; i--){
      mu[i] = h;
      h = MAX(h, psqr[i]);
    }
/* ------------------------------------------------------------
   1st round: pivot sequentially on 1:m, skipping instable ones.
   ------------------------------------------------------------ */
    nph2 = dpr1fact(psqr, d, riwork, &t, m, mu, maxu);
/* ------------------------------------------------------------
   Write results 1st round: beta = p ./ psqr.
   ------------------------------------------------------------ */
    if(!nph2){                  /* all 1:m handled */
      realHadadiv(beta, p, psqr, m);
      return 0;
    }
    else{                       /* skipped riwork.i */
      for(i = 0, j = 0; i < nph2; i++){
        nextj = (riwork+i)->i;
        fromto(perm, j, nextj);       /* perm[j-i:nextj-i] = j:nextj */
        realHadadiv(beta + j, p + j, psqr + j, nextj - j);
        j = nextj + 1;              /* skip nextj == (riwork+i)->i */
        --perm; --beta;             /* keep j valid index */
      }
      fromto(perm, j, m);       /* perm[j-i:nextj-i] = j:nextj */
      realHadadiv(beta + j, p + j, psqr + j, m - j);
      perm += m;           /* point just behind accepted pivots */
      beta += m;
/* ------------------------------------------------------------
   Sort rejected nodes in decreasing order of p.^2.
   ------------------------------------------------------------ */
      rirsortdec(riwork, nph2);
/* ------------------------------------------------------------
   2nd round factorization: ordered.
   ------------------------------------------------------------ */
      ph2dpr1fact(riwork, d, &t, nph2);
      for(i = 0; i < nph2; i++){
        j = (riwork+i)->i;
        perm[i] = j;
        beta[i] = p[j] / (riwork+i)->r;
      }
      return 1;
    } /* if nph2 > 0 */
  } /* if !dep */
/* ------------------------------------------------------------
   If d(1:mk) is NOT positive:
   Let (j,psqrdep) = max{psqr(i) | d(i)==0.0, i=1:m}
   ------------------------------------------------------------ */
  else{
    psqrdep = 0.0;
    for(i = 0; dep[i] < m; i++)
      if(psqr[dep[i]] > psqrdep){
        j = i;
        psqrdep = psqr[dep[i]];
      }
    mxAssert(i <= ndep, "");
/* ------------------------------------------------------------
   Threshold h = maxu^2 * psqrdep
   If all psqr>h have been factorized, we'll pivot on dep[k], if
   t * psqrdep > 0 (otherwise we view this as being zero).
   ------------------------------------------------------------ */
    if(psqrdep > 0.0){        /* we'll remove dependency at idep=dep[j] */
      idep = dep[j];
/* ------------------------------------------------------------
   If psqrdep>0, we can remove dependency idep=dep[j].
   Let dep[j:ndep-1] = dep[j+1:ndep] (incl tail dep[ndep]), then
   let dep[ndep] = idep, and --ndep. For Lorentz cones, removed
   dependencies may get dependent again at the t=-1 step.
   ------------------------------------------------------------ */
      if(t > 0.0){
        deldep = 1;
        memmove(dep+j, dep+j+1, (ndep - j) * sizeof(int));
        h = SQR(maxu) * psqrdep;
        dep[ndep] = idep;                /* remember removed dependency */
        *pndep = --ndep;
      }
/* ------------------------------------------------------------
   If we're subtracting a rank-1 factor (t<0), then psqrdep should
   be zero (up to rounding errors)
   ------------------------------------------------------------ */
      else{                  /* D - p*p' should be psd, so */
        h = psqrdep;         /* we've to round [0,psqrdep] to 0 */
        deldep = 0;
      }
    }
    else{
      idep = dep[0];           /* psqr(dep) == 0: remains dependent */
      h = 0.0;
      deldep = 0;
    }
/* ------------------------------------------------------------
   PARTITION: perm = [find(psqr > h), idep, remainder].
   Then let n be j = length(find(psqr > h)).
   Temporarily use nph2 = m-length(remainder).
   ------------------------------------------------------------ */
    for(i = 0, j = 0, nph2 = m; i < idep; i++)
      if(psqr[i] > h)
        perm[j++] = i;
      else
        perm[--nph2] = i;
    for(++i; i < m; i++)               /* skip over i = idep */
      if(psqr[i] > h)
        perm[j++] = i;
      else
        perm[--nph2] = i;
    mxAssert(j == nph2-1,"");
    perm[j] = idep;                     /* finally insert idep */
    n = j;                       /* length(find(psqr > h)) */
    *pn = j + deldep;            /* cardinality of beta */
/* ------------------------------------------------------------
   Now h=max(psqr(perm(n+1:m))).
   Let mu(i) = max(psqr(perm(i+1:m))).
   ------------------------------------------------------------ */
    for(i = n - 1; i >= 0; i--){
      mu[i] = h;
      h = MAX(h, psqr[perm[i]]);
    }
/* ------------------------------------------------------------
   1st round: pivot sequentially on perm(1:n), skipping instable ones.
   The stable pivots are re-alligned at start of perm.
   ------------------------------------------------------------ */
    nph2 = dpr1factperm(psqr, d, riwork, &t, perm, n, mu, maxu);
/* ------------------------------------------------------------
   Write results 1st round: beta = p(perm(1:n-nph2)) ./ psqr(perm(1:n-nph2)).
   ------------------------------------------------------------ */
    n -= nph2;          /* cardinality 1st round */
    for(i = 0; i < n; i++){
      j = perm[i];
      beta[i] = p[j] / psqr[j];
    }
    perm += n;         /* handled 1st round */
    beta += n;
/* ------------------------------------------------------------
   Sort rejected nodes in decreasing order of p.^2.
   ------------------------------------------------------------ */
    if(nph2){
      rirsortdec(riwork, nph2);
/* ------------------------------------------------------------
   2nd round factorization: ordered.
   ------------------------------------------------------------ */
      ph2dpr1fact(riwork, d, &t, nph2);
      for(i = 0; i < nph2; i++){
        j = (riwork+i)->i;
        perm[i] = j;
        beta[i] = p[j] / (riwork+i)->r;
      }
    }
/* ------------------------------------------------------------
   If psqrdep > 0, we can now finish off the factorization by
   pivoting on idep == perm[nph2]:
   d_new(i) = p_i^2/t, beta = 1/p_i.
   ------------------------------------------------------------ */
    if(deldep){
      d[idep] = psqr[idep] / t;
      beta[nph2] = 1.0 / p[idep];
    }
  }
  return 1;
}

/* ************************************************************
   PROCEDURE findnewdep
   INPUT
     ndep    - Number of dependent nodes, d[dep[0:ndep-1]] == 0.
     maxndep - dep is length maxndep+1. dep[ndep+1:maxndep] are previously
          removed dependencies.
     d       - length m vector, m = dep[ndep].
   UPDATED
     dep - length maxndep+1 array. If d[dep[i]] <= 0 for some i > ndep,
       then dep[i] is inserted into dep(0:ndep), so that dep(0:ndep+1) remains
       sorted.
   RETURNS 1 if ndep has to be incremented, i.e. an entry of
     dep(ndep+1:maxndep) is inserted into dep(0:ndep). Otherwise returns 0.
   ************************************************************ */
int findnewdep(int *dep, const int ndep, const int maxndep, const double *d)
{
  int i, j, idep;

  for(i = ndep + 1; i <= maxndep; i++)
    if(d[dep[i]] <= 0.0)
      break;
  if(i <= maxndep){
    idep = dep[i];
    for(j = 0; dep[j] < idep; j++);
    memmove(dep+j+1, dep+j, (i - j) * sizeof(int));
    dep[j] = idep;
    return 1;
  }
  else
    return 0;
}

/* ============================================================
   PRODFORMFACT does a dpr1fact for each rank-1 update.
   ============================================================ */

/* ************************************************************
   PROCEDURE prodformfact
   INPUT
     xsuper - column k consists of rows 0:xsuper(k+1)-1.
     n      - number of (dense) columns
     nadd   - number of rank-1 adds (D+pk*pk'). The remaining p-columns
        are subtracted (D-pk*pk').
     maxu   - max_k(max abs(Lk)) will be at most maxu. Rows may be
      reordered to achieve this.
   UPDATED
     p  - Length(p) = sum(xsuper). On input, contains the dense columns
       as in X = diag(d) + P*P'. On output, a product-form forward solve
       has been made to p(:,2:n).
     d  - length m nonnegative vector. On input, the diagonal w/o dense
       columns. On output, the diagonal in the final product form Cholesky.
     dep    - Length ndep+1 list of entries where d(i)=0; dep(0) < dep(1)...;
        dep[ndep] = xsuper[n], the tail.
     pndep  - length of dep, may be decreased on output, if dependencies
       are removed by adding the rank-1 updates..
   OUTPUT
     permPr - length(p), contains a stable pivot ordering, for those
       columns where ordered[j]=1.
     beta   - Length length(p). Such that L_k = eye(m) + tril(pk * betak, -1).
     betajc - Length n+1. start of betak. nnz(beta) <= nnz(p).
     ordered - length n. Ordered[j]==1 iff the rows of column j are
       reordered for numerical stability (controled by maxu).
   WORK
     perm   - length m=xsuper[n] integer working array.
     fwork  - length m=xsuper[n] float working array.
     riwork - length m=xsuper[n] (i,r)-working array.
   ************************************************************ */
void prodformfact(double *p, double *permPr, double *beta, int *betajc,
                  double *d, char *ordered, const int *xsuper,
                  const int nadd, const int n, int *dep, int *pndep,
                  const double maxu, int *perm, double *fwork, iandr *riwork)
{
  int k, mk, nk, j, inz, maxndep;
  double *betak, *pk, *pj;
  double t;
  char useperm;
/* ------------------------------------------------------------
   Initialize. inz points to next avl. place in beta,
   perm is used to store pivot ordering,
   t = 1 means adding rank-1. We switch to t=-1 for k >= nadd.
   ------------------------------------------------------------ */
  inz = 0;
  t = 1.0;
  maxndep = *pndep;
/* ------------------------------------------------------------
   For all columns k, mk = length(pk), nk = length(betak).
   ------------------------------------------------------------ */
  for(k = 0, pk = p; k < n; k++){
    betajc[k] = inz;
    mk = xsuper[k+1];
    betak = beta + inz;
    pk += xsuper[k];
    if(k == nadd)
      t = -1.0;
    useperm = dodpr1fact(betak, perm, d, t, pk, mk, &nk, dep, pndep, maxu,
                         fwork, riwork);
    ordered[k] = useperm;
    if(t < 0.0)
      *pndep += findnewdep(dep,*pndep,maxndep,d);
/* ------------------------------------------------------------
   Forward solve on columns p(k+1:n)
   ------------------------------------------------------------ */
    if(useperm){
      for(j = k+1, pj = pk; j < n; j++){               /* with pivoting */
        pj += xsuper[j];
        fwipr1o(pj, perm, pk, betak, mk, nk);
      }
      for(j = 0; j < mk; j++)
        permPr[j] = perm[j];                           /* C-style perm */
      permPr += mk;
    }
    else
      for(j = k+1, pj = pk; j < n; j++){               /* without pivoting */
        pj += xsuper[j];
        fwipr1(pj, pk, betak, mk, nk);
      }
/* ------------------------------------------------------------
   Point to next column
   ------------------------------------------------------------ */
    inz += nk;
  }
/* ------------------------------------------------------------
   In total, we wrote inz <= length(p) nonzeros in beta.
   ------------------------------------------------------------ */
  betajc[n] = inz;
#ifdef DO_SUPER_SAFE
/* ------------------------------------------------------------
   If t < 0, then let dep = find(d<=0), and d(dep) = 0.
   Note: length(d) = m = xsuper[n].
   ------------------------------------------------------------ */
  if(t < 0){
    mk = xsuper[n];
    inz = 0;
    for(j = 0; j < mk; j++)
      if(d[j] <= 0.0){
        d[j] = 0.0;
        dep[inz++] = j;
        if(inz > maxndep)
          mexErrMsgTxt("Fatal numerical error in dpr1fact.");
      }
    *pndep = inz;
  }
#endif
}

/* ============================================================
   MAIN: MEXFUNCTION
   ============================================================ */
/* ************************************************************
   PROCEDURE mexFunction - Entry for Matlab
   ************************************************************ */
void mexFunction(const int nlhs, mxArray *plhs[],
  const int nrhs, const mxArray *prhs[])
{
  mxArray *myplhs[NPAROUT];
  int m,n,ndep,i,j,nadd, permj, pnnz, permnnz, m1, maxnnz;
  char *ordered;
  int *xsuper,*dep, *colperm, *invrowperm, *betajc, *iwork;
  double *beta, *d, *depPr,*betajcPr, *pj, *orderedPr, *fwork,
    *p, *permPr;
  const double *lab, *rowpermPr, *colpermPr, *xsuperPr;
  iandr *riwork;
  double maxu;
  jcir x;
/* ------------------------------------------------------------
   Check for proper number of arguments
   ------------------------------------------------------------ */
  if(nrhs < NPARIN)
    mexErrMsgTxt("dpr1fact requires more input arguments");
  if(nlhs > NPAROUT)
    mexErrMsgTxt("dpr1fact produces less output arguments");
/* ------------------------------------------------------------
   Get inputs (x, lab=d, rowperm,colperm,xsuper, dep, maxu)
   ------------------------------------------------------------ */
  m = mxGetM(X_IN);                              /* x */
  n = mxGetN(X_IN);
  if(!mxIsSparse(X_IN))
    mexErrMsgTxt("x should be sparse.");
  x.jc = mxGetJc(X_IN);
  x.ir = mxGetIr(X_IN);
  x.pr = mxGetPr(X_IN);
  if( mxGetM(D_IN) * mxGetN(D_IN) != m)               /* d */
    mexErrMsgTxt("Size mismatch d.");
  lab = mxGetPr(D_IN);
  if( mxGetM(ROWPERM_IN) * mxGetN(ROWPERM_IN) != m)   /* rowperm */
    mexErrMsgTxt("Size mismatch rowperm.");
  rowpermPr = mxGetPr(ROWPERM_IN);
  if( mxGetM(COLPERM_IN) * mxGetN(COLPERM_IN) != n)   /* colperm */
    mexErrMsgTxt("Size mismatch colperm.");
  colpermPr = mxGetPr(COLPERM_IN);
  nadd = mxGetM(XSUPER_IN) * mxGetN(XSUPER_IN) - 1;   /* xsuper */
  if(nadd > n || nadd < 0)
    mexErrMsgTxt("Size mismatch xsuper.");
  xsuperPr = mxGetPr(XSUPER_IN);
  depPr = mxGetPr(DEPROWS_IN);                        /* dep */
  ndep = mxGetM(DEPROWS_IN) * mxGetN(DEPROWS_IN);
  maxu = mxGetScalar(MAXU_IN);
/* ------------------------------------------------------------
   Allocate working arrays:
   dep(ndep+1), xsuper(n+1), betajc(n+1), ordered(n)
   iwork(m+n), fwork(m), riwork(m).
   ------------------------------------------------------------ */
  m1 = MAX(m,1);                                     /* avoid alloc to 0 */
  dep     = (int *) mxCalloc(ndep+1, sizeof(int));
  xsuper  = (int *) mxCalloc(n+1, sizeof(int));
  betajc  = (int *) mxCalloc(n+1, sizeof(int));
  iwork   = (int *) mxCalloc(m1 + n, sizeof(int)); 
  ordered = (char *) mxCalloc(MAX(n,1), sizeof(char));       /* boolean */
  fwork   = (double *) mxCalloc(m1, sizeof(double));   /* float */
  riwork  = (iandr *) mxCalloc(m1, sizeof(iandr));     /* (i,r) */
/* ------------------------------------------------------------
   Convert xsuper to integer. Append xsuper[nadd] up to entry n.
   ------------------------------------------------------------ */
  for(i = 0; i <= nadd; i++){
    j = xsuperPr[i];
    xsuper[i] = --j;
  }
  while(i <= n)
    xsuper[i++] = j;   /*  The phase-2 cols are all length xsuper[nadd]. */
  mxAssert(xsuper[0] == 0, "");
/* ------------------------------------------------------------
   Let pnnz = sum(xsuper).
   ------------------------------------------------------------ */
  for(i = 1, pnnz = 0; i <= n; i++)
    pnnz += xsuper[i];
/* ------------------------------------------------------------
   Allocate outputs p(pnnz+m),  d(m),  beta(pnnz),
   ordered(n), perm(pnnz).
   NB: The +m for p is temporary. This will avoid memory problems when
   initializing p(invperm,:) = x, if xsuper and/or rowperm are invalid.
   ------------------------------------------------------------ */
  P_OUT = mxCreateDoubleMatrix(pnnz + m,1,mxREAL);              /* p */
  p = mxGetPr(P_OUT);
  D_OUT = mxCreateDoubleMatrix(m, 1, mxREAL);                /* d */
  d = mxGetPr(D_OUT);
  BETA_OUT = mxCreateDoubleMatrix(pnnz, 1, mxREAL);       /* beta */
  beta = mxGetPr(BETA_OUT);
  ORDERED_OUT = mxCreateDoubleMatrix(n, 1, mxREAL);       /* ordered */
  orderedPr = mxGetPr(ORDERED_OUT);
  PERM_OUT =  mxCreateDoubleMatrix(pnnz, 1, mxREAL);       /* perm */
  permPr = mxGetPr(PERM_OUT);
/* ------------------------------------------------------------
   Partition iwork = [colperm(n), invrowperm(m)].
   ------------------------------------------------------------ */
  colperm = iwork;
  invrowperm = iwork + n;
/* ------------------------------------------------------------
   Convert colperm to integer
   ------------------------------------------------------------ */
  for(i = 0; i < n; i++){                 /* colperm(0:n-1) */
    j = colpermPr[i];
    colperm[i] = --j;
  }
/* ------------------------------------------------------------
   Let invrowperm(rowperm) = 0:m-1
   ------------------------------------------------------------ */
  for(i = 0; i < m; i++){
    j = rowpermPr[i];
    invrowperm[--j] = i;
  }
/* ------------------------------------------------------------
   Let d(invrowperm) = lab,  dep = [sort(invrowperm(depPr-1.0)), m]
   ------------------------------------------------------------ */
  for(i = 0; i < m; i++)
    d[invrowperm[i]] = lab[i];
  for(i = 0; i < ndep; i++){              /* dep(0:ndep-1) */
    j = depPr[i];
    dep[i] = invrowperm[--j];
  }
  isortinc(dep,ndep);
  dep[ndep] = m;           /* tail of dep */
/* ------------------------------------------------------------
   Let p(invrowperm,:) = x(:,colperm)
   ------------------------------------------------------------ */
  for(j = 0, pj = p; j < n; j++){
    pj += xsuper[j];
    permj = colperm[j];
    for(i = x.jc[permj]; i < x.jc[permj+1]; i++)
      pj[invrowperm[x.ir[i]]] = x.pr[i];
  }
/* ------------------------------------------------------------
   Realloc (shrink) p to the size it should have, i.e. pnnz
   ------------------------------------------------------------ */
  pnnz = MAX(1,pnnz);
  if((p = (double *) mxRealloc(p, pnnz * sizeof(double))) == NULL)
    mexErrMsgTxt("Memory allocation error");
  mxSetPr(P_OUT, p);
  mxSetM(P_OUT,pnnz);
/* ------------------------------------------------------------
   The actual job is done here:
   Adding nadd rank-1 updates, subtracting n-nadd rank-1 updates.
   ------------------------------------------------------------ */
  prodformfact(p, permPr, beta, betajc, d, ordered, xsuper, nadd,
               n, dep, &ndep, maxu, iwork, fwork, riwork);
/* ------------------------------------------------------------
   Realloc (shrink) beta to length betajc[n].
   ------------------------------------------------------------ */
  maxnnz = MAX(betajc[n],1);
  if((beta = (double *) mxRealloc(beta, maxnnz * sizeof(double))) == NULL)
    mexErrMsgTxt("Memory allocation error");
  mxSetPr(BETA_OUT, beta);
  mxSetM(BETA_OUT,maxnnz);
/* ------------------------------------------------------------
   Realloc (shrink) permPr to length sum{xsuper[j] | ordered[j]==1}
   ------------------------------------------------------------ */
  for(i = 0, permnnz = 0; i < n; i++)
    permnnz += ordered[i] * xsuper[i+1];
  mxAssert(permnnz <= pnnz, "");
  permnnz = MAX(1, permnnz);         /* avoid realloc to zero */
  if((permPr = (double *) mxRealloc(permPr, permnnz * sizeof(double))) == NULL)
    mexErrMsgTxt("Memory allocation error");
  mxSetPr(PERM_OUT, permPr);
  mxSetM(PERM_OUT,permnnz);
/* ------------------------------------------------------------
   Create DEP_OUT = dep(1:ndep), in Fortran floats.
   ------------------------------------------------------------ */
  DEP_OUT = mxCreateDoubleMatrix(ndep, 1, mxREAL);
  depPr = mxGetPr(DEP_OUT);
  for(i = 0; i < ndep; i++){
    j = dep[i];
    depPr[i] = ++j;
  }
/* ------------------------------------------------------------
   Create BETAJC_OUT(n+1)
   ------------------------------------------------------------ */
  BETAJC_OUT = mxCreateDoubleMatrix(n + 1, 1, mxREAL);
  betajcPr = mxGetPr(BETAJC_OUT);
  for(i = 0; i <= n; i++){
    j = betajc[i];
    betajcPr[i] = ++j;
  }
/* ------------------------------------------------------------
   Ordered to floats
   ------------------------------------------------------------ */
  for(i = 0; i < n; i++)
    orderedPr[i] = ordered[i];
/* ------------------------------------------------------------
   Release working arrays
   ------------------------------------------------------------ */
  mxFree(riwork);
  mxFree(fwork);
  mxFree(ordered);
  mxFree(iwork);
  mxFree(betajc);
  mxFree(xsuper);
  mxFree(dep);
/* ------------------------------------------------------------
   Copy requested output parameters (at least 1), release others.
   ------------------------------------------------------------ */
  i = MAX(nlhs, 1);
  memcpy(plhs,myplhs, i * sizeof(mxArray *));
  for(; i < NPAROUT; i++)
    mxDestroyArray(myplhs[i]);
}
