diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..851d8878
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+
+stenglib/Fast/fsetop.mexw64
+stenglib/Fast/fsparse.mexw64
+stenglib/Tensor/tprod.mexw64
+stenglib/Tensor/tsum.mexw64
diff --git a/startup.m b/startup.m
index 0f64f255..8c3b0716 100644
--- a/startup.m
+++ b/startup.m
@@ -8,4 +8,9 @@
 
 % path to urdme/ folders
 addpath(genpath([link 'urdme/']));
-addpath(genpath([link 'workflows/']));
\ No newline at end of file
+addpath(genpath([link 'workflows/']));
+
+% Johannes Dufva 2020-10-30
+% path to other functions used
+run('stenglib/make_all.m')
+run('stenglib/startup.m')
\ No newline at end of file
diff --git a/stenglib/Fast/clenshaw.m b/stenglib/Fast/clenshaw.m
new file mode 100644
index 00000000..6aaafbac
--- /dev/null
+++ b/stenglib/Fast/clenshaw.m
@@ -0,0 +1,43 @@
+function Y = clenshaw(A,B,C)
+%CLENSHAW Evaluation of 3-term recurrences.
+%   Y = CLENSHAW(A,B) evaluates the 3-term recurrence as defined by the
+%   matrices A and B.
+%
+%   Let [M,N] be MAX(SIZE(A),SIZE(B)). Then the output Y is M-by-N and, for
+%   1 <= j <= N, the recurrence is defined by Y(1,j) = A(1,j)+B(1,j), Y(2,j)
+%   = A(2,j)*Y(1,j)+B(2,j) and, for i > 2, Y(i,j) =
+%   A(i,j)Y(i-1,j)+B(i,j)Y(i-2,j).
+%
+%   The dimensions of the matrices A and B must either match or be
+%   singletons. Singleton dimensions are as usual silently repeated.
+%
+%   Y = CLENSHAW(A,B,C) evaluates the 3-term recurrence defined by A and B,
+%   multiplies it by the coefficients C and computes the sum of the result
+%   by means of Clenshaws summation formula. C is M-by-K-by-N and the result
+%   is K-by-N. The first dimension of C must be M while the second and third
+%   dimensions of C can be singletons.
+%
+%   Examples:
+%     % Fibonacci numbers
+%     n = 10; clenshaw(1,[0; 0; ones(n-2,1)])'
+%
+%     % Legendre polynomials
+%     x = linspace(-1,1); n = 4; j = [0.5 1:n]';
+%     A = (2*j-1)./j*x;
+%     B = (1-j)./j;
+%     y = clenshaw(A,B);
+%     figure, plot(x,y);
+%
+%     % two functions defined by Chebyshev coefficients
+%     x = linspace(-1,1,50);
+%     n = 4; j = [0; 1; 2*ones(n-1,1)];
+%     A = j*x; B = [1; 0; -ones(n-1,1)];
+%     C = [[1 0.75 0.5 0.25 0.125]' [1 -1.75 1.5 -1.25 2.125]'];
+%     figure, plot(x,clenshaw(A,B,C));
+%
+%   See also FILTER.
+
+% S. Engblom 2007-04-19 (Major revision)
+% S. Engblom 2005-07-28
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/clenshaw.mexw64 b/stenglib/Fast/clenshaw.mexw64
new file mode 100644
index 00000000..6a81209f
Binary files /dev/null and b/stenglib/Fast/clenshaw.mexw64 differ
diff --git a/stenglib/Fast/frepmat.m b/stenglib/Fast/frepmat.m
new file mode 100644
index 00000000..78c01692
--- /dev/null
+++ b/stenglib/Fast/frepmat.m
@@ -0,0 +1,61 @@
+function B = frepmat(A,rep)
+%FREPMAT Fast replication of array.
+%   FREPMAT does the same job as the MATLAB-function REPMAT but runs
+%   faster since the main part of the code is written in C.
+%
+%   B = FREPMAT(A,[M N]) creates a matrix B consisting of M-by-N
+%   copies of A.
+%
+%   In general, B = FREPMAT(A,REP) repeats A in the first dimension
+%   REP(1) times, in the second dimension REP(2) times and so on for
+%   all of the sizes in REP.
+%
+%   Unlike REPMAT, missing sizes in REP are consistently defined to be
+%   1. This means that if m is a scalar, then FREPMAT(A,m) does not
+%   produce the same result as REPMAT(A,m) (which is defined to be
+%   REPMAT(A,[m m])). Instead, FREPMAT(A,m) produces the same result
+%   as would REPMAT(A,[m 1]). Also, the traditional syntax
+%   FREPMAT(A,M,N,...) is not supported.
+%
+%   Examples:
+%     a = frepmat(1,1000);            % same as ones(1000,1)
+%     b = frepmat(int8(1),[100 100]); % same as ones(100,100,'int8')
+%
+%     s = sprand(10,10,0.5);
+%     a1 = frepmat(s,[100 10]); [nnz(a1) nzmax(a1)]
+%     a2 = repmat(s,[100 10]);  [nnz(a2) nzmax(a2)] % overallocation
+%
+%   See also REPMAT.
+
+%   - For doubles (real or complex), FREPMAT is about 15-20% faster
+%   than REPMAT.
+%
+%   - For other numerical types (single precision floats, integers,
+%   characters and logicals), FREPMAT is about 50% faster than REPMAT.
+%
+%   - For sparse matrices (real, complex or logical), FREPMAT is about
+%   85-95% faster than REPMAT and allocates much less memory.
+%
+%   - For other data-types, such as cell-, structure- and
+%   function-arrays, FREPMAT is about 10% faster than REPMAT.
+
+% S. Engblom 2004-10-28
+
+if isnumeric(A) || ischar(A) || islogical(A)
+  % runs fast
+  B = mexfrepmat(A,rep);
+else
+  % general code using the traditional 'indexing' style
+  rep = rep(:);
+  lenrep = size(rep,1);
+  len = max(lenrep,tndims(A));
+  sizA = int32(tsize(A,1:len));
+  rep = [rep; ones(len-lenrep,1)];
+
+  % while building the indices the speed of mexfrepmat is exploited
+  ix = cell(1,len);
+  for i = 1:len
+    ix{i} = mexfrepmat([1:sizA(i)]',rep(i));
+  end
+  B = A(ix{:});
+end
diff --git a/stenglib/Fast/fsetop.m b/stenglib/Fast/fsetop.m
new file mode 100644
index 00000000..c3d61232
--- /dev/null
+++ b/stenglib/Fast/fsetop.m
@@ -0,0 +1,102 @@
+function [c,ia,ib] = fsetop(op,a,b)
+%FSETOP Fast set operations based on hashing.
+%   FSETOP(OP,A,B) performs the set operation OP on the columns of the
+%   arrays A and B. The main differences from the set operations
+%   implemented in MATLAB are:
+%     - FSETOP runs faster because it is based on hashing instead of on
+%     sorting.
+%
+%     - In FSETOP, the order of the elements in the input arrays are
+%     retained in the output array, in contrast to the corresponding
+%     MATLAB functions where the result is sorted. This means that all
+%     indices for new elements are strictly increasing (except for IB
+%     in the INTERSECT operation and for JA and JB in the UNION
+%     operation). Also, in case of duplications, the first element
+%     (lowest column number) is selected.
+%
+%     - FSETOP always uses the columns of its inputs as elements, like the
+%     'rows' switch in the MATLAB commands, but for columns instead of
+%     rows. In particular, column vectors are not special cases, they are
+%     treated as single elements.
+%
+%     - FSETOP supports the following data types: double, single, char,
+%     logicals, int8, uint8, int16, uint16, int32, uint32, int64, uint64
+%     and cell-arrays containing any of the above data types.
+%
+%   Cell-arrays are treated as vectors and all output indices refer to a
+%   linear ordering. The output C is always a row cell-vector and cells
+%   containing the same data but of different types or shapes are
+%   considered unequal.
+%
+%   C = FSETOP('check',A) computes hash-values for each column in A. The
+%   output C is a row-vector containing uint32-type integers. This is not
+%   a set-operation but is provided as a useful tool for computing
+%   check-sums for large data-sets in a uniform way.
+%
+%   [C,IA,IB] = FSETOP('intersect',A,B) returns the columns common to
+%   both A and B. C = A(:,IA) = B(:,IB).
+%
+%   [C,IA] = FSETOP('setdiff',A,B) returns the columns in A that are
+%   not in B. C = A(:,IA).
+%
+%   [C,IA,IB] = FSETOP('setxor',A,B) returns the columns that are not
+%   in the intersection of A and B. C = [A(:,IA) B(:,IB)].
+%
+%   [C,IA,IB,JA,JB] = FSETOP('union',A,B) returns the combined columns
+%   from A and B but with no repetitions. C = [A(:,IA) B(:,IB)] and A
+%   = C(:,JA), B = C(:,JB). Note that the outputs JA and JB are not
+%   produced by the MATLAB-function UNION.
+%
+%   [B,IA,IB] = FSETOP('unique',A) returns the same columns as in A
+%   but with no repetitions. B = A(:,IA) and A = B(:,IB).
+%
+%   [IA,IB] = FSETOP('ismember',A,B) returns a logical vector IA
+%   containing 1 where the columns of A are also columns of B and 0
+%   otherwise. IB contains the index in B of each column in A and zero if
+%   no such index exists. A(:,IA) = B(:,IB(IA)).
+%
+%   Note: two elements are considered equal if and only if their
+%   bitwise representations are identical. This can sometimes give
+%   unexpected results. For example, with doubles, -0.0 ~= 0.0 and NaN
+%   == NaN.
+%
+%   Examples:
+%     % intersection of integers
+%     a = int8(ceil(3*rand(2,10))), b = int8(ceil(3*rand(2,10)))
+%     aandb = fsetop('intersect',a,b)
+%
+%     % unique strings in cell-array
+%     strs = {'foobar' 'foo' 'foobar' 'bar' 'barfoo'}
+%     strsunq = fsetop('unique',strs)
+%
+%     % find indices ix in b = a(ix)
+%     a = randperm(6), b = ceil(6*rand(1,6))
+%     [foo,ix] = fsetop('ismember',b,a)
+%
+%     % remove indices from set of indices
+%     a = {[1 2 4] [2 3 5 6] [4 2 1] [2 3 5 6]' int32([1 2 4])};
+%     b = {[4 2 1]};
+%     c = fsetop('setdiff',a,b)
+%
+%     % union of structs
+%     a = struct('foo',1,'bar',NaN,'foobar','hello')
+%     b = struct('faa',2,'bar',Inf,'foobar','goodbye')
+%     [cf,ia,ib] = fsetop('union',fieldnames(a),fieldnames(b));
+%     ac = struct2cell(a); bc = struct2cell(b);
+%     c = cell2struct([ac(ia); bc(ib)],cf)
+%
+%     % a single 32-bit checksum from strings
+%     s = {'check','intersect','setdiff','setxor', ...
+%          'union','unique','ismember'};
+%     c = fsetop('check',fsetop('check',s)')
+%
+%   See also INTERSECT, SETDIFF, SETXOR, UNION, UNIQUE, ISMEMBER.
+
+% S. Engblom 2010-02-09 (Revision)
+% S. Engblom 2006-06-13 (Major revision)
+% S. Engblom 2005-06-21
+% Based on a concept by P-O Persson, COMSOL AB. The internal hash
+% function used is based on a code by P. Hsieh, see
+% http://www.azillionmonkeys.com/qed/hash.html.
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/fsetop.mexw64 b/stenglib/Fast/fsetop.mexw64
new file mode 100644
index 00000000..db152a9a
Binary files /dev/null and b/stenglib/Fast/fsetop.mexw64 differ
diff --git a/stenglib/Fast/fsparse.m b/stenglib/Fast/fsparse.m
new file mode 100644
index 00000000..2c01914b
--- /dev/null
+++ b/stenglib/Fast/fsparse.m
@@ -0,0 +1,145 @@
+function S = fsparse(ii,jj,ss,siz,flag,nthreads)
+%FSPARSE Fast assembly of sparse matrix.
+%   FSPARSE does the same job as the MATLAB-function SPARSE but runs faster
+%   since the assembly is based on indirect addressing and on hashing rather
+%   than on sorting. FSPARSE also in general allocates the sparse matrix
+%   exactly, that is, the resulting sparse matrix S satisfies NNZ(S) =
+%   NZMAX(S). In addition, FSPARSE supports an efficient and powerful
+%   'assembly' syntax which makes it easy to create sparse banded matrices
+%   or sparse matrices arising in finite difference computations.
+%
+%   S = FSPARSE(X), where X is a full or sparse matrix, constructs a sparse
+%   matrix S by squeezing out any zero elements.
+%
+%   S = FSPARSE(II,JJ,SS) creates a sparse matrix S from triplet data
+%   (II,JJ,SS). The inputs are all matrices with either matching or
+%   singleton dimensions according to certain rules (see below). For
+%   instance, II may be 4-by-10, JJ 1-by-10 and SS 4-by-1.
+%
+%   The result is a sparse matrix formally satisfying the relation
+%   S(II(i,j),JJ(i,j)) = SS(i,j), except of course for singleton
+%   dimensions. In the example above for instance, we have the relation
+%   S(II(i,j),JJ(1,j)) = SS(i,1). Repeated indices are as usual summed
+%   together.
+%
+%   If II is IM-by-IN, JJ JM-by-JN and SS SM-by-SN, then it is required that
+%   (1) IN = JN or 1, (2) JM = IM or 1, (3) SM = IM or 1 and (4) SN = JN or
+%   1.
+%
+%   S = FSPARSE(II,JJ,SS,[M N Nzmax]) also specifies the dimensions of the
+%   output. All arguments are optional and when they are omitted M =
+%   MAX(II), N = MAX(JJ) and Nzmax = NNZ(S) are used as defaults.
+%
+%   S = FSPARSE(II,JJ,SS,SIZ,'nosort') produces a sparse matrix S
+%   where the columns are not sorted with respect to row-indices (if
+%   the input is sorted with respect to II, then the output is in
+%   order anyway). This runs faster and the resulting unordered matrix
+%   can be used in many, but not all, situations in
+%   MATLAB. Cautionary: on some platforms an unordered sparse matrix
+%   causes MATLAB to crash when performing certain operations.
+%
+%   One or both of the index-matrices II and JJ may be integers instead of
+%   doubles. This is faster still since no typecast or deep copy is
+%   needed. An 'integer' is the type which corresponds to the C declaration
+%   'int' on the platform; -- on most modern platforms this is int32.
+%
+%   Differences between FSPARSE and SPARSE:
+%     - FSPARSE generally allocates the sparse matrix so that it fits
+%     exactly. However, zero elements resulting from cancellation or from
+%     zero data causes a slight over-allocation. Thus, both FSPARSE([1 1],[1
+%     1],[1 -1]) and FSPARSE(1,1,0) allocates space for one entry.
+%
+%     - The call S = FSPARSE(X), where X is sparse, always makes a deep copy
+%     of the sparse matrix so that NNZ(S) = NZMAX(S).
+%
+%     - Integer indices are not supported by SPARSE.
+%
+%     - According to the specification above, the call FSPARSE(1:n,1,1) is
+%     not allowed. The corrected version of this example is simply
+%     FSPARSE([1:n]',1,1) where the row-index is associated with the first
+%     dimension of the input matrix.
+%
+%     - Logical sparse matrices cannot be created by FSPARSE. Use logical
+%     operations for this purpose.
+%
+%     - SPARSE allows the input (II,JJ,SS) itself to be sparse or even
+%     logical sparse. This syntax is not supported by FSPARSE.
+%
+%   Examples:
+%     N = 10;
+%     S1 = fsparse(1:N,1:N,1);    % same as speye(N)
+%     S2 = fsparse([1:N]',1:N,1); % same as sparse(ones(N));
+%
+%     % sparse matrix from triplet
+%     ii = ceil(4*rand(1,N));
+%     jj = ceil(4*rand(1,N));
+%     ss = rand(1,N);
+%     S3 = fsparse(ii,jj,ss); [nnz(S3) nzmax(S3)]
+%     s3 = sparse(ii,jj,ss);  [nnz(s3) nzmax(s3)] % over-allocation
+%
+%     % 3-point stencil
+%     S4 = fsparse([2:N-1]',[1:N-2; 2:N-1; 3:N]',[1 -2 1],[N N]);
+%
+%     % 5-point stencil on an N-by-N grid
+%     N2 = N*N; ix = reshape(1:N2,N,N);
+%     ix([1 N],:) = []; ix(:,[1 N]) = []; ix = ix(:);
+%     S5 = fsparse(ix,[ix-N ix-1 ix ix+1 ix+N],[1 1 -4 1 1],[N2 N2]);
+%
+%     % band-matrix
+%     B = rand(3,N); B(end,1) = 0; B(1,end) = 0;
+%     % same as spdiags(B',[-1 0 1],N,N):
+%     S6 = fsparse([[2:N 1]; [1:N]; [N 1:N-1]],1:N,B);
+%
+%     % circulant matrix
+%     jj = 1+mod(cumsum([0:3; ones(N-1,4)]),N);
+%     S7 = fsparse([1:N]',jj,1:4);
+%
+%   See also SPARSE, NNZ, NZMAX, SPDIAGS, FIND.
+
+
+%   Hidden argument nthreads and OpenMP-support
+%
+%   The full call currently supported is actually
+%     S = FSPARSE(II,JJ,SS,SIZ,[] | 'sort' | 'nosort',nthreads)
+%   with nthreads an integer >= 1. The default value is
+%   omp_get_max_threads(). The value of nthreads is remembered until
+%   the mex-file is cleared from memory:
+%     S1 = fsparse(...,4); % use 4 threads
+%     S2 = fsparse(...);   % continues to use 4 threads
+%     clear all;
+%     S3 = fsparse(...);   % uses omp_get_max_threads()
+%   The threaded version is a beta-release and is only supported for
+%   a subset of the syntaxes covered by FSPARSE. Also, compilation
+%   must be done under OpenMP, see MAKE.
+
+%   Hidden output timing
+%
+%   With #define FSPARSE_TIME an extra output time vector is supported
+%   as follows:
+%     [S,t] = FSPARSE(...)
+%   returns an 1-by-6 vector containing timings of the internal
+%   phases of FSPARSE.
+%     t(1) getix()-function
+%     t(2) Part 1
+%     t(3) Part 2
+%     t(4) Part 3
+%     t(5) Part 4
+%     t(6) sparse_insert()
+%   The suppport for timing is a beta-release and is only supported
+%   for a subset of the syntaxes covered by FSPARSE. See MAKE.
+
+%   Early performance observations for the serial version
+%
+%   In general, when the input indices and values are of the same
+%   sizes, FSPARSE is about 30-70% faster than SPARSE depending on the
+%   number of colliding indices, the usage of integer indices and the
+%   'nosort' option. When converting from full to sparse storage
+%   format, FSPARSE is about 50% faster than MATLAB. Finally, for the
+%   'assembly' syntax, FSPARSE is about 50-60% faster than any
+%   equivalent construction in MATLAB.
+
+% S. Engblom 2013-12-03 (Revision, OpenMP added, hidden argument nthreads).
+% S. Engblom 2010-01-13 (Minor revision, caution with 'nosort' added)
+% S. Engblom 2004-11-12
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/fsparse.mexw64 b/stenglib/Fast/fsparse.mexw64
new file mode 100644
index 00000000..da844630
Binary files /dev/null and b/stenglib/Fast/fsparse.mexw64 differ
diff --git a/stenglib/Fast/mexfrepmat.mexw64 b/stenglib/Fast/mexfrepmat.mexw64
new file mode 100644
index 00000000..b7a7de27
Binary files /dev/null and b/stenglib/Fast/mexfrepmat.mexw64 differ
diff --git a/stenglib/Fast/powerseries.m b/stenglib/Fast/powerseries.m
new file mode 100644
index 00000000..7ca9702b
--- /dev/null
+++ b/stenglib/Fast/powerseries.m
@@ -0,0 +1,25 @@
+y = powerseries(c,x,tol)
+%POWERSERIES Sum power series.
+%   Y = POWERSERIES(C,X,tol) evaluates the power series with coefficients C
+%   at points X where both C and X are vectors. The output Y is the same
+%   size as X and contains Y(i) = sum_(j >= 1) C(j)*X(i)^(j-1). The sum is
+%   truncated when the last contribution is relatively less than tol.
+%
+%   Note: use
+%     warning('off','powerseries:w1');
+%   to turn the warning for inaccurate evaluation off. As an
+%   alternative, you may explicitly set C(end) = 0.
+%
+%   Example:
+%     % the complex exponential function
+%     c = 1./cumprod([1 1:10]);
+%     x = complex(linspace(-2,3),linspace(-4,6));
+%     y = powerseries(c,x,0.5e-7);
+%     figure, plot(real(x),real(y),'b',real(x),real(exp(x)),'r');
+%     legend('Series','Exp()');
+
+% S. Engblom 2007-08-17 (Minor revision)
+% S. Engblom 2007-06-13 (Minor revision)
+% S. Engblom 2007-01-22
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/powerseries.mexw64 b/stenglib/Fast/powerseries.mexw64
new file mode 100644
index 00000000..178e1943
Binary files /dev/null and b/stenglib/Fast/powerseries.mexw64 differ
diff --git a/stenglib/Fast/source/clenshaw.c b/stenglib/Fast/source/clenshaw.c
new file mode 100644
index 00000000..a37c5015
--- /dev/null
+++ b/stenglib/Fast/source/clenshaw.c
@@ -0,0 +1,392 @@
+/* clenshaw.c */
+/* S. Engblom 2007-04-19 (Revision) */
+/* S. Engblom 2005-07-28 */
+
+#include <string.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/* forward declarations */
+void SizesAndType(const mxArray **A,
+		  int *dimM,int *dimN,
+		  mxComplexity *type);
+
+void recurrence(double *prY,double *piY,int M,int N,
+		const mxArray *A,const mxArray *B);
+void clenshaw(double *prY,double *piY,int M,int N,int KC,int NC,
+	      const mxArray *A,const mxArray *B,
+	      const double *prC,const double *piC);
+
+#define ISDOUBLEMATRIX(A) (mxIsDouble(A) && !mxIsSparse(A) && \
+			   mxGetNumberOfDimensions(A) == 2)
+#define ISDOUBLETENSOR(A) (mxIsDouble(A) && !mxIsSparse(A))
+
+/*------------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* syntax */
+  if (nrhs < 2 || 3 < nrhs || nlhs > 1)
+    mexErrMsgIdAndTxt("clenshaw:e1","Expecting 2 or 3 inputs "
+		      "and one output.");
+
+  /* 'recurrence'-syntax */
+  if (nrhs == 2) {
+    /* chech input and allocate output */
+    int dimM,dimN;
+    mxComplexity type;
+    SizesAndType(prhs,&dimM,&dimN,&type);
+
+    /* allocate and evaluate recurrence */
+    plhs[0] = mxCreateDoubleMatrix(dimM,dimN,type);
+    recurrence(mxGetPr(plhs[0]),mxGetPi(plhs[0]),
+	       dimM,dimN,prhs[0],prhs[1]);
+  }
+  /* 'coefficient'-syntax (Clenshaw summation) */
+  else {
+    /* check input and allocate output */
+    int dimM,dimN,KC,NC;
+    mxComplexity type;
+    SizesAndType(prhs,&dimM,&dimN,&type);
+
+    /* coefficient matrix */
+    if (!ISDOUBLETENSOR(prhs[2]))
+      mexErrMsgIdAndTxt("clenshaw:e2",
+			"Expecting a double, non-sparse matrix.");
+    if (mxGetM(prhs[2]) != dimM)
+      mexErrMsgIdAndTxt("clenshaw:e4",
+			"The dimensions of the coefficients must "
+			"match the length of each recursion.");
+
+    if (mxGetNumberOfDimensions(prhs[2]) > 3)
+      mexErrMsgIdAndTxt("clenshaw:e5","Coefficient array can have at "
+			"most three dimensions.");
+    else if (mxGetNumberOfDimensions(prhs[2]) == 3) {
+      KC = mxGetDimensions(prhs[2])[1];
+      NC = mxGetDimensions(prhs[2])[2];
+    }
+    else {
+      KC = mxGetN(prhs[2]);
+      NC = 1;
+    }
+
+    if (dimN == 1)
+      dimN = NC;
+    else if (dimN != NC && NC != 1)
+      mexErrMsgIdAndTxt("clenshaw:e6","The number of recursions "
+			"must agree.");
+
+    if (mxIsComplex(prhs[2])) type = mxCOMPLEX;
+
+    /* allocate and evaluate sum */
+    plhs[0] = mxCreateDoubleMatrix(KC,dimN,type);
+    clenshaw(mxGetPr(plhs[0]),mxGetPi(plhs[0]),
+	     dimM,dimN,KC,NC,
+	     prhs[0],prhs[1],mxGetPr(prhs[2]),mxGetPi(prhs[2]));
+  }
+}
+/*------------------------------------------------------------------------*/
+void SizesAndType(const mxArray **A,
+		  int *dimM,int *dimN,
+		  mxComplexity *type)
+/* Given the arrays A[0] and A[1], this routine determines the
+   dimensions dimM and dimN of the recursions defined by this
+   input. The common type (mxREAL/mxCOMPLEX) is returned in type. */
+{
+  *dimM = *dimN = 1;
+  *type = mxREAL;
+
+  for (int j = 0; j < 2; j++) {
+    const mxArray *Aj = A[j];
+    if (!ISDOUBLEMATRIX(Aj))
+      mexErrMsgIdAndTxt("clenshaw:e2",
+			"Expecting a double, non-sparse matrix.");
+    if (*dimM == 1)
+      *dimM = mxGetM(Aj);
+    else if (*dimM != mxGetM(Aj) && mxGetM(Aj) != 1)
+      mexErrMsgIdAndTxt("clenshaw:e3",
+			"Dimensions must either match or be singletons.");
+    if (*dimN == 1)
+      *dimN = mxGetN(Aj);
+    else if (*dimN != mxGetN(Aj) && mxGetN(Aj) != 1)
+      mexErrMsgIdAndTxt("clenshaw:e3",
+			"Dimensions must either match or be singletons.");
+    if (mxIsComplex(Aj)) *type = mxCOMPLEX;
+  }
+}
+/*------------------------------------------------------------------------*/
+void recurrence(double *prY,double *piY,int M,int N,
+		const mxArray *A,const mxArray *B)
+/* Evaluates the recurrence as defined by the matrices A and B and
+   stores the result in (prY,piY), which must be allocated prior to
+   call. [M,N] = max(size(A),size(B)). All dimensions must either
+   match or be singletons. */
+{
+  const double *prA = mxGetPr(A),*piA = mxGetPi(A);
+  const double *prB = mxGetPr(B),*piB = mxGetPi(B);
+
+  /* increments for columns and rows */
+  const int ia = mxGetM(A) == M;
+  const int ja = (ia ^ mxGetN(A) == N)*((mxGetN(A) == N)-mxGetM(A));
+  const int ib = mxGetM(B) == M;
+  const int jb = (ib ^ mxGetN(B) == N)*((mxGetN(B) == N)-mxGetM(B));
+
+  /* evaluate recurrence */
+  if (piY == NULL)
+    for (int j = 0; j < N; j++) {
+      double r1 = 1.0,r2 = 1.0,r3;
+      for (int i = 0; i < M; i++) {
+	r3 = r2;
+	r2 = r1;
+	r1 = *prA*r2+*prB*r3;
+	*prY++ = r1;
+	prA += ia;
+	prB += ib;
+      }
+      prA += ja;
+      prB += jb;
+    }
+  /* the remaining 3 complex case */
+  else if (piB == NULL)
+    for (int j = 0; j < N; j++) {
+      double r1 = 1.0,r2 = 1.0,r3;
+      double i1 = 0.0,i2 = 0.0,i3;
+      for (int i = 0; i < M; i++) {
+	r3 = r2; i3 = i2;
+	r2 = r1; i2 = i1;
+	r1 = *prA*r2-*piA*i2+*prB*r3;
+	i1 = *prA*i2+*piA*r2+*prB*i3;
+	*prY++ = r1; *piY++ = i1;
+	prA += ia; piA += ia;
+	prB += ib;
+      }
+      prA += ja; piA += ja;
+      prB += jb;
+    }
+  else if (piA == NULL)
+    for (int j = 0; j < N; j++) {
+      double r1 = 1.0,r2 = 1.0,r3;
+      double i1 = 0.0,i2 = 0.0,i3;
+      for (int i = 0; i < M; i++) {
+	r3 = r2; i3 = i2;
+	r2 = r1; i2 = i1;
+	r1 = *prA*r2+*prB*r3-*piB*i3;
+	i1 = *prA*i2+*prB*i3+*piB*r3;
+	*prY++ = r1; *piY++ = i1;
+	prA += ia;
+	prB += ib; piB += ib;
+      }
+      prA += ja;
+      prB += jb; piB += jb;
+    }
+  else
+    for (int j = 0; j < N; j++) {
+      double r1 = 1.0,r2 = 1.0,r3;
+      double i1 = 0.0,i2 = 0.0,i3;
+      for (int i = 0; i < M; i++) {
+	r3 = r2; i3 = i2;
+	r2 = r1; i2 = i1;
+	r1 = *prA*r2-*piA*i2+*prB*r3-*piB*i3;
+	i1 = *prA*i2+*piA*r2+*prB*i3+*piB*r3;
+	*prY++ = r1; *piY++ = i1;
+	prA += ia; piA += ia;
+	prB += ib; piB += ib;
+      }
+      prA += ja; piA += ja;
+      prB += jb; piB += jb;
+    }
+}
+/*------------------------------------------------------------------------*/
+void clenshaw(double *prY,double *piY,int M,int N,int KC,int NC,
+	      const mxArray *A,const mxArray *B,
+	      const double *prC,const double *piC)
+/* Evaluates the recurrence as defined by the matrices A and B,
+   multiplies it by the coefficients C and stores the sum of the
+   result in (prY,piY), which must be allocated prior to call. [M,N] =
+   max(size(A),size(B),size(C,[1 3])) and C is M-by-KC-by-NC (where NC
+   is either 1 or N). The result has the dimensions KC-by-N. All
+   dimensions must either match or be singletons, except for C's first
+   dimension which must be M. */
+{
+  const double *prA = mxGetPr(A),*piA = mxGetPi(A);
+  const double *prB = mxGetPr(B),*piB = mxGetPi(B);
+
+  /* start at the end since Clenshaws algorithm runs backwards */
+  if (M > 2) {
+    prA += mxGetNumberOfElements(A)-1;
+    prB += mxGetNumberOfElements(B)-1;
+    prC += M*KC*NC-1;
+    prY += KC*N-1;
+  }
+
+  /* increments for columns and rows */
+  const int ia = mxGetM(A) == M;
+  const int ja = mxGetM(A)*(mxGetN(A) == N);
+  const int ka = ia-ia*M;
+  const int ib = mxGetM(B) == M;
+  const int jb = mxGetM(B)*(mxGetN(B) == N);
+  const int kb = ib-ib*M;
+  const int jc = -(NC != N)*M*KC;
+
+  /* all real case */
+  if (piY == NULL) {
+    /* two special cases: sum includes boundary values only */
+    if (M == 1)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  const double R1 = prA[0]+prB[0];
+	  *prY++ = prC[0]*R1; prC++;
+	}
+	prA += ka+ja;
+	prB += kb+jb;
+	prC += jc;
+      }
+    else if (M == 2)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  const double R2 = prA[0]+prB[0];
+	  const double R1 = prA[ia]*R2+prB[ib];
+	  *prY++ = prC[0]*R2+prC[1]*R1; prC += 2;
+	}
+	prA += ia+ka+ja;
+	prB += ib+kb+jb;
+	prC += jc;
+      }
+    /* evaluate using Clenshaw summation */ 
+    else if (M >= 3)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  double r1 = *prC,r2,r3;
+	  prC--;
+
+	  /* initial boundary */
+	  r2 = r1;
+	  r1 = *prA*r2+*prC;
+	  prA -= ia;
+	  prC--;
+
+	  /* internal formula */
+	  for (int i = 0; i < M-3; i++) {
+	    r3 = r2;
+	    r2 = r1;
+	    r1 = *prA*r2+*prB*r3+*prC;
+	    prA -= ia;
+	    prB -= ib;
+	    prC--;
+	  }
+
+	  /* final boundary */
+	  r3 = r2;
+	  r2 = r1;
+	  r1 = *prB*r3+*prC;
+
+	  /* initial values of the recurrence */
+	  prA -= ia;
+	  prB -= 2*ib;
+	  const double R2 = prA[0]+prB[0];
+	  const double R1 = prA[ia]*R2+prB[ib];
+
+	  /* accumulated sum */
+	  *prY-- = R2*r1+R1*r2;
+
+	  prA -= ka;
+	  prB -= kb;
+	  prC--;
+	}
+	prA -= ja;
+	prB -= jb;
+	prC -= jc;
+      }
+  }
+  else {
+    /* in order to avoid a messy code the remaining 7 complex cases
+       are handled in the same way as the pure complex case */
+    const int ia2 = piA == NULL ? 0 : ia;
+    const int ja2 = piA == NULL ? 0 : ja;
+    const int ka2 = piA == NULL ? 0 : ka;
+    const int ib2 = piB == NULL ? 0 : ib;
+    const int jb2 = piB == NULL ? 0 : jb;
+    const int kb2 = piB == NULL ? 0 : kb;
+    const int ic2 = piC == NULL ? 0 : 1;
+    const int jc2 = piC == NULL ? 0 : jc;
+
+    if (M > 2) {
+      if (piA != NULL) piA += mxGetNumberOfElements(A)-1;
+      if (piB != NULL) piB += mxGetNumberOfElements(B)-1;
+      if (piC != NULL) piC += M*KC*NC-1;
+      piY += KC*N-1;
+    }
+
+    const double zero = 0.0;
+    if (piA == NULL) piA = &zero;
+    if (piB == NULL) piB = &zero;
+    if (piC == NULL) piC = &zero;
+
+    if (M == 1)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  const double R1 = prA[0]+prB[0],I1 = piA[0]+piB[0];
+	  *prY++ = prC[0]*R1-piC[0]*I1;
+	  *piY++ = prC[0]*I1+piC[0]*R1;
+	  prC++; piC += ic2;
+	}
+	prA += ka+ja; piA += ka2+ja2;
+	prB += kb+jb; piB += kb2+jb2;
+	prC += jc; piC += jc2;
+      }
+    else if (M == 2)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  const double R2 = prA[0]+prB[0],I2 = piA[0]+piB[0];
+	  const double R1 = prA[ia]*R2-piA[ia2]*I2+prB[ib],
+	    I1 = prA[ia]*I2+piA[ia2]*R2+piB[ib2];
+	  *prY++ = prC[0]*R2-piC[0]*I2+prC[1]*R1-piC[ic2]*I1;
+	  *piY++ = prC[0]*I2+piC[0]*R2+prC[1]*I1+piC[ic2]*R1;
+	  prC += 2; piC += 2*ic2;
+	}
+	prA += ia+ka+ja; piA += ia2+ka2+ja2;
+	prB += ib+kb+jb; piB += ib2+kb2+jb2;
+	prC += jc; piC += jc2;
+      }
+    else if (M >= 3)
+      for (int j = 0; j < N; j++) {
+	for (int k = 0; k < KC; k++) {
+	  double r1 = *prC,r2,r3;
+	  double i1 = *piC,i2,i3;
+	  prC--; piC -= ic2;
+	  r2 = r1; i2 = i1;
+	  r1 = *prA*r2-*piA*i2+*prC;
+	  i1 = *prA*i2+*piA*r2+*piC;
+	  prA -= ia; piA -= ia2;
+	  prC--; piC -= ic2;
+	  for (int i = 0; i < M-3; i++) {
+	    r3 = r2; i3 = i2;
+	    r2 = r1; i2 = i1;
+	    r1 = *prA*r2-*piA*i2+*prB*r3-*piB*i3+*prC;
+	    i1 = *prA*i2+*piA*r2+*prB*i3+*piB*r3+*piC;
+	    prA -= ia; piA -= ia2;
+	    prB -= ib; piB -= ib2;
+	    prC--; piC -= ic2;
+	  }
+	  r3 = r2; i3 = i2;
+	  r2 = r1; i2 = i1;
+	  r1 = *prB*r3-*piB*i3+*prC;
+	  i1 = *prB*i3+*piB*r3+*piC;
+	  prA -= ia; piA -= ia2;
+	  prB -= 2*ib; piB -= 2*ib2;
+	  const double R2 = prA[0]+prB[0],I2 = piA[0]+piB[0];
+	  const double R1 = prA[ia]*R2-piA[ia2]*I2+prB[ib],
+	    I1 = prA[ia]*I2+piA[ia2]*R2+piB[ib2];
+	  *prY-- = R2*r1-I2*i1+R1*r2-I1*i2;
+	  *piY-- = R2*i1+I2*r1+I1*r2+R1*i2;
+	  prA -= ka; piA -= ka2;
+	  prB -= kb; piB -= kb2;
+	  prC--; piC -= ic2;
+	}
+	prA -= ja; piA -= ja2;
+	prB -= jb; piB -= jb2;
+	prC -= jc; piC -= jc2;
+      }
+  }
+}
+/*------------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/fsetop.c b/stenglib/Fast/source/fsetop.c
new file mode 100644
index 00000000..935ed810
--- /dev/null
+++ b/stenglib/Fast/source/fsetop.c
@@ -0,0 +1,1058 @@
+/* fsetop.c */
+/* S. Engblom 2010-02-09 (Revision, extended union operation) */
+/* S. Engblom 2006-06-13 (Major revision) */
+/* S. Engblom 2005-06-02 */
+
+#include <math.h>
+// temporary fix for CC under Solaris:
+#ifndef NO_STDINT
+#include <stdint.h>
+#endif
+#include <string.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/*------------------------------------------------------------------------*/
+
+// forward declarations
+typedef enum {CHECK = 8,INTERSECT = 4,SETDIFF = 6,SETXOR = 1,
+	      UNION = 0,UNIQUE = 7,ISMEMBER = 10,UNKNOWN = -1} HASH_OP;
+HASH_OP getop(const char *str);
+
+// hash-table for two arrays A and B containing data
+typedef struct {
+  uint8_t **hash_tb;              // pointers to raw data
+  size_t hashsz;                  // size of table
+  char *hash_id;                  // identification of element
+  uint32_t *hash_ixa,*hash_ixb;   // pointers from data into table
+  size_t na,nb,nx;                // number of A's, B's and X's
+} hashTable;
+
+// hash-table for two arrays containing mxArrays
+typedef struct {
+  mxArray **hash_tb;
+  size_t hashsz;
+  char *hash_id;
+  uint32_t *hash_ixa,*hash_ixb;
+  uint32_t *hash_iixa,*hash_iixb; // pointers back into hash_ixa, hash_ixb
+  size_t na,nb,nx;
+} mxhashTable;
+
+// hash-utility functions
+size_t hashsize(size_t len);
+uint32_t hash(const uint8_t *val,size_t nbytes,uint32_t offset);
+uint32_t hash2nd(const uint8_t *val,size_t nbytes);
+bool mx_IsEqual(const mxArray *A,const mxArray *B);
+
+// Venn-topology routines
+void hashSort(hashTable *T,
+	      const uint8_t *A,size_t Na,
+	      const uint8_t *B,size_t Nb,size_t Mbytes);
+void mxhashSort(mxhashTable *T,
+		const mxArray *A,size_t Na,
+		const mxArray *B,size_t Nb);
+
+// post-processing of hash-tables
+typedef void (*fsetopOut)(hashTable,size_t,size_t,size_t,
+			  int,mxArray **,size_t,mxClassID);
+
+void intersectOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		  int nlhs,mxArray *plhs[],
+		  size_t,mxClassID);
+void setdiffOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		int nlhs,mxArray *plhs[],
+		size_t,mxClassID);
+void setxorOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	       int nlhs,mxArray *plhs[],
+	       size_t,mxClassID);
+void unionOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	      int nlhs,mxArray *plhs[],
+	      size_t,mxClassID);
+void uniqueOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	       int nlhs,mxArray *plhs[],
+	       size_t,mxClassID);
+void ismemberOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		 int nlhs,mxArray *plhs[],
+		 size_t,mxClassID);
+
+// mx-versions
+typedef void (*mxfsetopOut)(mxhashTable,size_t,size_t,
+			    int,mxArray **);
+
+void mxintersectOut(mxhashTable T,size_t Na,size_t Nb,
+		    int nlhs,mxArray *plhs[]);
+void mxsetdiffOut(mxhashTable T,size_t Na,size_t Nb,
+		  int nlhs,mxArray *plhs[]);
+void mxsetxorOut(mxhashTable T,size_t Na,size_t Nb,
+		 int nlhs,mxArray *plhs[]);
+void mxunionOut(mxhashTable T,size_t Na,size_t Nb,
+		int nlhs,mxArray *plhs[]);
+void mxuniqueOut(mxhashTable T,size_t Na,size_t Nb,
+		 int nlhs,mxArray *plhs[]);
+void mxismemberOut(mxhashTable T,size_t Na,size_t Nb,
+		   int nlhs,mxArray *plhs[]);
+
+/*------------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  // fetch first argument op (read at most 9 characters)
+  char strop[10];
+  HASH_OP op; 
+
+  // determine the type of the set-operation
+  if (nrhs < 2)
+    mexErrMsgIdAndTxt("fsetop:e1","Expecting at least two inputs.");
+  if (!mxIsChar(prhs[0]))
+    mexErrMsgIdAndTxt("fsetop:e2","First argument must be a "
+		      "character array.");
+  if (mxGetString(prhs[0],strop,10) != 0 || (op = getop(strop)) == UNKNOWN)
+    mexErrMsgIdAndTxt("fsetop:e3","Unknown set operation.");
+
+  // syntax
+  if (op != CHECK && op != UNIQUE) {
+    if (nrhs != 3)
+      mexErrMsgIdAndTxt("fsetop:e5","Expecting exactly three inputs.");
+  }
+  else if (nrhs != 2)
+    mexErrMsgIdAndTxt("fsetop:e8","Expecting exactly two inputs.");
+  if (op == UNION) {
+    if (nlhs > 5)
+      mexErrMsgIdAndTxt("fsetop:e13","Expecting at most five outputs.");
+  }
+  else if (op != CHECK && op != SETDIFF && op != ISMEMBER) {
+    if (nlhs > 3)
+      mexErrMsgIdAndTxt("fsetop:e9","Expecting at most three outputs.");
+  }
+  else if (op != CHECK) {
+    if (nlhs > 2)
+      mexErrMsgIdAndTxt("fsetop:e10","Expecting at most two outputs.");
+  }
+  else if (nlhs > 1)
+    mexErrMsgIdAndTxt("fsetop:e12","Expecting at most one output."); 
+
+  // two cases
+  const bool isCell = mxIsCell(prhs[1]);
+
+  // case 1: input is numeric array
+  if (!isCell) {
+    uint8_t *A,*B = NULL;
+    size_t Na,Nb = 0,Mbytes;
+
+    // fetch second input A
+    if (!(mxIsNumeric(prhs[1]) || mxIsChar(prhs[1]) || mxIsLogical(prhs[1])) 
+	|| mxIsSparse(prhs[1]) || mxIsComplex(prhs[1]) ||
+	mxGetNumberOfDimensions(prhs[1]) > 2)
+      mexErrMsgIdAndTxt("fsetop:e4","Input argument must be a 2-D real "
+			"and non-sparse array.");
+    A = (uint8_t *)mxGetData(prhs[1]);
+
+    Mbytes = mxGetM(prhs[1])*mxGetElementSize(prhs[1]);
+    Na = mxGetN(prhs[1]);
+
+    // quick return
+    if (op == CHECK) {
+      uint32_t *C;
+      plhs[0] = mxCreateNumericMatrix(1,Na,mxUINT32_CLASS,mxREAL);
+      C = mxGetData(plhs[0]);
+      for (size_t i = 0; i < Na; i++)
+	C[i] = hash(&A[i*Mbytes],Mbytes,0);
+      return;
+    }
+
+    // fetch third input B, if any
+    if (op != UNIQUE) {
+      if (mxIsSparse(prhs[2]) || mxIsComplex(prhs[2]) ||
+	  mxGetNumberOfDimensions(prhs[2]) > 2)
+	mexErrMsgIdAndTxt("fsetop:e4","Input argument must be a 2-D real "
+			  "and non-sparse array.");
+      if (mxGetClassID(prhs[1]) != mxGetClassID(prhs[2]))
+	mexErrMsgIdAndTxt("fsetop:e6","Arguments must be "
+			  "of the same class.");
+      if (mxGetM(prhs[1]) != mxGetM(prhs[2]))
+	mexErrMsgIdAndTxt("fsetop:e7","The number of rows must match.");
+ 
+      B = (uint8_t *)mxGetData(prhs[2]);
+      Nb = mxGetN(prhs[2]);
+    }
+
+    // sort it out!
+    hashTable T;
+    if (op != ISMEMBER)
+      hashSort(&T,A,Na,B,Nb,Mbytes);
+    else
+      hashSort(&T,B,Nb,A,Na,Mbytes); // 2nd input must be the master array
+
+    // produce whatever output is asked for
+    static const fsetopOut ftab[] = {
+      &unionOut,&setxorOut,NULL,NULL,&intersectOut,NULL,
+      &setdiffOut,&uniqueOut,NULL,NULL,&ismemberOut
+    };
+
+    (*ftab[op])(T,Na,Nb,Mbytes,nlhs,plhs,
+		mxGetElementSize(prhs[1]),mxGetClassID(prhs[1]));
+
+    // dealllocate the hash table
+    mxFree(T.hash_ixb);
+    mxFree(T.hash_ixa);
+    mxFree(T.hash_id);
+    mxFree(T.hash_tb);
+  }
+  // case 2: cell-array of arrays containing real and non-sparse data
+  else {
+    mxArray *A,*B = NULL;
+    size_t Na,Nb = 0;
+
+    // fetch second input A
+    A = (mxArray *)prhs[1];
+    Na = mxGetNumberOfElements(A);
+    for (size_t i = 0; i < Na; i++) {
+      const mxArray *Ai = mxGetCell(A,i);
+      if (!(mxIsNumeric(Ai) || mxIsChar(Ai) || mxIsLogical(Ai)) ||
+	  mxIsComplex(Ai) || mxIsSparse(Ai))
+	mexErrMsgIdAndTxt("fsetop:e11","Cell-vector must contain real, "
+			  "non-sparse arrays.");
+    }
+
+    // quick return
+    if (op == CHECK) {
+      uint32_t *C;
+      plhs[0] = mxCreateNumericMatrix(1,Na,mxUINT32_CLASS,mxREAL);
+      C = mxGetData(plhs[0]);
+      for (size_t i = 0; i < Na; i++) {
+	const mxArray *Ai = mxGetCell(A,i);
+	uint32_t hashkey =  hash(mxGetData(Ai),mxGetNumberOfElements(Ai)*
+				 mxGetElementSize(Ai),mxGetClassID(Ai));
+	hashkey = hash((uint8_t *)mxGetDimensions(Ai),
+		       mxGetNumberOfDimensions(Ai)*sizeof(int),hashkey);
+	C[i] = hashkey;
+      }
+      return;
+    }
+
+    // fetch third input B, if any
+    if (op != UNIQUE) {
+      if (!mxIsCell(prhs[2]))
+	mexErrMsgIdAndTxt("fsetop:e6","Arguments must be "
+			  "of the same class.");
+
+      B = (mxArray *)prhs[2];
+      Nb = mxGetNumberOfElements(B);
+      for (size_t i = 0; i < Nb; i++) {
+	const mxArray *Bi = mxGetCell(B,i);
+	if (!(mxIsNumeric(Bi) || mxIsChar(Bi) || mxIsLogical(Bi)) ||
+	    mxIsComplex(Bi) || mxIsSparse(Bi))
+	  mexErrMsgIdAndTxt("fsetop:e11","Cell-vector must contain real, "
+			    "non-sparse arrays.");
+      }
+    }
+
+    // sort it out!
+    mxhashTable T;
+    if (op != ISMEMBER)
+      mxhashSort(&T,A,Na,B,Nb);
+    else
+      mxhashSort(&T,B,Nb,A,Na); // 2nd input must be the master array
+
+    // produce whatever output is asked for
+    static const mxfsetopOut ftab[] = {
+      &mxunionOut,&mxsetxorOut,NULL,NULL,&mxintersectOut,NULL,
+      &mxsetdiffOut,&mxuniqueOut,NULL,NULL,&mxismemberOut
+    };
+
+    (*ftab[op])(T,Na,Nb,nlhs,plhs);
+
+    // deallocate the hash table
+    mxFree(T.hash_iixb);
+    mxFree(T.hash_iixa);
+    mxFree(T.hash_ixb);
+    mxFree(T.hash_ixa);
+    mxFree(T.hash_id);
+    mxFree(T.hash_tb);
+  }
+}
+/*------------------------------------------------------------------------*/
+void intersectOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		  int nlhs,mxArray *plhs[],
+		  size_t siz,mxClassID id)
+/* [C,IA,IB] = FSETOP('intersect',A,B) returns the columns common to
+   both A and B. C = A(:,IA) = B(:,IB). */
+{
+  uint8_t *C;
+  double *ia = NULL,*ib = NULL;
+
+  // allocate output
+  plhs[0] = mxCreateNumericMatrix(Mbytes/siz,T.nx,id,mxREAL);
+  C = (uint8_t *)mxGetData(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.nx,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nx,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'X') {
+      memcpy(&C[j*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ia != NULL) {
+	ia[j] = i+1;
+	if (ib != NULL)
+	  // IB follows the order determined by IA
+	  T.hash_tb[hashkey] = &C[j*Mbytes];
+      }
+
+      // first unique element only
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+  }
+
+  if (ib != NULL)
+    for (size_t i = 0; i < Nb; i++) {
+      const uint32_t hashkey = T.hash_ixb[i];
+      if (T.hash_id[hashkey] == 'Y') {
+	ib[(T.hash_tb[hashkey]-&C[0])/(Mbytes ? Mbytes : 1)] = i+1;
+
+	// again, first unique element only
+	T.hash_id[hashkey] = 'Z';
+      }
+    }
+}
+/*------------------------------------------------------------------------*/
+void mxintersectOut(mxhashTable T,size_t Na,size_t Nb,
+		    int nlhs,mxArray *plhs[])
+{
+  mxArray *C;
+  double *ia = NULL,*ib = NULL;
+
+  C = plhs[0] = mxCreateCellMatrix(1,T.nx);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.nx,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nx,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'X') {
+      mxSetCell(C,j,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ia != NULL) {
+	ia[j] = i+1;
+	if (ib != NULL)
+	  ib[j] = T.hash_iixb[hashkey]+1; // new construction
+      }
+
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void setdiffOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		int nlhs,mxArray *plhs[],
+		size_t siz,mxClassID id)
+/* [C,IA] = FSETOP('setdiff',A,B) returns the columns in A that are
+   not in B. C = A(:,IA). */
+{
+  uint8_t *C;
+  double *ia = NULL;
+
+  // allocate output
+  plhs[0] = mxCreateNumericMatrix(Mbytes/siz,T.na,id,mxREAL);
+  C = (uint8_t *)mxGetData(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+  }
+
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A') {
+      memcpy(&C[j*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ia != NULL) ia[j] = i+1;
+
+      // first unique element only
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxsetdiffOut(mxhashTable T,size_t Na,size_t Nb,
+		  int nlhs,mxArray *plhs[])
+{
+  mxArray *C;
+  double *ia = NULL;
+
+  C = plhs[0] = mxCreateCellMatrix(1,T.na);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+  }
+
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A') {
+      mxSetCell(C,j,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ia != NULL) ia[j] = i+1;
+
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void setxorOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	       int nlhs,mxArray *plhs[],
+	       size_t siz,mxClassID id)
+/* [C,IA,IB] = FSETOP('setxor',A,B) returns the columns that are not
+   in the intersection of A and B. C = [A(:,IA) B(:,IB)]. */
+{
+  uint8_t *C;
+  double *ia = NULL,*ib = NULL;
+
+  // allocate output
+  plhs[0] = mxCreateNumericMatrix(Mbytes/siz,T.na+T.nb,id,mxREAL);
+  C = (uint8_t *)mxGetData(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nb,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  size_t k = 0;
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A') {
+      memcpy(&C[k++*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ia != NULL) ia[j++] = i+1;
+
+      // first unique element only
+      T.hash_id[hashkey] = 'Y';
+    }
+  }
+
+  for (size_t i = 0, j = 0; i < Nb; i++) {
+    const uint32_t hashkey = T.hash_ixb[i];
+    if (T.hash_id[hashkey] == 'B') {
+      memcpy(&C[k++*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ib != NULL) ib[j++] = i+1;
+
+      T.hash_id[hashkey] = 'Y';
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxsetxorOut(mxhashTable T,size_t Na,size_t Nb,
+		 int nlhs,mxArray *plhs[])
+{
+  mxArray *C;
+  double *ia = NULL,*ib = NULL;
+
+  C = plhs[0] = mxCreateCellMatrix(1,T.na+T.nb);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nb,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  size_t k = 0;
+  for (size_t i = 0,j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A') {
+      mxSetCell(C,k++,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ia != NULL) ia[j++] = i+1;
+
+      T.hash_id[hashkey] = 'Y';
+    }
+  }
+
+  for (size_t i = 0, j = 0; i < Nb; i++) {
+    const uint32_t hashkey = T.hash_ixb[i];
+    if (T.hash_id[hashkey] == 'B') {
+      mxSetCell(C,k++,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ib != NULL) ib[j++] = i+1;
+
+      T.hash_id[hashkey] = 'Y';
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void unionOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	      int nlhs,mxArray *plhs[],
+	      size_t siz,mxClassID id)
+/* [C,IA,IB,JA,JB] = FSETOP('union',A,B) returns the combined columns
+   from A and B but with no repetitions. C = [A(:,IA) B(:,IB)] and A =
+   C(:,JA), B = C(:,JB). */
+{
+  uint8_t *C;
+  double *ia = NULL,*ib = NULL,*ja = NULL,*jb = NULL;
+
+  plhs[0] = mxCreateNumericMatrix(Mbytes/siz,T.na+T.nx+T.nb,id,mxREAL);
+  C = (uint8_t *)mxGetData(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na+T.nx,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nb,mxREAL);
+      ib = mxGetPr(plhs[2]);
+      if (nlhs > 3) {
+	plhs[3] = mxCreateDoubleMatrix(1,Na,mxREAL);
+	ja = mxGetPr(plhs[3]);
+	if (nlhs > 4) {
+	  plhs[4] = mxCreateDoubleMatrix(1,Nb,mxREAL);
+	  jb = mxGetPr(plhs[4]);
+	}
+      }
+    }
+  }
+
+  size_t k = 0;
+  for (size_t i = 0, j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A' || T.hash_id[hashkey] == 'X') {
+      memcpy(&C[k*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ia != NULL) {
+	ia[j++] = i+1;
+	if (ja != NULL) {
+	  ja[i] = k+1;
+	  T.hash_tb[hashkey] = &C[k*Mbytes];
+	}
+      }
+      k++;
+
+      // first unique element
+      T.hash_id[hashkey] = 'Y';
+    }
+    else if (ja != NULL && T.hash_id[hashkey] == 'Y')
+      ja[i] = (T.hash_tb[hashkey]-&C[0])/(Mbytes ? Mbytes : 1)+1;
+  }
+
+  for (size_t i = 0, j = 0; i < Nb; i++) {
+    const uint32_t hashkey = T.hash_ixb[i];
+    if (T.hash_id[hashkey] == 'B') {
+      memcpy(&C[k*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ib != NULL) {
+	ib[j++] = i+1;
+	if (jb != NULL) {
+	  jb[i] = k+1;
+	  T.hash_tb[hashkey] = &C[k*Mbytes];
+	}
+      }
+      k++;
+
+      T.hash_id[hashkey] = 'Y';
+    }
+    else if (jb != NULL && T.hash_id[hashkey] == 'Y')
+      jb[i] = (T.hash_tb[hashkey]-&C[0])/(Mbytes ? Mbytes : 1)+1;
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxunionOut(mxhashTable T,size_t Na,size_t Nb,
+		int nlhs,mxArray *plhs[])
+{
+  mxArray *C;
+  double *ia = NULL,*ib = NULL,*ja = NULL,*jb = NULL;
+
+  C = plhs[0] = mxCreateCellMatrix(1,T.na+T.nx+T.nb);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na+T.nx,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,T.nb,mxREAL);
+      ib = mxGetPr(plhs[2]);
+      if (nlhs > 3) {
+	plhs[3] = mxCreateDoubleMatrix(1,Na,mxREAL);
+	ja = mxGetPr(plhs[3]);
+	if (nlhs > 4) {
+	  plhs[4] = mxCreateDoubleMatrix(1,Nb,mxREAL);
+	  jb = mxGetPr(plhs[4]);
+	}
+      }
+    }
+  }
+
+  size_t k = 0;
+  for (size_t i = 0, j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+    if (T.hash_id[hashkey] == 'A' || T.hash_id[hashkey] == 'X') {
+      mxSetCell(C,k,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ia != NULL) {
+	ia[j++] = i+1;
+	if (ja != NULL) {
+	  ja[i] = k+1;
+	  T.hash_iixa[hashkey] = k;
+	}
+      }
+      k++;
+
+      T.hash_id[hashkey] = 'Y';
+    }
+    else if (ja != NULL && T.hash_id[hashkey] == 'Y')
+      ja[i] = T.hash_iixa[hashkey]+1;
+  }
+
+  for (size_t i = 0, j = 0; i < Nb; i++) {
+    const uint32_t hashkey = T.hash_ixb[i];
+    if (T.hash_id[hashkey] == 'B') {
+      mxSetCell(C,k,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ib != NULL) {
+	ib[j++] = i+1;
+	if (jb != NULL) {
+	  jb[i] = k+1;
+	  T.hash_iixb[hashkey] = k;
+	}
+      }
+      k++;
+
+      T.hash_id[hashkey] = 'Z';
+    }
+    else if (jb != NULL) {
+      if (T.hash_id[hashkey] == 'Y')
+	jb[i] = T.hash_iixa[hashkey]+1;
+      else if (T.hash_id[hashkey] == 'Z')
+	jb[i] = T.hash_iixb[hashkey]+1;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void uniqueOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+	       int nlhs,mxArray *plhs[],
+	       size_t siz,mxClassID id)
+/* [B,IA,IB] = FSETOP('unique',A) returns the same columns as in A but
+   with no repetitions. B = A(:,IA) and A = B(:,IB). */
+{
+  uint8_t *B;
+  double *ia = NULL,*ib = NULL;
+
+  plhs[0] = mxCreateNumericMatrix(Mbytes/siz,T.na,id,mxREAL);
+  B = (uint8_t *)mxGetData(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,Na,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  for (size_t i = 0, j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+
+    if (T.hash_id[hashkey] == 'A') {
+      memcpy(&B[j*Mbytes],T.hash_tb[hashkey],Mbytes);
+      if (ia != NULL) {
+	ia[j] = i+1;
+	if (ib != NULL)
+	  // because IB points into B
+	  T.hash_tb[hashkey] = &B[j*Mbytes];
+      }
+
+      // first unique element only
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+
+    // IB does not refer to unique elements
+    if (ib != NULL) ib[i] = ((T.hash_tb[hashkey]-&B[0]))/
+      (Mbytes ? Mbytes : 1)+1;
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxuniqueOut(mxhashTable T,size_t Na,size_t Nb,
+		 int nlhs,mxArray *plhs[])
+{
+  mxArray *B;
+  double *ia = NULL,*ib = NULL;
+
+  B = plhs[0] = mxCreateCellMatrix(1,T.na);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,T.na,mxREAL);
+    ia = mxGetPr(plhs[1]);
+    if (nlhs > 2) {
+      plhs[2] = mxCreateDoubleMatrix(1,Na,mxREAL);
+      ib = mxGetPr(plhs[2]);
+    }
+  }
+
+  for (size_t i = 0, j = 0; i < Na; i++) {
+    const uint32_t hashkey = T.hash_ixa[i];
+
+    if (T.hash_id[hashkey] == 'A') {
+      mxSetCell(B,j,mxDuplicateArray(T.hash_tb[hashkey]));
+      if (ia != NULL) {
+	ia[j] = i+1;
+	if (ib != NULL)
+	  T.hash_iixa[hashkey] = j;
+      }
+
+      T.hash_id[hashkey] = 'Y';
+      j++;
+    }
+
+    // new construction
+    if (ib != NULL) ib[i] = T.hash_iixa[hashkey]+1;
+  }
+}
+/*------------------------------------------------------------------------*/
+void ismemberOut(hashTable T,size_t Na,size_t Nb,size_t Mbytes,
+		 int nlhs,mxArray *plhs[],
+		 size_t siz,mxClassID id)
+/* [IA,IB] = FSETOP('ismember',A,B) returns a logical vector IA
+   containing 1 where the columns of A are also columns of B and 0
+   otherwise. IB contains the index in B of each column in A and zero
+   if no such index exists. A(:,IA) = B(:,IB(IA). */
+{
+  mxLogical *ia;
+  double *ib = NULL;
+
+  plhs[0] = mxCreateLogicalMatrix(1,Na);
+  ia = mxGetLogicals(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,Na,mxREAL);
+    ib = mxGetPr(plhs[1]);
+  }
+
+  for (size_t j = 0; j < Na; j++) {
+    // the meaning of hash_ixa and hash_ixb has now switched!
+    const size_t hashkey = T.hash_ixb[j];
+    if (T.hash_id[hashkey] == 'X') {
+      ia[j] = 1;
+
+      if (ib != NULL)
+	ib[j] = (T.hash_tb[hashkey]-T.hash_tb[T.hash_ixa[0]])/
+	  (Mbytes ? Mbytes : 1)+1;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxismemberOut(mxhashTable T,size_t Na,size_t Nb,
+		   int nlhs,mxArray *plhs[])
+{
+  mxLogical *ia;
+  double *ib = NULL;
+
+  plhs[0] = mxCreateLogicalMatrix(1,Na);
+  ia = mxGetLogicals(plhs[0]);
+  if (nlhs > 1) {
+    plhs[1] = mxCreateDoubleMatrix(1,Na,mxREAL);
+    ib = mxGetPr(plhs[1]);
+  }
+
+  for (size_t j = 0; j < Na; j++) {
+    const size_t hashkey = T.hash_ixb[j];
+    if (T.hash_id[hashkey] == 'X') {
+      ia[j] = 1;
+
+      // new construction
+      if (ib != NULL) ib[j] = T.hash_iixa[hashkey]+1;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+HASH_OP getop(const char *str)
+/* Small perfect hash for the supported set operations. Note that
+   str[4] must be readable for this to work. Returns UNKNOWN on
+   failure. */
+{
+  static const char *tab[] = {
+    "union","setxor","","","intersect","",
+    "setdiff","unique","check","","ismember"
+  };
+  const unsigned hash = ((unsigned)str[4])%11;
+
+  if (str[0] != '\0' && strcmp(str,tab[hash]) == 0)
+    return hash;
+  else
+    return UNKNOWN;
+}
+/*------------------------------------------------------------------------*/
+
+// table is only allowed to become about 70% full
+const double HASH_LOAD = 1.0/0.70;
+
+size_t hashsize(size_t len)
+/* Size of hash table. This is simply
+   exp2(ceil(log2(HASH_LOAD*len))). */
+{
+  size_t ans = 1;
+
+  len = ceil(HASH_LOAD*len);
+  do ans <<= 1; while (len >>= 1);
+
+  return ans;
+}
+/*------------------------------------------------------------------------*/
+
+// should work on most platforms
+#define get16bits(d) (*((const uint16_t *)(d)))
+
+uint32_t hash(const uint8_t *val,size_t nbytes,uint32_t offset)
+/* Computes a hashkey from bytes. The input offset may be used to
+   generate a new hashkey from the same data. Use zero as the default.
+
+   The original author of this function is Paul Hsieh. See
+   http://www.azillionmonkeys.com/qed/hash.html for further details.
+*/
+{
+  uint32_t hash = offset,temp,rem;
+
+  // incorporate 4 bytes at a time
+  rem = nbytes&3;
+  nbytes >>= 2;
+
+  // main loop
+  for ( ; nbytes > 0; nbytes--, val += 4) {
+    hash += get16bits(val);
+    temp = (get16bits(val+2) << 11)^hash;
+    hash = (hash << 16)^temp;
+    hash += hash >> 11;
+  }
+
+  // handle end cases
+  switch (rem) {
+  case 3:
+    hash += get16bits(val);
+    hash ^= hash << 16;
+    hash ^= val[2] << 18;
+    hash += hash >> 11;
+    break;
+  case 2:
+    hash += get16bits(val);
+    hash ^= hash << 11;
+    hash += hash >> 17;
+    break;
+  case 1:
+    hash += val[0];
+    hash ^= hash << 10;
+    hash += hash >> 1;
+  }
+
+  // force "avalanching" of final 127 bits
+  hash ^= hash << 3;
+  hash += hash >> 5;
+  hash ^= hash << 2;
+  hash += hash >> 15;
+  hash ^= hash << 10;
+
+  return hash;
+}
+/*------------------------------------------------------------------------*/
+uint32_t hash2nd(const uint8_t *val,size_t nbytes)
+/* Computes a secondary (incremental) hashkey from bytes. */
+{
+  // a fairly independent and odd hashkey
+  return hash(val,nbytes,0x0f0f0f0f)|1;
+}
+/*------------------------------------------------------------------------*/
+bool mx_IsEqual(const mxArray *A,const mxArray *B)
+/* Bitwise equality for real non-sparse mxArrays containing data. This
+   function will not work properly for sparse arrays, cell-arrays,
+   struct-arrays or for arrays containing imaginary data. */
+{
+  // class
+  if (mxGetClassID(A) != mxGetClassID(B)) return false;
+
+  // real data
+  const int nelems = mxGetNumberOfElements(A);
+  const int sizelem = mxGetElementSize(A);
+  if (nelems != mxGetNumberOfElements(B)) return false;
+  if (memcmp(mxGetData(A),mxGetData(B),nelems*sizelem)) return false;
+
+  // shape
+  int ndims = mxGetNumberOfDimensions(A);
+  if (ndims != mxGetNumberOfDimensions(B)) return false;
+  if (memcmp(mxGetDimensions(A),mxGetDimensions(B),ndims*sizeof(int)))
+    return false;
+
+  return true;
+}
+/*------------------------------------------------------------------------*/
+void hashSort(hashTable *T,
+	      const uint8_t *A,size_t Na,
+	      const uint8_t *B,size_t Nb,size_t Mbytes)
+/* Determines the full Venn-topology of the arrays A and B using
+   hashing.
+
+   Input are the arrays A and B of sizes Mbytes-by-Na and
+   Mbytes-by-Nb.
+
+   On return, hash_tb is the hash-table pointing back into the columns
+   of A and B, hash_id contains 'A' for pointers strictly into A, 'B'
+   for pointers strictly into B and 'X' for pointers into A but with
+   equivalent elements contained in B as well. The number of A's, B's
+   and X's are returned in na, nb and nx respectively. The vectors
+   hash_ixa and hash_ixb has the same length as A and B and points
+   into hash_tb/hash_id.
+*/
+{
+  // hash table and index into hash table
+  T->hashsz = hashsize(Na+Nb);
+  T->hash_tb = mxMalloc(T->hashsz*sizeof(uint8_t *));
+  T->hash_id = mxCalloc(T->hashsz,sizeof(char));
+  T->hash_ixa = mxMalloc(Na*sizeof(uint32_t));
+  T->hash_ixb = mxMalloc(Nb*sizeof(uint32_t));
+
+  // clear counters
+  T->na = T->nb = T->nx = 0;
+
+  // hash columns of A
+  for (size_t i = 0; i < Na; i++) {
+    uint32_t hashkey = hash(&A[i*Mbytes],Mbytes,0)&(T->hashsz-1);
+
+    // collision?
+    if (T->hash_id[hashkey] != '\0' && 
+	memcmp(&A[i*Mbytes],T->hash_tb[hashkey],Mbytes)) {
+      const uint32_t hashkey2nd = hash2nd(&A[i*Mbytes],Mbytes);
+
+      // rehash until empty slot found
+      do
+	hashkey = (hashkey+hashkey2nd)&(T->hashsz-1);
+      while (T->hash_id[hashkey] != '\0' && 
+	     memcmp(&A[i*Mbytes],T->hash_tb[hashkey],Mbytes));
+    }
+
+    // fill in table
+    if (T->hash_id[hashkey] == '\0') {
+      T->hash_tb[hashkey] = &((uint8_t *)A)[i*Mbytes];
+      T->hash_id[hashkey] = 'A';
+      T->hash_ixa[i] = hashkey;
+      T->na++;
+    }
+    else
+      T->hash_ixa[i] = hashkey;
+  }
+
+  // hash columns of B
+  for (size_t i = 0; i < Nb; i++) {
+    uint32_t hashkey = hash(&B[i*Mbytes],Mbytes,0)&(T->hashsz-1);
+
+    if (T->hash_id[hashkey] != '\0' && 
+	memcmp(&B[i*Mbytes],T->hash_tb[hashkey],Mbytes)) {
+      const uint32_t hashkey2nd = hash2nd(&B[i*Mbytes],Mbytes);
+
+      do
+	hashkey = (hashkey+hashkey2nd)&(T->hashsz-1);
+      while (T->hash_id[hashkey] != '\0' && 
+	     memcmp(&B[i*Mbytes],T->hash_tb[hashkey],Mbytes));
+    }
+
+    if (T->hash_id[hashkey] == '\0') {
+      T->hash_tb[hashkey] = &((uint8_t *)B)[i*Mbytes];
+      T->hash_id[hashkey] = 'B';
+      T->hash_ixb[i] = hashkey;
+      T->nb++;
+    }
+    else {
+      T->hash_ixb[i] = hashkey;
+
+      // special case here
+      if (T->hash_id[hashkey] == 'A') {
+	T->na--;
+	T->hash_id[hashkey] = 'X';
+	T->nx++;
+      }
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void mxhashSort(mxhashTable *T,
+		const mxArray *A,size_t Na,
+		const mxArray *B,size_t Nb)
+/* Corresponding routine for two cell-arrays of mxArrays. */
+{
+  T->hashsz = hashsize(Na+Nb);
+  T->hash_tb = mxMalloc(T->hashsz*sizeof(mxArray *));
+  T->hash_id = mxCalloc(T->hashsz,sizeof(char));
+  T->hash_ixa = mxMalloc(Na*sizeof(uint32_t));
+  T->hash_ixb = mxMalloc(Nb*sizeof(uint32_t));
+
+  // inverse pointers (back into hash_ixa, hash_ixb)
+  T->hash_iixa = mxMalloc(T->hashsz*sizeof(uint32_t));
+  T->hash_iixb = mxMalloc((Nb != 0)*T->hashsz*sizeof(uint32_t));
+
+  T->na = T->nb = T->nx = 0;
+
+  for (size_t i = 0; i < Na; i++) {
+    // hashing over data and class only...
+    const mxArray *Ai = mxGetCell(A,i);
+    uint32_t hashkey = hash(mxGetData(Ai),mxGetNumberOfElements(Ai)*
+			    mxGetElementSize(Ai),mxGetClassID(Ai));
+    hashkey = hashkey&(T->hashsz-1);
+
+    if (T->hash_id[hashkey] != '\0' && 
+	!mx_IsEqual(Ai,T->hash_tb[hashkey])) {
+      // ...hashing over the shape is used as the secondary key
+      const uint32_t hashkey2nd = hash2nd((const uint8_t *)
+					  mxGetDimensions(Ai),
+					  mxGetNumberOfDimensions(Ai)*
+					  sizeof(int));
+
+      do
+	hashkey = (hashkey+hashkey2nd)&(T->hashsz-1);
+      while (T->hash_id[hashkey] != '\0' && 
+	     !mx_IsEqual(Ai,T->hash_tb[hashkey]));
+    }
+
+    if (T->hash_id[hashkey] == '\0') {
+      T->hash_tb[hashkey] = (mxArray *)Ai;
+      T->hash_id[hashkey] = 'A';
+      T->hash_ixa[i] = hashkey;
+      T->hash_iixa[hashkey] = i;
+      T->na++;
+    }
+    else
+      T->hash_ixa[i] = hashkey;
+  }
+
+  for (size_t i = 0; i < Nb; i++) {
+    const mxArray *Bi = mxGetCell(B,i);
+    uint32_t hashkey = hash(mxGetData(Bi),mxGetNumberOfElements(Bi)*
+			    mxGetElementSize(Bi),mxGetClassID(Bi));
+    hashkey = hashkey&(T->hashsz-1);
+
+    if (T->hash_id[hashkey] != '\0' && 
+	!mx_IsEqual(Bi,T->hash_tb[hashkey])) {
+      const uint32_t hashkey2nd = hash2nd((const uint8_t *)
+					  mxGetDimensions(Bi),
+					  mxGetNumberOfDimensions(Bi)*
+					  sizeof(int));
+
+      do
+	hashkey = (hashkey+hashkey2nd)&(T->hashsz-1);
+      while (T->hash_id[hashkey] != '\0' && 
+	     !mx_IsEqual(Bi,T->hash_tb[hashkey]));
+    }
+
+    if (T->hash_id[hashkey] == '\0') {
+      T->hash_tb[hashkey] = (mxArray *)Bi;
+      T->hash_id[hashkey] = 'B';
+      T->hash_ixb[i] = hashkey;
+      T->hash_iixb[hashkey] = i;
+      T->nb++;
+    }
+    else {
+      T->hash_ixb[i] = hashkey;
+
+      if (T->hash_id[hashkey] == 'A') {
+	T->na--;
+	T->hash_id[hashkey] = 'X';
+	T->hash_iixb[hashkey] = i;
+	T->nx++;
+      }
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/fsparse.c b/stenglib/Fast/source/fsparse.c
new file mode 100644
index 00000000..88f667ca
--- /dev/null
+++ b/stenglib/Fast/source/fsparse.c
@@ -0,0 +1,1688 @@
+/* fsparse.c */
+
+/* S. Engblom 2013-12-02 (OpenMP) */
+/* S. Engblom 2010-02-02 (Minor revision) */
+/* S. Engblom 2007-05-04 (Revision) */
+/* S. Engblom 2005-05-05 (Revision) */
+/* S. Engblom 2004-10-29 */
+
+#include <math.h>
+#include <string.h>
+// temporary fix for CC under Solaris:
+#ifndef NO_STDINT
+#include <stdint.h>
+#endif
+
+#include "mex.h"
+#include "matrix.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifndef FSPARSE_TIME
+
+#define StartTime
+#define StopTime
+#define GetTime(dest)
+
+#else
+
+#include <sys/time.h>
+
+static double *time_vec; // global dummy for output time vector
+
+static struct timeval TIME_before,TIME_after;
+#define StartTime gettimeofday(&TIME_before,NULL)
+#define StopTime gettimeofday(&TIME_after,NULL)
+#define GetTime(dest) ((dest) = (double)(TIME_after.tv_sec-TIME_before.tv_sec)+ \
+		       (TIME_after.tv_usec-TIME_before.tv_usec)/1000000.0)
+
+#endif // FSPARSE_TIME
+
+// print intermediate results:
+#undef PRINT_INTERMEDIATE
+
+/*------------------------------------------------------------------------*/
+
+// forward declarations
+bool mx_IsInt(const mxArray *array_ptr);
+
+bool getix(int **ix,int M,int N,int *max,bool nocopy,const mxArray *IX);
+
+mxArray *sparse2sparse(const mxArray *S);
+mxArray *full2sparse(const mxArray *S);
+
+void squeeze(mxArray *S);
+
+void sparse_insert(mwIndex *irS,double *prS,double *piS,
+		   const int *irank,const int *rank,const mwSize *jrS,
+		   const int *ii,
+		   const double *sr,const double *si,
+		   int smod,int sdiv,int len,int M);
+void sparse_inserti(mwIndex *irS,double *prS,double *piS,
+		    const int *irank,
+		    const int *ii,int imod,
+		    const double *sr,const double *si,
+		    int smod,int sdiv,int len);
+
+mxArray *sparse(const int *ii,const int *jj,
+		const double *sr,const double *si,
+		int smod,int sdiv,
+		int len,int M,int N,int Nzmax);
+mxArray *sparse_nosort(const int *ii,const int *jj,
+		       const double *sr,const double *si,
+		       int smod,int sdiv,
+		       int len,int M,int N,int Nzmax);
+mxArray *gsparse(const int *ii,int imod,
+		 const int *jj,int jdiv,
+		 const double *sr,const double *si,
+		 int smod,int sdiv,
+		 int len,int M,int N,int Nzmax);
+mxArray *gsparse_nosort(const int *ii,int imod,
+			const int *jj,int jdiv,
+			const double *sr,const double *si,
+			int smod,int sdiv,
+			int len,int M,int N,int Nzmax);
+
+/*------------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+#ifdef FSPARSE_TIME
+  // output time vector allocated here already
+  plhs[1] = mxCreateDoubleMatrix(1,6,mxREAL);
+  time_vec = mxGetPr(plhs[1]);
+
+  // check of syntax
+  if (nrhs < 1 || nrhs == 2 || 6 < nrhs  || nlhs > 2)
+    mexErrMsgIdAndTxt("fsparse:e1",
+		      "Expecting 1, 3..6 inputs and one or two outputs.");
+#else
+  if (nrhs < 1 || nrhs == 2 || 6 < nrhs  || nlhs > 1)
+    mexErrMsgIdAndTxt("fsparse:e1",
+		      "Expecting 1, 3..6 inputs and one output.");
+#endif
+
+  // special case for one input
+  if (nrhs == 1) {
+    if (!mxIsDouble(prhs[0]))
+      mexErrMsgIdAndTxt("fsparse:e2",
+			"Single input argument must be double.");
+
+    if (mxIsSparse(prhs[0])) {
+      plhs[0] = sparse2sparse(prhs[0]);
+      return;
+    }
+    else {
+      if (mxGetNumberOfDimensions(prhs[0]) > 2)
+	mexErrMsgIdAndTxt("fsparse:e3",
+			  "Single input argument must be 2-D.");
+      plhs[0] = full2sparse(prhs[0]);
+      return;
+    }
+  }
+
+  if (!mx_IsInt(prhs[0]) && !mxIsDouble(prhs[0]) || 
+      mxIsComplex(prhs[0]) || mxIsSparse(prhs[0]) ||
+      !mx_IsInt(prhs[1]) && !mxIsDouble(prhs[1]) || 
+      mxIsComplex(prhs[1]) || mxIsSparse(prhs[1]))
+    mexErrMsgIdAndTxt("fsparse:e4",
+		      "Index argument must be real, double or integers " 
+		      "and non-sparse.");
+
+  if (!mxIsDouble(prhs[2]) || mxIsSparse(prhs[2]))
+    mexErrMsgIdAndTxt("fsparse:e5",
+		      "Value argument must be double and non-sparse.");
+
+  if (mxGetNumberOfDimensions(prhs[0]) > 2 ||
+      mxGetNumberOfDimensions(prhs[1]) > 2 ||
+      mxGetNumberOfDimensions(prhs[2]) > 2)
+    mexErrMsgIdAndTxt("fsparse:e6","Input arguments must be 2-D.");
+
+  if (nrhs > 3) {
+    if (!mxIsDouble(prhs[3]) || mxIsComplex(prhs[3]) || mxIsSparse(prhs[3]))
+      mexErrMsgIdAndTxt("fsparse:e7","Size argument must be real, "
+			"double and non-sparse.");
+    if (nrhs > 4) {
+      if (mxGetNumberOfElements(prhs[4]) != 0 && !mxIsChar(prhs[4]))
+	mexErrMsgIdAndTxt("fsparse:e8","Flag argument must be "
+			  "character array.");
+      if (nrhs > 5) {
+	// must determine the number of threads here (before any parallel region)
+	if (!mxIsDouble(prhs[5]) || 
+	    mxGetNumberOfElements(prhs[5]) != 1)
+	  mexErrMsgIdAndTxt("fsparse:e17","Number of threads must be a "
+			    "real scalar.");
+	const int nthreads = (int)mxGetScalar(prhs[5]);
+	if (nthreads < 1)
+	  mexErrMsgIdAndTxt("fsparse:e18","Number of threads must be >= 1.");
+#ifdef _OPENMP
+	omp_set_num_threads(nthreads);
+#endif // ignored otherwise
+      }
+    }
+  }
+
+  // input
+  const int Mii = mxGetM(prhs[0]),Nii = mxGetN(prhs[0]);
+  const int Mjj = mxGetM(prhs[1]),Njj = mxGetN(prhs[1]);
+  const bool nocopyii = mx_IsInt(prhs[0]),nocopyjj = mx_IsInt(prhs[1]);  
+  int *ii,*jj;
+  const int Mss = mxGetM(prhs[2]),Nss = mxGetN(prhs[2]);
+  const double *sr = mxGetPr(prhs[2]);
+  const double *si = mxGetPi(prhs[2]);
+  const int len = Mii*Njj;
+  int M = 0,N = 0,Nzmax = -1,sort = 1;
+
+  // check of 'assembly' syntax
+  if (Nii != Njj && Nii != 1 ||
+      Mjj != Mii && Mjj != 1 ||
+      Mss != Mii && Mss != 1 ||
+      Nss != Njj && Nss != 1)
+    mexErrMsgIdAndTxt("fsparse:e9","Sizes mismatch.");
+
+  // input ii and jj
+  StartTime;
+  bool ok1,ok2;
+#ifndef _OPENMP
+  ok1 = getix(&ii,Mii,Nii,&M,nocopyii,prhs[0]);
+  ok2 = getix(&jj,Mjj,Njj,&N,nocopyjj,prhs[1]);
+#else
+  // independent calls
+#pragma omp single nowait
+  ok1 = getix(&ii,Mii,Nii,&M,nocopyii,prhs[0]);
+#pragma omp single nowait
+  ok2 = getix(&jj,Mjj,Njj,&N,nocopyjj,prhs[1]);
+#pragma omp barrier
+#endif // _OPENMP
+  if (!ok1 || !ok2)
+    mexErrMsgIdAndTxt("fsparse:e10","Index argument must be "
+		      "nonnegative integers.");
+  StopTime;
+  GetTime(time_vec[0]);
+
+  // determine the input dimensions [M N Nzmax] of the output
+  if (nrhs > 3) {
+    const int szlen = mxGetNumberOfElements(prhs[3]);
+    const double *szval = mxGetPr(prhs[3]);
+
+    if (szlen > 0) {
+      if (szval[0] < 0.0 || szval[0] != ceil(szval[0]))
+	mexErrMsgIdAndTxt("fsparse:e11","Size argument must be "
+			  "nonnegative integer.");
+      if (M > szval[0])
+	mexErrMsgIdAndTxt("fsparse:e12","Index exceeds matrix dimensions.");
+      M = szval[0];
+      if (szlen > 1) {
+	if (szval[1] < 0.0 || szval[1] != ceil(szval[1]))
+	  mexErrMsgIdAndTxt("fsparse:e11","Size argument must be "
+			    "nonnegative integer.");
+	if (N > szval[1])
+	  mexErrMsgIdAndTxt("fsparse:e12","Index exceeds "
+			    "matrix dimensions.");
+	N = szval[1];
+	if (szlen > 2) {
+	  if (szval[2] < 0.0 || szval[2] != ceil(szval[2]))
+	    mexErrMsgIdAndTxt("fsparse:e13","Nzmax argument must be "
+			      "nonnegative integer.");
+	  Nzmax = szval[2];
+	  if (szlen > 3)
+	    mexErrMsgIdAndTxt("fsparse:e14","Size argument must "
+			      "contain 3 elements or less.");
+	}
+      }
+    }
+  }
+
+  // sorted/not sorted output
+  if (nrhs > 4) {
+    if (mxGetNumberOfElements(prhs[4]) == 0)
+      sort = 1;
+    else {
+      char buf[15]; // read 14 characters at most
+      if (mxGetString(prhs[4],buf,15) != 0)
+	mexErrMsgIdAndTxt("fsparse:e15","Unrecognized flag argument.");
+      if (strcmp(buf,"nosort") == 0)
+	sort = 0;
+      else if (strcmp(buf,"sort") != 0)
+	mexErrMsgIdAndTxt("fsparse:e15","Unrecognized flag argument.");
+    }
+  }
+
+  // empty case
+  if (len == 0) {
+    plhs[0] = mxCreateSparse(M,N,Nzmax == -1 ? 0 : Nzmax,
+			     si == NULL ? mxREAL : mxCOMPLEX);
+    return;
+  }
+
+  if (Nii == Njj && Mjj == Mii) {
+    /* cases when ii and jj have the same shape but (sr,si) have one
+       of 4 different shapes */
+    const int smod = Nss != Njj ? Mii : len;
+    const int sdiv = Mss != Mii ? Mii : 1;
+    if (sort)
+      plhs[0] = sparse(ii,jj,sr,si,smod,sdiv,len,
+		       M,N,Nzmax);
+    else   
+      plhs[0] = sparse_nosort(ii,jj,sr,si,smod,sdiv,len,
+			      M,N,Nzmax);
+  }
+  else {
+    // fully general case
+    const int imod = Nii != Njj ? Mii : len;
+    const int jdiv = Mjj != Mii ? Mii : 1;
+    const int smod = Nss != Njj ? Mii : len;
+    const int sdiv = Mss != Mii ? Mii : 1;
+    if (sort)
+      plhs[0] = gsparse(ii,imod,jj,jdiv,sr,si,smod,sdiv,len,
+			M,N,Nzmax);
+    else   
+      plhs[0] = gsparse_nosort(ii,imod,jj,jdiv,sr,si,smod,sdiv,len,
+			       M,N,Nzmax);
+  }
+
+  // deallocate
+  if (!nocopyii) mxFree(ii);
+  if (!nocopyjj) mxFree(jj);
+
+  // squeeze out zero elements
+  squeeze(plhs[0]);
+}
+/*------------------------------------------------------------------------*/
+bool mx_IsInt(const mxArray *array_ptr)
+/* Returns logical 1 (true) if array_ptr is a numeric array containing
+   integers (int8, int16, int32 or int64 depending on the platform),
+   and logical 0 (false) otherwise. This is useful since the test is
+   not provided in MEX.
+
+   In the name of the function, an extra underscore is used in order
+   to avoid confusing it with true MEX-functions. */
+{
+  const int id = mxGetClassID(array_ptr);
+  const size_t siz = mxGetElementSize(array_ptr);
+  
+  /* check that the class is an integer and that its size matches the
+     size of an int */
+  return (id == mxINT8_CLASS || id == mxINT16_CLASS ||
+	  id == mxINT32_CLASS || id == mxINT64_CLASS) && siz == sizeof(int);
+}
+/*------------------------------------------------------------------------*/
+bool getix(int **ix,int M,int N,int *max,bool nocopy,const mxArray *IX)
+/* Gets indices ix from mxArray IX. The dimensions are M-by-N, max is
+   set to the maximum index and nocopy defines the type of IX (double
+   or int). */
+{
+  bool ok = true;
+  int mx = *max;
+
+#ifndef _OPENMP
+  if (nocopy) {
+    // no copy
+    const int *iix = (*ix = (int *)mxGetData(IX));
+    for (int i = 0; i < M*N; i++) {
+      if (iix[i] < 1)
+	return false;
+      if (iix[i] > mx) mx = iix[i];
+    }
+  }
+  else {
+    // typecast copy
+    const double *ival = mxGetPr(IX);
+    int *iix = (*ix = mxMalloc(M*N*sizeof(int)));
+    for (int i = 0; i < M*N; i++) {
+      if (ival[i] < 1.0 || ival[i] != ceil(ival[i]))
+	return false;
+      if ((iix[i] = ival[i]) > mx) mx = ival[i];
+    }
+  }
+#else // _OPENMP
+  if (nocopy) {
+    // no copy
+    const int *iix = (*ix = (int *)mxGetData(IX));
+#pragma omp parallel shared (mx)
+{
+    int mymx = mx; // local version of mx
+#pragma omp for
+    for (int i = 0; i < M*N; i++) {
+      if (iix[i] > mymx)
+	mymx = iix[i];
+      else if (iix[i] < 1)
+	ok = false; // no harm in continuing
+    }
+
+    if (mx < mymx)
+#pragma omp critical
+      // ensure nothing changed, then make the swap:
+      if (mx < mymx) mx = mymx;
+} // end omp parallel
+  }
+  else {
+    // typecast copy
+    const double *ival = mxGetPr(IX);
+    int *iix;
+#pragma omp critical
+     // not thread-safe:
+    iix = (*ix = mxMalloc(M*N*sizeof(int)));
+#pragma omp parallel shared (mx)
+{
+    int mymx = mx; // local version of mx
+#pragma omp for
+    for (int i = 0; i < M*N; i++) {
+      if (ival[i] < 1.0 || ival[i] != ceil(ival[i]))
+	ok = false; // no harm in continuing
+      else if ((iix[i] = ival[i]) > mymx)
+	mymx = ival[i];
+    }
+
+    if (mx < mymx)
+#pragma omp critical
+      // ensure nothing changed, then make the swap:
+      if (mx < mymx) mx = mymx;
+} // end omp parallel
+  }
+#endif // _OPENMP
+  *max = mx;
+  return ok;
+}
+/*------------------------------------------------------------------------*/
+mxArray *sparse2sparse(const mxArray *S)
+/* Returns a deep copy T of a sparse matrix S. The allocation is
+   exact. */
+{
+  const mwSize *jcS = mxGetJc(S);
+  const mwIndex *irS = mxGetIr(S);
+  const double *prS = mxGetPr(S),*piS = mxGetPi(S);
+  const int N = mxGetN(S),M = mxGetM(S),Nnz = jcS[N];
+  const bool real = piS == NULL;
+  mxArray *T;
+
+  /* straightforward */
+  T = mxCreateSparse(M,N,Nnz,real ? mxREAL : mxCOMPLEX);
+  memcpy(mxGetJc(T),jcS,(N+1)*sizeof(jcS[0]));
+  memcpy(mxGetIr(T),irS,Nnz*sizeof(irS[0]));
+  memcpy(mxGetPr(T),prS,Nnz*sizeof(prS[0]));
+  if (!real) memcpy(mxGetPi(T),piS,Nnz*sizeof(piS[0]));
+
+  return T;
+}
+/*------------------------------------------------------------------------*/
+mxArray *full2sparse(const mxArray *A)
+/* Constructs a sparse matrix S from a full matrix A. */
+{
+  const double *prA = mxGetPr(A),*piA = mxGetPi(A);
+  const int N = mxGetN(A),M = mxGetM(A);
+  const bool real = piA == NULL;
+  mxArray *S;
+
+  if (real) {
+    mwSize *jcS = mxCalloc(N+1,sizeof(mwSize));
+    mwIndex *irS;
+    double *prS;
+
+    /* determine the column pointer */
+    for (int c = 1,k = 0; c <= N; c++,k += M)
+      for (int i = 0; i < M; i++)
+	if (prA[k+i] != 0.0) jcS[c]++;
+    for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+    /* allocate */
+    S = mxCreateSparse(0,0,jcS[N],mxREAL);
+    mxSetM(S,M);
+    mxSetN(S,N);
+    irS = mxGetIr(S);
+    prS = mxGetPr(S);
+
+    /* set the column pointer */
+    mxFree(mxGetJc(S));
+    mxSetJc(S,jcS);
+
+    /* copy data */
+    for (int c = 1,k = 0,dest = 0; c <= N; c++,k += M)
+      for (int i = 0; i < M; i++)
+	if (prA[k+i] != 0.0) {
+	  irS[dest] = i;
+	  prS[dest++] = prA[k+i];
+	}
+  }
+  else {
+    mwSize *jcS = mxCalloc(N+1,sizeof(mwSize));
+    mwIndex *irS;
+    double *prS,*piS;
+
+    for (int c = 1,k = 0; c <= N; c++,k += M)
+      for (int i = 0; i < M; i++)
+	if (prA[k+i] != 0.0 && piA[k+i] != 0.0) jcS[c]++;
+    for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+    S = mxCreateSparse(0,0,jcS[N],mxCOMPLEX);
+    mxSetM(S,M);
+    mxSetN(S,N);
+    irS = mxGetIr(S);
+    prS = mxGetPr(S);
+    piS = mxGetPi(S);
+
+    mxFree(mxGetJc(S));
+    mxSetJc(S,jcS);
+
+    for (int c = 1,k = 0,dest = 0; c <= N; c++,k += M)
+      for (int i = 0; i < M; i++)
+	if (prA[k+i] != 0.0 && piA[k+i] != 0.0) {
+	  irS[dest] = i;
+	  prS[dest] = prA[k+i];
+	  piS[dest++] = piA[k+i];
+	}
+  }
+
+  return S;
+}
+/*------------------------------------------------------------------------*/
+void squeeze(mxArray *S)
+/* Removes any zero elements explicitly stored in the sparse matrix
+   S. No reallocation is performed.
+
+   There is a quite complicated and potentially faster algorithm based
+   on memmove() that performs the same operation. However, benchmark
+   tests indicate that the following simple code optimizes better. */
+{
+  const int N = mxGetN(S);
+  mwSize *jcS = mxGetJc(S);
+  mwIndex *irS = mxGetIr(S);
+  double *prS = mxGetPr(S),*piS = mxGetPi(S);
+
+  if (piS == NULL) {
+    int c,i;
+
+    /* find the first zero, if any */
+    for (i = c = 0; c < N; c++)
+      for ( ; i < jcS[c+1]; i++)
+	if (prS[i] == 0.0)
+	  goto rfound0; /* a 'double break' */
+    return;
+
+  rfound0:
+    /* copy in a conservative fashion */
+    for (int dest = i++; c < N; c++) {
+      for ( ; i < jcS[c+1]; i++)
+	if (prS[i] != 0.0) {
+	  irS[dest] = irS[i];
+	  prS[dest++] = prS[i];
+	}
+      jcS[c+1] = dest;
+    }
+  }
+  else {
+    int c,i;
+    for (i = c = 0; c < N; c++)
+      for ( ; i < jcS[c+1]; i++)
+	if (prS[i] == 0.0 && piS[i] == 0.0)
+	  goto zfound0;
+    return;
+
+  zfound0:
+    for (int dest = i++; c < N; c++) {
+      for ( ; i < jcS[c+1]; i++)
+	if (prS[i] != 0.0 || piS[i] != 0.0) {
+	  irS[dest] = irS[i];
+	  prS[dest] = prS[i];
+	  piS[dest++] = piS[i];
+	}
+      jcS[c+1] = dest;
+    }
+  }
+}
+/*------------------------------------------------------------------------*/
+void sparse_insert(mwIndex *irS,double *prS,double *piS,
+		   const int *irank,const int *rank,const mwIndex *jrS,
+		   const int *ii,const double *sr,const double *si,
+		   int smod,int sdiv,int len,int M)
+/* Inserts elements into sparse matrix. Input is the sparse matrix
+   itself (irS,prS,piS), an index-table irank, the rowindices ii and
+   the values of the elements (sr,si). Four different formats of the
+   values are allowed as indicated by the parameters smod and sdiv.
+
+   Currently, input rank, jrS, and M are only used #ifdef _OPENMP and
+   for the full case 3 below. */
+{
+  const bool real = si == NULL;
+
+  switch (2*(smod == len)+(sdiv == 1)) {
+  case 3 : /* full case */
+#ifndef _OPENMP
+    if (real)
+      for (int i = 0; i < len; i++) {
+      	irS[irank[i]] = ii[i]-1;
+      	prS[irank[i]] += sr[i];
+      }
+    else
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i]-1;
+	prS[irank[i]] += sr[i];
+	piS[irank[i]] += si[i];
+      }
+#else // _OPENMP
+    if (real) {
+
+    if (rank != NULL) {
+      /* needed since not all cases respond to _OPENMP and uses a
+	 different syntax */
+#pragma omp parallel
+{
+      const int nThreads = omp_get_num_threads();
+      const int myId = omp_get_thread_num();
+      const int rstart = 1+M*myId/nThreads;
+      const int rend = M*(myId+1)/nThreads;
+      int istart;
+      if (rstart == 1)
+	istart = 0;
+      else
+	istart = jrS[rstart-1];
+
+      if (rend >= 1) {
+        for (int i = istart; i < jrS[rend]; i++) irS[irank[i]] = ii[rank[i]]-1;
+        for (int i = istart; i < jrS[rend]; i++) prS[irank[i]] += sr[rank[i]];
+      }
+} // end parallel
+    }
+    else {
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) irS[irank[i]] = ii[i]-1;
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) prS[irank[i]] += sr[i];
+    }
+    }
+    else {
+#pragma omp parallel
+{
+  if (rank != NULL) {
+    const int nThreads = omp_get_num_threads();
+    const int myId = omp_get_thread_num();
+    const int rstart = 1+M*myId/nThreads;
+    const int rend = M*(myId+1)/nThreads;
+    int istart;
+    if (rstart == 1)
+      istart = 0;
+    else
+      istart = jrS[rstart-1];
+
+    if (rend >= 1)
+      for (int i = istart; i < jrS[rend]; i++) {
+	irS[irank[i]] = ii[rank[i]]-1;
+	prS[irank[i]] += sr[rank[i]];
+	piS[irank[i]] += si[rank[i]];
+      }
+  }
+  else {
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) irS[irank[i]] = ii[i]-1;
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) prS[irank[i]] += sr[i];
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) piS[irank[i]] += si[i];
+  }
+} // end parallel
+    }
+#endif // _OPENMP
+    break;
+
+  case 2 : /* horizontal case */
+    if (real)
+      for (int j = 0; j < len; j += sdiv) {
+	const double ssr = sr[j/sdiv];
+	for (int i = j; i < j+sdiv; i++) {
+	  irS[irank[i]] = ii[i]-1;
+	  prS[irank[i]] += ssr;
+	}
+      }
+    else
+      for (int j = 0; j < len; j += sdiv) {
+	const double ssr = sr[j/sdiv];
+	const double ssi = si[j/sdiv];
+	for (int i = j; i < j+sdiv; i++) {
+	  irS[irank[i]] = ii[i]-1;
+	  prS[irank[i]] += ssr;
+	  piS[irank[i]] += ssi;
+	}
+      }
+    break;
+
+  case 1 : /* vertical case */
+    if (real)
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i]-1;
+	prS[irank[i]] += sr[i%smod];
+      }
+    else
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i]-1;
+	prS[irank[i]] += sr[i%smod];
+	piS[irank[i]] += si[i%smod];
+      }
+    break;
+
+  case 0 : /* scalar case */
+#ifndef _OPENMP
+    if (real) {
+      const double ssr = sr[0];
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i]-1;
+	prS[irank[i]] += ssr;
+      }
+    }
+    else {
+      const double ssr = sr[0];
+      const double ssi = si[0];
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i]-1;
+	prS[irank[i]] += ssr;
+	piS[irank[i]] += ssi;
+      }
+    }
+#else // _OPENMP
+    if (real) {
+#pragma omp parallel
+{
+      const double ssr = sr[0];
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) irS[irank[i]] = ii[i]-1;
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) prS[irank[i]] += ssr;
+} // end omp parallel
+    }
+    else {
+#pragma omp parallel
+{
+      const double ssr = sr[0];
+      const double ssi = si[0];
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) irS[irank[i]] = ii[i]-1;
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) prS[irank[i]] += ssr;
+#pragma omp single nowait
+      for (int i = 0; i < len; i++) piS[irank[i]] += ssi;
+} // end omp parallel
+    }
+#endif // _OPENMP
+    break;
+  }
+}
+/*------------------------------------------------------------------------*/
+void sparse_inserti(mwIndex *irS,double *prS,double *piS,
+		    const int *irank,
+		    const int *ii,int imod,
+		    const double *sr,const double *si,
+		    int smod,int sdiv,int len)
+/* Same as sparse_insert() above except that ii is assumed to be
+   vertical. */
+{
+  const bool real = si == NULL;
+
+  switch (2*(smod == len)+(sdiv == 1)) {
+  case 3 : /* full case */
+    if (real)
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i%imod]-1;
+	prS[irank[i]] += sr[i];
+      }
+    else
+      for (int i = 0; i < len; i++) {
+	irS[irank[i]] = ii[i%imod]-1;
+	prS[irank[i]] += sr[i];
+	piS[irank[i]] += si[i];
+      }
+    break;
+  case 2 : /* horizontal case */
+    if (real)
+      for (int j = 0; j < len; j += sdiv) {
+	const double ssr = sr[j/sdiv];
+	for (int i = 0; i < imod; i++) {
+	  irS[irank[j+i]] = ii[i]-1;
+	  prS[irank[j+i]] += ssr;
+	}
+      }
+    else
+      for (int j = 0; j < len; j += sdiv) {
+	const double ssr = sr[j/sdiv];
+	const double ssi = si[j/sdiv];
+	for (int i = 0; i < imod; i++) {
+	  irS[irank[j+i]] = ii[i]-1;
+	  prS[irank[j+i]] += ssr;
+	  piS[irank[j+i]] += ssi;
+	}
+      }
+    break;
+  case 1 : /* vertical case */
+    if (real)
+      for (int i = 0; i < imod; i++) {
+	const double ssr = sr[i];
+	const int iii = ii[i]-1;
+	for (int j = i; j < len; j += imod) {
+	  irS[irank[j]] = iii;
+	  prS[irank[j]] += ssr;
+	}
+      }
+    else
+      for (int i = 0; i < imod; i++) {
+	const double ssr = sr[i];
+	const double ssi = si[i];
+	const int iii = ii[i]-1;
+	for (int j = i; j < len; j += imod) {
+	  irS[irank[j]] = iii;
+	  prS[irank[j]] += ssr;
+	  piS[irank[j]] += ssi;
+	}
+      }
+    break;
+  case 0 : /* scalar case */
+    if (real) {
+      const double ssr = sr[0];
+      for (int j = 0; j < len; j += imod)
+	for (int i = 0; i < imod; i++) {
+	  irS[irank[j+i]] = ii[i]-1;
+	  prS[irank[j+i]] += ssr;
+	}
+    }
+    else {
+      const double ssr = sr[0];
+      const double ssi = si[0];
+      for (int j = 0; j < len; j += imod)
+	for (int i = 0; i < imod; i++) {
+	  irS[irank[j+i]] = ii[i]-1;
+	  prS[irank[j+i]] += ssr;
+	  piS[irank[j+i]] += ssi;
+	}
+    }
+    break;
+  }
+}
+/*------------------------------------------------------------------------*/
+#ifndef _OPENMP
+mxArray *sparse(const int *ii,const int *jj,
+		const double *sr,const double *si,
+		int smod,int sdiv,
+		int len,int M,int N,int Nzmax)
+/* Constructs a sparse matrix in Compressed Column Storage (CCS) from
+   triplet format [ii,jj,sr(si)]. An ordinary Matlab sparse matrix is
+   thus constructed.
+
+   Input (smod,sdiv) determine the shape of the value array sr(si) and
+   len is the length of the index arrays which both must have the same
+   shape. [M N Nzmax] determine the dimensions of the resulting
+   matrix. If Nzmax = -1, then the allocation is exact (i.e. nnz(S) =
+   nzmax(S)). Otherwise, Nzmax must be greater than or equal to the
+   number of nonzeros needed to be stored.
+
+   The memory demand of the algorithm is (at peak and for sufficiently
+   large output) <input>+int[len]+<output>. */
+{
+  // output
+  mxArray *S;
+  mwSize *jcS; // column pointer for sparse matrix S
+  int *irank;  // inverse rank array of length len
+
+  mwSize *jrS; // accumulated "pessimistic" row counter
+  int *rank;   // rank-array for rows
+  int *hcol;   // cache memory for columns
+
+  // Part 1: count and accumulate indices to rows
+  StartTime;
+  jrS = mxCalloc(M+1,sizeof(jrS[0]));
+  for (int i = 0; i < len; i++) jrS[ii[i]]++;
+  for (int r = 2; r <= M; r++) jrS[r] += jrS[r-1];
+  StopTime;
+  GetTime(time_vec[1]);
+#ifdef PRINT_INTERMEDIATE
+  mexPrintf("jrS = [");
+  for (int r = 0; r <= M; r++) mexPrintf("%d,",jrS[r]);
+  mexPrintf("]\n\n");
+#endif
+
+  // Part 2: build rank with the active use of jrS
+  StartTime;
+  rank = mxMalloc(len*sizeof(rank[0]));
+  jrS--; /* (unit-offset in ii) */
+  for (int i = 0; i < len; i++) rank[jrS[ii[i]]++] = i;
+  // rank now allows for row-wise traversal
+  StopTime;
+  GetTime(time_vec[2]);
+#ifdef PRINT_INTERMEDIATE
+  mexPrintf("rank = [");
+  for (int i = 0; i < len; i++) mexPrintf("%d,",rank[i]);
+  mexPrintf("]\n");
+  mexPrintf("jrS = [*,");
+  for (int r = 1; r <= M+1; r++) mexPrintf("%d,",jrS[r]);
+  mexPrintf("]\n\n");
+#endif
+
+  /* Part 3: loop over input and make each column unique with respect
+     to rowindices, building both an index vector irank and the final
+     column pointer at the same time */
+  StartTime;
+  jcS = mxCalloc(N+1,sizeof(jcS[0]));
+  hcol = mxCalloc(N,sizeof(hcol[0]));
+  hcol--; /* (unit-offset in jj) */
+  irank = mxMalloc(len*sizeof(irank[0]));
+  for (int row = 1,i = 0; row <= M; row++)
+    for ( ; i < jrS[row]; i++) {
+      const int ixijs = rank[i]; // index into input data triplet (ii,jj,sr)
+      const int col = jj[ixijs]; // column index
+
+      // new element?
+      if (hcol[col] < row) {
+	hcol[col] = row; // remembered by the row index
+	jcS[col]++;      // count it
+      }
+
+      // irank keeps track of where it should go
+      irank[ixijs] = jcS[col]-1;
+    }
+  mxFree(++hcol);
+  mxFree(rank);
+  mxFree(++jrS);
+  StopTime;
+  GetTime(time_vec[3]);
+#ifdef PRINT_INTERMEDIATE
+  mexPrintf("irank = [");
+  for (int i = 0; i < len; i++) mexPrintf("%d,",irank[i]);
+  mexPrintf("]\n");
+  mexPrintf("jcS = [");
+  for (int c = 0; c <= N; c++) mexPrintf("%d,",jcS[c]);
+  mexPrintf("]\n\n");
+#endif
+  
+  // Part 4: accumulate pointer to columns
+  StartTime;
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  // irank must account for the previous accumulation
+  jcS--; /* (again, unit-offset in jj) */
+  for (int i = 0; i < len; i++) irank[i] += jcS[jj[i]];
+  jcS++;
+  StopTime;
+  GetTime(time_vec[4]);
+#ifdef PRINT_INTERMEDIATE
+  mexPrintf("irank = [");
+  for (int i = 0; i < len; i++) mexPrintf("%d,",irank[i]);
+  mexPrintf("]\n");
+  mexPrintf("jcS = [");
+  for (int c = 0; c <= N; c++) mexPrintf("%d,",jcS[c]);
+  mexPrintf("]\n\n");
+#endif
+
+  // allocate output
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  // set the column pointer
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  // insert the data
+  StartTime;
+  sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  StopTime;
+  GetTime(time_vec[5]);
+
+  mxFree(irank);
+  return S;
+}
+/*------------------------------------------------------------------------*/
+#else // _OPENMP
+/*------------------------------------------------------------------------*/
+mxArray *sparse(const int *ii,const int *jj,
+		const double *sr,const double *si,
+		int smod,int sdiv,
+		int len,int M,int N,int Nzmax)
+/* This is the OpenMP-version of the sparse function above. */
+{
+  // output
+  mxArray *S;
+  mwSize **jcS; // column pointer, one per thread
+  mwSize *jcS_; // final column pointer
+  int *irank;   // inverse rank array of length len
+  int *irankP;  // permuted version of irank
+
+  mwSize **jrS; // accumulated "pessimistic" row counter
+  int *rank;    // rank-array for rows
+
+  // Part 1: count and accumulate indices to rows
+  StartTime;
+  const int nThreads = omp_get_max_threads();
+  jrS = mxMalloc((nThreads+1)*sizeof(jrS[0]));
+  for (int k = 0; k <= nThreads; k++) {
+    jrS[k] = mxCalloc(M+1,sizeof(jrS[k][0]));
+    jrS[k]--; /* (unit-offset in ii) */
+  }
+
+#pragma omp parallel
+{
+  const int myId = omp_get_thread_num();
+  const int istart = len*myId/nThreads;
+  const int iend = len*(myId+1)/nThreads;
+  for (int i = istart; i < iend; i++)
+    jrS[myId+1][ii[i]]++;
+
+#pragma omp barrier
+
+  // accumulate jrS over the threads
+#pragma omp for
+  for (int r = 1; r <= M; r++)
+    for (int k = 1; k < nThreads; k++)
+      jrS[k+1][r] += jrS[k][r];
+
+  // serial accumulation in jrS[0]
+#pragma omp single
+  for (int r = 1; r <= M; r++)
+    jrS[0][r+1] += jrS[0][r]+jrS[nThreads][r];
+
+  // determine a private jrS for each thread
+#pragma omp for
+  for (int r = 1; r <= M; r++)
+    for (int k = 1; k < nThreads; k++)
+      jrS[k][r] += jrS[0][r];
+} // end parallel
+  StopTime;
+  GetTime(time_vec[1]);
+
+  // Part 2: build rank with the active use of jrS
+  StartTime;
+  rank = mxMalloc(len*sizeof(rank[0]));
+
+#pragma omp parallel
+{
+  const int myId = omp_get_thread_num();
+  const int istart = len*myId/nThreads;
+  const int iend = len*(myId+1)/nThreads;
+  for (int i = istart; i < iend; i++)
+    rank[jrS[myId][ii[i]]++] = i;
+  // rank now allows for row-wise traversal
+} // end parallel
+  StopTime;
+  GetTime(time_vec[2]);
+
+  /* Part 3: loop over input and make each column unique with respect
+     to rowindices, building both a permuted index vector irankP and
+     the final column pointer at the same time */
+  StartTime;
+  jcS = mxMalloc((nThreads+1)*sizeof(jcS[0]));
+  for (int k = 0; k <= nThreads; k++)
+    jcS[k] = mxCalloc(N+1,sizeof(jcS[k][0]));
+  irankP = mxMalloc(len*sizeof(irankP[0]));
+  if (2*(smod == len)+(sdiv == 1) != 3)
+    // *** case not fully implemented
+    irank = mxMalloc(len*sizeof(irank[0]));
+
+#pragma omp parallel
+{
+  int *hcol; // cache memory for columns
+#pragma omp critical
+  hcol = mxCalloc(N,sizeof(hcol[0]));
+  hcol--; /* (unit-offset in jj) */
+
+  const int myId = omp_get_thread_num();
+  const int rstart = 1+M*myId/nThreads;
+  const int rend = M*(myId+1)/nThreads;
+  int istart;
+  if (rstart == 1)
+    istart = 0;
+  else
+    istart = jrS[nThreads-1][rstart-1];
+
+  // loop over segment of row indices
+  for (int row = rstart,i = istart; row <= rend; row++)
+    // loop over single row
+    for ( ; i < jrS[nThreads-1][row]; i++) {
+      const int ixijs = rank[i]; // index into input data triplet (ii,jj,sr)
+      const int col = jj[ixijs]; // column index
+
+      // new element?
+      if (hcol[col] < row) {
+      	hcol[col] = row;  // store row index
+      	jcS[myId+1][col]++; // count it
+      }
+
+      // irankP keeps track of where it should go
+      irankP[i] = jcS[myId+1][col]-1;
+    }
+#pragma omp critical
+  mxFree(++hcol);
+
+#pragma omp barrier
+
+  // accumulate jcS over the threads
+#pragma omp for
+  for (int c = 1; c <= N; c++)
+    for (int k = 1; k < nThreads; k++)
+      jcS[k+1][c] += jcS[k][c];
+
+  // serial accumulation in jcS[0]
+#pragma omp single
+{
+  for (int c = 1; c <= N; c++)
+    jcS[0][c] += jcS[0][c-1]+jcS[nThreads][c];
+  jcS[0]--; /* (unit-offset in jj) */
+}
+
+  // determine a private jcS for each thread
+#pragma omp for
+  for (int c = 1; c <= N; c++)
+    for (int k = 1; k < nThreads; k++)
+      jcS[k][c] += jcS[0][c];
+
+  // irankP must now account to these changes to jcS
+  if (rend >= 1)
+    for (int i = istart; i < jrS[nThreads-1][rend]; i++)
+      irankP[i] += jcS[myId][jj[rank[i]]];
+
+  if (2*(smod == len)+(sdiv == 1) != 3) {
+    // *** case not fully implemented
+    if (rend >= 1)
+      for (int i = istart; i < jrS[nThreads-1][rend]; i++)
+	irank[rank[i]] = irankP[i];
+  }
+} // end parallel
+  StopTime;
+  GetTime(time_vec[3]);
+
+  // Part 4: final accumulation of jcS
+  StartTime;
+  /* (nil in this version) */
+  StopTime;
+  GetTime(time_vec[4]);
+
+  // allocate output
+  jcS[0]++;
+  if (Nzmax == -1)
+    Nzmax = jcS[0][N];
+  else if (Nzmax < jcS[0][N])
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  // set the column pointer
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS[0]);
+  // free the remaining pointers
+  for (int k = 1; k <= nThreads; k++)
+    mxFree(jcS[k]);
+  mxFree(jcS);
+
+  // insert the data
+  StartTime;
+  if (2*(smod == len)+(sdiv == 1) == 3)
+    sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		  irankP,rank,jrS[nThreads-1],ii,sr,si,smod,sdiv,len,M);
+  else
+    sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		  irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  StopTime;
+  GetTime(time_vec[5]);
+
+  // *** case not fully implemented
+  if (2*(smod == len)+(sdiv == 1) != 3)
+    mxFree(irank);
+  mxFree(irankP);
+
+  return S;
+}
+#endif // _OPENMP
+/*------------------------------------------------------------------------*/
+
+/* table is only allowed to become about 70% full */
+const double HASH_LOAD = 1.0/0.70;
+
+size_t hashsize(size_t len)
+/* Size of hash table. This is simply
+   exp2(ceil(log2(HASH_LOAD*len))). */
+{
+  size_t ans = 1;
+
+  len = ceil(HASH_LOAD*len);
+  do ans <<= 1; while (len >>= 1);
+
+  return ans;
+}
+/*------------------------------------------------------------------------*/
+uint32_t hash(int ii,int jj)
+/* Computes a hashkey from an integer pair (ii,jj).
+
+   Adapted from a code by Paul Hsieh. See
+   http://www.azillionmonkeys.com/qed/hash.html for further details.
+*/
+{
+  uint32_t hash = jj,temp;
+
+  /* main loop executed twice */
+  hash += ii&0x0000FFFF;
+  temp = ((ii&0xFFFF0000) >> 5)^hash;
+  hash = (hash << 16)^temp;
+  hash += hash >> 11;
+
+  hash += jj&0x0000FFFF;
+  temp = ((jj&0xFFFF0000) >> 5)^hash;
+  hash = (hash << 16)^temp;
+  hash += hash >> 11;
+
+  /* "avalanching" of final 127 bits */
+  hash ^= hash << 3;
+  hash += hash >> 5;
+  hash ^= hash << 2;
+  hash += hash >> 15;
+  hash ^= hash << 10;
+
+  return hash;
+}
+/*------------------------------------------------------------------------*/
+mxArray *sparse_nosort(const int *ii,const int *jj,
+		       const double *sr,const double *si,
+		       int smod,int sdiv,
+		       int len,int M,int N,int Nzmax)
+/* This function does the same job as sparse() above except that the
+   result is *not* sorted w.r.t. rowindices. The sparse matrix thus
+   constructed is therefore a sparse matrix which is valid in many,
+   but not all situations in Matlab.
+
+   The allocation is still exact (i.e. nzmax(S) = nnz(S)) and the
+   memory demand is (at peak and for sufficiently large output)
+   <input>+int[N+2]+int[len]+<output>, where len is the size of the
+   input index set. */
+{
+
+#define SPS1 /* new version with hashing */
+#undef SPS2  /* updated version without hashing */
+
+#ifdef SPS1
+
+  /* output */
+  mxArray *S;
+  mwSize *jcS;
+
+  /* rank- and hash-table */
+  int *irank,*hash_tb;
+  size_t hash_sz = hashsize(len);
+
+  jcS = mxCalloc(N+1,sizeof(jcS[0]));
+  irank = mxMalloc(len*sizeof(irank[0]));
+  hash_tb = mxCalloc(hash_sz,sizeof(hash_tb[0]));
+
+  for (int i = 0; i < len; i++) {
+    const int row = ii[i],col = jj[i];
+    uint32_t hashkey = hash(row,col)&(hash_sz-1);
+
+    /* collision? */
+    if (hash_tb[hashkey] != 0 && 
+	!(row == ii[hash_tb[hashkey]-1] && 
+	  col == jj[hash_tb[hashkey]-1])) {
+      /* a fairly independent and odd (incremental) hashkey */
+      const uint32_t hashkey2nd = hash(col,row)|1;
+
+      /* rehash until empty slot found */
+      do
+	hashkey = (hashkey+hashkey2nd)&(hash_sz-1);
+      while (hash_tb[hashkey] != 0 && 
+	     !(row == ii[hash_tb[hashkey]-1] && 
+	       col == jj[hash_tb[hashkey]-1]));
+    }
+
+    /* fill in table */
+    if (hash_tb[hashkey] == 0) {
+      hash_tb[hashkey] = i+1;
+      irank[i] = jcS[col]++;
+    }
+    else
+      irank[i] = irank[hash_tb[hashkey]-1];
+  }
+  mxFree(hash_tb);
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  /* account for the accumulation of indices */
+  jcS--;
+  for (int i = 0; i < len; i++) irank[i] += jcS[jj[i]];
+  jcS++;
+
+  /* allocate output */
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  /* set the column pointer */
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  /* insert the data */
+  sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  mxFree(irank);
+
+  return S;
+
+#elif defined(SPS2)
+
+  /* output */
+  mxArray *S;
+  int *jcS;
+
+  /* rank-tables and help-pointer for rowindices */
+  int *rank,*irank,*hrow;
+
+  /* determine a "pessimistic" column pointer */
+  jcS = mxCalloc(N+2,sizeof(jcS[0]));
+  jcS++; /* allows for a fast restore */
+  for (int i = 0; i < len; i++) jcS[jj[i]]++;
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  /* produce the rank-table so that the data may be traversed in order
+     with respect to columns */
+  rank = mxMalloc(len*sizeof(rank[0]));
+  jcS--; /* restore jcS and account for unit offset in jj */
+  for (int i = 0; i < len; i++) rank[jcS[jj[i]]++] = i;
+
+  /* allocate index- and help-pointers */
+  irank = mxMalloc(len*sizeof(irank[0]));
+  hrow = mxCalloc(M,sizeof(hrow[0]));
+  hrow--; /* unit offset in ii */
+
+  /* loop over input and make each column unique with respect to
+     rowindices, building both an index vector irank and the final
+     column pointer at the same time */
+  for (int col = 1,i = 0; col <= N; col++) {
+    const int begin = jcS[col-1],end = jcS[col];
+    jcS[col] = begin;
+    for ( ; i < end; i++) {
+      const int ixijs = rank[i];
+      const int row = ii[ixijs];
+
+      /* new element; mark and count it */
+      if (hrow[row] <= begin) hrow[row] = ++jcS[col];
+
+      /* remember where it should go */
+      irank[ixijs] = hrow[row]-1;
+    }
+  }
+  mxFree(++hrow);
+  mxFree(rank);
+
+  /* allocate output */
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  /* set the column pointer */
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  /* insert the data */
+  sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  mxFree(irank);
+
+  return S;
+
+#endif
+}
+/*------------------------------------------------------------------------*/
+/*- General versions -----------------------------------------------------*/
+/*------------------------------------------------------------------------*/
+mxArray *gsparse(const int *ii,int imod,
+		 const int *jj,int jdiv,
+		 const double *sr,const double *si,
+		 int smod,int sdiv,
+		 int len,int M,int N,int Nzmax)
+/* Derived from sparse(). */
+{
+
+#define GSP1 /* new version */
+#undef GSP2  /* (very) old version */
+
+#ifdef GSP1
+
+  mxArray *S;
+  mwSize *jcS;
+  mwIndex *irS;
+
+  int *rank,*irank,*hcol;
+
+  irS = mxCalloc(M+1,sizeof(irS[0]));
+  /* for (int i = 0; i < len; i++) irS[ii[i%imod]]++; */
+  if (imod == len)
+    for (int i = 0; i < len; i++) irS[ii[i]]++;
+  else
+    for (int i = 0; i < imod; i++) irS[ii[i]] += len/imod;
+  for (int r = 2; r <= M; r++) irS[r] += irS[r-1];
+
+  rank = mxMalloc(len*sizeof(rank[0]));
+  irS--;
+  for (int i = 0; i < len; i++) rank[irS[ii[i%imod]]++] = i;
+
+  jcS = mxCalloc(N+1,sizeof(jcS[0]));
+  hcol = mxCalloc(N,sizeof(hcol[0]));
+  hcol--;
+  irank = mxMalloc(len*sizeof(irank[0]));
+  for (int row = 1,i = 0; row <= M; row++)
+    for ( ; i < irS[row]; i++) {
+      const int ixijs = rank[i];
+      const int col = jj[ixijs/jdiv];
+
+      if (hcol[col] < row) {
+	hcol[col] = row;
+	jcS[col]++;
+      }
+
+      irank[ixijs] = jcS[col]-1;
+    }
+  mxFree(++irS);
+  mxFree(rank);
+  mxFree(++hcol);
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  jcS--;
+  /* for (int i = 0; i < len; i++) irank[i] += jcS[jj[i/jdiv]]; */
+  if (jdiv == 1)
+    for (int i = 0; i < len; i++) irank[i] += jcS[jj[i]];
+  else
+    for (int c = 0, i = 0; c < len/jdiv; c++)
+      for ( ; i < (c+1)*jdiv; i++) irank[i] += jcS[jj[c]];
+  jcS++;
+
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  if (imod == len)
+    sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		  irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  else
+    sparse_inserti(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		   irank,ii,imod,sr,si,smod,sdiv,len);
+  mxFree(irank);
+
+  return S;
+
+#elif defined(GSP2)
+
+  mxArray *S;
+  int *jcS,*irS;
+
+  int *rank,*irank;
+
+  jcS = mxCalloc(N+2,sizeof(jsC[0]));
+  jcS++;
+  irS = mxCalloc(M+1,sizeof(irS[0]));
+
+  for (int i = 0; i < len; i++) {
+    jcS[jj[i/jdiv]]++;
+    irS[ii[i%imod]]++;
+  }
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+  for (int r = 2; r <= M; r++) irS[r] += irS[r-1];
+
+  irank = mxMalloc(len*sizeof(irank[0]));
+  irS--;
+  for (int i = 0; i < len; i++)
+    irank[irS[ii[i%imod]]++] = i;
+  mxFree(++irS);
+
+  rank = mxMalloc(len*sizeof(rank[0]));
+  jcS--;
+  for (int i = 0; i < len; i++)
+    rank[jcS[jj[irank[i]/jdiv]]++] = irank[i];
+
+  for (int i = 0,col = jj[rank[0]/jdiv],row; ; ) {
+    const int end = jcS[col];
+
+    irank[rank[i]] = jcS[col-1];
+    jcS[col] = jcS[col-1]+1;
+    row = ii[rank[i]%imod];
+
+    for (i++; i < end; i++) {
+      const int ixijs = rank[i];
+
+      if (row < ii[ixijs%imod]) {
+	row = ii[ixijs%imod];
+	jcS[col]++;
+      }
+
+      irank[ixijs] = jcS[col]-1;
+    }
+
+    if (i < len)
+      for (col++; col < jj[rank[i]/jdiv]; col++)
+	jcS[col] = jcS[col-1];
+    else {
+      for (col++; col <= N; col++) jcS[col] = jcS[col-1];
+      break;
+    }
+  }
+  mxFree(rank);
+
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  if (imod == len)
+    sparse_insert(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		  irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  else
+    sparse_inserti(mxGetIr(S),mxGetPr(S),mxGetPi(S),
+		   irank,ii,imod,sr,si,smod,sdiv,len);
+  mxFree(irank);
+
+  return S;
+
+#endif
+
+}
+/*------------------------------------------------------------------------*/
+mxArray *gsparse_nosort(const int *ii,int imod,
+			const int *jj,int jdiv,
+			const double *sr,const double *si,
+			int smod,int sdiv,
+			int len,int M,int N,int Nzmax)
+/* Derived from sparse_nosort(). */
+{
+
+#define GSPS1 /* new version using hashing */
+#undef GSPS2  /* (very) old version without hashing */
+
+#ifdef GSPS1
+
+  /* output */
+  mxArray *S;
+  mwSize *jcS;
+
+  /* rank- and hash-table */
+  int *irank,*hash_tb;
+  size_t hash_sz = hashsize(len);
+
+  jcS = mxCalloc(N+1,sizeof(jcS[0]));
+  irank = mxMalloc(len*sizeof(irank[0]));
+  hash_tb = mxCalloc(hash_sz,sizeof(hash_tb[0]));
+
+  for (int i = 0; i < len; i++) {
+    const int row = ii[i%imod],col = jj[i/jdiv];
+    uint32_t hashkey = hash(row,col)&(hash_sz-1);
+
+    /* collision? */
+    if (hash_tb[hashkey] != 0 && 
+	!(row == ii[(hash_tb[hashkey]-1)%imod] && 
+	  col == jj[(hash_tb[hashkey]-1)/jdiv])) {
+      /* a fairly independent and odd (incremental) hashkey */
+      const uint32_t hashkey2nd = hash(col,row)|1;
+
+      /* rehash until empty slot found */
+      do
+	hashkey = (hashkey+hashkey2nd)&(hash_sz-1);
+      while (hash_tb[hashkey] != 0 && 
+	     !(row == ii[(hash_tb[hashkey]-1)%imod] && 
+	       col == jj[(hash_tb[hashkey]-1)/jdiv]));
+    }
+
+    /* fill in table */
+    if (hash_tb[hashkey] == 0) {
+      hash_tb[hashkey] = i+1;
+      irank[i] = jcS[col]++;
+    }
+    else
+      irank[i] = irank[hash_tb[hashkey]-1];
+  }
+  mxFree(hash_tb);
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  /* account for the accumulation of indices */
+  jcS--;
+  /* for (int i = 0; i < len; i++) irank[i] += jcS[jj[i/jdiv]]; */
+  if (jdiv == 1)
+    for (int i = 0; i < len; i++) irank[i] += jcS[jj[i]];
+  else
+    for (int c = 0, i = 0; c < len/jdiv; c++)
+      for ( ; i < (c+1)*jdiv; i++) irank[i] += jcS[jj[c]];
+  jcS++;
+
+  /* allocate output */
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  /* set the column pointer */
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  /* insert the data */
+  if (imod == len)
+    sparse_insert(mxGetIr(S), mxGetPr(S),mxGetPi(S),
+		  irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  else
+    sparse_inserti(mxGetIr(S), mxGetPr(S),mxGetPi(S),
+		   irank,ii,imod,sr,si,smod,sdiv,len);
+  mxFree(irank);
+
+  return S;
+
+#elif defined(GSPS2)
+
+  mxArray *S;
+  int *jcS;
+
+  int *rank,*irank,*hrow;
+
+  jcS = mxCalloc(N+2,sizeof(jsC[0]));
+  jcS++;
+  for (int i = 0; i < len; i++) jcS[jj[i/jdiv]]++;
+  for (int c = 2; c <= N; c++) jcS[c] += jcS[c-1];
+
+  rank = mxMalloc(len*sizeof(rank[0]));
+  jcS--;
+  for (int i = 0; i < len; i++) rank[jcS[jj[i/jdiv]]++] = i;
+
+  irank = mxMalloc(len*sizeof(irank[0]));
+  hrow = mxCalloc(M,sizeof(hrow[0]));
+  hrow--;
+
+  for (int i = 0,col = jj[rank[0]/jdiv]; ; ) {
+    const int begin = jcS[col-1];
+    const int end = jcS[col];
+    for (jcS[col] = begin; i < end; i++) {
+      const int row = ii[rank[i]%imod];
+
+      if (hrow[row] <= begin) hrow[row] = ++jcS[col];
+
+      irank[rank[i]] = hrow[row]-1;
+    }
+
+    if (i < len)
+      for (col++; col < jj[rank[i]/jdiv]; col++) jcS[col] = jcS[col-1];
+    else {
+      for (col++; col <= N; col++) jcS[col] = jcS[col-1];
+      break;
+    }
+  }
+  mxFree(++hrow);
+  mxFree(rank);
+
+  if (Nzmax == -1)
+    Nzmax = jcS[N];
+  else if (Nzmax < jcS[N]) {
+    mxFree(irank);
+    mxFree(jcS);
+    mexErrMsgIdAndTxt("fsparse:e16","Allocation limited by caller: "
+		      "sparse matrix does not fit.");
+  }
+  S = mxCreateSparse(0,0,Nzmax,si == NULL ? mxREAL : mxCOMPLEX);
+  mxSetM(S,M);
+  mxSetN(S,N);
+
+  mxFree(mxGetJc(S));
+  mxSetJc(S,jcS);
+
+  if (imod == len)
+    sparse_insert(mxGetIr(S), mxGetPr(S),mxGetPi(S),
+		  irank,0,0,ii,sr,si,smod,sdiv,len,M);
+  else
+    sparse_inserti(mxGetIr(S), mxGetPr(S),mxGetPi(S),
+		   irank,ii,imod,sr,si,smod,sdiv,len);
+  mxFree(irank);
+
+  return S;
+
+#endif
+
+}
+/*------------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/make.m b/stenglib/Fast/source/make.m
new file mode 100644
index 00000000..2bb8f9ea
--- /dev/null
+++ b/stenglib/Fast/source/make.m
@@ -0,0 +1,465 @@
+function make(varargin)
+%MAKE Makefile for FAST.
+%   MAKE by itself makes FAST with default arguments.
+%
+%   MAKE(...) accepts additional arguments as property/value-pairs.
+%
+%   Property     Value/{Default}     Description
+%   -----------------------------------------------------------------
+%   OpenMP       Boolean {false}     Turns OpenMP-compilation
+%                                    on/off. Only affects FSPARSE.
+%
+%   fsparseonly  Boolean {false}     Only compiles FSPARSE.
+%
+%   fsparsetime  Boolean {false}     Timing syntax for FSPARSE.
+%
+%   silent       Boolean {false}     Turns information display on/off.
+
+% Johannes Dufva 2020-10-30 (mexw64, 9.7)
+% S. Engblom 2019-01-23 (mexmaci64, mexa64, 9.6)
+% S. Engblom 2016-11-23 (spreplace)
+% S. Engblom 2015-03-23 (mexa64, 8.4)
+% S. Engblom 2015-01-19 (mexmaci64, 8.4)
+% S. Engblom 2013-12-02 (OpenMP, fsparse)
+% S. Engblom 2012-04-16 (mexmaci64, 7.11)
+% S. Engblom 2011-04-17 (mexmaci64, 7.10)
+% S. Engblom 2011-03-07 (mexa64, 7.11)
+% S. Engblom 2010-09-23 (mexs64, 7.7)
+% S. Engblom 2010-02-02 (mexa64, 7.8)
+% S. Engblom 2010-01-12 (mexmaci)
+% S. Engblom 2007-05-17 (mexs64)
+% S. Engblom 2006-11-09 (mexa64)
+% S. Engblom 2005-03-22 (mexmac)
+
+% Use '-DmwIndex=int' for platforms where mex is not automatically
+% linked against the library defining the mwIndex type.
+
+% default options
+optdef.openmp = false;
+optdef.fsparseonly = false;
+optdef.fsparsetime = false;
+optdef.silent = false;
+
+% merge defaults with actual inputs
+if nargin > 0
+  opts = struct(varargin{:});
+  fn = fieldnames(opts);
+  for i = 1:length(fn)
+    optdef = setfield(optdef,fn{i},getfield(opts,fn{i}));
+  end
+end
+opts = optdef;
+if opts.openmp
+  if ~opts.silent
+    fprintf(1,'Compiling FSPARSE with OpenMP.\n');
+  end
+end
+if opts.fsparseonly
+  if ~opts.silent
+    fprintf(1,'Compiling FSPARSE only.\n');
+  end
+end
+if opts.fsparsetime
+  FSPARSEDEF = '-DFSPARSE_TIME';
+  if ~opts.silent
+    fprintf(1,'Compiling with #define FSPARSE_TIME.\n');
+  end
+else
+  FSPARSEDEF = '';
+  if ~opts.silent
+    fprintf(1,'Compiling with #undef FSPARSE_TIME.\n');
+  end
+end
+
+% Note that the OpenMP version of fsparse is a beta-release. Not all
+% cases of the code has been parallelized. Also, the OpenMP make has
+% only been implemented for some platforms. The important lines are:
+%
+% clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/glnxa64 -lmx ' ...
+%          '-lmex'];
+% mex('-largeArrayDims',clibs, ...
+%     ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+%     '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+%     '-outdir',s,[s '/source/fsparse.c']);
+
+s = pwd;
+mx = mexext;
+ver = version;
+
+% first is the fsparse-only compilation
+if opts.fsparseonly
+  if strcmp(mx,'mexa64')
+    if ver(1) == '7'
+      if ~strncmp(ver,'7.2',3) && ~strncmp(ver,'7.8',3) && ...
+            ~strncmp(ver,'7.11',4) && ~strncmp(ver,'7.13',4)
+        warning(['Extension .' mexext [' tested with Matlab version(s) ' ...
+                            '7.2, 7.8, 7.11 and 7.13 only.']]);
+      end
+      if ~strncmp(ver,'7.11',4)
+        if strncmp(ver,'7.2',3)
+          % should be an easy fix:
+          if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+          mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+               '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+              '-outdir',s,[s '/source/fsparse.c']);
+        else
+          if opts.openmp
+            clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/glnxa64 -lmx ' ...
+                     '-lmex'];
+            mex('-largeArrayDims',clibs, ...
+                ['CFLAGS=-fopenmp -O5 -fPIC -std=c99 ' ...
+                 '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+                '-outdir',s,[s '/source/fsparse.c']);
+          else
+            mex('-largeArrayDims', ...
+                ['CFLAGS=-fPIC -O5 -std=c99 ' ...
+                 '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+                '-outdir',s,[s '/source/fsparse.c']);
+          end
+        end
+      else
+        % should be an easy fix:
+        if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+        % apparently, the linker path is not properly set up on 7.11:
+        mex('-largeArrayDims', ...
+            ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            ['-L' matlabroot '/sys/os/glnxa64'], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      end
+    else
+      if ~strncmp(ver,'8.4',3) && ~strncmp(version,'9.6',3)
+        warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+                 '8.4 only.']);
+      end
+      
+      % apparently, the linker path is not properly set up on 8.4 (also a
+      % soft link libstdc++.so inside [matlabroot '/sys/os/glnxa64']
+      % is required to point to the correct shared library, in this
+      % case libstdc++.so.6.0.17)
+      if opts.openmp
+        clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/glnxa64 -lmx ' ...
+                 '-lmex'];
+        mex('-largeArrayDims',clibs, ...
+            ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            ['-L' matlabroot '/sys/os/glnxa64'], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      else
+        mex('-largeArrayDims', ...
+            ['CFLAGS=-fPIC O5 -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            ['-L' matlabroot '/sys/os/glnxa64'], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      end
+    end
+  elseif strcmp(mx,'mexmaci64')
+    if ver(1) == '7'
+      if ~strncmp(ver,'7.10',4) && ~strncmp(ver,'7.11',4) && ...
+            ~strncmp(ver,'7.14',4)
+        warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+                 '7.10 and 7.11 only.']);
+      end
+      if opts.openmp
+        clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/maci64 -lmx ' ...
+                 '-lmex'];
+        mex('-largeArrayDims',clibs, ...
+            ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      else
+        mex('-largeArrayDims', ...
+            ['CC=gcc -std=c99 -fast ',FSPARSEDEF], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      end
+    else
+      if opts.openmp, warning('Compilation of OpenMP not (yet?) supported for this platform.'); end 
+      if ~strncmp(ver,'8.4',3) && ~strncmp(version,'9.6',3)
+        warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+		 '8.4 and 9.6 only.']);
+      end
+      if opts.openmp
+        % no harm in trying (await update of Clang?)
+        clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/maci64 -lmx ' ...
+                 '-lmex'];
+        mex('-largeArrayDims',clibs, ...
+            ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      else
+        mex('-largeArrayDims', ...
+            ['CFLAGS=-Wno-logical-op-parentheses -std=c99 ',FSPARSEDEF],'-outdir',s,[s '/source/fsparse.c']);
+      end
+    end
+  else
+    error('FSPARSE-only compilation not implemented for this platform.');
+  end
+  return;
+end
+
+% main compilation
+if strcmp(mx,'mexglx')
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  if ~strncmp(ver,'7.5',3) && ~strncmp(ver,'7.8',3)
+    warning(['Extension .' mexext [' tested with Matlab version(s) ' ...
+                        '7.5 and 7.8 only.']]);
+  end
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/clenshaw.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/fsetop.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+        '-outdir',s,[s '/source/mexfrepmat.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/powerseries.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/sppmul.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/spreplace.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+      '-outdir',s,[s '/source/fsparse.c']);
+elseif strcmp(mx,'mexa64')
+  if ver(1) == '7'
+    if ~strncmp(ver,'7.2',3) && ~strncmp(ver,'7.8',3) && ...
+          ~strncmp(ver,'7.11',4) && ~strncmp(ver,'7.13',4)
+      warning(['Extension .' mexext [' tested with Matlab version(s) ' ...
+                          '7.2, 7.8, 7.11 and 7.13 only.']]);
+    end
+    if ~strncmp(ver,'7.11',4)
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/clenshaw.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/fsetop.c']);
+      if strncmp(ver,'7.2',3)
+        % should be an easy fix:
+        if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+        mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions'], ...
+            '-outdir',s,[s '/source/mexfrepmat.c']);
+        mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+            '-outdir',s,[s '/source/fsparse.c']);
+      else
+        mex('-largeArrayDims', ...
+            ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions'], ...
+            '-outdir',s,[s '/source/mexfrepmat.c']);
+        if opts.openmp
+          clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/glnxa64 -lmx ' ...
+                   '-lmex'];
+          mex('-largeArrayDims',clibs, ...
+              ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+               '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+              '-outdir',s,[s '/source/fsparse.c']);
+        else
+          mex('-largeArrayDims', ...
+              ['CFLAGS=-fPIC  -O5 -fno-omit-frame-pointer -std=c99 ' ...
+               '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+              '-outdir',s,[s '/source/fsparse.c']);
+        end
+      end
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/powerseries.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/sppmul.c']);
+      mex('-largeArrayDims', ...
+	  ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/spreplace.c']);
+    else
+      % should be an easy fix:
+      if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+      % apparently, the linker path is not properly set up on 7.11:
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/clenshaw.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/fsetop.c']);
+      mex('-largeArrayDims', ...
+          ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/mexfrepmat.c']);
+      mex('-largeArrayDims', ...
+          ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/fsparse.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/powerseries.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/sppmul.c']);
+      mex('-largeArrayDims', ...
+	  ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/spreplace.c']);
+    end
+  else
+    if ~strncmp(ver,'8.4',3) && ~strncmp(version,'9.6',3)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '8.4 and 9.6 only.']);
+    end
+    
+    % apparently, the linker path is not properly set up on 8.4 (also a
+    % soft link libstdc++.so inside [matlabroot '/sys/os/glnxa64'] is
+    % required to point to the correct shared library, in this case
+    % libstdc++.so.6.0.17)
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/clenshaw.c']);
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/fsetop.c']);
+    mex('-largeArrayDims','CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/mexfrepmat.c']);
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/powerseries.c']);
+    mex('-largeArrayDims','CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/sppmul.c']);
+    mex('-largeArrayDims','CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/spreplace.c']);
+
+    if opts.openmp
+      clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/glnxa64 -lmx ' ...
+               '-lmex'];
+      mex('-largeArrayDims',clibs, ...
+          ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/fsparse.c']);
+    else
+      mex('-largeArrayDims', ...
+          ['CFLAGS=-fPIC -O5 -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/fsparse.c']);
+    end
+  end
+elseif strcmp(mx,'mexmac')
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  if ~strncmp(ver,'7.0',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.0 only.']);
+  end
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/clenshaw.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/fsetop.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/mexfrepmat.c']);
+  mex(['CC=gcc -std=c99',FSPARSEDEF],'-outdir',s,[s '/source/fsparse.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/powerseries.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/sppmul.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/spreplace.c']);
+elseif strcmp(mx,'mexmaci')
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  if ~strncmp(ver,'7.8',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.8 only.']);
+  end
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/clenshaw.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/fsetop.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/mexfrepmat.c']);
+  mex(['CC=gcc -std=c99 -fast',FSPARSEDEF],'-outdir',s,[s '/source/fsparse.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/powerseries.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/sppmul.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/spreplace.c']);
+elseif strcmp(mx,'mexmaci64')
+  if ver(1) == '7'
+    if ~strncmp(ver,'7.10',4) && ~strncmp(ver,'7.11',4) && ...
+          ~strncmp(ver,'7.14',4)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '7.10 and 7.11 only.']);
+    end
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/clenshaw.c']);
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/fsetop.c']);
+    mex('-largeArrayDims', ...
+        'CC=gcc -std=c99 -fast','-outdir',s,[s '/source/mexfrepmat.c']);
+    if opts.openmp
+      clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/maci64 -lmx ' ...
+               '-lmex'];
+      mex('-largeArrayDims',clibs, ...
+          ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+          '-outdir',s,[s '/source/fsparse.c']);
+    else
+      mex('-largeArrayDims', ...
+          ['CC=gcc -std=c99 -fast',FSPARSEDEF],'-outdir',s,[s '/source/fsparse.c']);
+    end
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/powerseries.c']);
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/sppmul.c']);
+    mex('-largeArrayDims', ...
+	'CC=gcc -std=c99 -fast','-outdir',s,[s '/source/spreplace.c']);
+  else
+    if opts.openmp, warning('Compilation of OpenMP not (yet?) supported for this platform.'); end 
+    if ~strncmp(ver,'8.4',3) && ~strncmp(version,'9.6',3)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '8.4 and 9.6 only.']);
+    end
+    mex('CFLAGS=-Wno-parentheses -std=c99','-outdir',s,[s '/source/clenshaw.c']);
+    mex('CFLAGS= -std=c99','-outdir',s,[s '/source/fsetop.c']);
+
+    mex('-largeArrayDims', ...
+        'CFLAGS= -std=c99','-outdir',s,[s '/source/mexfrepmat.c']);
+    if opts.openmp
+      % no harm in trying (await update of Clang?)
+      clibs = ['CLIBS=-lgomp -lm -L' matlabroot '/bin/maci64 -lmx ' ...
+               '-lmex'];
+      mex('-largeArrayDims',clibs, ...
+          ['CFLAGS=-fopenmp -O5 -fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions ' FSPARSEDEF], ...
+          '-outdir',s,[s '/source/fsparse.c']);
+    else
+      mex('-largeArrayDims', ...
+          ['CFLAGS=-Wno-logical-op-parentheses -std=c99 ',FSPARSEDEF],'-outdir',s,[s '/source/fsparse.c']);
+    end
+    mex('CFLAGS= -std=c99','-outdir',s,[s '/source/powerseries.c']);
+    mex('CFLAGS= -std=c99','-outdir',s,[s '/source/sppmul.c']);
+    mex('-largeArrayDims', ...
+	'CFLAGS= -std=c99','-outdir',s,[s '/source/spreplace.c']);
+  end
+elseif strcmp(mx,'mexs64')
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  if ~strncmp(ver,'7.7',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.7 only.']);
+  end
+  mex('-outdir',s,[s '/source/clenshaw.c']);
+  mex('-DNO_STDINT','-outdir',s,[s '/source/fsetop.c']);
+  mex('-largeArrayDims','-outdir',s,[s '/source/mexfrepmat.c']);
+  mex('-largeArrayDims',['-DNO_STDINT ' FSPARSEDEF], ...
+      '-outdir',s,[s '/source/fsparse.c']);
+  mex('-outdir',s,[s '/source/powerseries.c']);
+  mex('-outdir',s,[s '/source/sppmul.c']);
+  mex('-largeArrayDims','-outdir',s,[s '/source/spreplace.c']);
+elseif strcmp(mx,'mexw64')
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  mex('-outdir',s,[s '/source/clenshaw.c']);
+  mex('-outdir',s,[s '/source/fsetop.c']);
+  mex('-outdir',s,[s '/source/mexfrepmat.c']);
+  mex(FSPARSEDEF,'-outdir',s,[s '/source/fsparse.c']);
+  mex('-outdir',s,[s '/source/powerseries.c']);
+  mex('-outdir',s,[s '/source/sppmul.c']);
+  mex('-largeArrayDims','-outdir',s,[s '/source/spreplace.c']);
+else
+  warning('New platform. Trying default make.');
+  if opts.openmp, warning('OpenMP not implemented for this platform.'); end
+  mex('-outdir',s,[s '/source/clenshaw.c']);
+  mex('-outdir',s,[s '/source/fsetop.c']);
+  mex('-outdir',s,[s '/source/mexfrepmat.c']);
+  mex(FSPARSEDEF,'-outdir',s,[s '/source/fsparse.c']);
+  mex('-outdir',s,[s '/source/powerseries.c']);
+  mex('-outdir',s,[s '/source/sppmul.c']);
+  mex('-largeArrayDims','-outdir',s,[s '/source/spreplace.c']);
+end
diff --git a/stenglib/Fast/source/mexfrepmat.c b/stenglib/Fast/source/mexfrepmat.c
new file mode 100644
index 00000000..aae1825b
--- /dev/null
+++ b/stenglib/Fast/source/mexfrepmat.c
@@ -0,0 +1,300 @@
+/* mexfrepmat.c */
+
+/* S. Engblom 2010-02-02 (Minor revision) */
+/* S. Engblom 2007-05-04 (Revision) */
+/* S. Engblom 2005-05-06 (Revision) */
+/* S. Engblom 2004-10-22 */
+
+#include <math.h>
+#include <string.h>
+#include <limits.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+// rmemcpy() below assumes this:
+#if CHAR_BIT != 8
+#error "Code assumes that 1 char is 1 byte."
+#endif
+
+/*------------------------------------------------------------------------*/
+
+// forward declarations
+void replicate(const char *prA,const char *piA,
+	       int ndimA,const mwSize *sizA,int lenA,
+	       char *prB,char *piB,
+	       int ndimB,const mwSize *sizB,int lenB,int nbytes);
+void sparse_replicate(const char *prA,const char *piA,
+		      const mwIndex *irA,const mwSize *jcA,const mwSize *sizA,
+		      char *prB,char *piB,
+		      mwIndex *irB,mwSize *jcB,const mwSize *sizB,
+		      int rM,int rN,int nnzB,int nbytes);
+void rmemcpy(void *s,size_t block,size_t len);
+void r2memcpy(void *s1,void *s2,size_t block,size_t len);
+
+/*------------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  // check of syntax
+  if (nrhs != 2 || nlhs > 1)
+    mexErrMsgIdAndTxt("frepmat:e1","Expecting two inputs and one output.");
+
+  if (!(mxIsNumeric(prhs[0]) || mxIsChar(prhs[0]) || mxIsLogical(prhs[0])))
+    mexErrMsgIdAndTxt("frepmat:e2","Only numerical, character and "
+		      "logical arrays supported.");
+
+  if (!mxIsDouble(prhs[1]) || mxIsComplex(prhs[1]) || mxIsSparse(prhs[1]))
+    mexErrMsgIdAndTxt("frepmat:e3","Size argument must be real, double "
+		      "and non-sparse.");
+
+  // input A
+  const int ndimA = mxGetNumberOfDimensions(prhs[0]);
+  const mwSize *sizA = mxGetDimensions(prhs[0]);
+  const int lenA = mxGetNumberOfElements(prhs[0]);
+  const char *prA = mxGetData(prhs[0]);
+  const char *piA = mxGetImagData(prhs[0]);
+  const bool complx = piA != NULL;
+  const bool sparse = mxIsSparse(prhs[0]);
+
+  // input REP
+  const int lenREP = mxGetNumberOfElements(prhs[1]);
+  const double *prREP = mxGetPr(prhs[1]);
+
+  // output B
+  int ndimB = ndimA >= lenREP ? ndimA : lenREP;
+  mwSize sizB[ndimB];
+
+  // check input REP
+  for (int i = 0; i < lenREP; i++)
+    if (prREP[i] < 0.0 || prREP[i] != ceil(prREP[i]))
+      mexErrMsgIdAndTxt("frepmat:e4","Size argument must be nonnegative "
+			"integers.");
+
+  if (sparse && ndimB > 2)
+    mexErrMsgIdAndTxt("frepmat:e5","Sparse N-dimensional matrices are "
+		      "not supported.");
+ 
+  // determine the size of the result B
+  if (ndimB == ndimA) {
+    int i;
+    for (i = 0; i < lenREP; i++) sizB[i] = sizA[i]*prREP[i];
+    for ( ; i < ndimA; i++) sizB[i] = sizA[i];
+  }
+  else {
+    int i;
+    for (i = 0; i < ndimA; i++) sizB[i] = prREP[i]*sizA[i];
+    for ( ; i < lenREP; i++) sizB[i] = prREP[i];
+  }
+
+  // allocate and do the job
+  if (!sparse) {
+    plhs[0] = mxCreateNumericArray(ndimB,sizB,
+				   mxGetClassID(prhs[0]),
+				   complx ? mxCOMPLEX : mxREAL);
+    replicate(prA,piA,ndimA,sizA,lenA,
+	      mxGetData(plhs[0]),mxGetImagData(plhs[0]),
+	      mxGetNumberOfDimensions(plhs[0]),
+	      mxGetDimensions(plhs[0]),
+	      mxGetNumberOfElements(plhs[0]),
+	      mxGetElementSize(plhs[0]));
+  }
+  else {
+    const int rM = lenREP < 1 ? 1 : prREP[0];
+    const int rN = lenREP < 2 ? 1 : prREP[1];
+    const int nnzB = mxGetJc(prhs[0])[sizA[1]]*rM*rN;
+
+    if (mxGetClassID(prhs[0]) == mxLOGICAL_CLASS)
+      plhs[0] = mxCreateSparseLogicalMatrix(sizB[0],sizB[1],nnzB);
+    else
+      plhs[0] = mxCreateSparse(sizB[0],sizB[1],nnzB,
+			       complx ? mxCOMPLEX : mxREAL);
+
+    sparse_replicate(prA,piA,mxGetIr(prhs[0]),mxGetJc(prhs[0]),sizA,
+		     mxGetData(plhs[0]),mxGetImagData(plhs[0]),
+		     mxGetIr(plhs[0]),mxGetJc(plhs[0]),
+		     mxGetDimensions(plhs[0]),
+		     rM,rN,nnzB,mxGetElementSize(plhs[0]));
+  }
+}
+/*------------------------------------------------------------------------*/
+void replicate(const char *prA,const char *piA,
+	       int ndimA,const mwSize *sizA,int lenA,
+	       char *prB,char *piB,
+	       int ndimB,const mwSize *sizB,int lenB,int nbytes)
+/* Replicates the matrix A within the matrix B by filling B in all its
+   dimensions with repeated copies of A. Each element of both matrices
+   is assumed to occupy nbytes bytes.
+
+   Note: using char * instead of void * as done above means that it is
+   implicitly assumed that sizeof(char) == 1 (byte). If this is not
+   true, the code won't work. */
+{
+  // empty case
+  if (lenB == 0) return;
+
+  // various
+  const bool complx = piA != NULL;
+  int d1,dimcount[ndimA];
+  int strB[ndimB+1],strBsizA[ndimA];
+
+  // construct stride for B
+  strB[0] = 1;
+  for (int i = 0; i < ndimB; i++) strB[i+1] = strB[i]*sizB[i];
+
+  // frequently used product
+  for (int i = 0; i < ndimA; i++) strBsizA[i] = strB[i]*sizA[i];
+
+  // find first nonmatching dimension
+  for (d1 = 0; d1 < ndimA-1; d1++)
+    if (sizA[d1] != sizB[d1]) break;
+
+  // a single initial copy of A is placed in B
+  memset(dimcount,0,ndimA*sizeof(dimcount[0]));
+  for (int i = 0, j = 0, block = strBsizA[d1]; ; ) {
+    // copy as much as possible
+    memcpy(&prB[j*nbytes],&prA[i*nbytes],block*nbytes);
+    if (complx) memcpy(&piB[j*nbytes],&piA[i*nbytes],block*nbytes);
+
+    // increase pointer in A
+    if ((i += block) == lenA) break;
+
+    // increase pointer in B
+    for (int k = d1+1; j += strB[k],++dimcount[k] == sizA[k]; k++) {
+      dimcount[k] = 0;
+      j -= strBsizA[k];
+    }
+  }
+
+  // replicate over the first dimensions
+  memset(dimcount,0,ndimA*sizeof(dimcount[0]));  
+  for (int r = d1; r < ndimA-1; r++) {
+    const int block = strBsizA[r];
+    for (int i = 0; i < lenB; ) {
+      // copy in a recursive fashion
+      if (complx)
+	r2memcpy(&prB[i*nbytes],&piB[i*nbytes],
+		 block*nbytes,strB[r+1]*nbytes);
+      else
+	rmemcpy(&prB[i*nbytes],block*nbytes,strB[r+1]*nbytes);
+
+      // determine next adress in B
+      for (int k = r+1; i += strB[k],++dimcount[k] == sizA[k]; ) {
+	dimcount[k] = 0;
+	i -= strBsizA[k];
+	if (++k == ndimA) {
+	  i = lenB;
+	  break;
+	}
+      }
+    }
+  }
+
+  // the final dimensions are easier
+  if (complx)
+    r2memcpy(prB,piB,strBsizA[ndimA-1]*nbytes,lenB*nbytes);
+  else
+    rmemcpy(prB,strBsizA[ndimA-1]*nbytes,lenB*nbytes);
+}
+/*------------------------------------------------------------------------*/
+void sparse_replicate(const char *prA,const char *piA,
+		      const mwIndex *irA,const mwSize *jcA,
+		      const mwSize *sizA,
+		      char *prB,char *piB,
+		      mwIndex *irB,mwSize *jcB,const mwSize *sizB,
+		      int rM,int rN,int nnzB,int nbytes)
+/* Sparse replication of matrix. Works as replicate() above (and with
+   the same limitations on sizeof(char)) but for sparse matrices A and
+   B instead. */
+{
+  // empty case
+  if (nnzB == 0) return;
+
+  const bool complx = piA != NULL;
+
+  /* copy jcA to jcB and then replicate the copy rN-1 times in
+     dimension 2 */
+  memcpy(jcB,jcA,(sizA[1]+1)*sizeof(jcA[0]));
+  for (int i = 1,jj = sizA[1]+1; i < rN; i++) {
+    // add new offset for each round
+    const int offset = jcB[jj-1];
+    for (int ii = 1; ii <= sizA[1]; ii++,jj++)
+      jcB[jj] = jcA[ii]+offset;
+  }
+
+  if (rM > 1) {
+    /* replicate rM times in dimension 1 by scaling the cumulative
+       pointer to account for added elements in each column */
+    for (int jj = 1; jj <= sizB[1]; jj++) jcB[jj] *= rM;
+
+    // construct irB columnwise and copy values at the same time
+    for (int j = 0; j < sizA[1]; j++) {
+      // copy and replicate the values
+      memcpy(&prB[jcB[j]*nbytes],&prA[jcA[j]*nbytes],
+	     (jcA[j+1]-jcA[j])*nbytes);
+      if (complx) {
+	memcpy(&piB[jcB[j]*nbytes],&piA[jcA[j]*nbytes],
+	       (jcA[j+1]-jcA[j])*nbytes);
+	r2memcpy(&prB[jcB[j]*nbytes],&piB[jcB[j]*nbytes],
+		 (jcA[j+1]-jcA[j])*nbytes,(jcB[j+1]-jcB[j])*nbytes);
+      }
+      else
+	rmemcpy(&prB[jcB[j]*nbytes],(jcA[j+1]-jcA[j])*nbytes,
+		(jcB[j+1]-jcB[j])*nbytes);
+
+      /* copy irA to irB and then replicate the copy rM-1 times in
+	 dimension 1 */
+      memcpy(&irB[jcB[j]],&irA[jcA[j]],(jcA[j+1]-jcA[j])*sizeof(irA[0]));
+      for (int i = 1,jj = jcB[j]+jcA[j+1]-jcA[j]; i < rM; i++) {
+	// add new offset for each round
+	const int offset = i*sizA[0];
+	for (int ii = jcA[j]; ii < jcA[j+1]; ii++,jj++)
+	  irB[jj] = irA[ii]+offset;
+      }
+    }
+  }
+  else {
+    // faster case, no replication in dimension 1
+    memcpy(irB,irA,jcA[sizA[1]]*sizeof(irA[0]));
+    memcpy(prB,prA,jcA[sizA[1]]*nbytes);
+    if (complx)
+      memcpy(piB,piA,jcA[sizA[1]]*nbytes);
+  }
+
+  // finally replicate the copy rN-1 times in dimension 2
+  rmemcpy(irB,jcA[sizA[1]]*rM*sizeof(irB[0]),jcB[sizB[1]]*sizeof(irB[0]));
+  if (complx)
+    r2memcpy(prB,piB,jcA[sizA[1]]*rM*nbytes,
+	     jcB[sizB[1]]*nbytes);
+  else
+    rmemcpy(prB,jcA[sizA[1]]*rM*nbytes,
+	    jcB[sizB[1]]*nbytes);
+}
+/*------------------------------------------------------------------------*/
+void rmemcpy(void *s,size_t block,size_t len)
+/* Formally the same as memcpy(s+block,s,len-block), except that
+   according to ANSI-C, memcpy() does not allow overlapping
+   blocks. Assumes that sizeof(char) == 1 byte. */
+{
+  // the size of the block is doubled for each round
+  while (block < len >> 1) {
+    memcpy((char *)s+block,s,block);
+    block <<= 1;
+  }
+
+  // the remainder
+  memcpy((char *)s+block,s,len-block);
+}
+/*------------------------------------------------------------------------*/
+void r2memcpy(void *s1,void *s2,size_t block,size_t len)
+/* The same as rmemcpy(s1,block,len) followed by
+   rmemcpy(s2,block,len). Used for real and imaginary parts. */
+{
+  while (block < len >> 1) {
+    memcpy((char *)s1+block,s1,block);
+    memcpy((char *)s2+block,s2,block);
+    block <<= 1;
+  }
+  memcpy((char *)s1+block,s1,len-block);
+  memcpy((char *)s2+block,s2,len-block);
+}
+/*------------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/powerseries.c b/stenglib/Fast/source/powerseries.c
new file mode 100644
index 00000000..8e619dd5
--- /dev/null
+++ b/stenglib/Fast/source/powerseries.c
@@ -0,0 +1,133 @@
+/* powerseries.c */
+
+/* S. Engblom 2007-08-17 (Minor revision) */
+/* S. Engblom 2007-06-13 (Minor revision) */
+/* S. Engblom 2007-01-22 */
+
+#include <math.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/* forward declaration */
+void powerseries(double *yr,double *yi,
+		 const double *cr,const double *ci,int Nc,
+		 const double *xr,const double *xi,int Nx,
+		 double tol);
+
+#define ISDOUBLEMATRIX(A) (mxIsDouble(A) && !mxIsSparse(A) && \
+			   mxGetNumberOfDimensions(A) == 2)
+#define ISREALSCALAR(A) (!mxIsCell(A) && !mxIsStruct(A) && \
+			 !mxIsComplex(A) && mxGetNumberOfElements(A) == 1)
+
+/*------------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  if (1 < nlhs || nrhs != 3)
+    mexErrMsgTxt("Expecting one output and 3 inputs.");
+
+  /* inputs C and X */
+  if (!ISDOUBLEMATRIX(prhs[0]) || !ISDOUBLEMATRIX(prhs[1]))
+    mexErrMsgTxt("First two arguments must be double matrices.");
+
+  /* input tol */
+  if (!ISREALSCALAR(prhs[2]))
+    mexErrMsgTxt("Expecting a real scalar tolerance.");
+
+  /* create output */
+  plhs[0] = mxCreateDoubleMatrix(mxGetM(prhs[1]),
+				 mxGetN(prhs[1]),
+				 mxIsComplex(prhs[0]) || 
+				 mxIsComplex(prhs[1]) ? mxCOMPLEX : mxREAL);
+
+  /* evaluate the result */
+  powerseries(mxGetPr(plhs[0]),mxGetPi(plhs[0]),
+	      mxGetPr(prhs[0]),mxGetPi(prhs[0]),
+	      mxGetNumberOfElements(prhs[0]),
+	      mxGetPr(prhs[1]),mxGetPi(prhs[1]),
+	      mxGetNumberOfElements(prhs[1]),
+	      *mxGetPr(prhs[2]));
+}
+/*------------------------------------------------------------------------*/
+void powerseries(double *yr,double *yi,
+		 const double *cr,const double *ci,int Nc,
+		 const double *xr,const double *xi,int Nx,
+		 double tol)
+/* Summation of power series. Computes y = sum_(k >= 0) c[k]*x^k where
+   y = (yr,yi), c = (cr,ci) and x = (xr,xi) and where any of the
+   imaginary parts may be NULL. The coefficient vector has length Nc
+   and the coordinate vector length Nx. It is required that y be
+   allocated and cleared before the call.*/
+{
+  bool warn = false;
+
+  if (yi == NULL)
+    for (int i = 0; i < Nx; i++) {
+      int j;
+      double powr = 1.0;
+      for (j = 0; j < Nc; j++) {
+	double termr = cr[j]*powr;
+	yr[i] += termr;
+	if (fabs(termr) < fabs(yr[i])*tol) break;
+	powr *= xr[i];
+      }
+      warn = warn || j == Nc;
+    }
+  else if (ci == NULL)
+    for (int i = 0; i < Nx; i++) {
+      int j;
+      double powr = 1.0,powi = 0.0;
+      for (j = 0; j < Nc; j++) {
+	double termr = cr[j]*powr;
+	double termi = cr[j]*powi;
+	yr[i] += termr;
+	yi[i] += termi;
+	if (fabs(termr)+fabs(termi) < (fabs(yr[i])+fabs(yi[i]))*tol)
+	  break;
+	double powr_ = powr*xr[i]-powi*xi[i];
+	double powi_ = powr*xi[i]+powi*xr[i];
+	powr = powr_;
+	powi = powi_;
+      }
+      warn = warn || j == Nc;
+    }
+  else if (xi == NULL)
+    for (int i = 0; i < Nx; i++) {
+      int j;
+      double powr = 1.0;
+      for (j = 0; j < Nc; j++) {
+	double termr = cr[j]*powr;
+	double termi = ci[j]*powr;
+	yr[i] += termr;
+	yi[i] += termi;
+	if (fabs(termr)+fabs(termi) < (fabs(yr[i])+fabs(yi[i]))*tol)
+	  break;
+	powr *= xr[i];
+      }
+      warn = warn || j == Nc;
+    }
+  else
+    for (int i = 0; i < Nx; i++) {
+      int j;
+      double powr = 1.0,powi = 0.0;
+      for (j = 0; j < Nc; j++) {
+	double termr = cr[j]*powr-ci[j]*powi;
+	double termi = cr[j]*powi+ci[j]*powr;
+	yr[i] += termr;
+	yi[i] += termi;
+	if (fabs(termr)+fabs(termi) < (fabs(yr[i])+fabs(yi[i]))*tol)
+	  break;
+	double powr_ = powr*xr[i]-powi*xi[i];
+	double powi_ = powr*xi[i]+powi*xr[i];
+	powr = powr_;
+	powi = powi_;
+      }
+      warn = warn || j == Nc;
+    }
+
+  if (warn)
+    mexWarnMsgIdAndTxt("powerseries:w1",
+		       "Series did not converge to the prescribed "
+		       "accuracy for all arguments.");
+}
+/*------------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/sppmul.c b/stenglib/Fast/source/sppmul.c
new file mode 100644
index 00000000..9ac4dfb5
--- /dev/null
+++ b/stenglib/Fast/source/sppmul.c
@@ -0,0 +1,80 @@
+/* sppmul.c */
+/* S. Engblom 2005-10-27 */
+
+#include "mex.h"
+#include "matrix.h"
+
+#define MXTYPE(A,B) (mxIsComplex(A) || mxIsComplex(B) ? \
+		     mxCOMPLEX : mxREAL)
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* check of syntax */
+  if (nrhs != 3 || nlhs > 1)
+    mexErrMsgTxt("Expecting three inputs and one output.");
+
+  /* input */
+  if (!mxIsSparse(prhs[0]))
+    mexErrMsgTxt("Expecting a sparse array as the first input.");
+  const int M = mxGetM(prhs[0]),N = mxGetN(prhs[0]);
+  const mwSize *jc = mxGetJc(prhs[0]);
+  const mwIndex *ir = mxGetIr(prhs[0]);
+
+  if (!mxIsDouble(prhs[1]) || mxGetNumberOfElements(prhs[1]) != jc[N])
+    mexErrMsgTxt("Expecting a matching double array as "
+		 "the second input.");
+  const double *prv = mxGetPr(prhs[1]),*piv = mxGetPi(prhs[1]);
+
+  if (!mxIsDouble(prhs[2]) || mxGetNumberOfDimensions(prhs[2]) > 2 || 
+      mxGetM(prhs[2]) != N)
+    mexErrMsgTxt("Expecting a double matrix with matching first "
+		 "dimension as the third input.");
+  const int K = mxGetN(prhs[2]);
+  const double *prX = mxGetPr(prhs[2]),*piX = mxGetPi(prhs[2]);
+
+  /* allocate output */
+  plhs[0] = mxCreateDoubleMatrix(M,K,MXTYPE(prhs[1],prhs[2]));
+  double *prY = mxGetPr(plhs[0]),*piY = mxGetPi(plhs[0]);
+
+  /* evaluate the product */
+  if (piY == NULL)
+    for (int k = 0; k < K; k++) {
+      for (int i = 0; i < N; i++)
+	for (mwSize j = jc[i]; j < jc[i+1]; j++)
+	  prY[ir[j]] += prv[j]*prX[i];
+      prY += M;
+      prX += N;
+    }
+  else if (piX == NULL)
+    for (int k = 0; k < K; k++) {
+      for (int i = 0; i < N; i++)
+	for (mwSize j = jc[i]; j < jc[i+1]; j++) {
+	  prY[ir[j]] += prv[j]*prX[i];
+	  piY[ir[j]] += piv[j]*prX[i];
+	}
+      prY += M; piY += M;
+      prX += N;
+    }
+  else if (piv == NULL)
+    for (int k = 0; k < K; k++) {
+      for (int i = 0; i < N; i++)
+	for (mwSize j = jc[i]; j < jc[i+1]; j++) {
+	  prY[ir[j]] += prv[j]*prX[i];
+	  piY[ir[j]] += prv[j]*piX[i];
+	}
+      prY += M; piY += M;
+      prX += N; piX += N;
+    }
+  else
+    for (int k = 0; k < K; k++) {
+      for (int i = 0; i < N; i++)
+	for (mwSize j = jc[i]; j < jc[i+1]; j++) {
+	  prY[ir[j]] += prv[j]*prX[i]-piv[j]*piX[i];
+	  piY[ir[j]] += piv[j]*prX[i]+piv[j]*prX[i];
+	}
+      prY += M; piY += M;
+      prX += N; piX += N;
+    }
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Fast/source/spreplace.c b/stenglib/Fast/source/spreplace.c
new file mode 100644
index 00000000..85ef3277
--- /dev/null
+++ b/stenglib/Fast/source/spreplace.c
@@ -0,0 +1,38 @@
+/* spreplace.c */
+/* S. Engblom 2016-11-23 */
+
+#include <string.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+#define MXTYPE(A) (mxIsComplex(A) ? mxCOMPLEX : mxREAL)
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* check of syntax */
+  if (nrhs != 2 || nlhs > 1)
+    mexErrMsgTxt("Expecting two inputs and one output.");
+
+  /* input */
+  if (!mxIsSparse(prhs[0]))
+    mexErrMsgTxt("Expecting a sparse array as the first input.");
+  const int M = mxGetM(prhs[0]),N = mxGetN(prhs[0]);
+  const mwSize *jc = mxGetJc(prhs[0]);
+  const mwIndex *ir = mxGetIr(prhs[0]);
+
+  if (!mxIsDouble(prhs[1]) || mxGetNumberOfElements(prhs[1]) != jc[N])
+    mexErrMsgTxt("Expecting a matching double array as "
+		 "the second input.");
+  const double *prx = mxGetPr(prhs[1]),*pix = mxGetPi(prhs[1]);
+
+  /* allocate and create output */
+  plhs[0] = mxCreateSparse(M,N,jc[N],MXTYPE(prhs[1]));
+  memcpy(mxGetJc(plhs[0]),jc,(N+1)*sizeof(jc[0]));
+  memcpy(mxGetIr(plhs[0]),ir,jc[N]*sizeof(ir[0]));
+  memcpy(mxGetPr(plhs[0]),prx,jc[N]*sizeof(double));
+  if (pix)
+    memcpy(mxGetPi(plhs[0]),pix,jc[N]*sizeof(double));
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Fast/sppmul.m b/stenglib/Fast/sppmul.m
new file mode 100644
index 00000000..6141da6a
--- /dev/null
+++ b/stenglib/Fast/sppmul.m
@@ -0,0 +1,15 @@
+function y = sppmul(S,v,x)
+%SPPMUL Sparse pattern multiply.
+%   Y = SPPMUL(S,V,X), where S is a sparse matrix and X is a full
+%   matrix, computes the product Y = S*X but with the nonzero elements
+%   of S replaced by those in the vector V. SPPMUL is thus formally
+%   equivalent to the code
+%      [i,j,s] = find(S);
+%      Y = sparse(i,j,v,size(S,1),size(S,2))*X;
+%   Of course, the intermediate sparse matrix is never actually 
+%   constructed. It is required that NNZ(S) = PROD(SIZE(V)) and
+%   that SIZE(S,2) = SIZE(X,1).
+
+% S. Engblom 2005-10-27
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/sppmul.mexw64 b/stenglib/Fast/sppmul.mexw64
new file mode 100644
index 00000000..93274d7b
Binary files /dev/null and b/stenglib/Fast/sppmul.mexw64 differ
diff --git a/stenglib/Fast/spreplace.m b/stenglib/Fast/spreplace.m
new file mode 100644
index 00000000..2af88bcc
--- /dev/null
+++ b/stenglib/Fast/spreplace.m
@@ -0,0 +1,18 @@
+function T = spreplace(S,x)
+%SPREPLACE Sparse pattern replace.
+%   T = SPREPLACE(S,X), where S is a sparse matrix and X a vector,
+%   replaces the nonzero elements of S with X. SPREPLACE is thus
+%   formally equivalent to the code
+%      [i,j,s] = find(S);
+%      T = sparse(i,j,x,size(S,1),size(S,2));
+%   It is required that NNZ(S) = PROD(SIZE(X)).
+%
+%   Example:
+%      S = sprand(30,30,0.2);
+%      F = spreplace(S,exp(nonzeros(S)));
+%
+%   See also SPPMUL, FSPARSE.
+
+% S. Engblom 2016-11-23
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Fast/spreplace.mexw64 b/stenglib/Fast/spreplace.mexw64
new file mode 100644
index 00000000..b80c2d0f
Binary files /dev/null and b/stenglib/Fast/spreplace.mexw64 differ
diff --git a/stenglib/Fast/startup.m b/stenglib/Fast/startup.m
new file mode 100644
index 00000000..8b778f35
--- /dev/null
+++ b/stenglib/Fast/startup.m
@@ -0,0 +1,9 @@
+%STARTUP Add paths to FAST stuff.
+
+% S. Engblom 2005-03-22
+
+s = pwd;
+addpath([s '/source']);
+if exist('test','dir')
+  addpath([s '/test']);
+end
diff --git a/stenglib/Misc/repeat.m b/stenglib/Misc/repeat.m
new file mode 100644
index 00000000..5396e87b
--- /dev/null
+++ b/stenglib/Misc/repeat.m
@@ -0,0 +1,18 @@
+function c = repeat(a,b)
+%REPEAT Repeat elements of vector.
+%   C = REPEAT(A,B) returns A repeated by B, see the example
+%   below. Both arguments are assumed to be row vectors of the same
+%   length and B must contain nonnegative integers only.
+%
+%   No error-checking is performed.
+%
+%   Example:
+%     a = [0 1 pi inf nan -nan];
+%     b = [1 4 0   2   3    0 ];
+%     c = repeat(a,b) % returns c = [0 1 1 1 1 inf inf nan nan nan]
+
+% S. Engblom 2005-10-14
+
+% can't beat this!
+ix = cumsum(sparse(1,cumsum([1 b]),1));
+c = a(ix(1:end-1));
diff --git a/stenglib/Misc/startup.m b/stenglib/Misc/startup.m
new file mode 100644
index 00000000..eb2f226b
--- /dev/null
+++ b/stenglib/Misc/startup.m
@@ -0,0 +1,6 @@
+%STARTUP Add paths to MISC stuff.
+
+% S. Engblom 2010-02-10
+
+s = pwd;
+% (do nothing)
diff --git a/stenglib/Misc/sudsolve.m b/stenglib/Misc/sudsolve.m
new file mode 100644
index 00000000..c1483841
--- /dev/null
+++ b/stenglib/Misc/sudsolve.m
@@ -0,0 +1,137 @@
+function [S,ix] = sudsolve(G)
+%SUDSOLVE Solve Sudoku puzzle.
+%   S = SUDSOLVE(G) solves the Sudoku puzzle with initial conditions
+%   given by G. The algorithm used is a recursive greedy selection
+%   procedure.
+%
+%   [S,IX] = SUDSOLVE(G) does the same thing but also returns the
+%   index IX of the Sudoku. The index is defined as the highest branch
+%   point taken by the algorithm and indicates the degree of
+%   difficulty. For most human solvable Sudokus, IX <= 2.
+%
+%   Example:
+%     % a Sudoku of medium difficulty:
+%     G = zeros(9,9);
+%     G([1 2 6 10 13 14 16 18 24 25 29 36 37 41 ...
+%        45 46 53 57 58 64 66 68 69 72 76 80 81]) = ...
+%     [7 5 8 3 4 9 2 6 1 4 6 8 5 1 3 1 2 1 6 8 6 2 9 7 7 6 1];
+%     % (Published in Sydsvenska Dagbladet, Malmoe, Sweden, 2005-07-13)
+%     S = sudsolve(G)
+%
+%     % a rather tricky one:
+%     M = zeros(9,9);
+%     M([2 5 8 10 18 21 25 31 33 37 41 ...
+%        45 49 51 57 61 64 72 74 77 80]) = ...
+%     [6 8 4 2 3 4 7 8 7 3 1 4 6 5 5 6 4 2 3 6 8];
+%     [S,ix] = sudsolve(M) % takes some time
+%     % (From Cleve's corner) 
+
+% S. Engblom 2010-02-25 (Added Moler's example)
+% S. Engblom 2005-07-15
+
+% used for tracking branches
+stack = cell(3,0);
+kmax = 0;
+
+% start with all possibilities
+S = true(9,9,9);
+
+% insert given values G
+[i,j,v] = find(G);
+
+% check G
+if any(v ~= ceil(v)) || any(v < 1) || any(9 < v)
+  error('Illegal initial conditions.');
+end
+if ndims(G) > 2 || any(size(G) ~= 9)
+  error('Illegal initial conditions.');
+end
+
+for k = 1:size(v,1)
+  ii = i(k); jj = j(k); vv = v(k);
+
+  if ~S(ii,jj,vv), error('Inconsistent initial conditions.'); end
+
+  % set individual number
+  S(ii,jj,:) = false;
+
+  % remove possibilities in same column...
+  S(:,jj,vv) = false;
+
+  % ...same row...
+  S(ii,:,vv) = false;
+
+  % ...and same 3-by-3 square
+  I = ceil(ii/3); I = 3*(I-1)+1:3*I;
+  J = ceil(jj/3); J = 3*(J-1)+1:3*J;
+  S(I,J,vv) = false;
+end
+
+% remove possibilities in a greedy fashion
+while true
+  % no more possibilities?
+  while ~any(S(:))
+    % done/unsolvable/pop stack?
+    if all(G(:))
+      % the answer is in G...
+      S = G;
+      if nargout > 1, ix = kmax; end
+      return;
+    elseif size(stack,2) == 0
+      error('Sudoku is unsolvable.');
+    else
+      S = stack{1,end};
+      G = stack{2,end};
+      kmax = stack{3,end};
+      stack = stack(:,1:end-1);
+    end
+  end
+
+  % find the branch with the lowest complexity
+  for k = 1:9
+    [ii,jj] = find(sum(S,3) == k,1);
+    if ~isempty(ii), break; end
+  end % will always succeed
+  kmax = max(k,kmax);
+
+  % increase stack if branch is taken
+  if k > 1
+    v = find(S(ii,jj,:));
+    for l = 1:k
+      vv = v(l);
+
+      % S and G must be saved
+      SS = S; GG = G;
+
+      % make the move
+      GG(ii,jj) = vv;
+      SS(ii,jj,:) = false;
+      SS(:,jj,vv) = false;
+      SS(ii,:,vv) = false;
+      I = ceil(ii/3); I = 3*(I-1)+1:3*I;
+      J = ceil(jj/3); J = 3*(J-1)+1:3*J;
+      SS(I,J,vv) = false;
+
+      % increase stack
+      stack(:,end+1) = {SS; GG; kmax};
+    end
+
+    % continue with the front of the stack
+    S = stack{1,end};
+    G = stack{2,end};
+    kmax = stack{3,end};
+    stack = stack(:,1:end-1);
+  else
+    % no need for stack
+    vv = find(S(ii,jj,:));
+
+    % make the move
+    G(ii,jj) = vv;
+    S(ii,jj,:) = false;
+    S(:,jj,vv) = false;
+    S(ii,:,vv) = false;
+    I = ceil(ii/3); I = 3*(I-1)+1:3*I;
+    J = ceil(jj/3); J = 3*(J-1)+1:3*J;
+    S(I,J,vv) = false;
+  end
+end
diff --git a/stenglib/README.md b/stenglib/README.md
new file mode 100644
index 00000000..0ec5ba0d
--- /dev/null
+++ b/stenglib/README.md
@@ -0,0 +1,232 @@
+# stenglib
+Stefan Engblom's Matlab libraries - packages for daily use.
+
+## License statement for stenglib
+
+You may download all of **stenglib** and
+use, modify and redistribute it in any way you like. A
+redistributor must fully attribute the authorship and make a good
+effort to cite the original location of the software. A researcher
+making *critical* use of the software in research is requested to 
+acknowledge this in publications related to the
+research. A company may use the code in software products provided
+that the original location and the author is clearly cited.
+
+All code provided here comes with absolutely **no warranty**
+and **no support** whatsoever is given.
+
+There are a lot of freeware available on the net. Do **not**
+download unless you agree to the above license.
+
+## About stenglib
+
+**stenglib** is loosely divided into 5 sub-packages, with few dependencies in between them:
+* Tensor
+* Fast
+* Scicomp
+* Utils
+* Misc
+
+For contact details, see [stenglib.org](http://www.stenglib.org).
+I welcome bug reports and comments. Please *do not* send support requests.
+
+## Tensor
+
+Originally, I made the **Tensor** package
+because I had the need to easily, efficiently and consistently
+manage multi-dimensional arrays in Matlab. *Examples:* given a
+matrix and a vector, how do you scale each row in the matrix by
+the vector? How can you multiply a 3-D array with a matrix? The
+package is useful to anyone who writes code for (pseudo-) spectral
+methods, FEM, or who uses multi-dimensional arrays or tensor
+notation a lot.
+
+* **tndims** Number of dimensions.
+  (Depend: 0, status: stable)
+  [tndims.m](Tensor/tndims.m)
+  [tndims.c](Tensor/source/tndims.c)
+
+* **tsize** Size of array.
+  (Depend: 0, status: stable)
+  [tsize.m](Tensor/tsize.m)
+  [tsize.c](Tensor/source/tsize.c)
+
+* **tsum** Tensor summation.
+  (Depend: 0, status: stable)
+  [tsum.m](Tensor/tsum.m)
+  [tsum.c](Tensor/source/tsum.c)
+
+* **tprod** Tensor product.
+  Based on a concept by D. Bertilsson, [COMSOL](http://www.comsol.com).
+  (Depend: 0, status: stable)
+  [tprod.m](Tensor/tprod.m)
+  [tprod.c](Tensor/source/tprod.c)
+  
+There is also a [make.m](Tensor/source/make.m) available.
+It will work on several, but not all, platforms.
+
+## Fast
+
+The routines in the **Fast** package exist
+because some things just take too much time in Matlab. *Examples:*
+replicate a data set in different dimensions (a.k.a. repmat),
+assemble a sparse matrix, or evaluate set operations. These
+routines should be of general interest to programmer in the
+scientific computing community.
+
+* **frepmat** Fast replication of array.
+  (Depend: Tensor/{tndims,tsize} (weakly), status: stable)
+  [frepmat.m](Fast/frepmat.m)
+  [mexfrepmat.c](Fast/sourcee/mexfrepmat.c)
+
+* **fsparse** Fast assembly of sparse matrix.
+  (Depend: 0, status: stable but not completely settled) 
+  [fsparse.m](Fast/fsparse.m)
+  [fsparse.c](Fast/source/fsparse.c)
+  
+There is now a *parallel* **fsparse** version available. A
+paper describing the algorithm is *S. Engblom, D. Lukarski:
+Fast Matlab compatible sparse assembly on multicore computers*,
+in *Parallel Comput.* 56:1--17 (2016) [(doi)](http://dx.doi.org/10.1016/j.parco.2016.04.001).
+*Fact:* the **fsparse**-code has been selected as the base for the sparse assembly routines
+in [PARALUTION](http://www.paralution.com).
+
+* **clenshaw** Evaluation of 3-term recurrences.
+  (Depend: 0, status: stable)
+  [clenshaw.m](Fast/clenshaw.m)
+  [clenshaw.c](Fast/source/clenshaw.c)
+
+* **fsetop** Fast set operations based on hashing. Based on
+  a concept by [P.-O. Persson](http://www.mit.edu/~persson) and a
+  hash-function by [P. Hsieh](http://www.azillionmonkeys.com/qed/hash.html)
+  (Depend: 0, status: stable)
+  [fsetop.m](Fast/fsetop.m)
+  [fsetop.c](Fast/source/fsetop.c)
+  
+* **sppmul** Sparse pattern multiply.
+  (Depend: 0, status: stable) Download:
+  [sppmul.m](Fast/sppmul.m)
+  [sppmul.c](Fast/source/sppmul.c)
+
+* **powerseries** Sum power series.
+  (Depend: 0, status: stable)
+  [powerseries.m](Fast/powerseries.m)
+  [powerseries.c](Fast/source/powerseries.c)
+
+As before there is a [make.m](Fast/source/make.m) available which you will probably have to modify.
+
+## Scicomp
+
+In **Scicomp** I've assembled some solvers from different areas within scientific computing: two solvers for
+nonlinear problems and an implementation of the Nelder-Mead simplex algorithm. I have also put three routines 
+for Gaussian quadratures with respect to discrete measures in this package.
+
+* **rtsafe** Scalar nonlinear solver.
+  (Depend: 0, status: stable)
+  [rtsafe.m](Scicomp/rtsafe.m)
+  
+* **ainsolve** Solver for large sets of nonlinear equations.
+  (Depend: 0, status: experimental)
+  [ainsolve.m](Scicomp/ainsolve.m)
+
+* **nmsimplex** Nelder-Meads simplex-algorithm.
+  (Depend: Fast/frepmat, Utils/parseopts, status: stable)
+  [nmsimplex.m](Scicomp/nmsimplex.m)
+
+* **ode1s** ODE-solver: split-step Euler method with digital control.
+  (Depend: Fast/{frepmat,fsetop}, Utils/{parseopts,report}, status: stable, but not finalized)
+  [ode1s.m](Scicomp/ode1s.m)
+
+* **gaussqd** Gaussian quadrature for discrete measures.
+  (Depend: Tensor/tprod, Fast/fsparse, status: stable)
+  [gaussqd.m](Scicomp/gaussqd.m)
+
+* **gausspd** Gaussian polynomial for discrete measures.
+  (Depend: Tensor/{tsum,tprod}, Fast/clenshaw, status: stable)
+  [gausspd.m](Scicomp/gausspd.m)
+  
+* **poch** Pochhammer's function.
+  (Depend: 0, status: stable)
+  [poch.m](Scicomp/poch.m)
+
+## Utils
+
+In the package **Utils** I've collected various
+routines for performing everyday tasks. Examples include
+generating LaTeX-arrays from matrices, .gif-animations, small
+perfect hash functions and removing files ending with a tilde (!).
+
+* **arr2latex** LaTeX-table from matrix.
+(Depend: Fast/frepmat, Utils/parseopts, status: stable)
+[arr2latex.m](Utils/arr2latex.m)
+
+* **assignopts** Assign options.
+(Depend: 0, status: stable)
+[assignopts.m](Utils/assignopts.m)
+
+* **parseopts** Parse options.
+(Depend: Fast/fsetop, status: stable)
+[parseopts.m](Utils/parseopts.m)
+
+* **matmerge** Merge .mat-files.
+(Depend: Fast/fsetop, Tensor/{tsize,tndims}, status: stable but kind of a hack)
+[matmerge.m](Utils/matmerge.m)
+
+* **consistency** Local truncation error of linear multistep method.
+(Depend: 0, status: stable)
+[consistency.m](Utils/consistency.m)
+
+* **stability** Plot of stability region for linear multistep method.
+(Depend: 0, status: stable)
+[stability.m](Utils/stability.m)
+
+* **perfecthash** Perfect hash-function from strings.
+(Depend: Tensor/tsum, Fast/fsparse, status: this is a hack, really)
+[perfecthash.m](Utils/perfecthash.m)
+
+* **movie2gif** Create GIF animation from MATLAB movie.
+(Depend: imwrite-function in Matlab, status: stable)
+[movie2gif.m](Utils/movie2gif.m)
+
+* **report** Report progress of solver.
+(Depend: 0, status: stable)
+[report.m](Utils/report.m)
+
+* **runtest** General test facility.
+(Depend: 0, status: stable)
+[runtest.m](Utils/runtest.m)
+
+* **rmtilde** Remove files ending with a tilde ('~').
+(Depend: 0, status: stable)
+[rmtilde.m](Utils/rmtilde.m)
+
+* **connect** Connectivity information for points.
+(Depend: 0, status: stable)
+[connect.m](Utils/connect.m)
+
+* **ndop** General N-dimensional operator.
+(Depend: Tensor/{tndims,tsum,tprod}, Fast/frepmat, status: stable)
+[ndop.m](Utils/ndop.m)
+
+* **spblock** Sparse matrix from blocks.
+(Depend: Fast/fsparse, status: stable)
+[spblock.m](Utils/spblock.m)
+
+* **spym** Visualize magnitude of elements in matrix.
+(Depend: 0, status: this is a hack)
+[spym.m](Utils/spym.m)
+
+## Misc
+
+I'll put amusing routines in **Misc**. For now,
+this humble package contains a sudoku solver and a short function
+which I personally believe is the most beautiful Matlab-code ever
+written.
+
+* **sudsolve** Solve Sudoku puzzle.
+(Depend: 0, status: stable)
+[sudsolve.m](Misc/sudsolve.m)
+
+* **repeat** Repeat elements of vector.
+(Depend: 0, status: stable)
+[repeat.m](Misc/repeat.m)
diff --git a/stenglib/Scicomp/ainsolve.m b/stenglib/Scicomp/ainsolve.m
new file mode 100644
index 00000000..fd14bbdf
--- /dev/null
+++ b/stenglib/Scicomp/ainsolve.m
@@ -0,0 +1,415 @@
+function [x,fx,flag,out,jac] = ainsolve(fun,x0,opts,varargin)
+%AINSOLVE Solver for large sets of nonlinear equations.
+%   AINSOLVE solves nonlinear equations of the form F(X) = 0, where F
+%   and X are vectors, by using either a damped Newton method (DN) or
+%   an accelerated Newton method (AN) where old search directions as
+%   well as a preconditioned Krylov space is used together with the
+%   Newton direction in order to accelerate the convergence. The
+%   inversion of the Jacobian may be avoided using an inexact variant
+%   (AIN) where the Newton direction is not used. Furthermore, there
+%   are variants (XAN/XAIN) of both accelerated algorithms in which
+%   the Krylov space is recomputed for every inner iteration. This is
+%   more expensive but may also produce a faster convergence.
+%
+%   X = AINSOLVE(FUN,X0) starts at the initial guess X0 and tries to
+%   solve the equations in FUN. FUN has the signature [Y,J] =
+%   FUN(X,...) and returns a vector of values Y together with a
+%   Jacobian J evaluated at the point X. Note that FUN will be called
+%   with both one and two output arguments which can be exploited in
+%   order to improve on the efficiency.
+%
+%   X = AINSOLVE(FUN,X0,OPTS) solves with the default parameters
+%   replaced by the values in the structure OPTS. The syntax OPTDEF =
+%   AINSOLVE returns the default options and the syntax OPTHELP =
+%   AINSOLVE('opts') produces a short description of available
+%   options.
+%
+%   X = AINSOLVE(FUN,X0,OPTS,P1,P2,...) passes the parameters P1, P2,
+%   ... directly to the function FUN; FUN(X,P1,P2,...).
+%
+%   An important option is the use of the field OPTS.prefun which
+%   specify the preconditioner for the Krylov space. The syntax works
+%   as follows; initially let M = [] and for every assembled Jacobian
+%   J, initialize the preonditioner by calling [PFUN,M] =
+%   feval(OPTS.prefun,J,M,OPTS.prepar,P1,P2,...). Then the
+%   preconditioner is defined by the call X = feval(PFUN,R,M), where X
+%   should be an approximative solution to the equation J*X = R.
+%
+%   [X,FX] = AINSOLVE(FUN,X0,...) returns the value of the objective
+%   function at X.
+%
+%   [X,FX,FLAG] = AINSOLVE(FUN,X0,...) returns a number FLAG that
+%   describes the exit condition of AINSOLVE.
+%   If FLAG is:
+%      0 then AINSOLVE converged to a solution X,
+%      1 then AINSOLVE converged to a solution X that does not satisfy
+%        the tolerance on the norm of the residual,
+%      2 then the maximum number of nonlinear iterations was reached,
+%      3 then the maximum number of function evaluations was reached.
+%
+%   The error criteria used works as follows:
+%     if all(abs(r) <= opts.atolf+opts.rtolf*abs(r0))
+%       then exit with FLAG = 0,
+%     elseif all(abs(x-xold) <= opts.atolx+opts.rtolx*abs(x))
+%       then exit with FLAG = 1.
+%   Here r0 is the initial residual and xold is the previous
+%   approximation.
+%
+%   [X,FX,FLAG,OUT] = AINSOLVE(FUN,X0,...) returns a structure OUT
+%   with the number of iterations taken in OUT.iterations, the number
+%   of function (Jacobian) evaluations in OUT.funcCount (OUT.jacCount)
+%   and the algorithm used in OUT.algorithm.
+%
+%   [X,FX,FLAG,OUT,JAC] = AINSOLVE(FUN,X0,...) also returns the
+%   Jacobian JAC of FUN at X.
+%
+%   Cautionary: this is an experimental version with very little error
+%   checking.
+%
+%   See also FSOLVE.
+
+% S. Engblom 2010-09-03 (Minor revision)
+% S. Engblom 2004-10-11 (Revision)
+% S. Engblom 2004-01-26
+
+% *** Use parseopts!
+% *** Option jacmul
+% *** Use of numjac
+% *** For some combinations, only the action of J to a vector v is
+% needed. This is just (F(u+h*v)-F(u))/h for a suitable small h.
+% *** Check the angle between the latest step and the one just taken
+% before adding it to the trace space? Alternative: update a
+% Gram-Schmidt orthogonalization? How should the LS-problem really be
+% solved? GMRES?
+
+% special call: existing options
+if nargin == 1 && ischar(fun) && strcmpi(fun,'opts')
+  optdef.maxnlinit = 'integer > 0 {500}';
+  optdef.maxfunevals = 'integer > 0 {1000}';
+  optder.report = '{''on''} | ''off''';
+  optdef.atolx = 'scalar {1e-8}';
+  optdef.rtolx = 'scalar {1e-6}';
+  optdef.atolf = 'scalar {1e-8}';
+  optdef.rtolf = 'scalar {1e-6}';
+  optdef.algorithm = ['''xain'' | {''ain''} | ' ...
+                      '''xan'' | ''an'' | ''dn'''];
+  optdef.linesearch = '{''bisect''} | ''poly''';
+  optdef.tracedim = 'integer >= 0 {6}';
+  optdef.krylovdim = 'integer >= 0 {6}';
+  optdef.accmaxits = 'integer > 0 {5}';
+  optdef.accminfac = 'scalar in (0,1) {0.95}';
+  optdef.prefun = 'function {[]}';
+  optdef.prepar = 'argument {[]}';
+  x = optdef;
+  return;
+end
+
+% default options
+optdef.maxnlinit = 500;
+optdef.maxfunevals = 1000;
+optdef.report = 'on';
+optdef.atolx = 1e-8;
+optdef.rtolx = 1e-6;
+optdef.atolf = 1e-8;
+optdef.rtolf = 1e-6;
+optdef.algorithm = 'ain';
+optdef.linesearch = 'bisect';
+optdef.tracedim = 6;
+optdef.krylovdim = 6;
+optdef.accmaxits = 5;
+optdef.accminfac = 0.95;
+optdef.prefun = [];
+optdef.prepar = [];
+
+% special call: return defaults
+if nargin == 0, x = optdef; return; end
+
+% merge defaults with actual inputs
+if nargin > 2
+  if iscell(opts), opts = struct(opts{:}); end
+  fn = fieldnames(opts);
+  for i = 1:length(fn)
+    optdef = setfield(optdef,fn{i},getfield(opts,fn{i}));
+  end
+end
+opts = optdef;
+
+% size of the problem
+ndof = size(x0,1);
+
+% algorithm
+switch opts.algorithm
+ case 'dn', opts.alg = 1;
+ case 'an', opts.alg = 2;
+ case 'xan', opts.alg = 3;
+ case 'ain', opts.alg = 4;
+ case 'xain', opts.alg = 5;
+ otherwise, error('Unknown algorithm.');
+end
+if opts.alg ~= 1, Vt = zeros(ndof,0); end
+
+% linesearch
+switch opts.linesearch
+ case 'bisect', lines = @l_linesearch_bisect;
+ case 'poly', lines = @l_linesearch_poly;
+ otherwise, error('Unknown linesearch.');
+end
+
+% preconditioner
+hasPrecond = ~isempty(opts.prefun);
+if hasPrecond, M = []; end
+
+% evaluate the first residual and Jacobian
+[r,J] = feval(fun,x0,varargin{:});
+nfun = 1; njac = 1;
+rcmp = opts.atolf+opts.rtolf*abs(r);
+normr = norm(r);
+
+% diagnostics
+opts.report = strcmpi(opts.report,'on');
+if opts.report
+  disp(sprintf([' Iteration  Func-count  Jac-count  ' ...
+                ' |residual|     |step|\n' ...
+                ' %5.0f      %5.0f      %5.0f       ' ...
+                ' %0.6e   -'], ...
+               0,nfun,njac,normr));
+end
+
+% early return
+if all(abs(r) <= rcmp)
+  x = x0;
+  nliniter = 0;
+else
+  % nonlinear (outer) iterations
+  for nliniter = 1:opts.maxnlinit
+
+    if opts.alg == 1 % damped Newton (DN)
+      en = J\(-r);
+      [x,nfun] = l_linesearch(lines,x0,J,r,normr,en, ...
+                              nfun,fun,varargin);
+
+      % next residual and Jacobian
+      [r,J] = feval(fun,x,varargin{:});
+      nfun = nfun+1; njac = njac+1;
+      normr = norm(r);
+      step = x-x0;
+    else % accelerated variants (AN/XAN/AIN/XAIN)
+      % needed for progress report
+      xold = x0;
+
+      % compute the Newton direction (AN/XAN)
+      if opts.alg == 2 || opts.alg == 3, en = J\(-r); end
+
+      % inner iterations
+      for j = 1:opts.accmaxits
+        if j == 1 || opts.alg == 3 || opts.alg == 5
+          % create Krylov space
+          Vk = zeros(ndof,opts.krylovdim);
+          if hasPrecond
+            [prefun,M] = feval(opts.prefun,J,M,opts.prepar, ...
+                               varargin{:});
+            Vk(:,1) = feval(prefun,-r,M);
+            for j = 2:opts.krylovdim
+              Vk(:,j) = feval(prefun,J*Vk(:,j-1),M);
+            end
+          else
+            Vk(:,1) = -r;
+            for j = 2:opts.krylovdim
+              Vk(:,j) = J*Vk(:,j-1);
+            end
+          end
+
+          if opts.alg == 2 || opts.alg == 3 % (AN/XAN)
+            W = orth([Vt en Vk]); % *** sloppy
+          else % (AIN/XAIN)
+            W = orth([Vt Vk]); % *** sloppy
+          end
+        end
+
+        % restricted (local) Jacobian
+        U = J*W;
+
+        % Newton direction for local problem
+        ea = W*(U\(-r)); % *** sloppy(?)
+        [x,nfun] = l_linesearch(lines,x0,J,r,normr,ea, ...
+                                nfun,fun,varargin);
+
+        % next residual and Jacobian
+        [r,J] = feval(fun,x,varargin{:});
+        nfun = nfun+1; njac = njac+1;
+        normr0 = normr;
+        normr = norm(r);
+ 
+        % should we stop?
+        if all(abs(r) <= rcmp) || ...
+              normr > opts.accminfac*normr0 || ...
+              nfun >= opts.maxfunevals
+          break;
+        end
+        x0 = x;
+      end
+
+      % restore old solution
+      x0 = xold;
+
+      % add the step thus produced to the trace space
+      step = x-x0;
+      if size(Vt,2) == opts.tracedim
+        Vt = [Vt(:,2:end) step];
+      else
+        Vt = [Vt step];
+      end
+    end
+
+    % diagnostics
+    if opts.report
+      disp(sprintf([' %5.0f      %5.0f      %5.0f       ' ...
+                    ' %0.6e   %0.6e'], ...
+                   nliniter,nfun,njac,normr,norm(step)));
+    end
+
+    % stopping criteria
+    if  all(abs(r) <= rcmp) || ...
+          all(abs(step) <= opts.atolx+opts.rtolx*abs(x)) || ...
+          nfun >= opts.maxfunevals
+      break;
+    end
+
+    % cycle solution for next round
+    x0 = x;
+  end
+end
+
+% various outputs
+if nargout > 1
+  fx = r;
+  if nargout > 2
+    flag = 0;
+    if nfun >= opts.maxfunevals
+      flag = 3;
+    elseif nliniter >= opts.maxnlinit
+      flag = 2;
+    elseif any(abs(r) > rcmp)
+      flag = 1;
+    end
+    if nargout > 3
+      out = struct('algorithm',opts.algorithm, ...
+                   'iterations',nliniter, ...
+                   'funcCount',nfun, ...
+                   'jacCount',njac);
+      if nargout > 4
+        jac = J;
+      end
+    end
+  end
+end
+
+%--------------------------------------------------------------------------
+function [x,fc] = l_linesearch(lines,xold,J,r,normr,eo,fcold,fun,fargs)
+%L_LINESEARCH Linesearch algorithm.
+%   [X,FC] = L_LINSEARCH(LINES,XOLD,J,R,NORMR,EO,FCOLD,FUN,FARGS)
+%   performs an optimal scaling of the search direction EO and then
+%   determines X = X0+LAMBDA*EO for which the norm of the function FUN
+%   (with additional arguments FARGS) has decreased
+%   sufficiently. LINES is a linesearch algorithm, J is the Jacobian
+%   at XOLD, R and NORMR the residual and norm of the residual. FCOLD
+%   is the function count before the call; it is increased for every
+%   call to FUN and the accumulated number of calls is returned in FC.
+
+% *** scaling really needed for AN/XAN/AIN/XAIN?
+% *** factor 1/2 really needed for the norm? (kept for the sake of poly)
+
+% straightforward
+Je = J*eo;
+mrtJe = -r'*Je;
+normJe = norm(Je);
+eta = mrtJe/(normr*normJe);
+e = (mrtJe/(normJe*normJe))*eo;
+grad = -(mrtJe/normJe)^2;
+
+[x,fc] = feval(lines,xold,normr^2/2,grad,e, ...
+               fcold,fun,fargs,0.5*eta*eta);
+%--------------------------------------------------------------------------
+function [x,fc] = l_linesearch_bisect(xold,fold,dfold,e, ...
+                                      fcold,fun,fargs,eta2)
+%L_LINESEARCH_BISECT Linesearch by bisection.
+
+fc = fcold; % count the number of residuals evaluated
+lambda = 1; % start by trying the full step
+
+% backtrack by bisection
+while lambda > eps
+  x = xold+lambda*e;
+  f = norm(feval(fun,x,fargs{:}))^2/2;
+  fc = fc+1;
+  if f <= (1-lambda*eta2)*fold, return; end
+  lambda = 0.5*lambda;
+end
+
+% otherwise we are probably on a local minimum -- this should be
+% handled by the caller by monitoring the residual
+warning('Linesearch direction does not appear to be a descent.');
+%--------------------------------------------------------------------------
+function [x,fc] = l_linesearch_poly(xold,fold,dfold,e, ...
+                                    fcold,fun,fargs,eta2)
+%L_LINESEARCH_POLY Linesearch by polynomial interpolation.
+
+% first try the full Newton step...
+lambda1 = 1;
+x = xold+e;
+f1 = norm(feval(fun,x,fargs{:}))^2/2;
+fc = fcold+1;
+
+% this much decrease is sufficient
+if f1 <= (1-eta2)*fold, return; end
+
+% ...then backtrack using a quadratic form...
+lambda2 = -dfold/(2*(f1-fold-dfold));
+lambda2 = min(lambda2,0.5);
+lambda2 = max(lambda2,0.1);
+x = xold+lambda2*e;
+f2 = norm(feval(fun,x,fargs{:}))^2/2;
+fc = fc+1;
+if f2 <= (1-lambda2*eta2)*fold, return; end
+
+% ...otherwise backtrack using a cubic form
+while lambda2 > eps
+  % solve for certain cubic coefficients
+  ab = ([1 -1; -lambda1 lambda2]* ...
+        [(f2-dfold*lambda2-fold)/lambda2^2; ...
+         (f1-dfold*lambda1-fold)/lambda1^2])./(lambda2-lambda1);
+
+  % cycle the old values
+  lambda1 = lambda2;
+  f1 = f2;
+
+  % compute next lambda safely
+  if ab(1) ~= 0
+    lambda2 = ab(2)^2-3*ab(1)*dfold;
+    if lambda2 <= 0
+      lambda2 = 1;
+    elseif ab(2) <= 0
+      lambda2 = (sqrt(lambda2)-ab(2))/(3*ab(1));
+    else
+      lambda2 = -dfold/(sqrt(lambda2)+ab(2));
+    end
+  elseif ab(2) ~= 0
+    lambda2 = -dfold/(2*ab(2));
+  else
+    lambda2 = -fold/dfold;
+  end
+
+  % enforce bounds
+  lambda2 = min(lambda2,0.5*lambda1);
+  lambda2 = max(lambda2,0.1*lambda1);
+
+  x = xold+lambda2*e;
+  f2 = norm(feval(fun,x,fargs{:}))^2/2;
+  fc = fc+1;
+  if f2 <= (1-lambda2*eta2)*fold, return; end
+end
+
+% otherwise we are probably on a local minimum -- this should be
+% handled by the caller by monitoring the residual
+warning('Linesearch direction does not appear to be a descent.');
+%--------------------------------------------------------------------------
diff --git a/stenglib/Scicomp/gausspd.m b/stenglib/Scicomp/gausspd.m
new file mode 100644
index 00000000..d4bcad1e
--- /dev/null
+++ b/stenglib/Scicomp/gausspd.m
@@ -0,0 +1,96 @@
+function [A,B] = gausspd(name,x,n,p1,p2,p3)
+%GAUSSPD Orthogonal polynomials generated by discrete inner products.
+%   [A,B] = GAUSSPD(NAME,X,N,P1,P2,...) returns suitable matrices A and B
+%   for usage with CLENSHAW for the polynomials with name NAME with
+%   parameters P1, P2, ...
+%
+%   X is a vector with coordinates and N is the maximum order of the
+%   polynomials.
+%
+%   The polynomials are normalized to unit norm for computational
+%   reasons. In addition, the measure has also been normalized to unity.
+%
+%   No error-checking is performed.
+%
+%   Name          Parameter(s)    Weight
+%   -------------------------------------------------------------
+%   'charlier'    a               exp(-a) a^x/x!
+%   'krawtchouk'  p, N            bin(N,x) p^x (1-p)^(N-x)
+%                                 (x = 0..N)
+%   'meixner'     c, b            (1-c)^b c^x (b)_x/x!
+%   'chebyshev'   N               1/N (x = 0..N-1)
+%   'hahn'        a, b, c         gamma(c-a)gamma(c-b)   ...
+%                                 /gamma(c)/gamma(c-a-b) ...
+%                                 (a)_x (b)_x/(c)_x/x!
+%
+%   Here bin(N,x) is the binomial coefficients and (a)_x is Pochhammer's
+%   symbol. Use GAUSSQD to check the sanity of the parameters P1, P2, ...
+%
+%   Examples:
+%     x = linspace(0,15);
+%     n = 5; a = 5;
+%     [A,B] = gausspd('charlier',x,n,a);
+%     y = clenshaw(A,B);
+%     figure, plot(x,y); title('Charlier polynomials')
+% 
+%     x = linspace(5,20);
+%     c = 0.4; b = 3;
+%     [A,B] = gausspd('meixner',x,n,c,b);
+%     y = clenshaw(A,B);
+%     figure, plot(x,y); title('Meixner polynomials')
+%
+%   See also CLENSHAW, GAUSSQD.
+
+% S. Engblom 2006-11-06 (Revision)
+% S. Engblom 2006-01-12
+
+switch name
+ case 'charlier'
+  x = reshape(x,1,[]);
+  nn = reshape(1:n,n,1);
+  sn = sqrt(nn);
+  sa = 1./(sqrt(p1)*sn);
+  A = [ones(size(x)); ...
+       tsum(sa.*(nn+(p1-1)),-sa*x,[1],[1 2])];
+  B = -[0; [0; sn(1:end-1)]./sn];
+ case 'krawtchouk'
+  [A,B] = gausspd('meixner',x,n,-p1/(1-p1),-p2);
+ case 'meixner'
+  x = reshape(x,1,[]);
+  nn = reshape(1:n,n,1);
+  nb = sqrt((nn+p2).*(nn+1));
+  sc = sqrt(p1);
+  A = [ones(size(x)); ...
+       tsum((p1*(n+p2)+n)./(sc*nb),(p1-1)/sc./nb*x,[1],[1 2])];
+  B = -[0; [0; nb(1:end-1)]./nb];
+ case 'chebyshev'
+  [A,B] = gausspd('hahn',x,n,1-p1,1,1-p1);
+ case 'hahn'
+  x = reshape(x,1,[]);
+  nn = reshape(0:n,n+1,1);
+
+  w = p3-p1-p2;
+  ww = w-2*nn;
+  w1 = ww-1;
+  w2 = ww+1;
+
+  An = -(ww.*w1)./((w-nn).*(p1+nn).*(p2+nn));
+  % *** hopefully, the only singular special case...
+  if w == -1
+    num = nn.*(w+p1-nn).*(w+p2-nn).*w1;
+    den = (w-nn).*(p1+nn).*(p2+nn).*w2;
+    Cn = [0; num(2:end)./den(2:end)];
+  else
+    Cn = (nn.*(w+p1-nn).*(w+p2-nn).*w1)./((w-nn).*(p1+nn).*(p2+nn).*w2);
+  end
+  A = [ones(size(x)); ...
+       tsum(1+Cn(1:end-1),An(1:end-1)*x,[1],[1 2])];
+  B = -[0; Cn(1:end-1)];
+  % *** implementation of normalization is sub-optimal
+  nrm = realsqrt([1; cumprod(Cn(2:end).*An(1:end-1)./An(2:end))]);
+  A = tprod(A,[1; nrm(1:end-1)./nrm(2:end)],[1 2],[1]);
+  B = B.*[1; 1; nrm(1:end-2)./nrm(3:end)];
+ 
+ otherwise
+  error('Unknown polynomial.');
+end
diff --git a/stenglib/Scicomp/gaussqd.m b/stenglib/Scicomp/gaussqd.m
new file mode 100644
index 00000000..e5d63b66
--- /dev/null
+++ b/stenglib/Scicomp/gaussqd.m
@@ -0,0 +1,178 @@
+function [x,w,y] = gaussqd(name,n,p1,p2,p3)
+%GAUSSQD Quadratures for discrete measures.
+%   [X,W] = GAUSSQD(NAME,N,P1,P2,...) returns abscissas X and weights W
+%   for the Nth discrete quadrature associated with the polynomial NAME
+%   using the parameters P1, P2, ...
+%
+%   The resulting summation formula is
+%     sum_0^inf f(x) weight(x) = sum_i f(x(i))*w(i),
+%   where the weight-function is defined in GAUSSPD.
+%
+%   [X,W,Y] = GAUSSQD(NAME,N,P1,P2,...) additionally returns an N-by-N
+%   matrix Y containing values of the normalized polynomials of order < N
+%   in all the quadrature points X. Each column Y(:,i) contains the values
+%   of the polynomials evaluated at X(i).
+%
+%   Unlike GAUSSPD, GAUSSQD carefully checks the parameters P1, P2, ... to
+%   see if the corresponding set of polynomials makes sense. The
+%   conditions put on the coefficients are that enough moments exist and
+%   that the measure is strictly positive. In addition, the order of the
+%   polynomial must be bounded whenever the natural domain of summation is
+%   limited.
+%
+%   Reference:
+%     [1] S. Engblom: "Gaussian quadratures with respect to discrete
+%     measures". Technical Report 2006-007, Dept of Information
+%     Technology, Uppsala University, 2006. Available at
+%     http://www.it.uu.se/research.
+%
+%   Example:
+%     f = inline('(x.^5+2*x.^4+3*x.^3+4*x.^2+5*x)');
+%     [x,w] = gaussqd('charlier',2,1); sum1 = f(x)'*w
+%     [x,w] = gaussqd('charlier',3,1); sum2 = f(x)'*w
+%     [x,w] = gaussqd('charlier',4,1); sum3 = f(x)'*w
+%
+%   See also GAUSSPD.
+
+% S. Engblom 2006-01-12
+
+if ~isscalar(n) || n ~= ceil(n) || n <= 0
+  error('Order of quadrature must be a positive integer.');
+end
+
+wmsg = 'Parameter(s) out of range. Quadrature may not make sense.';
+switch name
+ case 'charlier'
+  if nargin ~= 3 || ~isscalar(p1)
+    error('Parameter of Charlier polynomial must be a single scalar.');
+  end
+  ok = p1 > 0;
+  if ~ok, warning(wmsg); end
+
+  % construct a symmetric positive definite and tridiagonal matrix J
+  nn = 1:n;
+  b = -sqrt(nn*p1);
+  b(end) = 0;
+  J = fsparse([[1 1:n-1]; nn; [2:n n]],nn, ...
+              [b([end 1:end-1]); nn+p1-1; b],[],'nosort');
+
+  % eigenvalues and -vectors
+  [y,x] = eig(full(J));
+
+  % the abscissas are the eigenvalues of J
+  x = diag(x);
+
+  % the weights can be found from the first element of each eigenvector
+  w = reshape(y(1,:).^2,[],1);
+
+  % the values of the polynomials themselves can be found from the
+  % eigenvectors (normalizing the first element)
+  if nargout > 2
+    y = tprod(y,1./y(1,:),[1 2],[3 2]);
+  end
+
+ case 'krawtchouk'
+  if nargin ~= 4 || ~isscalar(p1) || ~isscalar(p2)
+    error('Parameters of Krawtchouk polynomial must be two scalars.');
+  end
+  [x,w,y] = gaussqd('meixner',n,-p1/(1-p1),-p2);
+
+ case 'meixner'
+  if nargin ~= 4 || ~isscalar(p1) || ~isscalar(p2)
+    error('Parameters of Meixner polynomial must be two scalars.');
+  end
+
+  % Conditions on (1) convergence of the inner product, (2) the measure
+  % being positive, (3) when p2 is a non-positive integer (i.e. the
+  % measure has finite support), then n must be sufficiently small.
+  xmax = inf;
+  if p2 <= 0 && p2 == ceil(p2)
+    xmax = -p2;
+  end
+  pp = ceil([-p2 1-p2 -1-p2]);
+  pp = min(max(0,pp),xmax);
+  pp = sign(p1.*(p2+pp-1)).^pp;
+  ok = xmax < inf || abs(p1) < 1 || abs(p1) == 1 && -p2 >= 2*n;
+  ok = ok && all(pp > 0);
+  ok = ok && n <= xmax+1;
+  if ~ok, warning(wmsg); end
+
+  nn = 1:n;
+  b = -sqrt(p1*nn.*(p2+nn-1));
+  b(end) = 0;
+  J = fsparse([[1 1:n-1]; nn; [2:n n]],nn, ...
+              [b([end 1:end-1]); (p1+1)*(nn-1)+p1*p2; b],[],'nosort');
+
+  [y,x] = eig(full(J./(1-p1)));
+
+  x = diag(x);
+  w = reshape(y(1,:).^2,[],1);
+
+  if nargout > 2
+    y = tprod(y,1./y(1,:),[1 2],[3 2]);
+  end
+
+ case 'chebyshev'
+  if nargin ~= 3 || ~isscalar(p1)
+    error('Parameter of Chebyshev polynomial must be a single scalar.');
+  end
+  [x,w,y] = gaussqd('hahn',n,1-p1,1,1-p1);
+
+ case 'hahn'
+  if nargin ~= 5 || ~isscalar(p1) || ~isscalar(p2) || ~isscalar(p3)
+    error('Parameters of Hahn polynomial must be three scalars.');
+  end
+  % Conditions on (1) convergence of the inner product, (2) the measure
+  % being positive, (3) the denominator not containing zeros and (4) when
+  % p1 and/or p2 are non-positive integers (i.e. the measure has finite
+  % support), then n must be sufficiently small.
+  xmax = inf;
+  if p1 <= 0 && p1 == ceil(p1)
+    xmax = -p1;
+  end
+  if p2 <= 0 && p2 == ceil(p2)
+    xmax = min(xmax,-p2);
+  end
+  pp = [p1 p2 p3];
+  pp = ceil([-pp 1-pp -1-pp]);
+  pp = min(max(0,pp),xmax);
+  pp = sign((p1+pp-1).*(p2+pp-1).*(p3+pp-1)).^pp;
+  ok =  xmax < inf || p3-p1-p2 >= 2*n;
+  ok = ok && all(pp > 0);
+  ok = ok && (p3 > 0 || p3 == ceil(p3) && xmax <= -p3);
+  ok = ok && n <= xmax+1;
+  if ~ok, warning(wmsg); end
+
+  nn = 1:n;
+  w = p3-p1-p2;
+  w1 = w-2*nn+1;
+  if w == -1
+    % Special case for essentially the Gauss-Chebyshev formula. It's a guess
+    % that this is the only singular case.
+    a = [p1*p2/(w-1) ...
+        polyval([1-p1-p2-p3 -(1-p1-p2-p3)*w (1+w)*p1*p2],nn(2:end)-1)./ ...
+        (w1(2:end).*(w1(2:end)+2))];
+  else
+    a = polyval([1-p1-p2-p3 -(1-p1-p2-p3)*w (1+w)*p1*p2],nn-1)./(w1.*(w1+2));
+  end
+  n2 = 1:n-1;
+  w2 = w-2*n2;
+  b = [-sqrt(n2.*(w+1-n2).* ...
+             (p1+n2-1)./(w2+1).* ...
+             (p2+n2-1)./(w2+1).* ...
+             (p1+w-n2)./(w2+2).* ...
+             (p2+w-n2)./w2) 0];
+  J = fsparse([[1 1:n-1]; nn; [2:n n]],nn, ...
+              [b([end 1:end-1]); a; b],[],'nosort');
+
+  [y,x] = eig(full(J));
+
+  x = diag(x);
+  w = reshape(y(1,:).^2,[],1);
+
+  if nargout > 2
+    y = tprod(y,1./y(1,:),[1 2],[3 2]);
+  end
+ otherwise
+  error('Unknown type of polynomial.');
+end
diff --git a/stenglib/Scicomp/nmsimplex.m b/stenglib/Scicomp/nmsimplex.m
new file mode 100644
index 00000000..88eae4f4
--- /dev/null
+++ b/stenglib/Scicomp/nmsimplex.m
@@ -0,0 +1,329 @@
+function [x,fx,flag] = nmsimplex(fun,x0,opts,varargin)
+%NMSIMPLEX Nelder-Meads simplex-algorithm.
+%   X = NMSIMPLEX(FUN,X0) starts at the initial guess X0 and tries to
+%   minimize the objective function FUN. FUN has the signature Y =
+%   FUN(X,...) and returns a scalar value.
+%
+%   X = NMSIMPLEX(FUN,X0,OPTS) solves with the default parameters replaced
+%   by the values in the structure OPTS. The syntax OPTDEF = NMSIMPLEX
+%   returns the default options and the syntax OPTHELP = NMSIMPLEX('opts')
+%   produces a short description of available options (see also the table
+%   below).
+%
+%   X = NMSIMPLEX(FUN,X0,OPTS,P1,P2,...) passes the parameters P1, P2,
+%   ... directly to the function FUN; FUN(X,P1,P2,...).
+%
+%   [X,FX] = NMSIMPLEX(FUN,X0,...) returns the value of the objective
+%   function at X.
+%
+%   [X,FX,FLAG] = NMSIMPLEX(FUN,X0,...) returns a number FLAG that
+%   describes the exit condition of NMSIMPLEX.
+%   If FLAG is:
+%      0 then NMSIMPLEX converged to a solution X,
+%      1 then NMSIMPLEX halted at a flat local minimum of the objective
+%        function,
+%      2 then the maximum number of function evaluations was
+%        reached,
+%      3 then the report function returned non-zero (see below).
+%
+%   The convergence criterion used works as follows:
+%     if all(abs(xstep) <= opts.atolx+opts.rtolx*abs(xmin))
+%       then exit with FLAG = 0,
+%     elseif ystep <= opts.atolf+opts.rtolf*abs(ymin)
+%       then exit with FLAG = 1.
+%   Here (xmin,ymin) is the current best approximation and (xstep,ystep)
+%   is the step just taken.
+%
+%   Property    Value/{Default}           Description
+%   -----------------------------------------------------------------------
+%   init        Scalar {0.25} | Simplex   A scalar indicating an
+%                                         estimate of the error in the 
+%                                         initial guess used to produce the
+%                                         initial simplex. Alternatively, 
+%                                         an ND-by-ND matrix where each
+%                                         column defines a corner in the 
+%                                         initial simplex. Together with 
+%                                         the initial guess X0 this must 
+%                                         define a feasible simplex.
+%
+%   maxfunevals integer > 0 {1000}        Maximum number of function 
+%                                         evaluations.
+%
+%   report      {'on'} | 'off'            Print diagnostics.
+%   reportfun   Function {[]}             Report function, see below.
+%
+%   atolx       Scalar {1e-8}             Absolute tolerance in solution.
+%   rtolx       Scalar {1e-6}             Relative tolerance in solution.
+%
+%   atolf       Scalar {1e-14}            Absolute tolerance in function.
+%   rtolf       Scalar {1e-12}            Relative tolerance in function.
+%
+%   lb          Scalar | Vector {-Inf}    Lower bound in each dimension.
+%   ub          Scalar | Vector {Inf}     Upper bound in each dimension.
+%
+%   The report function is called according to the three syntaxes STATUS =
+%   reportfun([t(0) -1],suggested title,'init'), STATUS =
+%   reportfun(t,xsimplex,[],...), and, STATUS =
+%   reportfun(t,xsimplex,'done',...). The "time" is defined by t =
+%   -max(abs(xstep)./(opts.atolx+opts.rtolx*abs(xmin))). An occasional
+%   useful feature is that the minimization is automatically aborted
+%   whenever the returned value STATUS is non-zero.
+%
+%   Example:
+%     % Rosenbrock function
+%     f = @(x)((1-x(1,:)).^2+100*(x(2,:)-x(1,:).^2).^2);
+%     [x,y] = meshgrid(linspace(-2,2,40),linspace(-1,3));
+%     figure, contour(x,y,reshape(f([x(:)'; y(:)']),size(x)),30),
+%     hold on
+%
+%     % reporter to plot convergence (could also be an .m-file function!)
+%     iif = @(varargin)(varargin{2*find([varargin{1:2:end}],1,'first')}());
+%     simplex_report = @(t,x,s)(iif(~ischar(x), ...
+%       @()0*plot(x(1,[1:end 1]),x(2,[1:end 1]),'b.-'), ...
+%       true,@()0));
+%
+%     % solve
+%     x = nmsimplex(f,[-1 2],{'reportfun' simplex_report})
+%
+%   See also FMINSEARCH, FSOLVE, REPORT.
+
+% S. Engblom 2014-08-28 (Revision, reportfun and plot example)
+% S. Engblom 2006-09-19 (Revision)
+% S. Engblom 2006-02-03
+
+% *** stat counter of some kind?
+
+% special call: existing options
+if nargin == 1 & ischar(fun) & strcmpi(fun,'opts')
+  optdef.init = 'scalar | simplex {0.25}'
+  optdef.maxfunevals = 'integer > 0 {1000}';
+  optdef.report = '{''on''} | ''off''';
+  optdef.reportfun = 'Function {[]}';
+  optdef.atolx = 'scalar {1e-8}';
+  optdef.rtolx = 'scalar {1e-6}';
+  optdef.atolf = 'scalar {1e-14}';
+  optdef.rtolf = 'scalar {1e-12}';
+  optdef.lb = 'scalar | vector {-inf}';
+  optdef.ub = 'scalar | vector {inf}';
+  x = optdef;
+  return;
+end
+
+% default options
+optdef.init = 0.25;
+optdef.maxfunevals = 1000;
+optdef.report = 'on';
+optdef.reportfun = [];
+optdef.atolx = 1e-8;
+optdef.rtolx = 1e-6;
+optdef.atolf = 1e-14;
+optdef.rtolf = 1e-12;
+optdef.lb = -inf;
+optdef.ub = inf;
+
+% special call: return defaults
+if nargin == 0, x = optdef; return; end
+
+% parse options
+if nargin < 3
+  opts = optdef;
+else
+  opts = parseopts(optdef,opts);
+end
+
+% initial guess
+x0 = x0(:);
+nd = size(x0,1);
+opts.lb = opts.lb(:);
+if isscalar(opts.lb)
+  opts.lb = frepmat(opts.lb,nd);
+end
+opts.ub = opts.ub(:);
+if isscalar(opts.ub)
+  opts.ub = frepmat(opts.ub,nd);
+end
+if any(x0 < opts.lb) || any(opts.ub < x0)
+  error('Initial guess not feasible.');
+end
+
+nfun = 0;
+if isscalar(opts.init)
+  % try creating a non-degenerate simplex
+  r = opts.init*abs(x0)+opts.atolx;
+  x1 = tsum(x0,diag(r),[1],[1 2]);
+  y1 = zeros(1,size(x1,2));
+  for i = 1:size(x1,2)
+    x1(:,i) = min(max(x1(:,i),opts.lb),opts.ub);
+    y1(i) = feval(fun,x1(:,i),varargin{:});
+  end
+  nfun = nfun+size(x1,2);
+
+  x2 = tsum(x0,diag(-r),[1],[1 2]);
+  y2 = zeros(1,size(x2,2));
+  for i = 1:size(x2,2)
+    x2(:,i) = min(max(x2(:,i),opts.lb),opts.ub);
+    y2(i) = feval(fun,x2(:,i),varargin{:});
+  end
+  nfun = nfun+size(x2,2);
+
+  % use best corners for the initial guess
+  [foo,i] = min([y1; y2]);
+  x = [x1(:,i == 1) x2(:,i == 2)];
+  y = [y1(i == 1) y2(i == 2)];
+
+  % add user's corner...
+  x = [x x0];
+  y = [y feval(fun,x0,varargin{:})];
+  nfun = nfun+1;
+else
+  % intial simplex supplied by user
+  x = [x0 opts.init];
+  y = zeros(1,size(x,2));
+  for i = 1:size(x,2)
+    x(:,i) = min(max(x(:,i),opts.lb),opts.ub);
+    y(i) = feval(fun,x(:,i),varargin{:});
+  end
+  nfun = nfun+size(x,2);
+end
+
+if size(x,2) ~= nd+1 || rank(tsum(x(:,2:end),-x(:,1),[1 2],[1])) < nd
+  error(['Degenerate initial simplex. Either provide a feasible initial ' ...
+         'simplex or try a smaller scalar option .init.']);
+end
+
+opts.report = strcmpi(opts.report,'on');
+if opts.report
+  disp(' Iteration  Func-count   Func-value      |step|');
+end
+
+% find best and worst values
+[ymin,imin] = min(y);
+[ymax,imax] = max(y);
+xmin = x(:,imin);
+xmax = x(:,imax);
+ymax2 = max(y([1:imax-1 imax+1:end]));
+
+% setup reporter
+if ischar(opts.reportfun)
+  reportfun = str2func(opts.reportfun);
+else
+  reportfun = opts.reportfun;
+end
+if ~isempty(reportfun)
+  status = reportfun([-max(abs(xmax-xmin)./(opts.atolx+opts.rtolx*abs(xmin))) -1], ... 
+                     'Solution progress...','init');
+else
+  % void function
+  reportfun = @report;
+  status = reportfun([],[],'none');
+end
+
+nit = 0;
+while 1
+  if status ~= 0
+    fprintf('Reporter returned non-zero status = %d. Bailing out.\n',status);
+    status = reportfun(-max(abs(xmax-xmin)./(opts.atolx+opts.rtolx*abs(xmin))), ...
+                       x,'done',varargin{:});
+    x = xmin;
+    if nargout > 1
+      fx = ymin;
+      if nargout > 2
+        flag = 3;
+      end
+    end
+    return;
+  end
+  nit = nit+1;  
+
+  xcen = (-xmax+sum(x,2))/nd;
+  xref = min(max(2*xcen-xmax,opts.lb),opts.ub);
+  yref = feval(fun,xref,varargin{:});
+  nfun = nfun+1;
+
+  % form new simplex
+  if ymin > yref
+    % xref seems promising; try to extrapolate
+    xexp = min(max(2*xref-xcen,opts.lb),opts.ub);
+    yexp = feval(fun,xexp,varargin{:});
+    nfun = nfun+1;
+    if yexp < yref
+      % successful, good decrease
+      x(:,imax) = xexp;
+      y(imax) = yexp;
+    else
+      % a fair decrease anyway...
+      x(:,imax) = xref;
+      y(imax) = yref;
+    end
+  elseif ymax2 > yref
+    % intermediate cost; slight improvement
+    x(:,imax) = xref;
+    y(imax) = yref;
+  else
+    % high cost; local contraction
+    if ymax <= yref
+      x(:,imax) = 0.5*(xmax+xcen);
+    else
+      x(:,imax) = 0.5*(xref+xcen);
+    end
+
+    % new evaluation
+    y(imax) = feval(fun,x(:,imax),varargin{:});
+    nfun = nfun+1;
+
+    % still not an improvement: try multiple contraction around the
+    % best corner
+    if y(imax) > ymax
+      for i = [1:imin-1 imin+1:size(x,2)]
+        x(:,i) = 0.5*(x(:,i)+xmin);
+        y(i) = feval(fun,x(:,i),varargin{:});
+      end
+      nfun = nfun+nd;
+    end
+  end
+
+  % update best and worst values
+  [ymin,imin] = min(y);
+  [ymax,imax] = max(y);
+  xmin = x(:,imin);
+  xmax = x(:,imax);
+  ymax2 = max(y([1:imax-1 imax+1:end]));
+
+  % report progress
+  xstep = xmax-xmin;
+  ystep = ymax-ymin;
+  if opts.report
+    fprintf(1,' %5.0f       %5.0f       %0.6e    %0.6e\n', ...
+            nit,nfun,ymin,norm(xstep));
+  end
+  status = reportfun(-max(abs(xstep)./(opts.atolx+opts.rtolx*abs(xmin))),x,'');
+
+  % convergence criteria
+  ok = all(abs(xstep) <= opts.atolx+opts.rtolx*abs(x(:,imin)));
+  if ok || abs(ystep) <= opts.atolf+opts.rtolf*abs(ymin) || ...
+        nfun >= opts.maxfunevals
+    break;
+  end
+end
+
+% various outputs
+x = xmin;
+if nargout > 1
+  fx = ymin;
+  if nargout > 2
+    flag = 0;
+    if nfun >= opts.maxfunevals
+      flag = 2;
+    elseif ~ok
+      flag = 1;
+    end
+  end
+elseif nargout < 3
+  if nfun >= opts.maxfunevals
+    warning('The maximum number of function evaluations was reached.');
+  elseif ~ok
+    warning('Halted at a flat local minimum of the objective function.');
+  end
+end
+
+status = reportfun(-max(abs(xstep)./(opts.atolx+opts.rtolx*abs(xmin))),x,'done');
diff --git a/stenglib/Scicomp/ode1s.m b/stenglib/Scicomp/ode1s.m
new file mode 100644
index 00000000..7c05ded9
--- /dev/null
+++ b/stenglib/Scicomp/ode1s.m
@@ -0,0 +1,654 @@
+function [t,y] = ode1s(fun,tspan,y0,opts,varargin)
+%ODE1S Solve stiff and strongly nonlinear ODE. Split-step Euler method.
+%   [T,Y] = ODE1S(FUN,[TSTART TEND],Y0,OPTS,...) solves the ODE dY/dt
+%   = FUN(t,Y,...) from TSTART to TEND with initial data Y0. OPTS is a
+%   structure with options, see the table below.
+%
+%   For the right-hand side, the special syntax [dYimpl,dYexpl] =
+%   FUN(tnew,Ynew,told,Yold,...) is used. Here Ynew(Yold) is the
+%   latest iterate(solution) at the next(previous) step. This syntax
+%   makes it easy to implement split-step semi-implicit methods. With
+%   only one output, dYimpl = FUN(tnew,Ynew,told,Yold,...) is
+%   understood.
+%
+%   The method is thus a forward/backward Euler combination. The error
+%   is estimated by taking half-steps and a step-size control strategy
+%   based on digital filters is used.
+%
+%   SOL = ODE1S(...) is also an allowed syntax, returning a solution
+%   structure instead.
+%
+%   Field        Value/{Default}        Description
+%   -----------------------------------------------------------------------
+%   atol         Vector/scalar {1e-6}   Absolute tolerance.
+%   rtol         Vector/scalar {1e-4}   Relative tolerance.
+%
+%   adaptivity   {'eps'} | 'epus' |     Error per step, per unit step, or
+%                'eq'                   equilibrium control.
+%   errnorm1     Function {@abs}        Inner and outer error-norms, see 
+%   errnorm2     Function {@norm}       below.
+%
+%   initialstep  Scalar {0}             The initial step, zero means that
+%                                       this is determined automatically.
+%   maxstep      Scalar {inf}           Maximum step, 
+%   minstep      Scalar {0}             minimum step.
+%
+%   nlinauto     {1} | 2 | 3            Tuning of the nonlinear solver. A
+%                                       higher value means a more
+%                                       suspicious solver, prepared to take 
+%                                       more drastic measures when
+%                                       poor convergence is detected.
+%
+%   reportfun    Function {[]}          Report function, see below.
+%   info         {0} | 1 | 2            Level of information. 0:
+%                                       only serious errors, 1:
+%                                       include warnings, 2:
+%                                       include final statistics
+%                                       (not yet implemented).
+%
+%   In EPS-adaptivity, the control objective is to keep
+%     errnorm2(TOL./ERR) = 1,
+%   where in terms of the tentative solution y,
+%     TOL = max(rtol.*errnorm1(y),atol),
+%     ERR = errnorm1(error estimate).
+%   The default choice is errnorm1(z) = abs(z) and errnorm2(z) =
+%   norm(z), and is the usual component-wise control. An alternative
+%   is the less stringent "norm control" defined by errnorm1(z) =
+%   norm(z) and errnorm2(z) = abs(z). Other variants are easily
+%   constructed. Note that the norms are automatically normalized to
+%   unit measure so that errnorm2(errnorm1(ones(size(solution
+%   vector)))) = 1.
+%
+%   In EPUS-adaptivity, one rather uses
+%     TOL = h*max(rtol.*errnorm1(FUN(y)),atol),
+%   while in EQ-adaptivity,
+%     TOL = max(rtol.*errnorm1(y-y0),atol),
+%   ensuring small steps near an equilibrium.
+%
+%   The report function is called according to the three
+%   self-explaining syntaxes STATUS = reportfun([TSTART
+%   TEND],suggested title,'init'), STATUS = reportfun(t,y,[],...),
+%   and, STATUS = reportfun(t,y,'done',...). An occasional useful
+%   feature is that the integration is automatically aborted whenever
+%   the returned value STATUS is non-zero.
+%
+%   See also ODE15S, ODE23S.
+
+% S. Engblom 2012-09-18 (info added)
+% S. Engblom 2010-08-25 (Port from earlier code)
+% S. Engblom 2007-03-13
+
+yy = y0(:);
+ndof = size(yy,1);
+
+% default options
+optdef.atol = 1e-6;
+optdef.rtol = 1e-4;
+optdef.adaptivity = 'eps';
+optdef.errnorm1 = @abs;
+optdef.errnorm2 = @norm;
+optdef.initialstep = 0;
+optdef.maxstep = inf;
+optdef.minstep = 0;
+optdef.nlinauto = 1;
+optdef.reportfun = [];
+optdef.info = 0;
+
+% merge defaults with actual inputs
+if nargin > 3
+  opts = parseopts(optdef,opts);
+else
+  opts = optdef;
+end
+assignopts(opts);
+info = opts.info; % (silly, but there is a Matlab-function 'info'...)
+
+% EPS/EPUS/EQ control
+[foo,adaptivity] = fsetop('ismember',{adaptivity},{'eps' 'epus' 'eq'});
+if adaptivity == 0
+  error('Unknown type of adaptivity.');
+end
+
+% normalize the norms...
+NM = 1/feval(errnorm2,(feval(errnorm1,(ones(ndof,1)))));
+
+% setup reporter
+if ischar(reportfun)
+  reportfun = str2func(reportfun);
+end
+if ~isempty(reportfun)
+  status = reportfun([tspan(1) tspan(end)],'Solution progress...','init');
+else
+  reportfun = @report;
+  status = reportfun([],[],'none');
+end
+% *** stat counter of some kind?
+
+% step-size controller
+Const = [0 0 1];      % constant step-size
+Ho110 = [1 0 0];      % elementary controller (Deadbeat I)
+H110 = [1/3 0 0];     % I controller (convolution filter)
+PI34 = [7/10 -2/5 0]; % PI controller (Gustafsson)
+PI42 = [3/5 -1/5 0];  % PI controller for nonstiff methods
+H211PI = [1/6 1/6 0]; % 1st order LP filter of PI structure
+gain = 1/4;           % gain in [1/6 1/2] (gain = 1/2 yields Ho211)
+H211b = gain*[1 1 1]; % general purpose 1st order LP filter
+% controller used:
+ctrl = H211b;
+% *** ctrl and Mfac (below) could be options
+
+% (inverse of the) order of the scheme
+switch adaptivity
+ case 1 % EPS
+  order = 1/2; 
+ case {2 3} % EPUS/EQ
+  order = 1;
+end
+
+% rejection logic is based on step-ratios
+Mfac = 2; % rejection whenever err > Mfac*TOL
+
+% reject when FILTER(err) > Mfac*TOL, where FILTER is the product of
+% the controller and the limiters
+rhomin = l_limiter(l_lpfilter(ctrl,l_limiter(1/Mfac^order,2),1,1),1);
+
+% start-up
+cerr = 1;
+rho = 1;
+% *** direct manipulation of the controller's state is done in three
+% places in the code below when the nonlinear solver fails; it is not
+% completely clear how this should be done in the best way
+
+% build options for the nonlinear solver
+if nlinauto == 1
+  nlinopts.kappa = 10^-1.5;       % required extra precision
+  nlinopts.thetaREF = 0.4;        % target convergence factor
+  nlinopts.thetaJAC = 0.4;        % recompute Jacobian
+  nlinopts.thetaLU = 0.4;         % refactorize Jacobian
+  nlinopts.etainit = 0.9/(1-0.9); % (total) initial convergence factor
+  nlinopts.etapow = 0.8;          % power law for reuse of estimate
+  nlinopts.maxiter = 10;          % maximum number of iterations
+  nlinopts.hfac = 0.5;            % for irregular convergence
+elseif nlinauto == 2
+  nlinopts.kappa = 10^-2;
+  nlinopts.thetaREF = 0.3;
+  nlinopts.thetaJAC = 0.3;
+  nlinopts.thetaLU = 0.3;
+  nlinopts.etainit = 0.95/(1-0.95);
+  nlinopts.etapow = 0.7;
+  nlinopts.maxiter = 9;
+  nlinopts.hfac = 0.4;
+  % (nlinauto >= 2 also uses a more conservative initial guess)
+elseif nlinauto == 3
+  nlinopts.kappa = 10^-3;
+  nlinopts.thetaREF = 0.2;
+  nlinopts.thetaJAC = 0.2;
+  nlinopts.thetaLU = 0.2;
+  nlinopts.etainit = 0.95/(1-0.95);
+  nlinopts.etapow = 0.6;
+  nlinopts.maxiter = 8;
+  nlinopts.hfac = 0.3;
+else
+  error('Property ''nlinauto'' must be one of {1 2 3}.');
+end
+
+% solution and allocation logic
+tspan = tspan(:);
+nchunks = 10;
+nchunksmax = 10000;
+t = zeros(1,nchunks);
+t(1) = tspan(1);
+y = zeros(ndof,nchunks);
+y(:,1) = yy;
+i = 1;
+
+% clear persistent data
+l_implsolve(nlinopts);
+
+% initial step
+tt = tspan(1);
+tend = tspan(end);
+[h,TOL] = l_initialstep(rtol,atol,adaptivity,order, ...
+                        NM,errnorm1,errnorm2,tt,yy,fun,varargin{:});
+if initialstep > 0
+  h = initialstep;
+end
+h = max(min(h,maxstep),minstep);
+h2 = 0.5*h;
+
+while 1
+  if h2 < eps(16*(1+abs(tt)))
+    warning(sprintf(['Unable to meet integration tolerances without ' ...
+                     'reducing the step-size below the smallest ' ...
+                     'value allowed. Failure at time t = %f.'],tt));
+    t = t(1:i);
+    y = y(:,1:i);
+    break; % return solution obtained so far
+  end
+
+  % First take the full step y3 = T_1(h) using zero as the initial
+  % approximation for the implicit part.
+  [y3,rhon] = l_step(true,tt,yy,h,[], ...
+                     TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  if isempty(y3)
+    h = l_limiter(rhon,1)*h; % retry with smaller h
+    h2 = 0.5*h;
+    cerr = 1;
+    rho = 1;
+    if info > 0
+      disp('Nonlinear iteration converged poorly.');
+    end
+    continue;
+    % *** the continue-statement implies that minstep+maxstep are
+    % disregarded!
+  end
+
+  % Then take two half-steps T_1(h/2) and T_2(h/2) using the full step
+  % T_1(h) as a predictor. These steps are usually easier than the
+  % previous full step, so something strange is going on if this does
+  % not converge.
+
+  % y1 = T_1(h/2)
+  if nlinauto < 3
+    % interpolate using the full step
+    [y1,rhon] = l_step(false,tt,yy,h2,0.5*(yy+y3), ...
+                       TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  else
+    % be more suspicious...
+    [y1,rhon] = l_step(false,tt,yy,h2,[], ...
+                       TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  end
+  if isempty(y1)
+    h2 = l_limiter(rhon,1)*h2;
+    h = 2*h2; % (step just attempted was h2)
+    cerr = 1;
+    rho = 1;
+    if info > 0
+      warning('Nonlinear iteration did not converge!');
+    end
+    continue;
+  end
+
+  % y2 = T_2(h/2)
+  if nlinauto == 1
+    % predictor by combining the two previous steps
+    [y2,rhon] = l_step(false,tt+h2,y1,h2,y1+0.5*(y3-yy), ...
+                       TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  elseif nlinauto == 2
+    % using only the full step
+    [y2,rhon] = l_step(false,tt+h2,y1,h2,y3, ...
+                       TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  else
+    % suspicious...
+    [y2,rhon] = l_step(false,tt+h2,y1,h2,[], ...
+                       TOL,NM,errnorm1,errnorm2,nlinopts,fun,varargin{:});
+  end
+  if isempty(y2)
+    h2 = l_limiter(rhon,1)*h2;
+    h = 2*h2;
+    cerr = 1;
+    rho = 1;
+    if info > 0
+      warning('Nonlinear iteration did not converge!');
+    end
+    continue;
+  end
+
+  % Richardson's estimate...
+  err = errnorm1((y3-y2)/3);
+
+  % control objective
+  switch adaptivity
+   case 1 % EPS
+    TOL = max(rtol.*errnorm1(y2),atol);
+   case 2 % EPUS
+    [dydt,dydtexpl] = fun(tt+h,y2,tt+h,y2,varargin{:});
+    dydt = dydt+dydtexpl;
+    TOL = h*max(rtol.*errnorm1(dydt),atol);
+   case 3 % EQ
+    TOL = max(rtol.*errnorm1(y2-yy),atol);
+  end
+
+  % digital control of step-size
+  cerrold = cerr;
+  cerr = l_limiter(1/(NM*errnorm2(err./TOL)^order),2);
+  rho = l_limiter(l_lpfilter(ctrl,cerr,cerrold,rho),1);
+
+  % acceptance/rejection based on smooth rho
+  if rho >= rhomin
+    % step accepted: record the value of T_2(h/2)
+    tt = tt+h;
+    yy = y2;
+    i = i+1;
+    if i > size(t,2)
+      t = [t zeros(1,nchunks)];
+      y = [y zeros(ndof,nchunks)];
+      nchunks = min(2*nchunks,nchunksmax);
+    end
+    t(i) = tt;
+    y(:,i) = yy;
+    status = reportfun(tt,yy,[],varargin{:});
+
+    % normal end of algorithm is here
+    if tt-tend > -eps(32*abs(tt))
+      status = reportfun(tt,yy,'done',varargin{:});
+      t = t(1:i);
+      y = y(:,1:i);
+      break;
+    elseif status ~= 0
+      fprintf('Reporter returned non-zero status = %d. Bailing out.\n',status);
+      status = reportfun(tt,yy,'done',varargin{:});
+      t = t(1:i);
+      y = y(:,1:i);
+      break;
+    end
+  else
+    if info > 0
+      disp('Failed integration tolerance.');
+      fprintf('   [h = %g, err = %f, cerr = %f, rho = %f]\n', ...
+              h,1/(NM*errnorm2(err./TOL)^order),cerr,rho);
+    end
+  end
+  h = rho*h;
+
+  % in these two cases (not the preferred regime!) we must ensure that
+  % the controller gets at least some kind of 'reasonable' information
+  if h > maxstep
+    rho = rho*maxstep/h;
+    h = maxstep;
+    cerr = 1;
+  elseif h < minstep
+    rho = rho*minstep/h;
+    h = minstep;
+    cerr = 1;
+  end
+  h2 = 0.5*h;
+end
+
+% output solution structure
+if nargout < 2
+  t = struct('x',t,'y',y,'solver','ode1s');
+end
+
+%-------------------------------------------------------------------------
+function [h,TOL] = l_initialstep(rtol,atol,adaptivity,order, ...
+                                 NM,errnorm1,errnorm2,t0,y0,fun,varargin)
+%L_INITIALSTEP Starting step-size and tolerance.
+%   [H,TOL] = L_INITIALSTEP(RTOL,ATOL,...) provides a starting
+%   step-size whenever none is supplied. A value of the effective
+%   tolerance TOL is also returned. It "usually gives a good guess for
+%   the initial step-size... (or at least avoids a very bad choice)"
+%   [1, p. 169]. The algorithm is described in [1, Chap. II.4,
+%   p. 169].
+%
+% Reference:
+%   [1] E. Hairer, S. P. Norsett and G. Wanner: "Solving Ordinary
+%   Differential Equations I, Nonstiff Problems", 2nd ed, Springer
+%   (1993).
+
+% h0 by ensuring that an explicit Euler step is small
+[dydt,dydtexpl] = fun(t0,y0,t0,y0,varargin{:});
+dydt0 = dydt+dydtexpl;
+% *** evaluation can be reused later
+
+d0 = errnorm1(y0);
+TOL = max(rtol.*d0,atol);
+d0 = NM*errnorm2(d0./TOL);
+d1 = NM*errnorm2(errnorm1(dydt0)./TOL);
+if d0 >= 1e-5 && d1 >= 1e-5
+  h0 = 0.01*d0/d1;
+else
+  h0 = 1e-6;
+end
+
+% take step and estimate the 2nd derivative
+y1 = y0+h0*dydt0;
+[dydt,dydtexpl] = fun(t0+h0,y1,t0+h0,y1,varargin{:});
+dydt1 = dydt+dydtexpl;
+d2 = NM*errnorm2(errnorm1(dydt1-dydt0)./(TOL*h0));
+
+% update TOL (used in the implicit step)
+switch adaptivity
+ case 2 % EPUS
+  TOL = h0*max(rtol.*errnorm1(dydt0),atol);
+ case 3 % EQ
+  TOL = max(rtol.*errnorm1(y1-y0),atol);
+end
+
+% h1 from simple error estimate
+d3 = max(d1,d2);
+if d3 > 1e-15
+  h1 = (0.01/d3)^order;
+else
+  h1 = max(1e-6,h0*1e-3);
+end
+
+h = min(100*h0,h1);
+% or otherwise integration might be aborted immediately:
+h = max(h,eps(1024*(1+abs(t0))));
+
+%-------------------------------------------------------------------------
+function [y,rhon] = l_step(fullstep,t0,y0,h,yest,TOL, ...
+                           NM,errnorm1,errnorm2, ...
+                           nlinopts,fun,varargin)
+%L_STEP One step with the split-step Euler method.
+%   [Y,RHON] = L_STEP(FULLSTEP,T0,Y0,H,YEST,...) returns the step from
+%   [T0,Y0] to [T0+H,Y] with a split-step Euler method using YEST as
+%   an initial guess for the implicit solver.
+%
+%   See L_IMPLSOLVE.
+
+if ~isempty(yest)
+  ydelta = yest-y0;
+else
+  ydelta = zeros(size(y0));
+end
+
+% step: implicit step by solving a nonlinear system
+[ydelta,rhon] = l_implsolve(fullstep,ydelta,t0,h,y0,TOL, ...
+                            NM,errnorm1,errnorm2, ...
+                            nlinopts,fun,varargin{:});
+if isempty(ydelta)
+  % nonlinear iteration did not converge, empty return
+  y = [];
+  return;
+end
+
+% assemble step
+y = y0+ydelta;
+
+%-------------------------------------------------------------------------
+function [ydelta,rhon] = l_implsolve(fullstep,ydelta,t0,h,y0,TOL, ...
+                                     NM,errnorm1,errnorm2, ...
+                                     nlinopts,fun,varargin)
+%L_IMPLSOLVE Solver for split-step Euler method.
+%   [YDELTA,RHON] = L_IMPLSOLVE(FULLSTEP,YDELTA,T0,H,Y0,[TOL,...],
+%   NLINOPTS,FUN,...) solves the equation YDELTA =
+%   H*F(T0,Y0)+H*G(T0+H,Y0+YDELTA) for YDELTA. FULLSTEP is true if H
+%   is a full step, false for half-steps. The split in
+%   implicit/explicit parts is input as [G,F] =
+%   FUN(T0+H,T0+YDELTA,T0,Y0,...). Empty YDELTA is returned whenever
+%   poor convergence is detected, in which case RHON < 1 is a
+%   suggested step-size reduction factor.
+%
+%   The implementation follows rather closely the prescription in [2,
+%   Chap. IV.8, pp. 118--121], but has also been augmented with ideas
+%   from [1].
+%
+% Reference:
+%   [1] K. Gustafsson and G. S�derlind: "Control Strategies for the
+%   Iterative Solution of Nonlinear Equations in ODE Solvers", SIAM
+%   J. Sci. Comput. 18(1):23--40 (1997).
+%   [2] E. Hairer and G. Wanner: "Solving Ordinary Differential
+%   Equations II, Stiff and Differential-Algebraic Problems", 2nd ed,
+%   Springer (1996).
+
+% control of Jacobian evaluations and refactorizations
+persistent J wksp tJAC L U P hLU eta theta L2 U2 P2;
+
+if nargin == 1
+  % clear persistent data
+  nlinopts = fullstep;
+  J = [];
+  wksp = [];
+  eta = nlinopts.etainit;
+  theta = eta/(1+eta);
+  return;
+elseif isempty(eta)
+  % previous attempt did not converge well
+  eta = nlinopts.etainit;
+  theta = eta/(1+eta);
+end
+
+% parameters of the solver
+rhon = 1;                           % only changed in case of problems
+eta = max(eta,eps)^nlinopts.etapow; % convergence factor (reused on success)
+kappa = nlinopts.kappa;             % extra accuracy
+maxiter = nlinopts.maxiter;         % max number of iterations
+
+% The equation solved is now
+%
+%   ydelta = h*F(t0,y0)+h*G(t0+h,y0+ydelta,[t0 y0]),
+%
+% where [G,F] = fun(t0+h,y0+ydelta,t0,y0,varargin{:}) is the
+% [implicit,explicit] part.
+
+% first residual
+[dydt,dydtexpl] = fun(t0+h,y0+ydelta,t0,y0,varargin{:});
+
+% full/half-step
+if fullstep
+  % new Jacobian and/or factorization
+  if isempty(J) || theta*hLU-abs(h-hLU) > nlinopts.thetaJAC*hLU
+    % convergence was slow previously despite a not very large difference
+    % in step-size (or there is no saved Jacobian)
+    [J,wksp] = numjac(fun,t0+h,y0+ydelta,dydt,{0 1},wksp,0,[],[], ...
+                      t0,y0,varargin{:});
+    % *** user-defined Jacobian (exact and numerical)
+    tJAC = t0; % (formed with data available up to t = t0)
+    [L,U,P] = lu(eye(size(J))-h*J);
+    hLU = h;
+    P2 = []; % force refactorization at the half-step
+  elseif isempty(P) || abs(h-hLU) > nlinopts.thetaLU*hLU
+    % anticipated convergence failure due to a step-size change:
+    % refactorize the Jacobian (or a new factorization was explicitly
+    % called for)
+    [L,U,P] = lu(eye(size(J))-h*J);
+    hLU = h;
+    P2 = [];
+  end
+else
+  % when taking a half-step, the previous full step always
+  % converged and hence we only care about the factorization
+  if isempty(P2)
+    [L2,U2,P2] = lu(eye(size(J))-h*J);
+  end
+  % During the iterations below we let the half-step change the
+  % convergence indicators theta and eta since they should normally be
+  % about the same. If this is not the case and the half-step is
+  % rejected, then the full step is rejected automatically.
+end
+ 
+% first Newton-step
+res = ydelta-h*dydtexpl-h*dydt;
+if fullstep
+  delta = U\(L\(P*res));
+else
+  delta = U2\(L2\(P2*res));
+end
+ydelta = ydelta-delta;
+ndelta = NM*errnorm2(errnorm1(delta)./TOL);
+
+% early return
+if eta*ndelta <= kappa
+  return;
+end
+
+for i = 2:maxiter
+  % new update and residual
+  res =  ydelta-h*dydtexpl-h*fun(t0+h,y0+ydelta,t0,y0,varargin{:});
+  if fullstep
+    delta = U\(L\(P*res));
+  else
+    delta = U2\(L2\(P2*res));
+  end
+  ydelta = ydelta-delta;
+
+  % check convergence
+  ndeltaold = ndelta;
+  ndelta = NM*errnorm2(errnorm1(delta)./TOL);
+  theta = ndelta/ndeltaold;
+  eta = theta/(1-theta);
+  
+  if theta >= 1 || theta^(maxiter-i)*ndelta > kappa*(1-theta)
+    % failure: diverging or not converging fast enough
+    break;
+  elseif eta*ndelta <= kappa
+    % converged, return solution
+    return;
+  end
+end
+
+% iteration not terminated successfully, take action
+if theta >= 1
+  % diverging, suggest a smaller step
+  rhon = nlinopts.thetaREF/theta;
+elseif tJAC == t0
+  % slow convergence despite a fresh Jacobian
+  if theta > nlinopts.thetaREF
+    % suggest a smaller step
+    rhon = nlinopts.thetaREF/theta;
+  else
+    % rare: irregular convergence, shrink step by an arbitrary factor
+    rhon = nlinopts.hfac;
+  end
+elseif ~fullstep
+  % rare: half-step, irregular convergence despite a successful full
+  % step?
+  rhon = nlinopts.hfac;
+end
+
+% note: rejecting a half-step implies also a new full step and hence
+% we postpone handling the Jacobian to after a smaller step has been
+% tried
+if fullstep
+  % handle the Jacobian
+  if tJAC ~= t0
+    J = []; % force a new evaluation
+  else
+    P = []; % force refactorization
+  end
+end
+
+ydelta = []; % step failed
+eta = [];    % don't reuse convergence indicators 
+
+%-------------------------------------------------------------------------
+function u = l_limiter(u,kappa)
+%L_LIMITER Step-size ratio limiter.
+%   U = L_LIMITER(U,KAPPA) applies an arctangent limiter parametrized
+%   by KAPPA to U.
+%
+% Reference:
+%   [1] G. S�derlind, L. Wang: "Adaptive time-stepping and
+%   computational stability", J. Comput. Appl. Math. 185:225--243 (2006).
+
+u = 1+kappa*atan((u-1)/kappa);
+
+%-------------------------------------------------------------------------
+function rho = l_lpfilter(fir,cerr,cerrold,rho)
+%L_LPFILTER Low-pass 2nd order digital filter.
+%   RHO = L_LPFILTER(FIR,CERR,CERROLD,RHO) evaluates the digital
+%   filter FIR = [k*beta_1 k*beta_2 alpha_2] using scaled controls
+%   (CERR,CERROLD) and previous ratio RHO.
+% 
+%   The proposed next ratio is returned.
+%
+% Reference:
+%   [1] G. S�derlind: "Digital Filters in Adaptive Time-Stepping",
+%   ACM Trans. Math. Software, 29:1--26 (2003).
+
+% immediate
+rho = prod([cerr cerrold 1/rho].^fir);
+
+%-------------------------------------------------------------------------
diff --git a/stenglib/Scicomp/poch.m b/stenglib/Scicomp/poch.m
new file mode 100644
index 00000000..78df397a
--- /dev/null
+++ b/stenglib/Scicomp/poch.m
@@ -0,0 +1,40 @@
+function y = poch(a,x,b)
+%POCH Pochhammer's function.
+%  Y = POCH(A,X), for a scalar A and a vector X, computes the value of
+%  GAMMA(A+X)./GAMMA(A). The output Y is a vector the same size as X.
+%
+%  If A is a vector, the ouput is Y(i) = prod_i POCH(A(i),X).
+%
+%  Y = POCH(A,B,X) is the same as POCH(A,X)./POCH(B,X).
+%
+%  See also GAMMA, GAMMALN, PSI.
+
+% S. Engblom 2006-01-12
+
+if nargin == 3
+  t = x(:);
+  x = b(:);
+  a = a(:); b = t;
+  y = exp(sum(l_gammaln(tsum(a,x,[1],[2])),1)- ...
+          sum(l_gammaln(tsum(b,x,[1],[2])),1)- ...
+          sum(l_gammaln(a))+sum(l_gammaln(b)));
+else
+  a = a(:);
+  x = x(:);
+  y = exp(sum(l_gammaln(tsum(a,x,[1],[2])),1)- ...
+          sum(l_gammaln(a)));
+end
+
+%--------------------------------------------------------------------------
+function y = l_gammaln(x)
+%L_GAMMALN Fix for bug in GAMMALN.
+%   Annoyingly, GAMMALN is incorrectly imlemented. This function is a
+%   patch for the case when there might be negative elements in
+%   X. Still, complex values of X are not allowed.
+
+isneg = find(x < 0);
+x(isneg) = 1-x(isneg);
+y = gammaln(x);
+y(isneg) = -y(isneg)+log(pi./sin(pi*(x(isneg))));
+
+%--------------------------------------------------------------------------
diff --git a/stenglib/Scicomp/rtsafe.m b/stenglib/Scicomp/rtsafe.m
new file mode 100644
index 00000000..e69e35bc
--- /dev/null
+++ b/stenglib/Scicomp/rtsafe.m
@@ -0,0 +1,104 @@
+function x = rtsafe(x1,f1,x2,f2,fun,fargs,tol,maxits)
+%RTSAFE Scalar nonlinear solver.
+%   X = RTSAFE(X1,F1,X2,F2,FUN,FARGS,TOL,MAXITS) computes a solution X
+%   to the equation FUN(X) = 0. FUN has the signature [F,DF] =
+%   FUN(X,FARGS{:}) and should compute both function values and
+%   derivatives. On entry, X1 and X2 must bracket a root and the
+%   function values F1 and F2 must therefore satisfy F1.*F2 <= 0. Only
+%   the signs of F1 and F2 need to be correct in order for the
+%   algorithm to work properly.
+%
+%   If there are more than one root, RTSAFE will find one of them
+%   (including possible singularities). If FUN is identical to zero on
+%   the whole interval [X1 X2], RTSAFE will converge to the midpoint.
+%
+%   FARGS is a cell-array and should contain additional inputs to
+%   FUN. An empty cell is the default.
+%  
+%   TOL = [RTOL ATOL] is the absolute and relative tolerance. If TOL
+%   is a scalar, then RTOL = TOL and ATOL = 1e-2*TOL are used. The
+%   default is TOL = [10*eps 0.1*eps].
+%
+%   MAXITS is the maximum number of iterations allowed. The default is
+%   100.
+%
+%   The method used is a fail-safe combination of Newtons method and
+%   bisection. Only limited error-checking is performed.
+
+% S. Engblom 2004-03-03
+
+if any(f1.*f2 > 0), error('Roots must be bracketed.'); end
+
+% defaults
+if nargin < 8
+  maxits = 100;
+  if nargin < 7
+    tol = 10*eps;
+    if nargin < 6
+      fargs = {};
+    end
+  end
+end
+
+% absolute/relative tolerance
+rtol = tol(1);
+if prod(size(tol)) == 1
+  atol = 1e-2*rtol;
+else
+  atol = tol(2);
+end
+
+% form initial brackets according to the following rules:
+%   xl(f2 < 0 f1 < 0 f2 == 0 f1 == 0) = [x2 x1 x2 x1]
+%   xh(f1 > 0 f2 > 0 f1 == 0 f2 == 0) = [x1 x2 x1 x2]
+% (the brackets has to be formed exactly as written)
+xl = zeros(size(x1));
+xh = zeros(size(x2));
+ix1 = find(f1 < 0);
+ix2 = find(f2 < 0);
+xl(ix2) = x2(ix2); xl(ix1) = x1(ix1);
+ix1 = find(f1 > 0);
+ix2 = find(f2 > 0);
+xh(ix1) = x1(ix1); xh(ix2) = x2(ix2);
+
+ix1 = find(f1 == 0);
+ix2 = find(f2 == 0);
+xl(ix2) = x2(ix2); xl(ix1) = x1(ix1);
+xh(ix1) = x1(ix1); xh(ix2) = x2(ix2);
+
+% evaluation in midpoint
+x = 0.5*(xl+xh);
+[f,df] = feval(fun,x,fargs{:});
+
+% step and 'step before last'
+dxold = xh-xl;
+dx = dxold;
+
+for i = 1:maxits
+  % prefer bisection whenever the Newton step jumps out of brackets or
+  % when the size of the interval is not decreasing fast enough
+  ix1 = (((x-xh).*df-f).*((x-xl).*df-f) >= 0 | ...
+         abs(2*f) > abs(dxold.*df));
+  ix2 = find(~ix1);
+  ix1 = find(ix1);
+
+  dxold = dx;
+
+  % bisection step
+  dx(ix1) = 0.5*(xh(ix1)-xl(ix1));
+  x(ix1) = xl(ix1)+dx(ix1);
+
+  % Newton step
+  dx(ix2) = f(ix2)./df(ix2);
+  x(ix2) = x(ix2)-dx(ix2);
+
+  if all(abs(dx) <= rtol*abs(x)+atol), return; end
+
+  % new evaluation and brackets
+  [f,df] = feval(fun,x,fargs{:});
+  ix1 = find(f <= 0);
+  ix2 = find(f >= 0);
+  xl(ix1) = x(ix1);
+  xh(ix2) = x(ix2);
+end
+error('Maximum number of iterations reached.');
diff --git a/stenglib/Scicomp/startup.m b/stenglib/Scicomp/startup.m
new file mode 100644
index 00000000..3b2b4498
--- /dev/null
+++ b/stenglib/Scicomp/startup.m
@@ -0,0 +1,8 @@
+%STARTUP Add paths to SCICOMP stuff.
+
+% S. Engblom 2008-06-17
+
+s = pwd;
+if exist('test','dir')
+  addpath([s '/test']);
+end
diff --git a/stenglib/Tensor/source/make.m b/stenglib/Tensor/source/make.m
new file mode 100644
index 00000000..0be2c8f8
--- /dev/null
+++ b/stenglib/Tensor/source/make.m
@@ -0,0 +1,174 @@
+function make
+%MAKE Makefile for TENSOR.
+
+% Johannes Dufva 2020-10-30 (mexw64, 9.7)
+% S. Engblom 2019-01-23 (mexmaci64, mexa64, 9.6)
+% S. Engblom 2015-03-20 (mexa64, 8.4)
+% S. Engblom 2015-01-19 (mexmaci64, 8.4)
+% S. Engblom 2012-04-16 (mexmaci64, 7.11)
+% S. Engblom 2011-04-17 (mexmaci64, 7.10)
+% S. Engblom 2011-03-07 (mexa64, 7.11)
+% S. Engblom 2010-09-23 (mexs64, 7.7)
+% S. Engblom 2010-02-02 (mexa64, 7.8)
+% S. Engblom 2010-01-12 (mexmaci)
+% S. Engblom 2007-05-17 (mexs64)
+% S. Engblom 2006-11-09 (mexa64)
+% S. Engblom 2005-04-10 (mexmac)
+
+% Use '-DBLASINT=size_t' for the (bad!) platforms where the 'int' in
+% the declaration of BLAS subroutines is in fact a 'size_t' and
+% sizeof(size_t) > sizeof(int).
+
+s = pwd;
+mx = mexext;
+ver = version;
+
+if strcmp(mx,'mexglx')
+  if ~strncmp(version,'7.5',3) && ~strncmp(version,'7.8',3)
+    warning(['Extension .' mexext [' tested with Matlab version(s) ' ...
+                        '7.5 and 7.8 only.']]);
+  end
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/tndims.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/tsize.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,[s '/source/tsum.c']);
+  mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+       '-D_GNU_SOURCE -pthread -fexceptions'], ...
+      '-outdir',s,'-lmwblas',[s '/source/tprod.c']);
+elseif strcmp(mx,'mexa64')
+  v = version;
+  if v(1) == '7'
+    if ~strncmp(version,'7.2',3) && ~strncmp(version,'7.8',3) && ...
+          ~strncmp(version,'7.11',4) && ~strncmp(version,'7.13',4)
+      warning(['Extension .' mexext [' tested with Matlab version(s) ' ...
+                          '7.2, 7.8, 7.11 and 7.13 only.']]);
+    end
+    if ~strncmp(version,'7.11',4)
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/tndims.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/tsize.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          '-outdir',s,[s '/source/tsum.c']);
+      if strncmp(version,'7.2',3)
+        mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions'], ...
+            '-outdir',s,[s '/source/tprod.c']);
+      else
+        mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 ' ...
+             '-D_GNU_SOURCE -pthread -fexceptions -DBLASINT=size_t'], ...
+            '-outdir',s,'-lmwblas',[s '/source/tprod.c']);
+      end
+    else
+      % apparently, the linker path is not properly set up on 7.11:
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/tndims.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/tsize.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,[s '/source/tsum.c']);
+      mex(['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+           '-D_GNU_SOURCE -pthread -fexceptions -DBLASINT=size_t'], ...
+          ['-L' matlabroot '/sys/os/glnxa64'], ...
+          '-outdir',s,'-lmwblas',[s '/source/tprod.c']);
+    end
+  else
+    if ~strncmp(version,'8.4',3) && ~strncmp(version,'9.6',3)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '8.4 and 9.6 only.']);
+    end
+      
+    % apparently, the linker path is not properly set up on 8.4 (also a
+    % soft link libstdc++.so inside [matlabroot '/sys/os/glnxa64'] is
+    % required to point to the correct shared library, in this case
+    % libstdc++.so.6.0.17)
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/tndims.c']);
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/tsize.c']);
+    mex('CFLAGS=-fPIC -std=c99 -O3',['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,[s '/source/tsum.c']);
+    mex('CFLAGS=-fPIC -std=c99 -O3 -DBLASINT=size_t', ...
+        ['-L' matlabroot '/sys/os/glnxa64'], ...
+        '-outdir',s,'-lmwblas',[s '/source/tprod.c']);
+  end
+elseif strcmp(mx,'mexmac')
+  if ~strncmp(version,'7.0',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.0 only.']);
+  end
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/tndims.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/tsize.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/tsum.c']);
+  mex('CC=gcc -std=c99','-outdir',s,[s '/source/tprod.c']);
+elseif strcmp(mx,'mexmaci')
+  if ~strncmp(version,'7.8',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.8 only.']);
+  end
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tndims.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tsize.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tsum.c']);
+  mex('CC=gcc -std=c99 -fast','-outdir',s,'-lmwblas', ...
+      [s '/source/tprod.c']);
+elseif strcmp(mx,'mexmaci64')
+  v = version;
+  if v(1) == '7'
+    if ~strncmp(version,'7.10',4) && ~strncmp(version,'7.11',4) && ...
+          ~strncmp(version,'7.14',4)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '7.10 and 7.11 only.']);
+    end
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tndims.c']);
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tsize.c']);
+    mex('CC=gcc -std=c99 -fast','-outdir',s,[s '/source/tsum.c']);
+    mex('CC=gcc -std=c99 -fast -DBLASINT=size_t', ...
+        '-outdir',s,'-lmwblas', ...
+        [s '/source/tprod.c']);
+  else
+    if ~strncmp(version,'8.4',3) && ~strncmp(version,'9.6',3)
+      warning(['Extension .' mexext ' tested with Matlab version(s) ' ...
+               '8.4 and 9.6 only.']);
+    end
+    mex('CFLAGS= -std=c99','-outdir',s,[s '/source/tndims.c']);
+    mex('CFLAGS=-Wno-logical-op-parentheses -std=c99','-outdir',s,[s '/source/tsize.c']);
+    mex('CFLAGS=-Wno-logical-op-parentheses -std=c99','-outdir',s,[s '/source/tsum.c']);
+    mex('CFLAGS= -std=c99 -DBLASINT=size_t', ...
+        '-outdir',s,'-lmwblas', ...
+        [s '/source/tprod.c']);
+  end
+elseif strcmp(mx,'mexs64')
+  if ~strncmp(version,'7.7',3)
+    warning(['Extension .' mexext ' tested with Matlab version(s) 7.7 only.']);
+  end
+  mex('-outdir',s,[s '/source/tndims.c']);
+  mex('-outdir',s,[s '/source/tsize.c']);
+  mex('-outdir',s,[s '/source/tsum.c']);
+  mex('-outdir',s,[s '/source/tprod.c']); 
+  
+elseif strcmp(mx,'mexw64')
+  mex('-outdir',s,[s '/source/tndims.c']);	  cflags = 'CFLAGS= -lmwblas -std=c99 ';
+  mex('-outdir',s,[s '/source/tsize.c']);	  mex(cflags, '-outdir',s,[s '/source/tndims.c']);
+  mex('-outdir',s,[s '/source/tsum.c']);	  mex(cflags, '-outdir',s,[s '/source/tsize.c']);
+  mex('-lmwblas','CFLAGS= -D"dgemm_=dgemm" ','-outdir',s,[s '/source/tprod.c']); 	  mex(cflags, '-outdir',s,[s '/source/tsum.c']);
+  mex([cflags '-D"dgemm_=dgemm" -D"BLASINT=size_t" '],'-outdir',s,[s '/source/tprod.c']); % '-lmwblas',
+
+else
+  warning('New platform. Trying default make.');
+  mex('-outdir',s,[s '/source/tndims.c']);
+  mex('-outdir',s,[s '/source/tsize.c']);
+  mex('-outdir',s,[s '/source/tsum.c']);
+  mex('-outdir',s,[s '/source/tprod.c']); 
+end
diff --git a/stenglib/Tensor/source/tndims.c b/stenglib/Tensor/source/tndims.c
new file mode 100644
index 00000000..4a0daf8d
--- /dev/null
+++ b/stenglib/Tensor/source/tndims.c
@@ -0,0 +1,25 @@
+/* tndims.c */
+/* S. Engblom 2005-04-10 */
+
+#include "mex.h"
+#include "matrix.h"
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* check of syntax */
+  if (nrhs != 1 || nlhs > 1)
+    mexErrMsgIdAndTxt("tndims:e1","Expecting one input and one output.");
+
+  /* input */
+  mwSize ndimA = mxGetNumberOfDimensions(prhs[0]);
+
+  /* adjust */
+  if (ndimA == 2)
+    ndimA -= (mxGetN(prhs[0]) == 1)*(1+(mxGetM(prhs[0]) == 1));
+
+  /* output */
+  plhs[0] = mxCreateDoubleMatrix(1,1,mxREAL);
+  *mxGetPr(plhs[0]) = (double)ndimA;
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Tensor/source/tprod.c b/stenglib/Tensor/source/tprod.c
new file mode 100644
index 00000000..a8ebbbca
--- /dev/null
+++ b/stenglib/Tensor/source/tprod.c
@@ -0,0 +1,298 @@
+/* tprod.c */
+
+/* S. Engblom 2010-09-23 (Minor revision) */
+/* S. Engblom 2010-02-02 (Minor revision) */
+/* S. Engblom 2005-04-29 */
+
+#include <string.h>
+#include <math.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/* Note: All integer arguments in calls to the BLAS function dgemm()
+   will be casted according to (int *)&BLASINT. This is to provide for
+   the (bad!) platforms where sizeof(int) < sizeof(size_t) and the
+   binary library uses size_t while the declaration is erroneously
+   int. For such cases you will have to manually define BLASINT as
+   size_t when calling the compiler. */
+#ifndef BLASINT
+#define BLASINT int
+#endif
+
+// declaration of BLAS-kernel
+void dgemm_(const char *,const char *,
+	    const int *,const int *,const int *,const double *,
+	    const double *,const int *,
+	    const double *,const int *,
+	    const double *,double *,const int *);
+
+// forward declarations
+void checkix(const double *ix,mwSize len,int *min,int *max);
+void sortix(const double *ix,mwSize len,int edg,int *in);
+void logicalsize(const mxArray *A,mwSize nda,mwSize *sizA);
+void permute(const mxArray *A,
+	     const int *p1,const int*p2,const int *p3,int len,
+	     mxArray **Ap);
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  // check of syntax
+  if (nrhs != 4 || nlhs > 1)
+    mexErrMsgIdAndTxt("tprod:e9",
+		      "Expecting four inputs and one output.");
+
+  // check of input
+  if (!mxIsDouble(prhs[0]) || mxIsSparse(prhs[0]) ||
+      !mxIsDouble(prhs[1]) || mxIsSparse(prhs[1]))
+    mexErrMsgIdAndTxt("tprod:e8",
+		      "Expecting a double, non-sparse array.");
+  if (!mxIsDouble(prhs[2]) || mxIsComplex(prhs[2]) || 
+      mxIsSparse(prhs[2]) || !mxIsDouble(prhs[3]) || 
+      mxIsComplex(prhs[3]) || mxIsSparse(prhs[3]))
+    mexErrMsgIdAndTxt("tprod:e10",
+		      "Index vector must be real, double and non-sparse.");
+
+  // inputs IA and IB in *unit offset*
+  const double *ia = mxGetPr(prhs[2])-1;
+  const double *ib = mxGetPr(prhs[3])-1;
+  const mwSize nda = mxGetNumberOfElements(prhs[2]);
+  const mwSize ndb = mxGetNumberOfElements(prhs[3]);
+  int min = 0,max = 0;
+
+  // perform some checks and determine the span of the indices
+  checkix(ia,nda,&min,&max);
+  checkix(ib,ndb,&min,&max);
+
+  // sanity check
+  if (max-min > nda+ndb)
+    mexErrMsgIdAndTxt("tprod:e7","Index vector is not contiguous or "
+                      "a summation index is missing.");
+
+  /* Sort the indices out. The vectors ina and inb points back into ia
+     and ib (in *unit offset*) according to the absolute ordering. */
+  const int edg = min;
+  const int len = max-min+1;
+  const int ndc = max; // the number of dimensions of the output
+  int ina[len],inb[len];
+
+  sortix(ia,nda,edg,memset(ina,0,len*sizeof(ina[0])));
+  sortix(ib,ndb,edg,memset(inb,0,len*sizeof(inb[0])));
+
+  /* Set operations: the permutations contain indices pointing back
+     into ia and ib in *unit offset* for the private parts (pa1 and
+     pb1), the negative common parts (pa2 and pb2) and the positive
+     common parts (pa3 and pb3). */
+  int pa1[len],pa2[len],pa3[len];
+  int pb1[len],pb2[len],pb3[len];
+
+  memset(pa1,0,len*sizeof(pa1[0])); memset(pb1,0,len*sizeof(pb1[0]));
+  memset(pa2,0,len*sizeof(pa2[0])); memset(pb2,0,len*sizeof(pb2[0]));
+  memset(pa3,0,len*sizeof(pa3[0])); memset(pb3,0,len*sizeof(pb3[0]));
+  for (int i = 0,ia1 = 0,ib1 = 0,i2 = 0,i3 = 0; i < len; i++) {
+    const int ix = i+edg;
+
+    // a setdiff produces the private part
+    if (ina[i] && !inb[i]) {
+      if (ix < 0)
+        mexErrMsgIdAndTxt("tprod:e3","Summation index is missing.");
+      pa1[ia1++] = ina[i];
+    }
+    // setdiff...
+    else if (!ina[i] && inb[i]) {
+      if (ix < 0)
+        mexErrMsgIdAndTxt("tprod:e3","Summation index is missing.");
+      pb1[ib1++] = inb[i];
+    }
+    // an intersection produces the (positive and negative) common part
+    else if (ina[i] && inb[i]) {
+      if (ix < 0) {
+        pa2[i2] = ina[i]; pb2[i2++] = inb[i];
+      }
+      else {
+        pa3[i3] = ina[i]; pb3[i3++] = inb[i];
+      }
+    }
+    else if (ix != 0)
+      mexErrMsgIdAndTxt("tprod:e4","Index vector must be contiguous.");
+  }
+
+  // determine the logical size of the inputs A and B
+  mwSize sizA[nda],sizB[ndb];
+  logicalsize(prhs[0],nda,sizA);
+  logicalsize(prhs[1],ndb,sizB);
+
+  // check common sizes
+  for (int i = 0; pa2[i] != 0; i++)
+    if (sizA[pa2[i]-1] != sizB[pb2[i]-1])
+      mexErrMsgIdAndTxt("tprod:e6","Dimensions must agree.");
+  for (int i = 0; pa3[i] != 0; i++)
+    if (sizA[pa3[i]-1] != sizB[pb3[i]-1])
+      mexErrMsgIdAndTxt("tprod:e6","Dimensions must agree.");
+
+  // permute to an order suitable for computations
+  mxArray *Ap,*Bp;
+
+  permute(prhs[0],pa1,pa2,pa3,nda,&Ap);
+  const double *prA = mxGetPr(Ap);
+  const double *piA = mxGetPi(Ap);
+
+  permute(prhs[1],pb1,pb2,pb3,ndb,&Bp);
+  const double *prB = mxGetPr(Bp);
+  const double *piB = mxGetPi(Bp);
+
+  // allocate the (permuted) output C
+  mwSize sizC[ndc > 2 ? ndc : 2];
+  sizC[0] = sizC[1] = 1; /* singletons */
+  BLASINT m = 1,n = 1,k = 1,l = 1;
+  {
+    // the size of C, including the sizes of the various blocks
+    int j = 0;
+    for (int i = 0; pa1[i] != 0; i++) m *= (sizC[j++] = sizA[pa1[i]-1]);
+    for (int i = 0; pa2[i] != 0; i++) k *= sizA[pa2[i]-1];
+    for (int i = 0; pb1[i] != 0; i++) n *= (sizC[j++] = sizB[pb1[i]-1]);
+    for (int i = 0; pa3[i] != 0; i++) l *= (sizC[j++] = sizA[pa3[i]-1]);
+  }
+  mxArray *Cp = mxCreateNumericArray(ndc > 2 ? ndc : 2,sizC,
+				     mxDOUBLE_CLASS,
+				     piA != NULL || piB != NULL ? 
+				     mxCOMPLEX : mxREAL);
+  double *prC = mxGetPr(Cp);
+  double *piC = mxGetPi(Cp);
+
+  /* Evaluate the product as a series of matrix products, C(:,:,i) =
+     A(:,:,i).'*B(:,:,i), i = 1..l. This corresponds to the canonical
+     form C = tprod(A,B,[-1 1 3],[-1 2 3]) where A is k-by-m-by-l, B
+     is k-by-n-by-l and C is m-by-n-by-l. */
+  const double one = 1.0,minusone = -1.0;
+  const char *t1 = "T",*t2 = "N";
+
+  // real x real, complex x real, real x complex, complex x complex
+  if (piC == NULL)
+    for (int jc = 0,ja = 0,jb = 0; jc < m*n*l; ) {
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&prC[jc],(int *)&m);
+      jc += m*n; ja += m*k; jb += k*n;
+    }
+  else if (piB == NULL)
+    for (int jc = 0,ja = 0,jb = 0; jc < m*n*l; ) {
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&prC[jc],(int *)&m);
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+             &piA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&piC[jc],(int *)&m);
+      jc += m*n; ja += m*k; jb += k*n;
+    }
+  else if (piA == NULL)
+    for (int jc = 0,ja = 0,jb = 0; jc < m*n*l; ) {
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&prC[jc],(int *)&m);
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&piB[jb],(int *)&k,&one,&piC[jc],(int *)&m);
+      jc += m*n; ja += m*k; jb += k*n;
+    }
+  else
+    for (int jc = 0,ja = 0,jb = 0; jc < m*n*l; ) {
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&prC[jc],(int *)&m);
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+             &piA[ja],(int *)&k,&prB[jb],(int *)&k,&one,&piC[jc],(int *)&m);
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&one,
+	     &prA[ja],(int *)&k,&piB[jb],(int *)&k,&one,&piC[jc],(int *)&m);
+      dgemm_(t1,t2,(int *)&m,(int *)&n,(int *)&k,&minusone,
+	     &piA[ja],(int *)&k,&piB[jb],(int *)&k,&one,&prC[jc],(int *)&m);
+      jc += m*n; ja += m*k; jb += k*n;
+    }
+
+  // can deallocate permuted versions of A and B now
+  mxDestroyArray(Bp);
+  mxDestroyArray(Ap);
+
+  // permute according to input
+  mxArray *prhs2[2];
+  prhs2[0] = Cp;
+  prhs2[1] = mxCreateDoubleMatrix(1,ndc > 2 ? ndc : 2,mxREAL);
+  {
+    // note: unit offset
+    int j = 1;
+    double *p = mxGetPr(prhs2[1])-1;
+    p[1] = 1.0; p[2] = 2.0; // singletons 
+
+    for (int i = 0; pa1[i] != 0; i++) p[(int)ia[pa1[i]]] = j++;
+    for (int i = 0; pb1[i] != 0; i++) p[(int)ib[pb1[i]]] = j++;
+    for (int i = 0; pa3[i] != 0; i++) p[(int)ia[pa3[i]]] = j++;
+  }
+  mexCallMATLAB(1,plhs,2,prhs2,"permute");
+  mxDestroyArray(prhs2[1]);
+  mxDestroyArray(Cp);
+}
+/*-----------------------------------------------------------------------*/
+void checkix(const double *ix,mwSize len,int *min,int *max)
+/* Checks the indices ix[1..len] and adjusts min and max
+   accordingly. */
+{
+  for (int i = 1; i <= len; i++) {
+    if (ix[i] == 0.0 || ix[i] != ceil(ix[i]) || isinf(ix[i]))
+      mexErrMsgIdAndTxt("tprod:e1",
+			"Index vector must contain nonzero integers.");
+
+    if (ix[i] < *min) *min = (int)ix[i];
+    else if (*max < ix[i]) *max = (int)ix[i];
+  }
+}
+/*-----------------------------------------------------------------------*/
+void sortix(const double *ix,mwSize len,int edg,int *in)
+/* Sorts the indices ix[1..len] and determines a rank-table in. The
+   smallest index is edg and in is assumed to be allocated and cleared
+   prior to call. */
+{
+  for (int i = 1; i <= len; i++) {
+    const int ii =  (int)ix[i]-edg;
+
+    if (in[ii] != 0)
+      mexErrMsgIdAndTxt("tprod:e2","Indices must be distinct.");
+    in[ii] = i;
+  }
+}
+/*-----------------------------------------------------------------------*/
+void logicalsize(const mxArray *A,mwSize nda,mwSize *sizA)
+/* Determines the size sizA of the array A. The sizes of all
+   dimensions i, 1 <= i <= nda are included in sizA. */
+{
+  mwSize ndimA = mxGetNumberOfDimensions(A);
+  const mwSize *msizA = mxGetDimensions(A);
+
+  // mask for Matlab's stupid convention
+  if (ndimA == 2) ndimA -= (msizA[1] == 1)*(1+(msizA[0] == 1));
+  if (nda < ndimA)
+    mexErrMsgIdAndTxt("tprod:e5","Wrong size of index vector.");
+
+  memcpy(sizA,msizA,ndimA*sizeof(sizA[0]));
+  for (int i = ndimA; i < nda; i++) sizA[i] = 1;
+}
+/*-----------------------------------------------------------------------*/
+void permute(const mxArray *A,
+	     const int *p1,const int*p2,const int *p3,int len,
+	     mxArray **Ap)
+/* Permutes the array A according to the ordering p = [p2 p1 p3] which
+   must be of total length len. The result is stored in Ap. */
+{
+  // input to Matlab
+  mxArray *prhs[2];
+  prhs[0] = (mxArray *)A;
+  prhs[1] = mxCreateDoubleMatrix(1,len > 2 ? len : 2,mxREAL);
+
+  int j = 0;
+  double *p = mxGetPr(prhs[1]);
+  p[0] = 1.0; p[1] = 2.0; // singletons
+
+  // construct the permutation p
+  for (int i = 0; p2[i] != 0; i++) p[j++] = (double)p2[i];
+  for (int i = 0; p1[i] != 0; i++) p[j++] = (double)p1[i];
+  for (int i = 0; p3[i] != 0; i++) p[j++] = (double)p3[i];
+
+  // call Matlab and clean up
+  mexCallMATLAB(1,Ap,2,prhs,"permute");
+  mxDestroyArray(prhs[1]);
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Tensor/source/tsize.c b/stenglib/Tensor/source/tsize.c
new file mode 100644
index 00000000..7975dc74
--- /dev/null
+++ b/stenglib/Tensor/source/tsize.c
@@ -0,0 +1,60 @@
+/* tsize.c */
+/* S. Engblom 2005-04-10 */
+
+#include <math.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* check of syntax */
+  if (nrhs != 1 && nrhs != 2 || nlhs > 1)
+    mexErrMsgIdAndTxt("tsize:e1",
+      "Expecting one or two inputs and one output.");
+
+  /* input */
+  mwSize ndimA = mxGetNumberOfDimensions(prhs[0]);
+  const mwSize *sizA = mxGetDimensions(prhs[0]);
+
+  /* adjust the number of dimensions */
+  if (ndimA == 2)
+    ndimA -= (sizA[1] == 1)*(1+(sizA[0] == 1));
+
+  /* full or indexed version */
+  if (nrhs == 1) {
+    /* output */
+    plhs[0] = mxCreateDoubleMatrix(1,ndimA,mxREAL);
+    double *siz = mxGetPr(plhs[0]);
+    for (int i = 0; i < ndimA; i++)
+      siz[i] = (double)sizA[i];
+  }
+  else {
+    if (!mxIsDouble(prhs[1]) || mxIsComplex(prhs[1]) || 
+        mxIsSparse(prhs[1]))
+      mexErrMsgIdAndTxt("tsize:e2",
+        "Dimension argument must be real, double and non-sparse.");
+
+    /* input DIMS */
+    const mwSize lenDIMS = mxGetNumberOfElements(prhs[1]);
+    const double *prDIMS = mxGetPr(prhs[1]);
+
+    /* check input DIMS */
+    for (int i = 0; i < lenDIMS; i++)
+      if (prDIMS[i] < 1.0 || prDIMS[i] != ceil(prDIMS[i]))
+        mexErrMsgIdAndTxt("tsize:e3",
+          "Size argument must be nonnegative integers.");
+
+    /* output */
+    plhs[0] = mxCreateDoubleMatrix(1,lenDIMS,mxREAL);
+    double *siz = mxGetPr(plhs[0]);
+    for (int i = 0; i < lenDIMS; i++) {
+      if (prDIMS[i] <= ndimA)
+        siz[i] = (double)sizA[(int)prDIMS[i]-1];
+      else
+        siz[i] = 1.0;
+    }
+  }
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Tensor/source/tsum.c b/stenglib/Tensor/source/tsum.c
new file mode 100644
index 00000000..236b10ab
--- /dev/null
+++ b/stenglib/Tensor/source/tsum.c
@@ -0,0 +1,498 @@
+/* tsum.c */
+/* S. Engblom 2005-08-26 */
+
+#include <string.h>
+#include <math.h>
+
+#include "mex.h"
+#include "matrix.h"
+
+/* forward declarations */
+void checkix(const double *ix,mwSize len,int *max);
+void sortix(const double *ix,mwSize len,int *in);
+void logicalsize(const mxArray *A,mwSize nda,mwSize *sizA);
+void permute(mxArray **Ap,const mxArray *A,
+	     const int *p1,const int *p2,int len);
+void sum(int dim,double *prB,double *piB,
+	 double *prA,double *piA,double **prC,double **piC,
+	 mwSize ndims,mwSize *siz,mwSize *str);
+
+#define MAX(a,b) ((a) >= (b) ? (a) : (b))
+
+#define MXTYPE(A,B) (mxIsComplex(A) || mxIsComplex(B) ? \
+		     mxCOMPLEX : mxREAL)
+
+#define ISDOUBLETENSOR(A)     (mxIsDouble(A) && !mxIsSparse(A))
+#define ISDOUBLEREALTENSOR(A) (mxIsDouble(A) && !mxIsSparse(A) && \
+			       !mxIsComplex(A))
+
+/*-----------------------------------------------------------------------*/
+void mexFunction(int nlhs,mxArray *plhs[],int nrhs,const mxArray *prhs[])
+{
+  /* check of syntax */
+  if (nrhs != 2 && nrhs != 4 || nlhs > 1)
+    mexErrMsgIdAndTxt("tsum:e1","Expecting two or four inputs and "
+		      "one output.");
+
+  /* tensor-sum of two arrays */
+  if (nrhs == 4) {
+    /* check of input */
+    if (!ISDOUBLETENSOR(prhs[0]) || !ISDOUBLETENSOR(prhs[1]))
+      mexErrMsgIdAndTxt("tsum:e2","Expecting a double, "
+			"non-sparse array.");
+    if (!ISDOUBLEREALTENSOR(prhs[2]) || !ISDOUBLEREALTENSOR(prhs[3]))
+      mexErrMsgIdAndTxt("tsum:e3","Index vector must be real, "
+			"double and non-sparse.");
+
+    /* inputs IA and IB in *unit offset* */
+    const double *ia = mxGetPr(prhs[2])-1;
+    const double *ib = mxGetPr(prhs[3])-1;
+    const mwSize nda = mxGetNumberOfElements(prhs[2]);
+    const mwSize ndb = mxGetNumberOfElements(prhs[3]);
+    int max = 0; /* the number of dimensions of the output */
+
+    /* perform some checks and determine the span of the indices */
+    checkix(ia,nda,&max); checkix(ib,ndb,&max);
+
+    /* sanity check */
+    if (max > nda+ndb)
+      mexErrMsgIdAndTxt("tsum:e5","Index vector must be contiguous.");
+
+    /* Sort the indices out. The vectors ina and inb points back into ia
+       and ib (in *unit offset*) according to the absolute ordering. */
+    int ina[max],inb[max];
+
+    sortix(ia,nda,memset(ina,0,max*sizeof(ina[0])));
+    sortix(ib,ndb,memset(inb,0,max*sizeof(inb[0])));
+
+    /* Set operations: the permutations contain indices pointing back
+       into ia and ib in *unit offset* for the private parts (pa1 and
+       pb1) and the common parts (pa2 and pb2). */
+    int pa1[max+1],pa2[max+1],pb1[max+1],pb2[max+1];
+
+    memset(pa1,0,(max+1)*sizeof(pa1[0])); memset(pa2,0,(max+1)*sizeof(pa2[0]));
+    memset(pb1,0,(max+1)*sizeof(pb1[0])); memset(pb2,0,(max+1)*sizeof(pb2[0]));
+    for (int i = 0,ia1 = 0,ib1 = 0,i2 = 0; i < max; i++) {
+
+      /* a setdiff produces the private part */
+      if (ina[i] && !inb[i])
+	pa1[ia1++] = ina[i];
+      /* setdiff... */
+      else if (!ina[i] && inb[i])
+	pb1[ib1++] = inb[i];
+      /* an intersection produces the common part */
+      else if (ina[i] && inb[i]) {
+	pa2[i2] = ina[i]; pb2[i2++] = inb[i];
+      }
+      else
+	mexErrMsgIdAndTxt("tsum:e5","Index vector must be contiguous.");
+    }
+
+    /* determine the logical size of the inputs A and B */
+    mwSize sizA[nda],sizB[ndb];
+    logicalsize(prhs[0],nda,sizA); logicalsize(prhs[1],ndb,sizB);
+
+    /* permute to an order suitable for computations */
+    mxArray *Ap,*Bp;
+
+    permute(&Ap,prhs[0],pa2,pa1,nda);
+    const double *prA = mxGetPr(Ap),*piA = mxGetPi(Ap);
+
+    permute(&Bp,prhs[1],pb2,pb1,ndb);
+    const double *prB = mxGetPr(Bp),*piB = mxGetPi(Bp);
+
+    /* allocate the (permuted) output C (here prhs2[0] as it will be
+       permuted by prhs2[1] in the end) */
+    mxArray *prhs2[2];
+    prhs2[1] = mxCreateDoubleMatrix(1,MAX(2,max),mxREAL);
+    mwSize sizC[MAX(2,max)]; sizC[0] = sizC[1] = 1; /* singletons */
+    int k = 1,m = 1,n = 1;
+    {
+      /* build the dimensions of C, including the sizes of the various
+	 blocks along with the final permutation at the same time */
+      int j = 0;
+      double *p = mxGetPr(prhs2[1])-1; /* unit offset */
+      p[1] = 1.0; p[2] = 2.0; /* singletons */
+
+      for (int i = 0; pa2[i] != 0; i++) {
+	/* check common sizes */
+	if (sizA[pa2[i]-1] != sizB[pb2[i]-1])
+	  mexErrMsgIdAndTxt("tsum:e8","Dimensions must agree.");
+	k *= (sizC[j++] = sizA[pa2[i]-1]);
+	p[(int)ia[pa2[i]]] = j;
+      }
+      for (int i = 0; pb1[i] != 0; i++) {
+	m *= (sizC[j++] = sizB[pb1[i]-1]);
+	p[(int)ib[pb1[i]]] = j;
+      }
+      for (int i = 0; pa1[i] != 0; i++) {
+	n *= (sizC[j++] = sizA[pa1[i]-1]);
+	p[(int)ia[pa1[i]]] = j;
+      }
+    }
+    prhs2[0] = mxCreateNumericArray(MAX(2,max),sizC,mxDOUBLE_CLASS,
+				    MXTYPE(prhs[0],prhs[1]));
+    double *prC = mxGetPr(prhs2[0]),*piC = mxGetPi(prhs2[0]);
+
+    /* Evaluate the sum on the canonical form C = tsum(A,B,[1 3],[1
+       2]), where A is k-by-n, B is k-by-m and C is k-by-m-by-n. */
+
+    /* real + real, complex + real, real + complex, complex + complex */
+    if (piA == NULL || piB == NULL) {
+      for (int ja = 0; ja < n; ja++, prA += k, prB -= k*m)
+	for (int jb = 0; jb < m; jb++, prA -= k) {
+	  for (int ii = 0; ii < k; ii++)
+	    *prC++ = (*prA++)+(*prB++);
+	}
+
+      /* straightforward copy of imaginary parts, if any */
+      if (piA != NULL)
+	for (int ja = 0; ja < n; ja++, piA += k)
+	  for (int jb = 0; jb < m; jb++, piC += k)
+	    memcpy(piC,piA,k*sizeof(double));
+      else if (piB != NULL)
+	for (int ja = 0; ja < n; ja++, piC += k*m)
+	  memcpy(piC,piB,k*m*sizeof(double));
+    }
+    else
+      /* same loop as in the real case, rewritten for typographical
+	 reasons */
+      for (int ja = 0; ja < n; ja++) {
+	for (int jb = 0; jb < m; jb++) {
+	  for (int ii = 0; ii < k; ii++) {
+	    *prC++ = (*prA++)+(*prB++);
+	    *piC++ = (*piA++)+(*piB++);
+	  }
+	  prA -= k; piA -= k;
+	}
+	prA += k; piA += k;
+	prB -= k*m; piB -= k*m;
+      }
+
+    /* deallocate permuted versions of A and B */
+    mxDestroyArray(Bp); mxDestroyArray(Ap);
+
+    /* permute according to input */
+    mexCallMATLAB(1,plhs,2,prhs2,"permute");
+    mxDestroyArray(prhs2[1]); mxDestroyArray(prhs2[0]);
+  }
+  /* tensor-sum of one array */
+  else {
+    /* check of input */
+    if (!ISDOUBLETENSOR(prhs[0]))
+      mexErrMsgIdAndTxt("tsum:e2","Expecting a double, "
+			"non-sparse array.");
+    if (!ISDOUBLEREALTENSOR(prhs[1]))
+      mexErrMsgIdAndTxt("tsum:e3","Index vector must be real, "
+			"double and non-sparse.");
+
+    /* input IA */
+    const double *ia = mxGetPr(prhs[1]);
+    const mwSize nda = mxGetNumberOfElements(prhs[1]);
+
+    for (int i = 0; i < nda; i++)
+      if (ia[i] == 0.0 || ia[i] != ceil(ia[i]))
+	mexErrMsgIdAndTxt("tsum:e9","Index vector must contain "
+			  "nonzero integers.");
+
+    if (mxGetNumberOfElements(prhs[0]) == 0) {
+      /* empty dimensions is special since the output might be larger
+	 than the input (a sum over an empty dimension produces a
+	 singleton dimension) */
+      const mwSize ndimA = mxGetNumberOfDimensions(prhs[0]);
+      mwSize sizA[ndimA];
+      memcpy(sizA,mxGetDimensions(prhs[0]),ndimA*sizeof(sizA[0]));
+
+      for (int i = 0; i < nda; i++) {
+	const int dim = abs((int)ia[i]);
+	/* fix for a GCC-bug in abs() -- see sum() below */
+	if (0 < dim && dim <= ndimA) sizA[dim-1] = 1;
+      }
+      plhs[0] = mxCreateNumericArray(ndimA,sizA,mxDOUBLE_CLASS,
+				     mxIsComplex(prhs[1]) ? mxCOMPLEX 
+				     : mxREAL);
+      return;
+    }
+    else if (nda == 0) {
+      plhs[0] = mxDuplicateArray(prhs[0]);
+      return;
+    }
+
+    /* input and working arrays A and B */
+    double *prA = mxGetPr(prhs[0]),*piA = mxGetPi(prhs[0]);
+    double *prB,*piB = NULL;
+
+    /* dimensions and stride, to be updated */
+    const mwSize ndimA = mxGetNumberOfDimensions(prhs[0]);
+    mwSize sizA[ndimA],strA[ndimA+1];
+    memcpy(sizA,mxGetDimensions(prhs[0]),ndimA*sizeof(sizA[0]));
+    strA[0] = 1;
+    for (int i = 0; i < ndimA; i++) strA[i+1] = strA[i]*sizA[i];
+
+    /* B is A summed along the first dimension in IA */
+    {
+      const int dim = abs((int)ia[0]);
+      int remove = 1;
+      /* fix for a GCC-bug in abs() -- see sum() below */
+      if (0 < dim && dim <= ndimA) remove = sizA[dim-1];
+      prB = mxMalloc(strA[ndimA]/remove*sizeof(double));
+      if (piA != NULL) piB = mxMalloc(strA[ndimA]/remove*sizeof(double));
+    }
+
+    /* must be saved: */
+    double *prB_ = prB,*piB_ = piB;
+
+    /* sum over all given dimensions */
+    sum((int)ia[0],prB,piB,prA,piA,NULL,NULL,ndimA,sizA,strA);
+    for (int i = 1; i < nda; i++)
+      sum((int)ia[i],NULL,NULL,prB,piB,&prB,&piB,ndimA,sizA,strA);
+
+    /* account for negative indices and reallocate the final sum */
+    memmove(prB_,prB,strA[ndimA]*sizeof(double)); prB = prB_;
+    prB = mxRealloc(prB,strA[ndimA]*sizeof(double));    
+    if (piB != NULL) {
+      memmove(piB_,piB,strA[ndimA]*sizeof(double)); piB = piB_;
+      piB = mxRealloc(piB,strA[ndimA]*sizeof(double));
+    }
+    mxArray *B = mxCreateDoubleMatrix(0,0,piB != NULL ? mxCOMPLEX 
+				      : mxREAL);
+    mxFree(mxGetPr(B)); mxSetPr(B,prB);
+    if (piB != NULL) {
+      mxFree(mxGetPi(B)); mxSetPi(B,piB);
+    }
+    mxSetDimensions(B,sizA,ndimA);
+    plhs[0] = B;
+  }
+}
+/*-----------------------------------------------------------------------*/
+void checkix(const double *ix,mwSize len,int *max)
+/* Checks the indices ix[1..len] and adjusts max accordingly. */
+{
+  for (int i = 1; i <= len; i++) {
+    if (ix[i] <= 0.0 || ix[i] != ceil(ix[i]) || isinf(ix[i]))
+      mexErrMsgIdAndTxt("tsum:e4","Index vector must contain "
+			"positive integers.");
+
+    if (*max < ix[i]) *max = (int)ix[i];
+  }
+}
+/*-----------------------------------------------------------------------*/
+void sortix(const double *ix,mwSize len,int *in)
+/* Sorts the indices ix[1..len] and determines a rank-table, in, which
+   is assumed to be allocated and cleared prior to call. */
+{
+  for (int i = 1; i <= len; i++) {
+    const int ii = (int)ix[i]-1;
+
+    if (in[ii] != 0)
+      mexErrMsgIdAndTxt("tsum:e6","Indices must be distinct.");
+    in[ii] = i;
+  }
+}
+/*-----------------------------------------------------------------------*/
+void logicalsize(const mxArray *A,mwSize nda,mwSize *sizA)
+/* Determines the size sizA of the array A. The sizes of all
+   dimensions i, 1 <= i <= nda are included in sizA. */
+{
+  mwSize ndimA = mxGetNumberOfDimensions(A);
+  const mwSize *msizA = mxGetDimensions(A);
+
+  /* mask for Matlab's stupid convention */
+  if (ndimA == 2) ndimA -= (msizA[1] == 1)*(1+(msizA[0] == 1));
+  if (nda < ndimA)
+    mexErrMsgIdAndTxt("tsum:e7","Wrong size of index vector.");
+
+  memcpy(sizA,msizA,ndimA*sizeof(sizA[0]));
+  for (int i = ndimA; i < nda; i++) sizA[i] = 1;
+}
+/*-----------------------------------------------------------------------*/
+void permute(mxArray **Ap,const mxArray *A,
+	     const int *p1,const int *p2,int len)
+/* Permutes the array A according to the ordering p = [p1 p2] which
+   must be of total length len. The result is stored in Ap. */
+{
+  /* input to Matlab */
+  mxArray *prhs[2];
+  prhs[0] = (mxArray *)A;
+  prhs[1] = mxCreateDoubleMatrix(1,MAX(2,len),mxREAL);
+
+  int j = 0;
+  double *p = mxGetPr(prhs[1]);
+  p[0] = 1.0; p[1] = 2.0; /* singletons */
+
+  /* construct the permutation p */
+  for (int i = 0; p1[i] != 0; i++) p[j++] = (double)p1[i];
+  for (int i = 0; p2[i] != 0; i++) p[j++] = (double)p2[i];
+
+  /* call Matlab and clean up */
+  mexCallMATLAB(1,Ap,2,prhs,"permute");
+  mxDestroyArray(prhs[1]);
+}
+/*-----------------------------------------------------------------------*/
+void sum(int dim,double *prB,double *piB,
+	 double *prA,double *piA,double **prC,double **piC,
+	 mwSize ndims,mwSize *siz,mwSize *str)
+/* Sums the array (prA,piA) along the dimension dim (dim must be
+   nonzero and the corresponding dimension is not allowed to be of
+   vanishing width). The resulting array is placed in (prB,piB) and
+   contains on exit the newly created sum with siz[dim-1] = 1 (siz and
+   str are thus updated). Use prB = piB = NULL to perform the sum
+   within the original memory (prA,piA). In this case, the result
+   starts at (prC,piC) which is somewhere inside (prA,piA). The input
+   dim may be negative, indicating summation backwards. */
+{
+  const bool fwd = dim >= 0;
+  dim = abs(dim);
+
+  /* early returns (plus a small fix for GCC-problems with abs(the
+     most negative number)) */
+  if (dim < 0 || ndims < dim || siz[dim-1] <= 1) {
+    if (prB == NULL) {
+      *prC = prA;
+      *piC = piA;
+    }
+    else {
+      memcpy(prB,prA,str[ndims]*sizeof(double));
+      if (piA != NULL) memcpy(piB,piA,str[ndims]*sizeof(double));
+    }
+    return;
+  }
+
+  if (piA == NULL) {
+    if (fwd) {
+      /* forward summation */
+      if (prB == NULL) *prC = prB = prA;
+
+      /* summing along the first non-singleton dimension is faster */
+      if (str[dim-1] == 1)
+	for (int i = 0; i < str[ndims]; i += str[dim], prB++) {
+	  *prB = *prA++;
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    *prB += *prA++;
+	}
+      else
+	for (int i = 0; i < str[ndims]; i += str[dim], prB += str[dim-1]) {
+	  memmove(prB,prA,str[dim-1]*sizeof(double));
+	  prA += str[dim-1];
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    for (int k = 0; k < str[dim-1]; k++)
+	      prB[k] += *prA++;
+	}
+    }
+    else {
+      /* backward summation */
+      const bool inside = prB == NULL;
+      if (inside)
+	prB = prA += str[ndims];
+      else {
+	prB += str[ndims]/siz[dim-1];
+	prA += str[ndims];
+      }
+
+      /* it's still easier to sum along the first non-singleton... */
+      if (str[dim-1] == 1) {
+	for (int i = 0; i < str[ndims]; i += str[dim]) {
+	  *--prB = *--prA;
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    *prB += *--prA;
+	}
+	/* the sum has been computed in the wrong memory area */
+	if (inside) *prC = prB;
+      }
+      else {
+	for (int i = 0; i < str[ndims]; i += str[dim]) {
+	  prB -= str[dim-1];
+	  prA -= str[dim-1];
+	  memmove(prB,prA,str[dim-1]*sizeof(double));
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    for (int k = str[dim-1]; --k >= 0; )
+	      prB[k] += *--prA;
+	}
+	if (inside) *prC = prB;
+      }
+    }
+  }
+  else {
+    /* the complex case follows analogously */
+    if (fwd) {
+      if (prB == NULL) {
+	*prC = prB = prA;
+	*piC = piB = piA;
+      }
+
+      if (str[dim-1] == 1)
+	for (int i = 0; i < str[ndims]; i += str[dim], prB++, piB++) {
+	  *prB = *prA++; *piB = *piA++;
+	  for (int j = 1; j < siz[dim-1]; j++) {
+	    *prB += *prA++;
+	    *piB += *piA++;
+	  }
+	}
+      else
+	for (int i = 0; i < str[ndims]; i += str[dim], 
+	       prB += str[dim-1], piB += str[dim-1]) {
+	  memmove(prB,prA,str[dim-1]*sizeof(double));
+	  prA += str[dim-1];
+	  memmove(piB,piA,str[dim-1]*sizeof(double));
+	  piA += str[dim-1];
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    for (int k = 0; k < str[dim-1]; k++) {
+	      prB[k] += *prA++;
+	      piB[k] += *piA++;
+	    }
+	}
+    }
+    else {
+      const bool inside = prB == NULL;
+      if (inside) {
+	prB = prA += str[ndims];
+	piB = piA += str[ndims];
+      }
+      else {
+	prB += str[ndims]/siz[dim-1];
+	prA += str[ndims];
+	piB += str[ndims]/siz[dim-1];
+	piA += str[ndims];
+      }
+
+      if (str[dim-1] == 1) {
+	for (int i = 0; i < str[ndims]; i += str[dim]) {
+	  *--prB = *--prA;
+	  *--piB = *--piA;
+	  for (int j = 1; j < siz[dim-1]; j++) {
+	    *prB += *--prA;
+	    *piB += *--piA;
+	  }
+	}
+	if (inside) {
+	  *prC = prB;
+	  *piC = piB;
+	}
+      }
+      else {
+	for (int i = 0; i < str[ndims]; i += str[dim]) {
+	  prB -= str[dim-1];
+	  prA -= str[dim-1];
+	  memmove(prB,prA,str[dim-1]*sizeof(double));
+	  piB -= str[dim-1];
+	  piA -= str[dim-1];
+	  memmove(piB,piA,str[dim-1]*sizeof(double));
+	  for (int j = 1; j < siz[dim-1]; j++)
+	    for (int k = str[dim-1]; --k >= 0; ) {
+	      prB[k] += *--prA;
+	      piB[k] += *--piA;
+	    }
+	}
+	if (inside) {
+	  *prC = prB;
+	  *piC = piB;
+	}
+      }
+    }
+  }
+
+  /* update the dimensions */
+  siz[dim-1] = 1;
+  for (int i = dim-1; i < ndims; i++) str[i+1] = str[i]*siz[i];
+}
+/*-----------------------------------------------------------------------*/
diff --git a/stenglib/Tensor/startup.m b/stenglib/Tensor/startup.m
new file mode 100644
index 00000000..1865f967
--- /dev/null
+++ b/stenglib/Tensor/startup.m
@@ -0,0 +1,9 @@
+%STARTUP Add paths to TENSOR stuff.
+
+% S. Engblom 2005-04-10
+
+s = pwd;
+if exist('test','dir')
+  addpath([s '/test']);
+end
+addpath([s '/source']);
diff --git a/stenglib/Tensor/tndims.m b/stenglib/Tensor/tndims.m
new file mode 100644
index 00000000..daed17e1
--- /dev/null
+++ b/stenglib/Tensor/tndims.m
@@ -0,0 +1,17 @@
+%TNDIMS Number of dimensions.
+%   N = TNDIMS(A) returns the number of dimensions of the array A. TNDIMS
+%   works just like the MATLAB function NDIMS except that scalars and
+%   column vectors are detected correctly, see the examples below. This is
+%   useful since the equivalent construction using NDIMS and SIZE is quite
+%   ugly.
+%
+%   Examples:
+%     [tndims(rand(2,2)) ndims(rand(2,2))]
+%     [tndims(rand(2,1)) ndims(rand(2,1))]
+%     [tndims(rand(1,1)) ndims(rand(1,1))]
+%
+%   See also NDIMS, SIZE, TSIZE.
+
+% S. Engblom 2005-04-10
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Tensor/tndims.mexw64 b/stenglib/Tensor/tndims.mexw64
new file mode 100644
index 00000000..4eada8ed
Binary files /dev/null and b/stenglib/Tensor/tndims.mexw64 differ
diff --git a/stenglib/Tensor/tprod.m b/stenglib/Tensor/tprod.m
new file mode 100644
index 00000000..a2bf528e
--- /dev/null
+++ b/stenglib/Tensor/tprod.m
@@ -0,0 +1,80 @@
+function c = tprod(a,b,ia,ib)
+%TPROD Tensor product.
+%   C = TPROD(A,B,IA,IB) computes a general tensor product of the arrays A
+%   and B. The supported tensor product can be described as a general
+%   multiplication of the elements in A and B where some indices may be
+%   equal and/or summed over. The mapping from input indices to output
+%   indices, as well as how to sum, is described by the vectors IA and IB.
+%
+%   This function is best explained by an example. Let A be a
+%   4-dimensional array and let B be a 3-dimensional array. Then
+%     C = TPROD(A,B,[2 -1 1 -2],[-2 2 -1])
+%   creates a 2-dimensional array (matrix) C in the following way.
+%   First, the product
+%     D(j2,j1,i1,i2) = A(i2,j1,i1,j2)*B(j2,i2,j1)
+%   is formed. This is a 4-dimensional array D, where j1, j2, i1, i2
+%   denote index variables. D is the tensor (outer) product of A and B,
+%   followed by a permutation of the indices and setting some indices
+%   equal. Of course, it is assumed that SIZE(A,1) = SIZE(B,2), SIZE(A,2)
+%   = SIZE(B,3), and SIZE(A,4) = SIZE(B,1). Second, we sum over the index
+%   variables corresponding to negative numbers (j1 and j2):
+%     C(i1,i2) = sum of D(j2,j1,i1,i2)
+%   where the indices j1 and j2 (independently) run through all their
+%   possible values.
+%
+%   The arguments are assumed to have the following format:
+%     - A, B, and C are (real or complex) arrays of doubles.
+%
+%     - IA and IB are vectors of doubles, containing nonzero integers. The
+%     length of IA(IB) has to be equal to the number of dimensions of
+%     A(B). However, since MATLAB automatically removes trailing singleton
+%     dimensions, A is padded with singleton dimensions if the number of
+%     dimensions of A is less than the length of IA (and similarly for B).
+%     Unlike MATLAB, TPROD supports 0-D and 1-D tensors (scalars and
+%     column vectors), so the index vectors may contain less than 2
+%     elements.
+%
+%     - The numbers in IA(IB) have to be distinct.
+%
+%     - If a number occurs both in IA and IB, it is required that the
+%     corresponding dimensions in A and B have the same size.
+%
+%     - If a negative number occurs in A(B), it must also occur in B(A).
+%
+%     - It is assumed that the union of the numbers in IA and IB together
+%     with 0 form a contiguous sequence of integers.
+%
+%   For optimal performance of TPROD you should try to have the vectors
+%   IA, IB and [IA IB] in order.
+%
+%   Examples:
+%     % the tensor (outer) product of the matrices A and B
+%     A = rand(3,4); B = rand(2,5);
+%     C1 = tprod(A,B,[1 2],[3 4]);
+%
+%     % the Kronecker product of the matrices A and B
+%     C2 = reshape(tprod(A,B,[2 4],[1 3]),size(A).*size(B));
+%
+%     % the ordinary matrix product of the matrices A and B
+%     A = rand(3,4); B = rand(4,2);
+%     C3 = tprod(A,B,[1 -1],[-1 2]);
+%
+%     % the element-wise product of the matrices A and B
+%     A = rand(3,5); B = rand(3,5);
+%     C4 = tprod(A,B,[1 2],[1 2]);
+%
+%     % the transpose of the matrix A
+%     C5 = tprod(A,1,[2 1],[]);
+%
+%     % the sum of all entries in the matrix A
+%     C6 = tprod(A,ones(size(A)),[-1 -2],[-1 -2]);
+%
+%     % the sum of all diagonal entries in the matrix A (the trace)
+%     C7 = tprod(A,eye(size(A)),[-1 -2],[-1 -2]);
+%
+%   See also KRON, MTIMES, TIMES.
+
+% S. Engblom 2005-04-04
+% Based on a concept by D. Bertilsson, COMSOL AB.
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Tensor/tprod.mexw64 b/stenglib/Tensor/tprod.mexw64
new file mode 100644
index 00000000..a6b3d5a3
Binary files /dev/null and b/stenglib/Tensor/tprod.mexw64 differ
diff --git a/stenglib/Tensor/tsize.m b/stenglib/Tensor/tsize.m
new file mode 100644
index 00000000..625cf24c
--- /dev/null
+++ b/stenglib/Tensor/tsize.m
@@ -0,0 +1,21 @@
+%TSIZE Size of array.
+%   SIZ = TSIZE(A) returns the size of the array A. Unlike the MATLAB
+%   function SIZE, the size of SIZ itself is 1-by-TNDIMS(A) (see this
+%   function), so that scalars and column vectors are treated in a
+%   consistent way.
+%
+%   SIZ = TSIZE(A,DIMS) returns the size along the dimensions DIMS. Unlike
+%   the corresponding syntax using SIZE, DIMS may be a vector.
+%
+%   Examples:
+%     [tsize(rand(2,2)) size(rand(2,2))]
+%     [tsize(rand(2,1)) size(rand(2,1))]
+%     [tsize(rand(1,1)) size(rand(1,1))]
+%
+%     tsize(rand(4,3,2,1),1:4)
+%
+%   See also TNDIMS, SIZE.
+
+% S. Engblom 2005-04-10
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Tensor/tsize.mexw64 b/stenglib/Tensor/tsize.mexw64
new file mode 100644
index 00000000..859b584a
Binary files /dev/null and b/stenglib/Tensor/tsize.mexw64 differ
diff --git a/stenglib/Tensor/tsum.m b/stenglib/Tensor/tsum.m
new file mode 100644
index 00000000..6e4f17ab
--- /dev/null
+++ b/stenglib/Tensor/tsum.m
@@ -0,0 +1,74 @@
+function c = tsum(a,b,ia,ib)
+%TSUM Tensor summation.
+%   C = TSUM(A,B,IA,IB) computes a general sum of the arrays A and B in
+%   the order specified by the vectors IA and IB.
+%
+%   This syntax is best explained by an example. Let A be a 3-dimensional
+%   array and let B be a 2-dimensional array. Then
+%     C = TSUM(A,B,[1 3 2],[3 1])
+%   creates a 3-dimensional array C by forming the sum
+%     C(i1,i2,i3) = A(i1,i3,i2)+B(i3,i1).
+%   Of course, it is assumed that SIZE(A,1) = SIZE(B,2) and SIZE(A,2) =
+%   SIZE(B,1). The output C has the dimensions
+%   SIZE(A,1)-by-size(A,3)-by-SIZE(A,2).
+%
+%   The arguments are assumed to have the following format:
+%     - A, B, and C are (real or complex) arrays of doubles.
+%
+%     - IA and IB are vectors of doubles, containing positive
+%     integers. The length of IA(IB) has to be equal to the number of
+%     dimensions of A(B). However, since MATLAB automatically removes
+%     trailing singleton dimensions, A is padded with singleton dimensions
+%     if the number of dimensions of A is less than the length of IA (and
+%     similarly for B). Unlike MATLAB, TSUM supports 0-D and 1-D tensors
+%     (scalars and column vectors), so the index vectors may contain less
+%     than 2 elements.
+%
+%     - The numbers in IA(IB) have to be distinct.
+%
+%     - If a number occurs both in IA and IB, it is required that the
+%     corresponding dimensions in A and B have the same size.
+%
+%     - It is assumed that the union of the numbers in IA and IB form a
+%     contiguous sequence of integers.
+%
+%   For optimal performance of TSUM you should try to have the vectors IA,
+%   IB and [IA IB] in order.
+%
+%   C = TSUM(A,IA) sums the array A along the dimensions specified in the
+%   ordered vector IA. This is an extension of the MATLAB-function SUM,
+%   which only allows for summation along one dimension. After summing,
+%   the dimensions specified in IA become singleton dimensions of C; you
+%   may efficiently remove them by using the MATLAB-function SQUEEZE.
+%
+%   As a further extension of MATLABs SUM-function, negative numbers in IA
+%   may be used to force backward summation of the corresponding
+%   dimensions.
+%
+%   Examples:
+%     % the "outer sum" of the vectors v and w
+%     v = rand(3,1); w = rand(4,1);
+%     C1 = tsum(v,w,[1],[2]);
+%
+%     % the symmetric part of the matrix A
+%     A = rand(4);
+%     C2 = 0.5*tsum(A,A,[1 2],[2 1]);
+%
+%     % the transpose of the matrix A
+%     C3 = tsum(A,0,[2 1],[]);
+%
+%     % add i to row i of A
+%     C4 = tsum(A,1:4,[1 2],[3 1]);
+%
+%     % sum dimensions 1 and 3 of B
+%     B = rand(3,4,5);
+%     C5 = tsum(B,[1 3]);
+%
+%     % sum backwards
+%     C6 = tsum([1:100000].^-4,-2);
+%
+%   See also TPROD, SUM, SQUEEZE.
+
+% S. Engblom 2005-08-26
+
+error('.MEX-file not found on path.');
diff --git a/stenglib/Tensor/tsum.mexw64 b/stenglib/Tensor/tsum.mexw64
new file mode 100644
index 00000000..4abd0d0a
Binary files /dev/null and b/stenglib/Tensor/tsum.mexw64 differ
diff --git a/stenglib/Utils/arr2latex.m b/stenglib/Utils/arr2latex.m
new file mode 100644
index 00000000..7965447e
--- /dev/null
+++ b/stenglib/Utils/arr2latex.m
@@ -0,0 +1,232 @@
+function S = arr2latex(T,fmt,varargin)
+%ARR2LATEX LaTeX-table from matrix.
+%   S = ARR2LATEX(T,FMT) creates a LaTeX-table S from a matrix T and a
+%   format string FMT.
+%
+%   T must be a matrix containing scalars that SPRINTF
+%   accepts. Currently, complex matrices are not supported.
+%
+%   FMT must be a cell-matrix of the same size as T. Singleton
+%   dimensions are, however, automatically expanded. ARR2LATEX
+%   supports all format-flags as supported by SPRINTF but the
+%   percent-character should not be included as it is added
+%   automatically.
+%
+%   Additionally, flags ending with a dollar-sign (see the second
+%   example below) expands the 'scientific' notation so that
+%   '6.022e23' becomes '$6.022 \cdot 10^{23}$' instead. Also, 'n_'
+%   reflects n whitespaces in a row and is useful for typesetting
+%   empty entries.
+%
+%   S = ARR2LATEX(T,FMT,...) allows for options to be passed; the
+%   table below explains the available options.
+%
+%   Property    Value/{Default}           Description
+%   -----------------------------------------------------------------------
+%   collabel,   Cell-vector with          Label of columns and rows.
+%   rowlabel    strings {''}
+%
+%   breakrow    Integer {74}              Where to insert a newline.
+%
+%   hline       {'on'} | 'off'            Insert a \hline between
+%                                         each row.
+%
+%   posinf      String {'$\infty$'}       Symbols of plus/minus infinity
+%   neginf      String {'$-\infty$'}      and NaN's. Use empty to type
+%   nan         String {'-'}              them using SPRINTFs defaults.
+%
+%   caption     String {''}               The caption of the table.
+%
+%   centering   {'on'} | 'off'            Use \centering.
+%
+%   colspec     String {'|c| ... |c|'}    The format of each column.
+%
+%   label       String {''}               The label of the table.
+%
+%   pos         String {'htp'}            The position of the table.
+%
+%   Examples:
+%     % a small table of the BesselJ-function
+%     A = [0 1:5; ...
+%          [1:4]' besselj([1:5],[1:4]')];
+%     fmt = [{'1_'} frepmat({'7d'},[1 5]); ...
+%            frepmat([{'d'} frepmat({'7.4f'},[1 5])],[4])];
+%     S = arr2latex(A,fmt,'colspec','|c||r|r|r|r|r|','hline','off')
+%
+%     % the Gamma-function
+%     A = [10:15; gamma(10:15)]';
+%     fmt = {'3d' '6.4e$'};
+%     T = arr2latex(A,fmt,'colspec','|c|l|', ...
+%                         'collabel',{'$x$' '$\Gamma(x)$'}, ...
+%                         'hline','off')
+%
+%   See also SPRINTF.
+
+% S. Engblom 2005-09-22
+
+% checks...
+if ~isreal(T) || ndims(T) > 2
+  error('Real, two-dimensional matrix expected.');
+end
+if ndims(fmt) > 2 || ~isa(fmt,'cell') || ...
+      ~all(all(cellfun('isclass',fmt,'char') & ...
+               cellfun('ndims',fmt) == 2 & ...
+               cellfun('size',fmt,1) == 1))
+  error('Cell-matrix containing row strings expected.');
+end
+
+% expand fmt to match T
+rep = size(T)./size(fmt);
+if any(rep ~= ceil(rep))
+  error('The sizes of the table and the format strings must match.');
+end
+fmt = frepmat(fmt,rep);
+
+% parse options
+opts = struct('collabel','', ...
+              'rowlabel','', ...
+              'breakrow',74, ...
+              'hline','on', ...
+              'posinf','$\infty$', ...
+              'neginf','$-\infty', ...
+              'nan','  -', ...
+              'caption','', ...
+              'centering','on', ...
+              'colspec',['|' frepmat('c|',[1 size(T,2)])], ...
+              'label','', ...
+              'pos','htp');
+[opts,got] = parseopts(opts,varargin);
+if strcmpi(opts.hline,'on')
+  opts.hline = '\t\\hline\n';
+else
+  opts.hline = '';
+end
+if ~isempty(opts.label)
+  opts.label = ['\\label{' opts.label '}\n'];
+end
+if strcmpi(opts.centering,'on')
+  opts.centering = '\\centering\n';
+else
+  opts.centering = '';
+end
+% rowlabels? add one column!
+if ~isempty(opts.rowlabel) && ~got.colspec
+  opts.colspec = ['|c|' opts.colspec];
+end
+if ~isempty(opts.caption)
+  opts.caption = ['\\caption{' l_string(opts.caption) '}\n'];
+end
+
+% the beginning...
+S = ['\\begin{table}[' opts.pos ']\n' ...
+     opts.centering ...
+     '\\begin{tabular}{' opts.colspec '}\n' ...
+     '\t\\hline\n'];
+
+% column labels
+if ~isempty(opts.collabel)
+  s = '\t';
+  for j = 1:size(T,2)
+    s = [s l_string(opts.collabel{j}) ' & '];
+  end
+  s(end-1:end) = '';
+  s = l_break(s,opts.breakrow);
+  s = [s '\t\\\\\n\t\\hline\n'];
+  S = [S s];
+end
+
+% ...the body...
+for i = 1:size(T,1)
+  s = '\t';
+  if ~isempty(opts.rowlabel)
+    s = [s l_string(opts.rowlabel{i}) ' & '];
+  end
+  for j = 1:size(T,2)
+    ss = l_format(fmt{i,j},T(i,j));
+
+    % signs for infinity and NaN's, if any -- try to keep the tabs 
+    % of the table whenever possible
+    if ~isempty(opts.posinf) && T(i,j) == Inf
+      ss = [frepmat(' ',[1 max(size(ss,2)-size(opts.posinf,2),0)]) ...
+            l_string(opts.posinf)];
+    elseif ~isempty(opts.neginf) && T(i,j) == -Inf
+      ss = [frepmat(' ',[1 max(size(ss,2)-size(opts.neginf,2),0)]) ...
+            l_string(opts.neginf)];
+    elseif ~isempty(opts.nan) && isnan(T(i,j))
+      ss = [frepmat(' ',[1 max(size(ss,2)-size(opts.nan,2),0)]) ...
+            l_string(opts.nan)];
+    end
+    s = [s ss ' & '];
+  end
+  s(end-1:end) = '';
+
+  % possibly insert breaks
+  s = l_break(s,opts.breakrow);
+
+  % new row
+  s = [s '\t\\\\\n' opts.hline];
+  S = [S s];
+end
+
+% always produce a 'bounding box'
+if isempty(opts.hline), S = [S '\t\\hline\n']; end
+
+% ...and the end!
+S = [S ...
+     '\\end{tabular}\n' ...
+     opts.caption ...
+     opts.label ...
+     '\\end{table}'];
+
+% let sprintf do the rest
+S = sprintf(S);
+
+%--------------------------------------------------------------------------
+function s = l_string(t)
+%L_STRING Formatting one string.
+%   S = L_FORMAT(T) formats the string T. Care is taken to insert a
+%   double '\\' whenever a single slash is encountered.
+
+s = t(sort([find(t == '\') 1:size(t,2)]));
+
+%--------------------------------------------------------------------------
+function s = l_break(t,n)
+%L_BREAK Break rows.
+%   S = L_BREAK(T,N) tries to insert new lines in the string T at
+%   every Nth character position. The new lines are always inserted
+%   after a '&'-character and a double tab is used so as to keep the
+%   text readable.
+
+ii = find(t == '&');
+jj = find(diff(mod(ii,n)) < 0);
+if ~isempty(jj)
+  jj = ii(jj);
+  s = t(1:jj(1));
+  for k = [jj+1; jj(2:end) size(t,2)]
+    s = [s '\n\t\t' t(k(1):k(2))];
+  end
+else
+  s = t;
+end
+
+%--------------------------------------------------------------------------
+function s = l_format(f,t)
+%L_FORMAT Formatting one scalar.
+%   S = L_FORMAT(F,T) formats the scalar T using the flag F and
+%   returns the result in the string S.
+
+if f(end) == '_'
+  s = frepmat(' ',[1 str2num(f(1:end-1))]);
+elseif f(end) == '$'
+  s = sprintf(['%' f(1:end-1)],t);
+  i = find(s == 'e',1,'first');
+  if ~isempty(i)
+    s1 = s(1:i-1); s2 = s(i+1:end);
+    % masks for 'e-08' ('-8'), 'e+01' ('1'), 'e+00' ('0')
+    s2 = num2str(str2num(s2));
+    s = ['$' s1 ' \\cdot 10^{' s2 '}$'];
+  end
+else
+  s = sprintf(['%' f],t);
+end
+%--------------------------------------------------------------------------
diff --git a/stenglib/Utils/assignopts.m b/stenglib/Utils/assignopts.m
new file mode 100644
index 00000000..9d1c780c
--- /dev/null
+++ b/stenglib/Utils/assignopts.m
@@ -0,0 +1,46 @@
+function assignopts(opts)
+%ASSIGNOPTS Assign options.
+%   ASSIGNOPTS(OPTS) assigns the options in OPTS. This is useful
+%   whenever you prefer to use 'name' rather than 'OPTS.name'
+%   throughout the code. OPTS can either be a structure or a
+%   cell-vector with property/value pairs.
+%
+%   Example:
+%     % caller's site:
+%     opts.order = 2;
+%
+%     % inside fun(opts, ...)
+%     def.rho = 0.01;
+%     def.method = 'lagrange';
+%     def.order = 4;
+%     opts = parseopts(def,opts);
+%     assignopts(opts);
+%
+%     % use the variables directly
+%     order % note: 2
+%     method
+%
+%   See also PARSEOPTS, ASSIGNIN.
+
+% S. Engblom 2010-08-25
+
+if isstruct(opts)
+  if any(size(opts) ~= 1)
+    error('Expecting a scalar structure.');
+  end
+  field = fieldnames(opts);
+  val = struct2cell(opts);
+elseif iscell(opts)
+  field = reshape(opts(1:2:end),[],1);
+  val = reshape(opts(2:2:end),[],1);
+  if size(field,1) ~= size(val,1)
+    error('Cell-vector must contain property/value pairs.');
+  end
+else
+  error('Options must either be a struct or a cell-vector.');
+end
+
+% immediate
+for i = 1:size(field,1)
+  assignin('caller',field{i},val{i});
+end
diff --git a/stenglib/Utils/connect.m b/stenglib/Utils/connect.m
new file mode 100644
index 00000000..fd141fc3
--- /dev/null
+++ b/stenglib/Utils/connect.m
@@ -0,0 +1,113 @@
+function S = connect(z,tol,exact)
+%CONNECT Connectivity information for points.
+%   S = CONNECT(Z,tol) determines closest neighbours for all the (complex) 
+%   coordinates Z so that find(S(:,i)) contains a complete list of indices
+%   to elements Z(j) that satisfy abs(Z(j)-Z(i)) <= tol.
+%
+%   S = CONNECT(Z,tol,0) returns a slightly less sharp result. The list is
+%   still complete but generally contains more elements. All elements Z(j)
+%   are guaranteed to satisfy abs(Z(j)-Z(i)) <= 2*sqrt(2)*tol ~ 2.82*tol.
+%
+%   The output S is an N-by-N sparse matrix, where N is the number of
+%   coordinates.
+%
+%   Cautionary: if tol is small compared to the largest bounding box
+%   containing all points in Z, a lot of memory is allocated.
+%
+%   Example:
+%     % 10000 random points
+%     np = 10000;
+%     z = 2*complex(rand(np,1),rand(np,1))-(1+1i);
+%
+%     S = connect(z,0.5e-1);
+%     S0 = connect(z,0.5e-1,0); % faster but less sharp
+%
+%     figure, plot(z,'b.'); hold on
+%     i = find(S0(:,1)); plot(z(i),'r.');
+%     i = find(S(:,1)); plot(z(i),'c.'); plot(z(1),'k*');
+%     j = find(S0(:,2)); plot(z(j),'r.');
+%     j = find(S(:,2)); plot(z(j),'c.'); plot(z(2),'k*');
+%     axis equal, axis([-1 1 -1 1]);
+
+% S. Engblom 2009-09-24 (Minor revision)
+% S. Engblom 2008-04-29
+
+% check input
+if ~isscalar(tol) || tol <= 0
+  error('Invalid input.');
+end
+
+% column vector
+z = z(:); N = size(z,1);
+
+% extract coordinates
+x = real(z); y = imag(z);
+
+% bounding box
+xmin = min(x); hx = max(x)-xmin;
+ymin = min(y); hy = max(y)-ymin;
+
+% size of each box
+heff = 2*tol;
+
+% number of boxes
+nx = max(1,ceil(hx/heff)); ny = max(1,ceil(hy/heff));
+NB = nx*ny;
+
+% a possibly slightly larger bounding box
+hx = nx*heff; hy = ny*heff;
+
+% change to local coordinates in [0,1]
+x = (x-xmin)/hx; y = (y-ymin)/hy;
+
+% index into primary mesh
+i1 = 1+min(floor(x*nx),nx-1);
+j1 = min(floor(y*ny),ny-1);
+
+% index into shifted mesh
+i2 = 1+min(floor(x*nx+0.5),nx);
+j2 = min(floor(y*ny+0.5),ny);
+
+% assemble the result
+ix = repmat(1:N,1,4);
+jx = [i1 i1+NB i2+2*NB+nx i2+3*NB+nx+ny]+ ...
+     [j1*nx j2*nx j1*(nx+1) j2*(nx+1)];
+S = sparse(ix,jx,1);
+S = S*S';
+
+% optionally improve the result
+if nargin < 3 || exact
+  [ix,jx] = find(S);
+  mask = find(abs(z(ix)-z(jx)) <= tol);
+  S = sparse(ix(mask),jx(mask),1,N,N);
+end
+
+% this code resembles the multipole mesh more closely but is
+% slightly less effective:
+
+% % size of each box
+% heff = tol/2;
+% 
+% % number of boxes
+% nx = max(1,ceil(hx/heff)); ny = max(1,ceil(hy/heff));
+% NB = nx*ny;
+% 
+% % a possibly slightly larger bounding box
+% hx = nx*heff; hy = ny*heff;
+% 
+% % change to local coordinates in [0,1]
+% x = (x-xmin)/hx; y = (y-ymin)/hy;
+% 
+% % index into an imagined nx-by-ny matrix
+% ii = 1+min(floor(x*nx),nx-1);
+% jj = min(floor(y*ny),ny-1);
+% ii = [max(ii-1,1); ii; min(ii+1,nx)];
+% jj = [max(jj-1,0) jj min(jj+1,ny-1)];
+% ii = repmat(ii,1,3);
+% jj = repmat(jj,3,1);
+% 
+% % assemble the result
+% ix = repmat(1:N,1,9);
+% jx = ii(:)+jj(:)*nx;
+% S = sparse(ix,jx,1,N,NB); % N-by-NB (coordinate-to-box)
+% S = S*S'; % N-by-N (coordinate-to-coordinate in the same box)
diff --git a/stenglib/Utils/consistency.m b/stenglib/Utils/consistency.m
new file mode 100644
index 00000000..b89393a6
--- /dev/null
+++ b/stenglib/Utils/consistency.m
@@ -0,0 +1,97 @@
+function [p,c] = consistency(rho,sigma,N)
+%CONSISTENCY Local truncation error of linear multistep method.
+%   [P,C] = CONSISTENCY(RHO,SIGMA) determines the local truncation
+%   error for the linear multistep method defined by the polynomials
+%   RHO and SIGMA. For the ODE Y'= F(t,Y) the method is given by
+%   RHO(E)*Y(n) = h*SIGMA(E)*F(t(n),Y(n)), where E is the forward
+%   shift operator.
+%
+%   The local error is returned as an integer P (the order) and a
+%   length-2 vector C (error constants). The local error is then to
+%   leading order given by C(1)*h^(P+1)*D^(P+1)*Y(n), where D =
+%   d/dt. The error constant is given by C(2) = C(1)/sum(sigma).
+%
+%   P = CONSISTENCY(RHO,SIGMA) returns a residual polynomial P such
+%   that the local error is given by POLYVAL(P,DELTA)*Y(n), where the
+%   discrete difference DELTA = E-1 = exp(hD)-1 = hD+(hD)^2/2+...
+%
+%   P = CONSISTENCY(RHO,SIGMA,N) returns the first N terms, where N =
+%   max(numel(RHO),numel(SIGMA))+2 is the default.
+%
+%   Examples:
+%     % explicit Euler
+%     [p,c] = consistency([1 -1],[0 1])
+%
+%     % BDF2
+%     p1 = consistency([3/2 -2 1/2],[1 0 0])
+%
+%     % "explicit" BDF2
+%     p2 = consistency([1 -4/3 1/3],2/3*[0 2 -1])
+%
+%     % 2nd order Adams-Bashforth
+%     p3 = consistency([1 -1 0],[0 3/2 -1/2])
+%
+%     % (note that the actual error constants differ more than
+%     % perhaps expected judging from p2 and p3:)
+%     [p,c2] = consistency([1 -4/3 1/3],2/3*[0 2 -1])
+%     [p,c3] = consistency([1 -1 0],[0 3/2 -1/2])
+%
+%     % trapezoidal rule
+%     p4 = consistency([1 -1],1/2*[1 1],10)'
+%
+%   See also STABILITY, POLYVAL.
+
+% S. Engblom 2010-01-31
+
+% input
+rlen = numel(rho);
+slen = numel(sigma);
+len = max(rlen,slen);
+if nargin < 3, N = len+2; end
+
+% code becomes much slicker if the constant term is first:
+rho = [reshape(rho(end:-1:1),1,[]) zeros(1,len-rlen) ];
+sigma = [reshape(sigma(end:-1:1),1,[]) zeros(1,len-slen)];
+
+% series for log(1+x)
+logpoly = [0 1./(1:N-1)]; % enough for terms 1..N
+logpoly(1:2:end) = -logpoly(1:2:end);
+% (understood with x = DELTA = exp(hD)-1, logpoly = hD)
+
+% re-expand rho and sigma around x = 1
+sigma1 = zeros(1,len);
+rho1 = zeros(1,len);
+fac = 1;
+dfac = 1:len;
+for d = 1:len
+  % derivatives at x = 1
+  sigma1(d) = sum(sigma)/fac;
+  sigma = sigma(2:end).*dfac(1:end-d);
+  rho1(d) = sum(rho)/fac;
+  rho = rho(2:end).*dfac(1:end-d);
+  fac = d*fac;
+end
+
+% residual polynomial
+p = [rho1 zeros(1,N-1)]-conv(logpoly,sigma1);
+if nargout > 1
+  ip = find(abs(p) > eps(1000*(norm(rho1,1)+norm(sigma1,1))),1,'first');
+  c = [p(ip) p(ip)/sigma1(1)];
+  p = ip-2;
+else
+  p = p(1:N);
+
+  % Stirling transformation (in case one prefers a residual polynomial
+  % in hD rather than in DELTA = exph(hD)-1)
+  %v = [1 zeros(1,N-2)];
+  %q = [p(1) zeros(1,N-1)];
+  %p = p(2:N);
+  %for i = 2:N
+  %  q(i) = v*p';
+  %  v = (1:N-1)/i.*(v+[0 v(1:end-1)]);
+  %end
+  %p = q;
+
+  % Matlab convention:
+  p = p(end:-1:1);
+end
diff --git a/stenglib/Utils/matmerge.m b/stenglib/Utils/matmerge.m
new file mode 100644
index 00000000..aac2057b
--- /dev/null
+++ b/stenglib/Utils/matmerge.m
@@ -0,0 +1,57 @@
+function matmerge(dest,file1,file2)
+%MATMERGE Merge .mat-files.
+%   MATMERGE(DEST,FILE1,FILE2) Loads the two files FILE1 and FILE2 and
+%   examines the variables. The intersection of variable names is
+%   determined and the values of all those variables are concatenated
+%   along the first non-matching dimension (or last dimension). The
+%   resulting variables are then stored in the created file DEST. It
+%   is considered an error if DEST exists.
+%
+%   MATMERGE(DEST,FILE1) allows for an existing file DEST. Hence this
+%   syntax can be thought of as DEST += FILE1.
+%
+%   MATMERGE is a simple design to aid with distributed computations. No
+%   attempt in merging non-array data or making the data unique has been
+%   made.
+
+% S. Engblom 2012-04-01
+
+% new file or appending to dest
+if ~strcmp(dest,'.mat')
+  dest = [dest '.mat'];
+end
+if nargin > 2
+  if exist(dest,'file')
+    error('matmerge:e1','Will not overwrite existing file.');
+  end
+else
+  if ~exist(dest,'file')
+    error('matmerge:e2','Cannot merge with non-existing file.');
+  end
+  file2 = file1;
+  file1 = dest;
+end
+
+% load files
+s1 = load(file1);
+n1 = fieldnames(s1);
+s2 = load(file2);
+n2 = fieldnames(s2);
+
+% intersection of variable names
+n3 = fsetop('intersect',n1,n2);
+s3 = struct;
+
+% merge data
+for i = 1:size(n3,2)
+  temp1 = s1.(n3{i});
+  temp2 = s2.(n3{i});
+  effdim = max(tndims(temp1),tndims(temp2));
+  sz1 = tsize(temp1,1:effdim);
+  sz2 = tsize(temp2,1:effdim);
+  catdim = find([sz1 0] ~= [sz2 1],1,'first');
+  % this wont work if temp1 and temp2 doesn't match
+  s3.(n3{i}) = cat(catdim,temp1,temp2);
+end
+
+save(dest,'-struct','s3');
diff --git a/stenglib/Utils/movie2gif.m b/stenglib/Utils/movie2gif.m
new file mode 100644
index 00000000..43ef08a4
--- /dev/null
+++ b/stenglib/Utils/movie2gif.m
@@ -0,0 +1,77 @@
+function movie2gif(M,cdata,file,varargin)
+%MOVIE2GIF Create GIF animation from MATLAB movie.
+%   MOVIE2GIF works like the MATLAB function MOVIE2AVI except that a
+%   GIF animation is created instead of an AVI movie. This usually
+%   saves a lot of space for simpler movies.
+%
+%   MOVIE2GIF(M,CDATA,FILE,...) creates a GIF animation from the movie
+%   M using colors in the cell-vector CDATA and writes the result to
+%   FILE. The extension '.gif' will be added to FILE if it doesn't
+%   already have an extension.
+%
+%   Usually, CDATA kan be obtained from M itself. For instance, CDATA
+%   = {M(1).cdata} might work in many situations, meaning that only
+%   the colors of the first frame are used.
+%
+%   Property     Value               Description
+%   -----------------------------------------------------------------------
+%   comment      String or cell-     Comment(s) to add to the image.
+%                array of strings
+%
+%   delaytime    Scalar in [0,655]   Specifies the delay in seconds
+%                                    before displaying the next image.
+% 
+%   loopcount    Integer in          Specifies the number of times to repeat
+%                [0..65535] | Inf    the animation. If loopcount = 0,
+%                                    the animation will be played
+%                                    once, if loopcount = 1, the
+%                                    animation will be played twice,
+%                                    and so on.
+%
+%   For more available properties, see the GIF-section in the help-text for 
+%   IMWRITE.
+%
+%   Example:
+%     tspan = linspace(0,2*pi,30); tspan(end) = [];
+%     x = linspace(-pi,pi);
+%     figure, j = 0;
+%     M = struct('cdata',{},'colormap',{});
+%     for t = tspan
+%       j = j+1;
+%       y1 = sin(x+t); y2 = cos(x-t);
+%       plot(x,y1,'b',x,y2,'r');
+%       M(j) = getframe;
+%     end
+%
+%     % use cdata from first two frames only
+%     movie2gif(M,{M(1:2).cdata},'test.gif','delaytime',0.05,'loopcount',inf);
+%
+%   See also MOVIE2AVI, AVIREAD, IMWRITE, PRIVATE/WRITEGIF.
+
+% S. Engblom 2010-09-01 (Revision, explicitly adding extension .gif)
+% S. Engblom 2010-01-14 (Revision, argument cdata)
+% S. Engblom 2007-05-08
+
+% straightforward once you know how to do it...
+sz = size(M(1).cdata);
+gif = zeros([sz(1:2) 1 size(M,2)],'uint8');
+
+if ~all(cellfun('isempty',{M.colormap}))
+  warning('Colormap of movie ignored.');
+end
+
+% find single color map from cdata-cells provided
+[foo,map] = rgb2ind(cat(1,cdata{:}),256);
+if size(map,1) < 2
+  warning(['Obtained a colormap with less than two colors. ' ...
+           'Try to provide more colors.']);
+end
+
+% compute GIF-frames
+for j = 1:size(M,2)
+  gif(:,:,:,j) = rgb2ind(M(j).cdata,map);
+end
+
+% final write
+if all(file ~= '.'), file = [file '.gif']; end
+imwrite(gif,map,file,'gif',varargin{:});
diff --git a/stenglib/Utils/ndop.m b/stenglib/Utils/ndop.m
new file mode 100644
index 00000000..d7a3a9e7
--- /dev/null
+++ b/stenglib/Utils/ndop.m
@@ -0,0 +1,198 @@
+function varargout = ndop(M,ix,sz,varargin)
+%NDOP General N-dimensional operator.
+%   S = NDOP(M,IX,SZ) constructs a sparse matrix S representing the
+%   N-dimensional computational molecule M in a way such that the result of
+%   M acting on a vector X is given by the sparse matrix-vector product S*X.
+%
+%   M is an N-dimensional full matrix with M(IX(1),...,IX(N)) being regarded
+%   as the target point of the molecule.
+%
+%   SZ is a vector of length N specifying the size of the N-dimensional
+%   rectangular computational domain.
+%
+%   S = NDOP(M,IX,SZ,L,R) is a syntax for variable coefficients and
+%   additionally performs left and right scaling by vectors L and R. The
+%   result is the same as if S had been multiplied from the left and right
+%   by diagonal matrices with L and R as diagonals. Empty arguments should
+%   be used when one-sided scaling is requested.
+%
+%   S = NDOP(M,IX,SZ,IP) is a syntax for boundary conditions and uses the
+%   cell-vector IP of length N to specify the points where the molecule
+%   should be applied. If IP{j} is empty, all points in that dimension are
+%   mapped -- similar to MATLABs colon-notation.
+%
+%   S = NDOP(M,IX,SZ,[]) is a useful shorthand for mapping all points using
+%   periodicity at all boundaries, see the remark on boundaries below.
+%
+%   S = NDOP(M,IX,SZ,IP,L,R) additionally performs scaling as before. Again,
+%   empty arguments should be used for one-sided scaling.
+%
+%   [II,JJ,SS] = NDOP(M,IX,SZ,IP,L,R,I,J,S) is a special fast syntax for
+%   boundary conditions. It does not assemble the resulting sparse matrix
+%   but rather keeps it in the triplet-format for later assembly via a call
+%   to FSPARSE. The result is concatenated with the obligatory inputs I, J
+%   and S.
+%
+%   Boundaries: in the first case, boundary points in X are mapped to zero;
+%   boundary points in X are all those points such that applying M would
+%   cause an 'index out of bounds'-error. In the 'boundary syntax' however,
+%   periodicity is used in order to be able to handle periodic boundary
+%   conditions.
+%
+%   On return, S is a sparse square matrix of size PROD(SZ). The enumeration
+%   of the solution vector X follows that of M, that is, the weights for
+%   X(j-1) and X(j+1) when computing X(j) is given by
+%   M(IX(1)-1,IX(2),...,IX(N)) and M(IX(1)+1,IX(2),...,IX(N))
+%   respectively. Hence, "matrix-coordinates" rather than Cartesian
+%   coordinates are used.
+%
+%   Example:
+%     % classical 2nd order accurate Laplace-operator
+%     % on the (N-by-N) unit square
+%     N = 50; h = 1/(N-1);
+%     M = [0 1 0; 1 -4 1; 0 1 0]/h^2;
+%     S = ndop(M,[2 2],[N N]);
+%
+%     % Dirichlet BCs at all boundaries
+%     BC = ndop(1,[1 1],[N N],{[1 N] []})+...   % lower/upper
+%          ndop(1,[1 1],[N N],{[2:N-1] [1 N]}); % left/right
+%
+%     % solve -Laplace(u) = 1 with u = 0 on the boundary
+%     f = ones(N); f([1 N],:) = 0; f(:,[1 N]) = 0;
+%     u = (-S+BC)\f(:);
+%     figure, surf(reshape(u,N,N));
+%
+%   See also FSPARSE, SPBLOCK.
+
+% S. Engblom 2007-01-29 (Major revision)
+% S. Engblom 2004-10-29 (Revision, changed to frepmat)
+% S. Engblom 2003-03-26
+
+% determine dimensionality
+ix = ix(:); sz = sz(:);
+ndim = size(ix,1);
+if size(ix,1) ~= size(sz,1)
+  error('ndop:e1','Ambiguous dimension of input.');
+elseif tndims(M) > ndim
+  error('ndop:e2','Molecule has incorrect dimension.');
+end
+
+% scaling arguments
+if nargin == 5
+  L = varargin{1}(:);
+  R = varargin{2}(:);
+elseif nargin == 6 || nargin == 9
+  L = varargin{2}(:);
+  R = varargin{3}(:);
+else
+  L = [];
+  R = [];
+end
+
+% find on M
+[imm,foo,ss] = find(M(:));
+imm = imm(:); % fix for bug in find when M is zero scalar
+ss = ss(:).';
+
+% compute indices
+imr = zeros(size(imm,1),ndim);
+str = [1 cumprod(tsize(M,1:ndim))];
+imm = imm-1;
+for j = ndim:-1:1,
+  imr(:,j) = floor(imm/str(j))+1-ix(j); % use ix as origin
+  imm = rem(imm,str(j));
+end
+
+% stride for the (imagined) N-dimensional solution block
+str = [1; cumprod(sz)];
+
+% check of scaling
+if ~isempty(L) && numel(L) ~= str(end)
+  error('ndop:e3','Wrong size of scaling argument.');
+end
+if ~isempty(R) && numel(R) ~= str(end)
+  error('ndop:e3','Wrong size of scaling argument.');
+end
+
+% normal syntax: apply molecule only where possible
+if nargout <= 1 && (nargin == 3 || nargin == 5)
+  % compute relative indices to DOFs coupled to each DOF (there will typically
+  % be a zero somewhere in relind representing the target point of the
+  % molecule)
+  relind = imr*str(1:end-1);
+
+  % build rowindices by finding all interior points: the location of the
+  % target point together with the index of the extreme points of the
+  % molecule determine which DOFs are inner DOFs
+  first = max([-min(imr,[],1); zeros(1,ndim)],[],1)';
+  last = sz-1+min([-max(imr,[],1); zeros(1,ndim)],[],1)';
+  first = first.*str(1:end-1);
+  last = last.*str(1:end-1);
+
+  ii = 1;
+  for j = 1:ndim
+    % the rank of ii increases by 1
+    ii = tsum(ii(:),[first(j):str(j):last(j)]',1,2);
+  end
+
+  % final assembly
+  ii = ii(:);
+  jj = tsum(ii,relind,1,2);
+% 'boundary' or 'periodic' syntax
+elseif nargout <= 1 && (nargin == 4 || nargin == 6) || ...
+      (nargout == 3 && nargin == 9)
+  ip = varargin{1}(:);
+  if isempty(ip)
+    % support of all-periodic case
+    ip = cell(ndim,1);
+    ipsz = sz;
+  elseif size(ip,1) ~= ndim
+    error('ndop:e1','Ambiguous dimension of input.');
+  else
+    ipsz = cellfun('prodofsize',ip)';
+    ipsz(ipsz == 0) = sz(ipsz == 0); % 'colon'-notation
+  end
+
+  % build jj by indexing according to input ip, assembling all indices per row
+  % at the same time (this is the last dimension in jj)
+  jj = ones(size(imm,1),1);
+  ii = 1;
+  for j = 1:ndim
+    if isempty(ip{j}), ip{j} = 1:sz(j); end
+    jj = tsum(jj,mod(tsum(ip{j}(:)-1,imr(:,j),1,2),sz(j))*str(j),...
+              [1:j-1 j+1],[j j+1]);
+    ii = tsum(ii(:),(ip{j}(:)-1)*str(j),1,2);
+  end
+
+  % final assembly
+  ii = ii(:);
+  jj = reshape(jj,[],size(imr,1));
+else
+  error('ndop:e4','Unknown syntax.');
+end
+
+% scaling (if any)
+if ~isempty(L)
+  ss = L(ii)*ss;
+  if ~isempty(R)
+    ss = ss.*reshape(R(jj),size(ss));
+  end
+elseif ~isempty(R)
+  ss = tprod(ss,reshape(R(jj),size(jj)),[3 2],[1 2]);
+end
+
+% output
+if nargout <= 1
+  varargout{1} = fsparse(ii,jj,ss,[str(end) str(end)]);
+else
+  % cannot use the 'row' format of fsparse for this output
+  if size(ii,2) ~= size(jj,2)
+    ii = frepmat(ii,[1 size(jj,2)]);
+  end
+  if size(ss,1) ~= size(ii,1)
+    ss = frepmat(ss,size(ii,1));
+  end
+  varargout(1:3) = {[varargin{4}(:); ii(:)] ...
+                    [varargin{5}(:); jj(:)] ...
+                    [varargin{6}(:); ss(:)]};
+end
diff --git a/stenglib/Utils/parseopts.m b/stenglib/Utils/parseopts.m
new file mode 100644
index 00000000..1dc3aca1
--- /dev/null
+++ b/stenglib/Utils/parseopts.m
@@ -0,0 +1,84 @@
+function  [opts,got] = parseopts(optdef,optin)
+%PARSEOPTS Parse options.
+%   [OPTS,GOT] = PARSEOPTS(OPTDEF,OPTIN) merges the options OPTIN with
+%   the default options OPTDEF and produces the options structure
+%   OPTS. OPTDEF and OPTIN can either be structures or cell-vectors
+%   with property/value pairs.
+%
+%   Fields in OPTDEF that are structures are recursively parsed, see
+%   the example below.
+%
+%   The optional output GOT is a structure containing the same fields
+%   as OPTS. The values of each field are boolean variables indicating
+%   whether the corresponging field was received in OPTIN or not.
+%
+%   It is considered an error for OPTIN to contain options not in
+%   OPTDEF.
+%
+%   Example:
+%     f1 = struct('delta',0.5,'rho',0.8);
+%     f2 = struct('delta',0.2,'rho',0.1);
+%     od = struct('nlin','yes','theta',0.1,'relax','on', ...
+%                 'filter1',f1,'filter2',f2);
+%     oi = {'theta' 0.25 'relax' 'off' 'filter1' {'rho' 0.2}};
+%     [opts,got] = parseopts(od,oi)
+%
+%   See also ASSIGNOPTS.
+
+% S. Engblom 2005-07-21
+
+% extract fields and values
+if isstruct(optdef)
+  if any(size(optdef) ~= 1)
+    error('Expecting a scalar structure.');
+  end
+  dfield = fieldnames(optdef);
+  dval = struct2cell(optdef);
+elseif iscell(optdef)
+  dfield = reshape(optdef(1:2:end),[],1);
+  dval = reshape(optdef(2:2:end),[],1);
+  if size(dfield,1) ~= size(dval,1)
+    error('Cell-vector must contain property/value pairs.');
+  end
+else
+  error('Default options must either be a struct or a cell-vector.');
+end
+
+if isstruct(optin)
+  if any(size(optin) ~= 1)
+    error('Expecting a scalar structure.');
+  end
+  ifield = fieldnames(optin);
+  ival = struct2cell(optin);
+elseif iscell(optin)
+  ifield = reshape(optin(1:2:end),[],1);
+  ival = reshape(optin(2:2:end),[],1);
+  if size(ifield,1) ~= size(ival,1)
+    error('Cell-vector must contain property/value pairs.');
+  end
+else
+  error('Input options must either be a struct or a cell-vector.');
+end
+
+% check for illegal fields
+d = fsetop('setdiff',ifield,dfield);
+if ~isempty(d)
+  error(sprintf('Unsupported option "%s".',d{1}));
+end
+
+% merge and create structure of options
+[sfield,ii,id] = fsetop('union',ifield,dfield);
+opts = cell2struct([ival(ii); dval(id)],sfield(:));
+got = cell2struct([frepmat({true},size(ii,2)); ...
+                   frepmat({false},size(id,2))],sfield(:));
+
+% recursive call for fields containing structures
+is = find(cellfun('isclass',dval,'struct'))';
+for i = is
+  f = dfield{i};
+  if got.(f)
+    [opts.(f),got.(f)] = parseopts(dval{i},opts.(f));
+  else
+    [opts.(f),got.(f)] = parseopts(dval{i},{});
+  end
+end
diff --git a/stenglib/Utils/perfecthash.m b/stenglib/Utils/perfecthash.m
new file mode 100644
index 00000000..c1fcda97
--- /dev/null
+++ b/stenglib/Utils/perfecthash.m
@@ -0,0 +1,228 @@
+function hash = perfecthash(s)
+%PERFECTHASH Perfect hash-function from strings.
+%   HASH = PERFECTHASH(S) with S a cell-vector of strings tries to 
+%   determine a perfect hash-function. A few simple models for such 
+%   functions are tried and any function determined in this way is 
+%   returned as a string HASH in C pseudo-code.
+%
+%   Note: PERFECTHASH only indexes characters with index <= maximum length 
+%   of any of the strings. You must explicitly increase the length of the 
+%   strings to index other characters. See the second example below.
+%
+%   Example:
+%      s = {'check','intersect','setdiff','setxor', ...
+%           'union','unique','ismember'};
+%      perfecthash(s)
+%
+%      % case when increasing the length of the strings might help
+%      s = {'if' 'else' 'elseif' 'while' 'for' 'switch' 'case' ...
+%           'end' 'return' 'break' 'continue' 'function'};
+%      perfecthash(s) % void
+%
+%      % add strlen(s{i}) and ending '\0'
+%      sz = cellfun('prodofsize',s);
+%      for i = 1:numel(s), s{i} = [char(sz(i)) s{i} char(0)]; end
+%      perfecthash(s)
+%      % (here str[0] is strlen(str))
+
+% S. Engblom 2016-10-07 (Improved handling of non-printable characters)
+% S. Engblom 2011-06-27 (enum+char*[] output, bug for 3-ary hashes fixed)
+% S. Engblom 2008-04-25 (Minor revision)
+% S. Engblom 2007-05-25
+
+% to be built
+hash = '';
+
+% construct character array c with strings as rows
+c = char(s);
+slen = min(cellfun('prodofsize',s));
+nums = size(c,1);
+% consistently using doubles (not optimal -- but convenient)
+c = double(c(:,1:slen));
+
+% enums are formatted in printable uppercases, strings are case-sensitives
+S = s;
+for i = 1:nums
+  prints1 = find(33 <= s{i} & s{i} <= 126);
+  prints2 = ('a' <= s{i} & s{i} <= 'z') | ...
+            ('A' <= s{i} & s{i} <= 'Z') | ...
+            ('0' <= s{i} & s{i} <= '9');
+  s{i} = s{i}(prints1); % be more generous here
+  S{i} = upper(S{i});
+  S{i}(~prints2) = '_';
+end
+
+% try these sizes for the hash-tables
+siz = nums:2*nums;
+
+% factors to try
+muls = 1:20;
+
+% 1st order hash-functions
+for sz = siz
+  cc = mod(c,sz);
+  ii = all(fsparse(cc+1,1:slen,1) <= 1,1);
+  if any(ii)
+    for i = find(ii)
+      % output function
+      str = ['typedef enum {' frepmat('%s = %d,',[1 size(c,1)])];
+      str = [str(1:end-1) '} PROP;\n'];
+      [foo,is] = sort(cc(:,i));
+      t = [S(is); mat2cell(cc(is,i)',1,ones(1,size(c,1)))];
+      hash = [hash sprintf(str,t{:})];
+
+      t = frepmat({'NULL%s,'},[1 sz]);
+      t(cc(:,i)+1) = {'&get%s,'};
+      t = cell2mat(t);
+      str = ['static const getfun_t getval[] = {' t];
+      str = [str(1:end-1) '};\n'];
+      t = frepmat({''},[1 sz]);
+      t(cc(:,i)+1) = S;
+      hash = [hash sprintf(str,t{:})];
+
+      t = frepmat({'NULL%s,'},[1 sz]);
+      t(cc(:,i)+1) = {'&%s,'};
+      t = cell2mat(t);
+      str = ['void *val[] = {' t];
+      str = [str(1:end-1) '};\n'];
+      t = frepmat({''},[1 sz]);
+      t(cc(:,i)+1) = S;
+      hash = [hash sprintf(str,t{:})];
+
+      str = ['static const char *tab[] = {' frepmat('"%s",',[1 sz])];
+      str = [str(1:end-1) '};\n'];
+      t = frepmat({''},[1 sz]);
+      t(cc(:,i)+1) = s;
+      hash = [hash sprintf(str,t{:})];
+      hash = [hash sprintf('hash(str) = str[%d] %% %d;\n\n',i-1,sz)];
+    end
+    % no point in continuing here
+    siz = nums:sz-1;
+    break;
+  end
+end
+
+% 2nd order functions
+
+% *** use of symmetricity could reduce time by a factor of 2
+found = false;
+for sz = siz
+  for mul1 = muls
+    for mul2 = muls
+      cc = reshape(tsum(mul1*c,mul2*c,[1 2],[1 3]),nums,[]);
+      cc = mod(cc,sz);
+      ii = all(fsparse(cc+1,1:slen^2,1) <= 1,1);
+      if any(ii)
+	[i1,i2] = ind2sub([slen slen],find(ii));
+	ix = i1 <= i2;
+	if any(ix)
+	  i1 = i1(ix); i2 = i2(ix);
+          for i = 1:numel(i1)
+            j = (i2(i)-1)*slen+i1(i);
+            str = ['typedef enum {' frepmat('%s = %d,',[1 size(c,1)])];
+            str = [str(1:end-1) '} PROP;\n'];
+            [foo,is] = sort(cc(:,j));
+            t = [S(is); ...
+                 mat2cell(cc(is,j)',1,ones(1,size(c,1)))];
+            hash = [hash sprintf(str,t{:})];
+            
+            t = frepmat({'NULL%s,'},[1 sz]);
+            t(cc(:,j)+1) = {'&get%s,'};
+            t = cell2mat(t);
+            str = ['static const getfun_t getval[] = {' t];
+            str = [str(1:end-1) '};\n'];
+            t = frepmat({''},[1 sz]);
+            t(cc(:,j)+1) = S;
+            hash = [hash sprintf(str,t{:})];
+
+            t = frepmat({'NULL%s,'},[1 sz]);
+            t(cc(:,j)+1) = {'&%s,'};
+            t = cell2mat(t);
+            str = ['void *val[] = {' t];
+            str = [str(1:end-1) '};\n'];
+            t = frepmat({''},[1 sz]);
+            t(cc(:,j)+1) = S;
+            hash = [hash sprintf(str,t{:})];
+
+            str = ['static const char *tab[] = {' frepmat('"%s",',[1 sz])];
+            str = [str(1:end-1) '};\n'];
+            t = frepmat({''},[1 sz]);
+            t(cc(:,j)+1) = s;
+            hash = [hash sprintf(str,t{:})];
+            hash = [hash ...
+                     sprintf(['hash(str) = ' ...
+		       '((str[%d] * %d)+(str[%d] * %d)) %% %d;\n\n'], ...
+	               i1(i)-1,mul1,i2(i)-1,mul2,sz)];
+          end
+          siz = nums:sz-1;
+          found = true;
+          break;
+        end
+      end
+    end
+    if found, break; end
+  end
+  if found, break; end
+end
+
+% 3rd order functions
+
+% *** use of symmetricity could reduce time by a factor of 6
+for sz = siz
+  for mul1 = muls
+    for mul2 = muls
+      for mul3 = muls
+        cc = reshape(tsum(mul1*c,mul2*c,[1 2],[1 3]),nums,[]);
+        cc = reshape(tsum(cc,mul3*c,[1 2],[1 3]),nums,[]);
+        cc = mod(cc,sz);
+        ii = all(fsparse(cc+1,1:slen^3,1) <= 1,1);
+        if any(ii)
+	  [i1,i2,i3] = ind2sub([slen slen slen],find(ii));
+          ix = i1 <= i2 & i2 <= i3;
+          if any(ix)
+            i1 = i1(ix); i2 = i2(ix); i3 = i3(ix);
+            for i = 1:numel(i1)
+              j = ((i3(i)-1)*slen+(i2(i)-1))*slen+i1(i);
+              str = ['typedef enum {' frepmat('%s = %d,',[1 size(c,1)])];
+              str = [str(1:end-1) '} PROP;\n'];
+              [foo,is] = sort(cc(:,j));
+              t = [S(is); mat2cell(cc(is,j)',1,ones(1,size(c,1)))];
+              hash = [hash sprintf(str,t{:})];
+
+              t = frepmat({'NULL%s,'},[1 sz]);
+              t(cc(:,j)+1) = {'&get%s,'};
+              t = cell2mat(t);
+              str = ['static const getfun_t getval[] = {' t];
+              str = [str(1:end-1) '};\n'];
+              t = frepmat({''},[1 sz]);
+              t(cc(:,j)+1) = S;
+              hash = [hash sprintf(str,t{:})];
+
+              t = frepmat({'NULL%s,'},[1 sz]);
+              t(cc(:,j)+1) = {'&%s,'};
+              t = cell2mat(t);
+              str = ['void *val[] = {' t];
+              str = [str(1:end-1) '};\n'];
+              t = frepmat({''},[1 sz]);
+              t(cc(:,j)+1) = S;
+              hash = [hash sprintf(str,t{:})];
+
+              str = ['static const char *tab[] = {' frepmat('"%s",',[1 sz])];
+              str = [str(1:end-1) '};\n'];
+              t = frepmat({''},[1 sz]);
+              t(cc(:,j)+1) = s;
+              hash = [hash sprintf(str,t{:})];
+
+              hash = [hash ...
+                       sprintf(['hash(str) = ' ...
+		         '((str[%d] * %d)+(str[%d] * %d)+(str[%d] * %d)) ' ...
+                         '%% %d;\n'], ...
+		         i1(i)-1,mul1,i2(i)-1,mul2,i3(i)-1,mul3,sz)];
+            end
+            return;
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/stenglib/Utils/report.m b/stenglib/Utils/report.m
new file mode 100644
index 00000000..4c1c8ace
--- /dev/null
+++ b/stenglib/Utils/report.m
@@ -0,0 +1,98 @@
+function status = report(t,title,s,varargin)
+%REPORT Report progress of solver.
+%   The function REPORT is designed to report progress of any type of solver
+%   and for Matlab's ODE-solvers in particular.
+%
+%   Only one reporter may be active at any given time. Use WAITBAR
+%   directly to achieve multiple reporters.
+%
+%   STATUS = REPORT([T0 Tend],TITLE,'init',...) sets up the progress
+%   reporter. T0 is the initial time/initial accuracy and Tend the final
+%   time/requested accuracy. The character array TITLE is the title of the 
+%   reporter; if TITLE is a non-character array or empty, the default used 
+%   is 'Solution progress'. This somewhat singular behavior is due to 
+%   considerations of compatibility with Matlab's ODE-solvers.
+%
+%   STATUS = REPORT([T0 Tend],'timeleft','init',...) continuously
+%   measures time and reports an estimate of the time until
+%   completion.
+%
+%   STATUS = REPORT([],[],'none') causes the reporter to be inactive until
+%   reinitialized again. This is useful for avoiding if-clauses before
+%   calling REPORT.
+%
+%   STATUS = REPORT(T,[],'',...) reports the progress T, where T0 <= T <=
+%   Tend.
+%
+%   STATUS = REPORT([],[],'',...) relies on the 'timeleft'-syntax and
+%   updates the estimated time without computing a new estimate. This
+%   is useful when the number of tasks are limited, but each task
+%   takes a considerable amount of time.
+%
+%   STATUS = REPORT([],[],'done',...) finishes the reporter.
+%
+%   STATUS = 0 is returned in all syntaxes.
+%
+%   See also WAITBAR.
+
+% S. Engblom 2009-03-06 (Minor revision)
+% S. Engblom 2007-04-05
+
+persistent T0 Tend percent hwait t0 est lastupdate;
+
+if isempty(s) && ~isempty(hwait)
+  if isempty(t)
+    if ~isempty(est) && ~isempty(lastupdate)
+      % blind update (relying on est and lastupdate being properly initiated)
+      est = est-toc(lastupdate);
+      hrs = floor(est/3600);
+      mins = floor(est/60)-60*hrs;
+      waitbar(percent/100,hwait, ...
+              sprintf('Estimated time left: %d hour(s), %d minute(s).', ...
+                      hrs,mins));
+      lastupdate = tic;
+    end
+    % call ignored otherwise
+  else
+    % main use
+    now = round((t(end)-T0)/(Tend-T0)*100);
+    if now > percent
+      percent = now;
+      if isempty(t0) || percent < eps
+        waitbar(percent/100,hwait);
+      else
+        t1 = toc(t0);
+        est = (100-percent)/percent*t1;
+        hrs = floor(est/3600);
+        mins = floor(est/60)-60*hrs;
+        waitbar(percent/100,hwait, ...
+                sprintf('Estimated time left: %d hour(s), %d minute(s).', ...
+                        hrs,mins));
+      end
+    end
+    lastupdate = tic;
+  end
+elseif strcmp(s,'init')
+  T0 = t(1);
+  Tend = t(end);
+  t0 = [];
+  est = [];
+  lastupdate = [];
+  percent = 0;
+  try, close(hwait); catch, end
+  if isempty(title) || ~isa(title,'char')
+    title = 'Solution progress';
+  elseif strcmp(title,'timeleft')
+    title = 'Estimated time left: ';
+    t0 = tic;
+  end
+  hwait = waitbar(0.0,title);
+else % 'none' and 'done' empties hwait
+  try, close(hwait); catch, end
+  hwait = [];
+  t0 = [];
+  est = [];
+  lastupdate = [];
+end
+
+status = 0;
diff --git a/stenglib/Utils/rmtilde.m b/stenglib/Utils/rmtilde.m
new file mode 100644
index 00000000..8e64d09b
--- /dev/null
+++ b/stenglib/Utils/rmtilde.m
@@ -0,0 +1,51 @@
+function nrem = rmtilde(wd)
+%RMTILDE Remove files ending with a tilde ('~').
+%   NREM = RMTILDE Recursively removes all files with names ending
+%   with a tilde, starting from the current directory. The total
+%   number of removed files is returned.
+%
+%   NREM = RMTILDE(WD) does the same thing, but starts from the
+%   directory WD instead.
+%
+%   In any of the syntaxes, filenames or directories starting with a
+%   period are left unaffected.
+%
+%   Cautionary: the function is recursive and may run slowly in
+%   certain situations.
+%
+%   See also DELETE, RECYCLE, RMDIR.
+
+% S. Engblom 2005-06-16
+
+% remember callers working directory
+cwd = pwd;
+nrem = 0;
+
+% input
+if nargin == 0, wd = cwd; end
+
+% change working directory
+cd(wd);
+
+% fetch all files/directories
+d = dir;
+
+% loop over all of them
+for i = 1:size(d,1)
+  % avoid all names starting with a period
+  if d(i).name(1) ~= '.'
+    if d(i).isdir
+      % directory: recursive call
+      fprintf(1,'Searching directory %s...\n',d(i).name);
+      nrem = nrem+rmtilde(d(i).name);
+    elseif d(i).name(end) == '~'
+      % file ending with '~': remove it
+      delete(d(i).name);
+      nrem = nrem+1;
+      fprintf(1,'Removed file: %s.\n',d(i).name);
+    end
+  end
+end
+
+% switch back to callers directory
+cd(cwd);
diff --git a/stenglib/Utils/runtest.m b/stenglib/Utils/runtest.m
new file mode 100644
index 00000000..839035e9
--- /dev/null
+++ b/stenglib/Utils/runtest.m
@@ -0,0 +1,60 @@
+function allok = runtest(idstr,funs,funnames,varargin)
+%RUNTEST General test facility.
+%   OK = RUNTEST(IDSTR,FUNS,FUNNAMES,...) calls the functions in the
+%   cell-vector FUNS, one by one, and displays appropriate
+%   messages. FUNNAMES is a cell-vector with corresponding names of
+%   the functions and IDSTR is an identifying string which appears
+%   first in all messages produces by RUNTEST (use empty to supress
+%   all such messages). Additional arguments, if any, are passed to
+%   the functions.
+%
+%   Each function in FUNS should be written so that it returns 1 for
+%   'passed' and zero for 'failed'.
+%
+%   Example:
+%     runtest('Medium tests (myfun)', ...
+%       {@test1 @test2},{'Test #1' 'Test #2'});
+
+% S. Engblom 2013-12-03 (empty IDSTR, output OK)
+% S. Engblom 2012-05-25 (Additional arguments)
+% S. Engblom 2004-10-29
+
+% input
+ntest = size(funs,2);
+if size(funnames,2) ~= ntest, error('Input does not match.'); end
+
+% run tests
+nf = 0;
+allok = 1;
+for i = 1:size(funs,2)
+  try
+    ok = feval(funs{i},varargin{:});
+    allok = allok && ok;
+    if ~ok
+      if ~isempty(idstr)
+        fprintf(1,'%s: %s failed.\n',idstr,funnames{i});
+      end
+      nf = nf+1;
+    else
+      if ~isempty(idstr)
+        fprintf(1,'%s: %s passed.\n',idstr,funnames{i});
+      end
+    end
+  catch
+    allok = 0;
+    if ~isempty(idstr)
+      fprintf(1,'%s: Error caught in %s.\n%s\n', ...
+              idstr,funnames{i},lasterr);
+    end
+    nf = nf+1;
+  end
+end
+
+% concluding diagnostics
+if ~isempty(idstr)
+  if nf == 0
+    fprintf(1,'%s passed: %d test(s).\n',idstr,ntest);
+   else
+     fprintf(1,'%s failed: %d test(s) passed, %d failed.\n',idstr,ntest-nf,nf);
+  end
+end
diff --git a/stenglib/Utils/spblock.m b/stenglib/Utils/spblock.m
new file mode 100644
index 00000000..d8299f2e
--- /dev/null
+++ b/stenglib/Utils/spblock.m
@@ -0,0 +1,88 @@
+function S = spblock(i,j,v,mn)
+%SPBLOCK Sparse matrix from blocks.
+%   S = SPBLOCK(I,J,V,[M N]) assembles an M-by-N sparse matrix S from
+%   the index vectors I and J and the cell vector V containing sparse
+%   matrices. The assembly is performed in blocks so that for each
+%   index k, the block V{k} is placed with the upper left corner in
+%   (I(k),J(k)) in S. Multiple indices are as usual summed together,
+%   see FSPARSE. Also, single index or blocks are automatically
+%   expanded so as to match the other inputs.
+%
+%   S = SPBLOCK(I,J,V) does the same thing except that the size of the
+%   result is determined automatically as the extreme bottom right corner of
+%   any of the translated matrices in V.
+%
+%   Example:
+%     A = sprand(3,3,0.5);
+%     S1 = spblock([1 4 7],[1 4 7],{A,-2*A,A});
+%     S2 = kron([1 0 0; 0 -2 0; 0 0 1],A);      % does the same thing
+%
+%     % a different example
+%     B = sprand(3,2,0.8);
+%     S3 = spblock([1 1 4],[1 4 4],{A,B,A});
+%     S4 = [A B sparse(3,1); sparse(3,3) A];    % ditto
+%
+%   See also FSPARSE, FREPMAT, KRON, NDOP, CAT.
+
+% S. Engblom 2012-01-24 (Revision)
+% S. Engblom 2007-01-29 (Minor revision)
+% S. Engblom 2004-05-24
+
+if nargin ~= 3 & nargin ~= 4
+  error('spblock:e2','Unknown syntax.');
+end
+
+% normalize input
+i = i(:)-1; isz = size(i,1);
+j = j(:)-1; jsz = size(j,1);
+v = v(:);   vsz = size(v,1);
+len = max([isz jsz vsz]);
+
+if isz ~= len
+  if isz ~= 1
+    error('spblock:e1', ...
+          'Index vectors must match the number of sparse blocks.');
+  end
+  i = frepmat(i,len);
+end
+if jsz ~= len
+  if jsz ~= 1
+    error('spblock:e1', ...
+          'Index vectors must match the number of sparse blocks.');
+  end
+  j = frepmat(j,len);
+end
+if vsz ~= len
+  if vsz ~= 1
+    error('spblock:e3', ...
+          'The sparse blocks must match the index vectors.');
+  end
+  v = frepmat(v,len);
+end
+
+% determine (an upper bound to) the number of nonzeros
+nz = [0; cumsum(cellfun(@nnz,v))];
+sz = [cellfun('size',v,1) cellfun('size',v,2)];
+tnz = nz(end);
+
+% allocate indices and values
+ii = zeros(tnz,1);
+jj = zeros(tnz,1);
+ss = zeros(tnz,1);
+
+% find on all blocks and build indices and values
+for k = 1:len
+  inds = nz(k)+1:nz(k+1);
+  [ii(inds),jj(inds),ss(inds)] = find(v{k});
+  ii(inds) = ii(inds)+i(k);
+  jj(inds) = jj(inds)+j(k);
+end
+
+% the size of the resulting sparse matrix
+if nargin < 4
+  sz = sz+[i j];
+  mn = max(sz);
+end
+
+% assemble the result
+S = fsparse(ii,jj,ss,mn);
diff --git a/stenglib/Utils/stability.m b/stenglib/Utils/stability.m
new file mode 100644
index 00000000..22ccc46d
--- /dev/null
+++ b/stenglib/Utils/stability.m
@@ -0,0 +1,43 @@
+function h = stability(rho,sigma,varargin)
+%STABILITY Plot of stability region for linear multistep method.
+%   STABILITY(RHO,SIGMA) plots the unit root boundary locus for the
+%   linear multistep method defined by the polynomials RHO and
+%   SIGMA. For the ODE Y'= F(t,Y) the method is given by RHO(E)*Y(n) =
+%   h*SIGMA(E)*F(t(n),Y(n)), where E is the forward shift operator.
+%
+%   STABILITY(RHO,SIGMA,N) uses N points on the unit circle, with N =
+%   200 the default.
+%
+%   H = STABILITY(RHO,SIGMA,...) returns a handle H to the plot and
+%   sends any additional arguments to PLOT.
+%
+%   Examples:
+%     % explicit/implicit Euler
+%     figure, hold on,
+%     stability([1 -1],[0 1],'r');
+%     stability([1 -1],[1 0],'r--');
+%
+%     % BDF2
+%     stability([3/2 -2 1/2],[1 0 0],'b--');
+%     axis equal, hold off,
+%     legend('Euler Fwd','Euler Bwd','BDF2');
+%
+%   See also CONSISTENCY, PLOT, POLYVAL.
+
+% S. Engblom 2010-01-30 (Minor modifications)
+% G. S�derlind 2010-01-29
+
+% straightforward
+if nargin < 3 || ~isnumeric(varargin{1})
+  N = 200;
+else
+  N = varargin{1};
+  varargin = varargin(2:end);
+end
+z = exp(1i*linspace(0,2*pi,N+1));
+y = polyval(rho,z)./polyval(sigma,z);
+if nargout > 0
+  h = plot(real(y),imag(y),varargin{:});
+else
+  plot(real(y),imag(y),varargin{:});
+end
diff --git a/stenglib/Utils/startup.m b/stenglib/Utils/startup.m
new file mode 100644
index 00000000..60322f52
--- /dev/null
+++ b/stenglib/Utils/startup.m
@@ -0,0 +1,10 @@
+%STARTUP Add paths to UTILS stuff.
+
+% S. Engblom 2010-02-10 (Revision)
+% S. Engblom 2005-03-22
+
+s = pwd;
+if exist('test','dir')
+  addpath([s '/test']);
+end
+
diff --git a/stenglib/make_all.m b/stenglib/make_all.m
new file mode 100644
index 00000000..9dcd4ca9
--- /dev/null
+++ b/stenglib/make_all.m
@@ -0,0 +1,13 @@
+%MAKE_ALL Make for all of stenglib/.
+
+% S. Engblom 2010-09-23
+
+cd Tensor
+startup
+make
+
+cd ../Fast
+startup
+make
+
+cd ..
\ No newline at end of file
diff --git a/stenglib/startup.m b/stenglib/startup.m
new file mode 100644
index 00000000..e5932207
--- /dev/null
+++ b/stenglib/startup.m
@@ -0,0 +1,10 @@
+%STARTUP Add paths to stenglib/.
+
+% S. Engblom 2005-03-22
+
+s = pwd;
+addpath([s '/Tensor']);
+addpath([s '/Fast']);
+addpath([s '/Utils']);
+addpath([s '/Scicomp']);
+addpath([s '/Misc']);
diff --git a/urdme/src/nsm/mexmake_nsm.m b/urdme/src/nsm/mexmake_nsm.m
index dd05abcb..120c6ba0 100755
--- a/urdme/src/nsm/mexmake_nsm.m
+++ b/urdme/src/nsm/mexmake_nsm.m
@@ -10,6 +10,7 @@ function mexmake_nsm(propensity_file,~)
 
 % global defines, if any
 define = [];
+definew = [];
 
 % path = location of this make
 path = mfilename('fullpath');
@@ -35,6 +36,7 @@ function mexmake_nsm(propensity_file,~)
           [path '../inline.c'] ...
           [path '../report.c']};
 define = [define '-DMALLOC\(n\)=mxMalloc\(n\) -DFREE\(p\)=mxFree\(p\)'];
+definew = [definew '-D"MALLOC(n)=mxMalloc(n)" -D"FREE(p)=mxFree(p)" -D"srand48(k)=srand(k)" -D"drand48()=rand()"'];
 
 % mex extension
 mx = mexext;
@@ -51,6 +53,11 @@ function mexmake_nsm(propensity_file,~)
   cflags = 'CFLAGS= -std=c99 ';
   mex('-silent','-largeArrayDims',[cflags define], ...
       include{:},link{:},source{:});
+elseif strcmp(mx,'mexw64')
+   cflags = ['CFLAGS=-fPIC -fno-omit-frame-pointer -std=c99 -O3 ' ...
+            '-D_GNU_SOURCE -pthread -fexceptions '];
+  mex('-v','-largeArrayDims',[cflags definew], ... % [mexflags define],
+      include{:},link{:},source{:});
 else
   error(['Platform not yet supported. Your MEX file extension is ' ...
          mx '. Please edit mexmake_nsm.m to allow for this extension.']);
diff --git a/urdme/src/nsm/nsm.c b/urdme/src/nsm/nsm.c
index c3c7e5c9..7b0fffdd 100644
--- a/urdme/src/nsm/nsm.c
+++ b/urdme/src/nsm/nsm.c
@@ -139,7 +139,7 @@ The output is a matrix U (Ndofs X length(tspan)).
 {
   double tt;
   double rdelta,rrdelta;
-  double rand,cum,old;
+  double randd,cum,old;
   double *srrate,*rrate;
   double *sdrate,*Ddiag;
   double *rtimes;
@@ -259,16 +259,16 @@ The output is a matrix U (Ndofs X length(tspan)).
 
       /* First check if it is a reaction or a diffusion event. */
       totrate = srrate[subvol]+sdrate[subvol];
-      rand = drand48();
+      randd = drand48();
 
-      if (rand*totrate <= srrate[subvol]) {
+      if (randd*totrate <= srrate[subvol]) {
 	/* Reaction event. */
 	event = -1;
 
 	/* a) Determine the reaction re that did occur (direct SSA). */
-	rand *= totrate;
+	randd *= totrate;
 	for (re = 0, cum = rrate[subvol*Mreactions]; 
-	     re < Mreactions && rand > cum; 
+	     re < Mreactions && randd > cum; 
 	     re++, cum += rrate[subvol*Mreactions+re]);
 
 	/* elaborate floating point fix: */
@@ -319,10 +319,10 @@ The output is a matrix U (Ndofs X length(tspan)).
 	event = 1;
 
 	/* a) Determine which species... */
-	rand *= totrate;        
-	rand -= srrate[subvol];
+	randd *= totrate;        
+	randd -= srrate[subvol];
 	for (spec = 0, dof = subvol*Mspecies, cum = Ddiag[dof]*xx[dof]; 
-	     spec < Mspecies && rand > cum;
+	     spec < Mspecies && randd > cum;
 	     spec++, cum += Ddiag[dof+spec]*xx[dof+spec]);
 
 	/* elaborate floating point fix: */
@@ -344,11 +344,11 @@ The output is a matrix U (Ndofs X length(tspan)).
 
 	/* b) and then the direction of diffusion. */
 	col = dof+spec;
-	rand = drand48()*Ddiag[col];
+	randd = drand48()*Ddiag[col];
 
 	/* Search for diffusion direction. */
 	for (i = jcD[col], cum = 0.0; i < jcD[col+1]; i++)
-	  if (irD[i] != col && (cum += prD[i]) > rand) break;
+	  if (irD[i] != col && (cum += prD[i]) > randd) break;
 
 	/* simple floating point fix: */
 	if (i >= jcD[col+1]) i = jcD[col+1]-1;
diff --git a/workflows/DLCM/experiments/1_basic_test/basic_test.m b/workflows/DLCM/experiments/1_basic_test/basic_test.m
index bef77ede..c713a58e 100644
--- a/workflows/DLCM/experiments/1_basic_test/basic_test.m
+++ b/workflows/DLCM/experiments/1_basic_test/basic_test.m
@@ -47,6 +47,8 @@
 % diffusive pressure rate
 Drate = 1;
 
+mesh_type = 1
+
 % fetch discretization (mesh_type = 1 or 2)
 if ~exist('mesh_type','var'), error('Must define mesh_type.'); end
 [P,E,T,gradquotient] = basic_mesh(mesh_type,Nvoxels);
diff --git a/workflows/DLCM/experiments/1_basic_test/growth.m b/workflows/DLCM/experiments/1_basic_test/growth.m
index 0d8b8034..94c0c9ed 100644
--- a/workflows/DLCM/experiments/1_basic_test/growth.m
+++ b/workflows/DLCM/experiments/1_basic_test/growth.m
@@ -72,6 +72,17 @@
 % $$$   patch('Faces',R(jj,:),'Vertices',V,'FaceColor',[1 0 0]);
 % $$$   drawnow;
 
+    % visualization (somewhat slow)
+    figure(1), clf,
+    patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9]);
+    hold on,
+    axis([-1 1 -1 1]); axis square, axis off
+    ii = find(U == 1);
+    patch('Faces',R(ii,:),'Vertices',V,'FaceColor',[0 1 0]);
+    jj = find(U > 1);
+    patch('Faces',R(jj,:),'Vertices',V,'FaceColor',[1 0 0]);
+    drawnow;
+
   % classify the DOFs
 
   adof = find(U);
diff --git a/workflows/DLCM/experiments/1_basic_test/mechanics.m b/workflows/DLCM/experiments/1_basic_test/mechanics.m
index ca0690ae..76778746 100644
--- a/workflows/DLCM/experiments/1_basic_test/mechanics.m
+++ b/workflows/DLCM/experiments/1_basic_test/mechanics.m
@@ -33,6 +33,8 @@
 % BC1 >= BC2 is the only thing that makes sense. And normalizing one
 % of them to 0 sets the pressure unit.
 
+mesh_type = 1
+
 % fetch discretization (mesh_type = 1 or 2)
 if ~exist('mesh_type','var'), error('Must define mesh_type.'); end
 [P,E,T,gradquotient] = basic_mesh(mesh_type,Nvoxels);
@@ -59,7 +61,6 @@
 U(ii) = 0; % an empty bottom strip
 
 % classify the DOFs
-
 neigh = full(sum(N,2));
 adof = find(U);
 bdof_m = find(N*(U > 0) < neigh & U == 1);
@@ -114,10 +115,10 @@
                numel(sdof_m));
 moves = full(gradquotient*grad);
 
-% $$$ % pressure plot
-% $$$ figure(3), clf,
-% $$$ Pr_ = full(U); Pr_(Adof) = Pr;
-% $$$ trisurf(T(1:3,:)',P(1,:),P(2,:),Pr_);
+% pressure plot
+figure(3), clf,
+Pr_ = full(U); Pr_(Adof) = Pr;
+trisurf(T(1:3,:)',P(1,:),P(2,:),Pr_);
 
 % visualize rates
 figure(mesh_type), clf,
diff --git a/workflows/DLCM/experiments/1_basic_test/schematics.m b/workflows/DLCM/experiments/1_basic_test/schematics.m
index 86a8b284..cfc7233b 100644
--- a/workflows/DLCM/experiments/1_basic_test/schematics.m
+++ b/workflows/DLCM/experiments/1_basic_test/schematics.m
@@ -12,6 +12,8 @@
 % diffusive pressure rate
 Drate = 1;
 
+mesh_type = 1;
+
 % fetch discretization (mesh_type = 1 or 2)
 if ~exist('mesh_type','var'), error('Must define mesh_type.'); end
 [P,E,T,gradquotient] = basic_mesh(mesh_type,Nvoxels);
@@ -42,6 +44,7 @@
       'FaceColor',graphics_color('vermillion'));
 
 % "The Method"
+neigh = full(sum(N,2));
 adof = find(U);
 bdof_m = find(N*(U > 0) < neigh & U == 1);
 sdof = find(U > 1);
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/AvascularTumour.m b/workflows/DLCM/experiments/2_avascular_tumour/AvascularTumour.m
index 8dbf8652..0d746152 100644
--- a/workflows/DLCM/experiments/2_avascular_tumour/AvascularTumour.m
+++ b/workflows/DLCM/experiments/2_avascular_tumour/AvascularTumour.m
@@ -20,9 +20,9 @@
 % S. Engblom 2017-02-11
 
 % simulation interval
-Tend = 1000;
+Tend = 11;
 tspan = linspace(0,Tend,101);
-report(tspan,'timeleft','init'); % (this estimator gets seriously confused!)
+%report(tspan,'timeleft','init'); % (this estimator gets seriously confused!)
 
 % The user specified cutoff and rate parameters for the proliferation,  
 % death, degradation and consumption rules.
@@ -264,10 +264,9 @@
   % update the visited sites
   VU = VU | U;
 end
-
 report(tt,U,'done');
 
-return;
+% return;
 
 % create a GIF animation
 
@@ -280,14 +279,19 @@
   hold on,
   axis([-1 1 -1 1]); axis square, axis off
   ii = find(Usave{i} == 1);
-  patch('Faces',R(ii,:),'Vertices',V, ...
+  single = patch('Faces',R(ii,:),'Vertices',V, ...
         'FaceColor',graphics_color('bluish green'));
+  
   ii = find(Usave{i} == 2);
-  patch('Faces',R(ii,:),'Vertices',V, ...
+  double = patch('Faces',R(ii,:),'Vertices',V, ...
         'FaceColor',graphics_color('vermillion'));
+  
   ii = find(Usave{i} == -1);
-  patch('Faces',R(ii,:),'Vertices',V, ...
+  dead = patch('Faces',R(ii,:),'Vertices',V, ...
         'FaceColor',[0 0 0]);
+    
+  legend([single, double, dead],'single','double', 'dead');
+  
   title(sprintf('Time = %d, Ncells = %d',tspan(i),full(sum(abs(Usave{i})))));
   drawnow;
   M(i) = getframe(gcf);
@@ -313,6 +317,7 @@
 ylim([0 max(y)]);
 xlabel('time')
 ylabel('N cells')
+legend('total', 'dead','double','single');
 
 return;
 
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/AvascularTumour.m b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/AvascularTumour.m
new file mode 100644
index 00000000..84b55858
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/AvascularTumour.m	
@@ -0,0 +1,334 @@
+% Simulation of an avascular tumour model.
+%
+%   Avascular tumour growth: An initial circular population of cells (one
+%   per voxel) lie in a domain rich in oxygen. Cells consume oxygen at
+%   a constant rate, cons. Cells occupying a voxel with oxygen above
+%   cutoff_prol proliferate at a rate r_prol. Cells occupying
+%   voxels with an oxygen concentration below cutoff_die die at a
+%   rate r_die. Dead cells degrade and stop occupying space at a
+%   rate r_degrade. Voxels with cell concentrations above 1 are 
+%   over-occupied and cause pressure, which will cause movement of cells.
+
+% M.C. Jayaweera & A. Graf Brolund 2021-01(revision)
+% S. Engblom 2017-12-27 (revision)
+% D. B. Wilson 2017-09-05
+% S. Engblom 2017-02-11
+
+clear;
+clc;
+close all;
+
+%% Initial experiment setup
+% cells live in a square of Nvoxels-by-Nvoxels
+Nvoxels = 121; % odd so the BC for oxygen can by centered
+
+% fetch Cartesian discretization
+[P,E,T,gradquotient] = basic_mesh(1,Nvoxels); 
+[V,R] = mesh2dual(P,E,T,'voronoi');
+
+D = 1; % D_rate, the rate with which cells move in the domain. 
+        % currently the rate is the same for visited voxels and non-visited
+
+% simulation interval
+Tend = 101;                  % final time step
+tspan = linspace(0,Tend,101);
+timescaling=0.005;          % time scaling
+
+% report(tspan,'timeleft','init'); %(this estimator gets seriously confused!)
+
+% the user specified cutoff and rate parameters for the proliferation,
+% death, degradation and consumption rules.
+cons = 0.0015;        % consumption of oxygen by cells
+cutoff_prol = 0.65;   % the minimum amount of oxygen for proliferation
+r_prol = 0.125;       % rate of proliferation of singly occupied voxels
+cutoff_die = 0.55;    % the maximum amount of oxygen where cells can die
+r_die = 0.125;        % rate of death
+r_degrade = 0.01;     % rate of degradation for already dead cells
+cutoff_bdof = 0.1;    % lower bound for bdof
+cutoff_remain = 0.01; % the minimum amount of living cells in a voxel
+cutoff_deg = 0.0001;  % the minimum amount of dead cells in a voxel
+
+% initial population: circular blob of living cells
+start_value = 1; % cell concentrations in the initial blob
+radius = 0.05;
+r = sqrt(P(1,:).^2+P(2,:).^2);
+ii = find(r < radius); % radius of the initial blob
+U = fsparse(ii(:),1,start_value,[Nvoxels^2 1]); % intialize
+U_new = fsparse(ii(:),1,start_value,[Nvoxels^2 1]);
+U_dead = fsparse(ii(:),1,0,[Nvoxels^2 1]);      
+U_deadnew = fsparse(ii(:),1,0,[Nvoxels^2 1]);   
+
+% boundary conditions
+OBC1 = 0; % BC for the oxygen equation for unvisited boundary
+OBC2 = 0; % BC for the visited boundary
+    
+% assemble minus the Laplacian on this grid (ignoring BCs), the voxel
+% volume vector, and the sparse neighbor matrix
+[L,dM,N] = dt_operators(P,T);       %N gives the neighbours 
+neigh = full(sum(N,2));
+
+N_vec = zeros(size(N,1),4);         % neighbour matrix used to find sdof_m
+for k = 1:size(N,1)
+    temp = find(N(k,:));
+    if length(temp) == 2
+        temp = [temp, temp(1),temp(2)];
+    elseif length(temp) == 3
+        temp = [temp, temp(1)];
+    end
+    N_vec(k,:) = temp;
+end
+
+% dofs for the sources at the extreme outer circular boundary
+[xc,yc] = getmidpointcircle(1/2*(Nvoxels+1),1/2*(Nvoxels+1),1/2*(Nvoxels-1));
+irem = find(xc < 1 | yc < 1 | xc > Nvoxels | yc > Nvoxels);
+xc(irem) = [];
+yc(irem) = [];
+extdof = find(sparse(xc,yc,1,Nvoxels,Nvoxels));
+
+% visit marker matrix: 1 for voxels who have been occupied 
+VU = (U ~= 0);
+
+% representation of solution: cell-vector of sparse matrices
+Usave = cell(1,numel(tspan));
+Usave{1} = U;
+Udsave = cell(1,numel(tspan));
+Udsave{1} = U_dead;
+
+% keeps track of oxygen and dofs for figures and understanding
+Oxysave = cell(1,numel(tspan));
+bdofsave = cell(1,numel(tspan));
+sdofsave = cell(1,numel(tspan));
+sdofbsave = cell(1,numel(tspan));
+
+tt = tspan(1);
+i = 1;
+
+La = struct('X',0,'L',0,'U',0,'p',0,'q',0,'R',0);
+OLa = struct('X',0,'L',0,'U',0,'p',0,'q',0,'R',0);
+
+% oxygen Laplacian
+OLa.X = L;
+OLai = fsparse(extdof,extdof,1,size(OLa.X));
+OLa.X = OLa.X-OLai*OLa.X+OLai;   
+
+[OLa.L,OLa.U,OLa.p,OLa.q,OLa.R] = lu(OLa.X,'vector');
+
+
+%% Time loop
+while tt <= tspan(end)
+    %% Initialise U and U_dead and classify the DOFs
+    U = U_new;
+    U_dead = U_deadnew;
+    U_and_U_dead = U | U_dead;
+  
+    adof = find(U_and_U_dead); % all filled voxels 
+    % singularly occupied voxels on the boundary: 
+    bdof_m = find(N*(U_and_U_dead ~= 0) < neigh & (U > cutoff_bdof & ...
+        U <= 1));
+    sdof = find(U > 1); % sdof on the boundary
+    sdof_b = find(N*(U_and_U_dead ~=0) < neigh & (U > 1));
+    % voxels with more than concentration 1 in them which may move, 
+    % with a voxel containing less number of cells next to it:
+    sdof_m = find(U - min(U(N_vec),[],2) > 0 & U>1);
+    % empty voxels touching occupied ones 
+    Idof = (N*(U_and_U_dead ~= 0) > 0 & U_and_U_dead == 0); 
+    idof1 = find(Idof & ~VU);         % "external" OBC1
+    idof2 = find(Idof & VU);          % "internal" OBC2
+    idof = find(Idof);
+    ddof = find(U_dead > 0);          % degrading voxels
+    
+    % "All DOFs" = adof + idof, like the "hull of adof"
+    Adof = [adof; idof];
+    % The above will be enumerated within U, a Nvoxels^2-by-1 sparse
+    % matrix. Determine also a local enumeration, eg. [1 2 3
+    % ... numel(Adof)].
+
+    Adof_ = (1:numel(Adof))';
+    [bdof_m_,sdof_,sdof_m_,idof1_,idof2_,idof_,adof_, sdof_b_,ddof_] = ...          
+       map(Adof_,Adof,bdof_m,sdof,sdof_m,idof1,idof2,idof,adof,sdof_b,ddof);
+    
+    %% Calculate Pressure and Oxygen systems
+    
+    % pressure Laplacian
+    La.X = L(Adof,Adof);
+    % remove emtpy voxels touching occupied ones
+    Lai = fsparse(idof_,idof_,1,size(La.X)); 
+    La.X = La.X-Lai*La.X+Lai;
+    [La.L,La.U,La.p,La.q,La.R] = lu(La.X,'vector');
+
+    % RHS source term proportional to the over-occupancy and BCs
+    Pr = full(fsparse(sdof_,1,(U(sdof)-1)./dM(sdof),...% equilibrium at U=1
+        [size(La.X,1) 1]));     % RHS first...
+    Pr(La.q) = La.U\(La.L\(La.R(:,La.p)\Pr)); % ..then the solution
+
+    % RHS source term proportional to the over-occupancy and BCs
+    Oxy = full(fsparse([extdof; adof],1, ...
+        [ones(size(extdof)); ...
+        -cons*full(U(adof)./dM(adof))], ... 
+        [size(OLa.X,1) 1]));
+    Oxy(OLa.q) = OLa.U\(OLa.L\(OLa.R(:,OLa.p)\Oxy));
+
+    %% Movement calculations
+    
+    % movement of cells in sources, sdof_m  
+    rates_sdof = zeros(length(Adof),1);
+    [ii,jj_] = find(N(sdof_m,Adof)); % neighbours
+    % pressure difference between cells and its neighbours
+    Pr_diff__ = max(Pr(sdof_m_(ii))-Pr(jj_),0); 
+    grad_sdof = fsparse(ii,1,Pr_diff__*D, numel(sdof_m));
+    rates_sdof(sdof_m_) = -gradquotient*grad_sdof; % sources lose cells
+    grad_N = fsparse(jj_,1, Pr_diff__*D, numel(Adof));
+    rates_sdof = rates_sdof + gradquotient*grad_N; % neighbours gain cells
+            
+    % movement of cells on the boundary, not over-occupied, bdof_m
+    % this loop is similar to what is done above for sdof and can be 
+    % rewritten in a similar fashion. Since this calculation is less 
+    % common, it is not a big concern for efficiency
+    rates_bdof = zeros(length(Adof),1);
+    for ind=1:length(bdof_m_)
+        ix = bdof_m(ind);
+        ix_ = bdof_m_(ind);
+
+        jx_ = find(N(ix,Adof)); % neighbours to the bdof
+        jx_ = jx_(U_and_U_dead(Adof(jx_)) == 0); % empty neighbours
+        Pr_diff = max(Pr(ix_)-Pr(jx_),0); % pressure difference
+
+        rates_bdof(ix_) = -sum(D*Pr_diff); % loses cells
+        rates_bdof(jx_) = rates_bdof(jx_) + D*Pr_diff; % gain cells
+    end
+
+
+    %%  Change calculation
+    %Change calculation of proliferation, death and degradation
+
+    % proliferation
+    ind_prol = find((Oxy > cutoff_prol)); % index of proliferating cells
+    prol_conc = r_prol*U(ind_prol); % concentration born in each time step
+
+    % death
+    ind_die = find(Oxy < cutoff_die); % index of dying cells
+    dead_conc = r_die*U(ind_die); % concentration dying in each time step
+
+    % degradation 
+    degrade_conc = U_deadnew(ddof)*r_degrade; % degraded in each time step
+
+%%  Calculate timestep dt  
+
+    ind_rates_sdof_n = find(rates_sdof(sdof_m_)<0); % affected voxels
+    ind_rates_bdof_n = find(rates_bdof(bdof_m_)<0);
+
+    % find the largest possible time step while avoiding U<0
+    dt_death = U_new(ind_die)./(dead_conc);
+    dt_sdof = U_new(sdof_m(ind_rates_sdof_n))./ ...
+        (-rates_sdof(sdof_m_(ind_rates_sdof_n)));
+    dt_bdof = U_new(bdof_m(ind_rates_bdof_n))./ ...
+        (-rates_bdof(bdof_m_(ind_rates_bdof_n)));
+
+    dt_unscaled = (min([dt_death; dt_sdof; dt_bdof;(0.1*Tend)]));
+    dt = dt_unscaled*timescaling; % scale dt smaller
+
+    %%  Report back and save time series of current states 
+    if tspan(i+1) < tt+dt
+        iend = i+find(tspan(i+1:end) < tt+dt,1,'last');
+
+        % save relevant values 
+        Usave(i+1:iend) = {U};
+        Udsave(i+1:iend) = {U_dead};
+
+        Oxysave(i+1:iend) = {Oxy};
+        bdofsave(i+1:iend) = {bdof_m};
+        sdofsave(i+1:iend) = {sdof_m};
+        sdofbsave{i+1:iend} = {sdof_b};
+
+        i = iend;
+    end
+
+    %%  Euler forward step
+
+    %Proliferation
+    U_new(ind_prol)=U_new(ind_prol)+prol_conc*dt;
+
+    %Death
+    U_new(ind_die) = U_new(ind_die) - dead_conc*dt;
+    U_deadnew(ind_die) = U_deadnew(ind_die) + dead_conc*dt;
+    ind_cutoff =  find(U_new < cutoff_remain & (Oxy < cutoff_die));    
+    U_new(ind_cutoff) = 0; % remove cells below cutoff_remain 
+
+    % Degradation
+    U_deadnew(ddof) = U_deadnew(ddof) - degrade_conc*dt;
+    U_deadnew(U_deadnew < cutoff_deg) = 0; % remove cells below cutoff_deg
+
+    % movement of cells in sources, sdof_m
+    U_new(Adof) = U_new(Adof) + rates_sdof.*dt;
+
+    % movement of cells in boundary voxels, bdof_m
+    U_new(Adof) = U_new(Adof) + rates_bdof*dt;
+
+
+    %% Step in time
+    tt = tt+dt; 
+%     report(tt,U,'');
+    
+    % update the visited sites
+    VU = VU | U_new;
+end
+
+%report(tt,U,'done');
+
+% return;
+
+%% Create a GIF-animation
+
+% create a GIF animation
+Tumour = struct('cdata',{},'colormap',{});
+fig=figure(1);
+clf,
+
+Umat=full(cell2mat(Usave));
+colorbar;
+caxis([0 max(max(Umat))])
+colorlabel('Concentration of cells, U')
+
+for i = 1:numel(Usave)        
+    % background
+    patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+        'EdgeColor','none');
+
+    hold on,
+    axis([-1 1 -1 1]); axis square, axis off
+
+    % colour living voxels after concentration level
+    ii = find(Usave{i}>0);
+    c = Umat(ii,i);
+    patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c, ... 
+        'FaceColor','flat');     
+
+    % colour (fully) dead voxels black
+    ii = find(Usave{i} == 0 & Udsave{i} > 0);
+    p_dead = patch('Faces',R(ii,:),'Vertices',V, ...
+        'FaceColor',[0 0 0]);
+    %legend(p_dead,'dead')
+
+    title(sprintf('Time = %d',tspan(i)));
+    drawnow;
+    Tumour(i) = getframe(gcf);
+
+    % saves the GIF
+    movie2gif(Tumour,{Tumour.cdata},'Tumour.gif', ...
+             'delaytime',0.1,'loopcount',0);
+
+end
+
+%% SAVE DATA
+saveData = struct('U', {U}, 'VU', {VU}, 'Usave', {Udsave}, 'tspan', {tspan}, ...
+    'R', {R}, 'V', {V}, 'N', {N}, 'Udsave', {Udsave}, ...
+    'Pr', {Pr}, 'Adof', {Adof},'adof', {adof}, 'adof_', {adof_}, ...
+    'idof', {idof},'idof_', {idof_}, 'Nvoxels',{Nvoxels}, ...
+    'P', {P}, 'bdof_m', {bdof_m}, 'bdof_m_', {bdof_m_}, ...
+    'sdof_m', {sdof_m},'sdof_m_', {sdof_m_}, 'gradquotient', {gradquotient}, ...
+    'Tend', {Tend});
+filename = "Data";
+filename_saveData = filename + ".mat";
+save(filename_saveData,'-struct','saveData');
+
+
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_Relaxation.m b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_Relaxation.m
new file mode 100644
index 00000000..5cfa961e
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_Relaxation.m	
@@ -0,0 +1,244 @@
+% Simulation of an avascular tumour model.
+%
+%   Avascular tumour growth, relaxation experiment: An initial circular 
+%   population of cells lie crowded together (concentration>1). When 
+%   released, the pressure exerted between the cells will have them move 
+%   out in the domain.
+%
+%   No proliferation, death, or degradation is allowed.  
+
+% C. Jayaweera & A. Graf Brolund 2021-01 (revision)
+% S. Engblom 2017-12-27 (revision)
+% D. B. Wilson 2017-09-05
+% S. Engblom 2017-02-11
+
+clear;
+clc;
+close all;
+
+% cells live in a square of Nvoxels-by-Nvoxels
+Nvoxels = 121; % odd so the BC for oxygen can by centered
+
+% fetch Cartesian discretization
+[P,E,T,gradquotient] = basic_mesh(1,Nvoxels);  %gradquotient=1 for... 
+[V,R] = mesh2dual(P,E,T,'voronoi');             %cartesian mesh
+
+D = 1; % D_rate, the rate with which cells move in the domain. 
+        % currently the rate is the same for visited voxels and non-visited
+
+% simulation interval
+Tend = 100;
+tspan = linspace(0,Tend,101);
+timescaling = 0.005;
+
+% initial population: circular blob of living cells    
+start_value = 10; % cell concentrations in the initial blob
+radius = 0.07;
+r = sqrt(P(1,:).^2+P(2,:).^2);
+ii = find(r < radius); % radius of the initial blob
+U = fsparse(ii(:),1,start_value,[Nvoxels^2 1]); % initialize
+U_new = fsparse(ii(:),1,start_value,[Nvoxels^2 1]);
+U_dead = fsparse(ii(:),1,0,[Nvoxels^2 1]); 
+U_deadnew = fsparse(ii(:),1,0,[Nvoxels^2 1]); 
+
+% parameters
+cutoff_bdof = 0.1;    % lower bound for bdof
+
+% boundary conditions
+OBC1 = 0; % BC for the oxygen equation for unvisited boundary
+OBC2 = 0; % BC for the visited boundary
+
+% assemble minus the Laplacian on this grid (ignoring BCs), the voxel
+% volume vector, and the sparse neighbor matrix
+[L,dM,N] = dt_operators(P,T);       %N gives the neighbours 
+neigh = full(sum(N,2));
+
+N_vec = zeros(size(N,1),4);          % neighbour matrix used to find sdof_m
+for k = 1:size(N,1)
+    temp = find(N(k,:));
+    if length(temp) == 2
+        temp = [temp, temp(1),temp(2)];
+    elseif length(temp) == 3
+        temp = [temp, temp(1)];
+    end
+    N_vec(k,:) = temp;
+end
+
+% visit marker matrix: 1 for voxels who have been occupied
+VU = (U ~= 0);
+
+% representation of solution: cell-vector of sparse matrices
+Usave = cell(1,numel(tspan));
+Usave{1} = U;
+Udsave = cell(1,numel(tspan));
+Udsave{1} = U_dead;
+
+% keeps track of dofs for figures and understanding
+bdofsave = cell(1,numel(tspan));
+sdofsave = cell(1,numel(tspan));
+sdofbsave = cell(1,numel(tspan));
+
+tt = tspan(1);
+i = 1;
+La = struct('X',0,'L',0,'U',0,'p',0,'q',0,'R',0);
+
+while tt <= tspan(end)
+    U = U_new;
+    U_dead = U_deadnew;
+    
+    %% Init U and U_dead and classify the DOFs
+    U = U_new;
+    U_dead = U_deadnew;
+    U_and_U_dead = U | U_dead;
+  
+    %Classification of the DOFs
+    
+    adof = find(U_and_U_dead); % all filled voxels 
+    % singularly occupied voxels on the boundary: 
+    bdof_m = find(N*(U_and_U_dead ~= 0) < neigh & (U > cutoff_bdof & ...
+        U <= 1));
+    sdof = find(U > 1); % source voxels,concentration more than 1
+    % sdof on the boundary
+    sdof_b = find(N*(U_and_U_dead ~=0) < neigh & (U > 1)); 
+    % voxels with more than concentration 1 in them which may move, 
+    % with a voxel containing a lower concentrations next to it:
+    sdof_m = find(U - min(U(N_vec),[],2) > 0 & U>1);
+    % empty voxels touching occupied ones  
+    Idof = (N*(U_and_U_dead ~= 0) > 0 & U_and_U_dead == 0);      
+    idof1 = find(Idof & ~VU);         % "external" OBC1
+    idof2 = find(Idof & VU);          % "internal" OBC2
+    idof = find(Idof);
+    
+    % "All DOFs" = adof + idof, like the "hull of adof"
+    Adof = [adof; idof];
+    % The above will be enumerated within U, a Nvoxels^2-by-1 sparse
+    % matrix. Determine also a local enumeration, eg. [1 2 3
+    % ... numel(Adof)].
+
+    Adof_ = (1:numel(Adof))';
+    [bdof_m_,sdof_,sdof_m_,idof1_,idof2_,idof_,adof_, sdof_b_] = ...          
+      map(Adof_,Adof,bdof_m,sdof,sdof_m,idof1,idof2,idof,adof,sdof_b);
+    
+    %% Calculate Pressure
+
+    % pressure Laplacian
+    La.X = L(Adof,Adof);
+    %remove emtpy voxels touching occupied ones
+    Lai = fsparse(idof_,idof_,1,size(La.X)); 
+    La.X = La.X-Lai*La.X+Lai;
+    [La.L,La.U,La.p,La.q,La.R] = lu(La.X,'vector');
+
+    % RHS source term proportional to the over-occupancy and BCs
+    Pr = full(fsparse(sdof_,1,(U(sdof)-1)./dM(sdof),... %equilibrium at U=1
+        [size(La.X,1) 1]));   % RHS first...                           
+    Pr(La.q) = La.U\(La.L\(La.R(:,La.p)\Pr)); % ..then the solution
+
+    %% Movement calculations
+    
+    % movement of cells in sources, sdof_m
+    rates_sdof = zeros(length(Adof),1);
+    [ii,jj_] = find(N(sdof_m,Adof)); % neighbours
+    % pressure difference between cells and its neighbours    
+    Pr_diff__ = max(Pr(sdof_m_(ii))-Pr(jj_),0);   
+    grad_sdof = fsparse(ii,1,Pr_diff__*D, numel(sdof_m));
+    rates_sdof(sdof_m_) = -gradquotient*grad_sdof; % sources lose cells
+    grad_N = fsparse(jj_,1, Pr_diff__*D, numel(Adof));
+    rates_sdof = rates_sdof + gradquotient*grad_N; % neighbours gain cells
+                         
+    % movement of cells on the boundary, not over-occupied, bdof_m
+    % this loop is similar to what is done above for sdof and can be 
+    % rewritten in a similar fashion. Since this calculation is less 
+    % common, it is not a big concern for efficiency
+    rates_bdof = zeros(length(Adof),1);
+    for ind=1:length(bdof_m_)
+        ix = bdof_m(ind);
+        ix_ = bdof_m_(ind);
+
+        jx_ = find(N(ix,Adof)); % neighbour to the voxel
+        jx_ = jx_(U_and_U_dead(Adof(jx_)) == 0); % empty neighbours
+        Pr_diff = max(Pr(ix_)-Pr(jx_),0); % pressure difference
+
+        rates_bdof(ix_) = -sum(D*Pr_diff); % loses cells        
+        rates_bdof(jx_) = rates_bdof(jx_) + D*Pr_diff; % gain cells
+    end
+ 
+    %%  Calculate timestep dt 
+    
+    ind_rates_sdof_n = find(rates_sdof(sdof_m_)<0); % affected voxels
+    ind_rates_bdof_n = find(rates_bdof(bdof_m_)<0);
+
+    % find the largest possible time step while avoiding U<0
+    dt_sdof = U_new(sdof_m(ind_rates_sdof_n))./ ... 
+        (-rates_sdof(sdof_m_(ind_rates_sdof_n)));
+    dt_bdof = U_new(bdof_m(ind_rates_bdof_n))./ ... 
+        (-rates_bdof(bdof_m_(ind_rates_bdof_n)));
+
+    dt = min([dt_sdof; dt_bdof;(0.1*Tend)])*timescaling; % scale dt smaller
+   
+    %% Report back and save time series of current states 
+
+    if tspan(i+1) < tt+dt
+        iend = i+find(tspan(i+1:end) < tt+dt,1,'last');
+
+        % save relevant values 
+        Usave(i+1:iend) = {U};
+        Udsave(i+1:iend) = {U_dead};
+
+        bdofsave(i+1:iend) = {bdof_m};
+        sdofsave(i+1:iend) = {sdof_m};
+        sdofbsave{i+1:iend} = {sdof_b};
+
+        i = iend;
+    end
+    
+    %% Euler forward step
+    
+    % movement of cells in sources, sdof_m
+    U_new(Adof) = U_new(Adof) + rates_sdof.*dt;
+
+    % movement of cells in boundary voxels, bdof_m
+    U_new(Adof) = U_new(Adof) + rates_bdof*dt;
+    
+    tt = tt+dt;
+%     report(tt,U,'');
+    
+    % update the visited sites
+    VU = VU | U;
+end
+% report(tt,U,'done');
+
+%% Create a GIF animation
+Mnormal = struct('cdata',{},'colormap',{});
+figure(1), clf,
+
+Umat=full(cell2mat(Usave));
+colorbar
+caxis([0 max(max(Umat))])
+colorlabel('Concentration of cells, U')
+for i = 1:numel(Usave)
+    % background
+    patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+        'EdgeColor','none');
+    hold on,
+    axis([-1 1 -1 1]); axis square, axis off
+    
+    % colour living voxels after concentration level
+    ii = find(Usave{i}>0);
+    c = Umat(ii,i);
+    patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c, ... 
+        'FaceColor','flat');     
+    
+    % color (fully) dead voxels black
+    ii = find(Usave{i} == 0 & Udsave{i} > 0);
+    p_dead = patch('Faces',R(ii,:),'Vertices',V, ...
+        'FaceColor',[0 0 0]);
+    legend(p_dead,'dead')
+    
+    title(sprintf('Time = %d',tspan(i)));
+    drawnow;
+    Mnormal(i) = getframe(gcf);
+end
+
+% save the GIF
+movie2gif(Mnormal,{Mnormal([1:2 end]).cdata},'Tumour.gif', ...
+          'delaytime',0.1,'loopcount',0);
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_StillGrowth.m b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_StillGrowth.m
new file mode 100644
index 00000000..cf31993c
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Experiments/AvascularTumour_StillGrowth.m	
@@ -0,0 +1,242 @@
+% Simulation of an avascular tumour model.
+%
+%   Avascular tumour growth, still growth experiment: An initial circular
+%   population cells (one per voxel) lie in a domain rich in oxygen. Cells 
+%   consume oxygen at a constant rate, cons. Cells occupying a voxel with 
+%   oxygen above cutoff_prol proliferate at a rate r_prol. Cells occupying
+%   voxels with an oxygen concentration below cutoff_die die at a
+%   rate r_die. Dead cells degrade and stop occupying space at a
+%   rate r_degrade.
+%   
+%   No movement of the cells is allowed. Pressure between cells is
+%   calculated but does not cause any actions.
+
+% C. Jayaweera & A. Graf Brolund 2021-01 (revision)
+% S. Engblom 2017-12-27 (revision)
+% D. B. Wilson 2017-09-05
+% S. Engblom 2017-02-11
+
+clear;
+clc;
+close all;
+
+% cells live in a square of Nvoxels-by-Nvoxels
+Nvoxels = 121; % odd so the BC for oxygen can by centered
+
+% fetch Cartesian discretization
+[P,E,T,gradquotient] = basic_mesh(1,Nvoxels); 
+[V,R] = mesh2dual(P,E,T,'voronoi');
+
+D = 1; % D_rate, the rate with which cells move in the domain. 
+        % currently the rate is the same for visited voxels and non-visited
+
+% simulation interval
+Tend = 100;
+tspan = linspace(0,Tend,101);
+timescaling = 0.005;
+
+% report(tspan,'timeleft','init'); % (this estimator gets seriously confused!)
+
+% The user specified cutoff and rate parameters for the proliferation,
+% death, degradation and consumption rules.
+cons = 0.0015;        % consumption of oxygen by cells
+cutoff_prol = 0.65;   % the minimum amount of oxygen for proliferation
+r_prol = 0.125;       % rate of proliferation of singly occupied voxels
+cutoff_die = 0.55;    % the maximum amount of oxygen where cells can die
+r_die = 0.125;        % rate of death
+r_degrade = 0.01;     % rate of degradation for already dead cells
+cutoff_deg = 0.0001;  % the minimum amount of dead cells in a voxel
+cutoff_remain = 0.01; % the minimum amount of alive cells in a voxel
+
+% Initial population: circular blob of living cells
+start_value = 1; % cell concentrations in the initial blob
+radius = 0.25;
+r = sqrt(P(1,:).^2+P(2,:).^2);
+ii = find(r < radius); % radius of the initial blob
+U = fsparse(ii(:),1,start_value,[Nvoxels^2 1]); % intialize
+U_new = fsparse(ii(:),1,start_value,[Nvoxels^2 1]);
+U_dead = fsparse(ii(:),1,0,[Nvoxels^2 1]);      
+U_deadnew = fsparse(ii(:),1,0,[Nvoxels^2 1]);  
+
+% boundary conditions
+OBC1 = 0; % BC for the oxygen equation for unvisited boundary
+OBC2 = 0; % BC for the visited boundary
+
+% assemble minus the Laplacian on this grid (ignoring BCs), the voxel
+% volume vector, and the sparse neighbor matrix
+[L,dM,N] = dt_operators(P,T);       %N gives the neighbours 
+neigh = full(sum(N,2));
+
+% dofs for the sources at the extreme outer circular boundary
+[xc,yc] = getmidpointcircle(1/2*(Nvoxels+1),1/2*(Nvoxels+1),1/2*(Nvoxels-1));
+irem = find(xc < 1 | yc < 1 | xc > Nvoxels | yc > Nvoxels);
+xc(irem) = [];
+yc(irem) = [];
+extdof = find(sparse(xc,yc,1,Nvoxels,Nvoxels));
+
+% visit marker matrix: 1 for voxels who have been occupied
+VU = (U ~= 0);
+
+% representation of solution: cell-vector of sparse matrices
+Usave = cell(1,numel(tspan));
+Usave{1} = U;
+Udsave = cell(1,numel(tspan));
+Udsave{1} = U_dead;
+
+% for keeping track of the oxygen
+Oxysave = cell(1,numel(tspan));
+
+tt = tspan(1);
+i = 1;
+La = struct('X',0,'L',0,'U',0,'p',0,'q',0,'R',0);
+OLa = struct('X',0,'L',0,'U',0,'p',0,'q',0,'R',0);
+
+% oxygen Laplacian
+OLa.X = L;
+OLai = fsparse(extdof,extdof,1,size(OLa.X));
+OLa.X = OLa.X-OLai*OLa.X+OLai;   
+[OLa.L,OLa.U,OLa.p,OLa.q,OLa.R] = lu(OLa.X,'vector');
+
+while tt <= tspan(end)
+    U = U_new;
+    U_dead = U_deadnew;
+    
+    %% Init U and U_dead and classify the DOFs
+    U = U_new;
+    U_dead = U_deadnew;
+    U_and_U_dead = U | U_dead;
+  
+    %Classification of the DOFs
+    adof = find(U_and_U_dead); % all filled voxels 
+    sdof = find(U > 1); % source voxels,concentration more than 1
+    % empty voxels touching occupied ones 
+    Idof = (N*(U_and_U_dead ~= 0) > 0 & U_and_U_dead == 0);          
+    idof1 = find(Idof & ~VU); % "external" OBC1
+    idof2 = find(Idof & VU); % "internal" OBC2
+    idof = find(Idof);
+    ddof = find(U_dead > 0); % degrading voxels
+    
+    % "All DOFs" = adof + idof, like the "hull of adof"
+    Adof = [adof; idof];
+    % The above will be enumerated within U, a Nvoxels^2-by-1 sparse
+    % matrix. Determine also a local enumeration, eg. [1 2 3
+    % ... numel(Adof)].
+
+    Adof_ = (1:numel(Adof))';
+    [sdof_,idof1_,idof2_,idof_,adof_,ddof_] = ...          
+       map(Adof_,Adof,sdof,idof1,idof2,idof,adof,ddof);
+     
+    %% Calculate Pressure and Oxygen systems
+    
+    % pressure Laplacian
+    La.X = L(Adof,Adof);
+    %remove emtpy voxels touching occupied ones
+    Lai = fsparse(idof_,idof_,1,size(La.X)); 
+    La.X = La.X-Lai*La.X+Lai;
+    [La.L,La.U,La.p,La.q,La.R] = lu(La.X,'vector');
+
+    % RHS source term proportional to the over-occupancy and BCs
+    Pr = full(fsparse(sdof_,1,(U(sdof)-1)./dM(sdof), ...%equilibrium at U=1
+        [size(La.X,1) 1]));     % RHS first...
+    Pr(La.q) = La.U\(La.L\(La.R(:,La.p)\Pr)); % ..then the solution
+
+    % RHS source term proportional to the over-occupancy and BCs
+    Oxy = full(fsparse([extdof; adof],1, ...
+        [ones(size(extdof)); ...
+        -cons*full(U(adof)./dM(adof))], ... 
+        [size(OLa.X,1) 1]));
+    Oxy(OLa.q) = OLa.U\(OLa.L\(OLa.R(:,OLa.p)\Oxy));
+    
+    %%  Change calculation
+    % proliferation, death and degradation
+
+    %proliferation
+    ind_prol = find((Oxy > cutoff_prol));   %index of proliferating cells
+    prol_conc = r_prol*U(ind_prol);
+
+    %death
+    ind_die = find(Oxy < cutoff_die);   %index for dying cells
+    dead_conc = r_die*U(ind_die);
+
+    %degradation
+    degrade_conc = U_deadnew(ddof)*r_degrade; 
+    
+    %%  Calculate time step dt    
+    % find the largest possible time step while avoiding U<0
+    dt_death = U_new(ind_die)./(dead_conc); 
+
+    dt = min([dt_death;(0.1*Tend)])*timescaling; % scale dt smaller
+   
+    %% Report back and save time series of current states
+ 
+    if tspan(i+1) < tt+dt
+        iend = i+find(tspan(i+1:end) < tt+dt,1,'last');
+
+        % save relevant values 
+        Usave(i+1:iend) = {U};
+        Udsave(i+1:iend) = {U_dead};
+
+        Oxysave(i+1:iend) = {Oxy};
+
+        i = iend;
+    end
+    
+    %% Euler steps
+    
+    %Proliferation
+    U_new(ind_prol) = U_new(ind_prol)+prol_conc*dt;
+
+    %Death
+    U_new(ind_die) = U_new(ind_die) - dead_conc*dt;
+    U_deadnew(ind_die) = U_deadnew(ind_die) + dead_conc*dt;
+    ind_cutoff =  find(U_new < cutoff_remain & (Oxy < cutoff_die));
+    U_new(ind_cutoff) = 0; % remove cells below cutoff_remain
+    
+    % Degradation
+    U_deadnew(ddof) = U_deadnew(ddof) - degrade_conc*dt;
+    U_deadnew(U_deadnew < cutoff_deg) = 0; % remove cells below cutoff_deg
+    
+    %% Step in time    
+    tt = tt+dt;
+%     report(tt,U,'');
+    
+    % update the visited sites
+    VU = VU | U;
+end
+% report(tt,U,'done'); 
+
+%% Create a GIF animation
+Mnormal = struct('cdata',{},'colormap',{});
+figure(1), clf, 
+
+Umat=full(cell2mat(Usave));
+colorbar
+caxis([min(min(Umat)) max(max(Umat))])
+colorlabel('Concentration of cells, U')
+for i = 1:numel(Usave)
+    % background
+    patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+        'EdgeColor','none');
+    hold on,
+    axis([-1 1 -1 1]); axis square, axis off
+    
+    % colour living voxels after concentration level
+    ii = find(Usave{i}>0);
+    c = Umat(ii,i);
+    patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c, ... 
+        'FaceColor','flat');     
+
+    % color (fully) dead voxels black
+    ii = find(Usave{i} == 0 & Udsave{i} > 0);
+    p_dead = patch('Faces',R(ii,:),'Vertices',V, ...
+        'FaceColor',[0 0 0]);
+    legend(p_dead,'dead')
+    
+    title(sprintf('Time = %d',tspan(i)));
+    drawnow;
+    Mnormal(i) = getframe(gcf);
+end
+
+% save the GIF
+movie2gif(Mnormal,{Mnormal([1:2 end]).cdata},'Tumour.gif', ...
+          'delaytime',0.1,'loopcount',0);
\ No newline at end of file
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/TumorGraphics.m b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/TumorGraphics.m
new file mode 100644
index 00000000..b22266db
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/TumorGraphics.m	
@@ -0,0 +1,172 @@
+% Can be run after AvascularTumour for additional visuals 
+
+snapshotfigure = 0; %creates a gif and 5 snapshot figures
+doffigure = 1;      %visualizes the sdof's and bdof's as well as the concentration in other voxels
+deadfigure = 1;     %visualizes the dead cells in every voxel 
+oxyfigure = 1;      %visualizes the concentration oxygen in the grid
+
+%% Snapshot figure
+if snapshotfigure == 1
+    % create a GIF animation
+    Tumour = struct('cdata',{},'colormap',{});
+    fig=figure(1);
+    clf,
+
+    Umat=full(cell2mat(Usave));
+    colorbar;
+    caxis([0 max(max(Umat))])
+    colorlabel('Concentration of cells, U')
+
+    snapshot = 0;    %Save 5 snapshots 
+
+    for i = 1:numel(Usave)        
+        % background
+        patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+            'EdgeColor','none');
+
+        hold on,
+        axis([-1 1 -1 1]); axis square, axis off
+
+        % colour living voxels after concentration level
+        ii = find(Usave{i}>0);
+        c = Umat(ii,i);
+        patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c, ... 
+            'FaceColor','flat');     
+
+        % colour (fully) dead voxels black
+        ii = find(Usave{i} == 0 & Udsave{i} > 0);
+        p_dead = patch('Faces',R(ii,:),'Vertices',V, ...
+            'FaceColor',[0 0 0]);
+        %legend(p_dead,'dead')
+
+        title(sprintf('Time = %d',tspan(i)));
+        drawnow;
+        Tumour(i) = getframe(gcf);
+
+        %Save 5 snapshots of the tumor progression
+        if i~= [1 ceil([0.24 0.49 0.74 1]*numel(Usave))]
+        elseif i==1
+            filename = 'T=1.png';
+            print(fig,filename,'-painters','-dpng');
+        else
+            ii=ceil(i*(Tend/numel(Usave)));
+            filename = ['T=' num2str(ii) '.png'];
+            print(fig,filename,'-painters','-dpng');
+        end
+
+
+        % saves the GIF
+        movie2gif(Tumour,{Tumour([1:2 end]).cdata},'Tumour.gif', ...
+                 'delaytime',0.1,'loopcount',0);
+
+    end
+end
+
+%% dof figure
+if doffigure==1
+    % create a GIF animation
+    Mdof = struct('cdata',{},'colormap',{});
+    figure(2), clf,
+
+    Umat=full(cell2mat(Usave));
+    colorbar
+    caxis([0 max(max(Umat))])
+    colorlabel('Concentration of cells, u')
+    for i = 1:numel(Usave)
+
+        patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+            'EdgeColor','none');
+        hold on,
+        axis([-1 1 -1 1]); axis square, axis off
+
+        ii = find(Usave{i}>0);
+        c = Umat(ii,i);
+        patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c,'FaceColor','flat');    
+        
+        % colour the different dof types
+        p_bdof = patch('Faces',R(bdofsave{i},:),'Vertices',V, ...
+            'FaceColor','cyan'); 
+
+        p_sdof = patch('Faces',R(sdofsave{i},:),'Vertices',V, ...
+            'FaceColor','magenta'); 
+
+        % colour (fully) dead voxels black
+        ii = find(Usave{i} == 0 & Udsave{i} >0);
+        p_dead = patch('Faces',R(ii,:),'Vertices',V, ...
+            'FaceColor',[0 0 0]);
+        legend([p_bdof,p_sdof,p_dead],'bdof','sdof','dead')
+
+        title(sprintf('Time = %d',tspan(i)));
+
+        drawnow;
+        Mdof(i) = getframe(gcf);
+    end
+
+    % saves the GIF
+    movie2gif(Mdof,{Mdof([1:2 end]).cdata},'TumourMdof.gif', ...
+              'delaytime',0.1,'loopcount',0);
+end
+    
+    
+%% oxygen figure
+if oxyfigure==1
+    % create a GIF animation
+    Mnormal = struct('cdata',{},'colormap',{});
+    figure(3), clf,
+
+    Oxymat=full(cell2mat(Oxysave));
+    colorbar
+    caxis([min(min(Oxymat)) max(max(Oxymat))])
+    colorlabel('Concentration of oxygen, c')
+    for i = 1:numel(Usave)
+
+        patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+            'EdgeColor','none');
+        hold on,
+        axis([-1 1 -1 1]); axis square, axis off
+
+        % Colour of oxygen concentration
+        ii = find(Oxysave{i}>0);
+        c = Oxymat(ii,i);
+        patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c,'FaceColor','flat');     
+
+        title(sprintf('Time = %d',tspan(i)));
+        drawnow;
+        Mnormal(i) = getframe(gcf);
+    end
+    % saves the GIF
+    movie2gif(Mnormal,{Mnormal([1:2 end]).cdata},'Oxynormal.gif', ...
+              'delaytime',0.1,'loopcount',0);
+end
+
+%% Dead figure
+if deadfigure==1
+    Mdead = struct('cdata',{},'colormap',{});
+    figure(4), clf,
+
+    Udmat=cell2mat(Udsave);
+    cmat = full(Udmat/max(max(Udmat)));
+    caxis([1 2])
+    colorbar;
+    colormap 'gray'
+
+    for i = 1:numel(Udsave)
+        patch('Faces',R,'Vertices',V,'FaceColor',[0.9 0.9 0.9], ...
+            'EdgeColor','none');
+        hold on,
+        axis([-1 1 -1 1]); axis square, axis off
+
+        % colour (fully) dead voxels black
+        ii = find(Udsave{i}>0);
+        c = cmat(ii,i);
+        patch('Faces',R(ii,:),'Vertices',V,'FaceVertexCData',c,'FaceColor','flat');    
+
+        title(sprintf('Time = %d',tspan(i)));
+        drawnow;
+        Mdead(i) = getframe(gcf);
+    end
+    % saves the GIF
+    movie2gif(Mdead,{Mdead([1:2 end]).cdata},'TumourMDead.gif', ...
+              'delaytime',0.1,'loopcount',0);
+end
+
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/colorlabel.m b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/colorlabel.m
new file mode 100644
index 00000000..5ae66268
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/colorlabel.m	
@@ -0,0 +1,75 @@
+function h=colorlabel(varargin)
+% Usage: h=colorlabel([hcb],labstring,[options])
+% Given a string "labstring", colorlabel finds the colorbar on the current
+% figure and labels the appropriate axis with that string. The handle of
+% the axis label is returned as "h". If the figure contains more than one
+% colorbar, provide the graphics handle "hcb" of the one that is to be 
+% labeled. You many also specify properties for the label object, e.g.,
+% colorlabel('x (cm)','FontSize',8,'FontWeight','bold').
+%
+%   Example:
+%      load clown
+%      imagesc(X)
+%      colorbar
+%      colorlabel('pixel brightness')
+
+% Written 7 February 2011 by Douglas H. Kelley, dhk [at] dougandneely.com.
+
+if nargin<1
+    error(['Usage: ' mfilename '([hcb],labstring,[options])'])
+end
+if ischar(varargin{1}) % expect colorlabel(labstring,...)
+    labstring=varargin{1};
+    hcb=findobj(gcf,'tag','Colorbar','-or','tag','MapColorbar', ...
+        '-or','tag','ColorWheel');
+    if numel(varargin)<2
+        argin=[varargin cell(1,2-numel(varargin))];
+    else
+        argin=varargin;
+    end
+    options=argin(2:end);
+else % expect colorlabel(hcb,labstring,...)
+    hcb=varargin{1};
+    labstring=varargin{2};
+    if numel(varargin)<3
+        argin=[varargin cell(1,3-numel(varargin))];
+    else
+        argin=varargin;
+    end
+    options=argin(3:end);
+end
+if numel(hcb)~=1
+    error('Please specify exactly one colorbar.')
+end
+if ~ishandle(hcb)
+    error(['Sorry, ' num2str(hcb) ' is not a valid graphics handle.'])
+elseif ~strcmp(get(hcb,'tag'),'Colorbar') && ...
+        ~strcmp(get(hcb,'tag'),'MapColorbar') && ...
+        ~strcmp(get(hcb,'tag'),'ColorWheel')
+    error(['Sorry, ' num2str(hcb) ...
+        ' is not a Colorbar, MapColorbar, or ColorWheel.']);
+end
+if strcmp(get(hcb,'tag'),'Colorbar')
+    if strfind(get(hcb,'Orientation'),'Horizontal') % horizontal colorbar
+        hh=get(hcb,'xlabel');
+    else % vertical colorbar
+        hh=get(hcb,'ylabel');
+    end
+elseif strcmp(get(hcb,'tag'),'MapColorbar') % guess orientation from size of colorbar image
+    c0=get(findobj(hcb,'type','image'),'cdata');
+    if size(c0,1)<size(c0,2) % guess horizontal 
+        hh=get(hcb,'xlabel');
+    else % guess vertical 
+        hh=get(hcb,'ylabel');
+    end
+else % ColorWheel: always set title
+    hh=get(hcb,'title');
+end
+if numel(options)>1 || ~isempty(options{:}) % if there are options, ...
+    set(hh,'string',labstring,'visible','on',options{:});
+else
+    set(hh,'string',labstring,'visible','on');
+end
+if nargout>0
+    h=hh; % return handle if requested
+end
diff --git a/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/license.txt b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/license.txt
new file mode 100644
index 00000000..8622b937
--- /dev/null
+++ b/workflows/DLCM/experiments/2_avascular_tumour/Project in SciComp - grp20/PDE-Proj/Utils/license.txt	
@@ -0,0 +1,24 @@
+Copyright (c) 2011, Douglas H. Kelley
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the distribution
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/workflows/DLCM/utils/map.m b/workflows/DLCM/utils/map.m
index 7e586245..d9138873 100644
--- a/workflows/DLCM/utils/map.m
+++ b/workflows/DLCM/utils/map.m
@@ -20,5 +20,5 @@
 str = cumsum([1 cellfun('prodofsize',varargin)]);
 varargout = cell(1,numel(varargin));
 for i = 1:numel(varargout)
-  varargout{i} = L(ix(str(i):str(i+1)-1));
+    varargout{i} = L(ix(str(i):str(i+1)-1));
 end
diff --git a/workflows/T=1.png b/workflows/T=1.png
new file mode 100644
index 00000000..62ddb6e3
Binary files /dev/null and b/workflows/T=1.png differ
diff --git a/workflows/T=11.png b/workflows/T=11.png
new file mode 100644
index 00000000..749ae968
Binary files /dev/null and b/workflows/T=11.png differ
diff --git a/workflows/T=3.png b/workflows/T=3.png
new file mode 100644
index 00000000..7c946070
Binary files /dev/null and b/workflows/T=3.png differ
diff --git a/workflows/T=6.png b/workflows/T=6.png
new file mode 100644
index 00000000..b5fc9728
Binary files /dev/null and b/workflows/T=6.png differ
diff --git a/workflows/T=9.png b/workflows/T=9.png
new file mode 100644
index 00000000..1a2b85d1
Binary files /dev/null and b/workflows/T=9.png differ
diff --git a/workflows/Tumour.gif b/workflows/Tumour.gif
new file mode 100644
index 00000000..3489e778
Binary files /dev/null and b/workflows/Tumour.gif differ