diff --git a/SpMat.h b/SpMat.h
index 07377c7b19f38ff9fb82750a8dc3d8db653f405a..ae81cc6fb155164c2ff91d1d8e3f1f7d8bed6210 100644
--- a/SpMat.h
+++ b/SpMat.h
@@ -25,6 +25,8 @@
 #include <vector>
 #include <fstream>
 #include <iomanip>
+#include <thread>
+#include <chrono>
 #include "armawrap/newmat.h"
 #include "cg.h"
 #include "bicg.h"
@@ -89,16 +91,18 @@ template<class T>
 class SpMat
 {
 public:
-  SpMat() : _m(0), _n(0), _nz(0), _ri(0), _val(0), _pw(false) {}
-  SpMat(unsigned int m, unsigned int n) : _m(m), _n(n), _nz(0), _ri(n), _val(n), _pw(false) {}
-  SpMat(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp);
-  SpMat(const NEWMAT::GeneralMatrix& M);
-  SpMat(const std::string& fname);
+  SpMat() : _m(0), _n(0), _nz(0), _ri(0), _val(0), _pw(false), _nt(1) {}
+  SpMat(unsigned int m, unsigned int n, unsigned int nt=1) : _m(m), _n(n), _nz(0), _ri(n), _val(n), _pw(false), _nt(nt) {}
+  SpMat(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp, unsigned int nt=1);
+  SpMat(const NEWMAT::GeneralMatrix& M, unsigned int nt=1);
+  SpMat(const std::string& fname, unsigned int nt=1);
   ~SpMat() {}
 
   unsigned int Nrows() const {return(_m);}
   unsigned int Ncols() const {return(_n);}
   unsigned int NZ() const {return(_nz);}
+  unsigned int Nthreads() const {return(_nt);}
+  void SetNthreads(unsigned int nt) {_nt=nt;}
 
   NEWMAT::ReturnMatrix AsNEWMAT() const;
   void Save(const std::string&   fname,
@@ -118,7 +122,7 @@ public:
 
   void Set(unsigned int r, unsigned int c, const T& v) {here(r,c) = v;}             // Set a single value
   void SetColumn(unsigned int c, const NEWMAT::ColumnVector& col, double eps=0.0);  // Set a whole column (obliterating what was there before)
-  SpMat<T>& MultiplyColumns(const NEWMAT::Matrix& vals);                             // Multiply each column of a matrix by a value
+  SpMat<T>& MultiplyColumns(const NEWMAT::Matrix& vals);                            // Multiply each column of a matrix by a value
   void AddTo(unsigned int r, unsigned int c, const T& v) {here(r,c) += v;}          // Add value to a single (possibly existing) value
 
   SpMat<T>& operator+=(const SpMat& M)
@@ -131,18 +135,18 @@ public:
     if (same_sparsity(M)) return(add_same_sparsity_mat_to_me(M,-1));
     else return(add_diff_sparsity_mat_to_me(M,-1));
   }
-  const NEWMAT::ReturnMatrix operator*(const NEWMAT::ColumnVector& x) const;       // Multiplication with column vector
-  const NEWMAT::ReturnMatrix trans_mult(const NEWMAT::ColumnVector& x) const;      // Multiplication of transpose with column vector
+  const NEWMAT::ReturnMatrix operator*(const NEWMAT::ColumnVector& x) const;       // Multiplication with column vector. Has multi-thread support.
+  const NEWMAT::ReturnMatrix trans_mult(const NEWMAT::ColumnVector& x) const;      // Multiplication of transpose with column vector. Has multi-thread support.
   const NEWMAT::ReturnMatrix TransMult(const NEWMAT::ColumnVector& x) const {
     return(trans_mult(x));                                                         // Duplication for compatibility with IML++
   }
-  const SpMat<T> TransMult(const SpMat<T>& B) const { return(this->t()*B); }       // Multiplication of transpose(*this) with sparse matrix B
+  const SpMat<T> TransMult(const SpMat<T>& B) const { return(this->t()*B); }       // Multiplication of transpose(*this) with sparse matrix B. Has multi-thread support.
   SpMat<T>& operator*=(double s);                                                  // Multiplication of self with scalar
   SpMat<T> operator-(const SpMat<T>& M) const {return(SpMat<T>(M) *= -1.0);}       // Unary minus
   SpMat<T>& operator|=(const SpMat<T>& rh);                                        // Hor concat to right
   SpMat<T>& operator&=(const SpMat<T>& bh);                                        // Vert concat below
 
-  const SpMat<T> TransMultSelf() const {return(this->t() * *this);}                 // Returns transpose(*this)*(*this)
+  const SpMat<T> TransMultSelf() const {return(this->t() * *this);}                 // Returns transpose(*this)*(*this). Has multi-thread support.
 
   const SpMat<T> t() const;                                                        // Returns transpose(*this).
 
@@ -182,9 +186,12 @@ public:
   ColumnIterator end(unsigned int col) const { if (col>_n) throw SpMatException("ColumnIterator: col out of range"); return(ColumnIterator(*this,col,true)); }
 
   template<class TT>
-  friend const SpMat<TT> operator*(const SpMat<TT>& lh, const SpMat<TT>& rh);      // Multiplication of two sparse matrices
+  friend const SpMat<TT> operator*(const SpMat<TT>& lh, const SpMat<TT>& rh);      // Multiplication of two sparse matrices. Has multi-thread support.
 
-  NEWMAT::ReturnMatrix SolveForx(const NEWMAT::ColumnVector&           b,          // Solve for x in b=(*this)*x
+  template<class TT>
+  friend bool operator==(const SpMat<TT>& lh, const SpMat<TT>& rh);
+
+  NEWMAT::ReturnMatrix SolveForx(const NEWMAT::ColumnVector&           b,          // Solve for x in b=(*this)*x . Has multi-thread support.
                                  MatrixType                            type = UNKNOWN,
                                  double                                tol = 1e-4,
                                  unsigned int                          miter = 200,
@@ -209,12 +216,13 @@ protected:
   T& here(unsigned int r, unsigned int c);
 
 private:
-  unsigned int                              _m;
-  unsigned int                              _n;
-  unsigned long                             _nz;
-  std::vector<std::vector<unsigned int> >   _ri;
-  std::vector<std::vector<T> >              _val;
-  bool                                      _pw;   // Print Warnings
+  unsigned int                              _m;        /// No. of rows
+  unsigned int                              _n;        /// No. of columns
+  unsigned long                             _nz;       /// No. of non-zero elements
+  std::vector<std::vector<unsigned int> >   _ri;       /// Vector of vectors (one per column) of row-indicies
+  std::vector<std::vector<T> >              _val;      /// Vector of vectors (one per column) of values
+  bool                                      _pw;       /// Print Warnings
+  unsigned int                              _nt;       /// Number of threads
 
   bool found(const std::vector<unsigned int>&  ri, unsigned int key, int& pos) const;
   void insert(std::vector<unsigned int>& vec, int indx, unsigned int val);
@@ -223,6 +231,37 @@ private:
   SpMat<T>& add_same_sparsity_mat_to_me(const SpMat<T>& M, double s);
   SpMat<T>& add_diff_sparsity_mat_to_me(const SpMat<T>& M, double s);
   bool is_sorted() const;
+  bool is_odd(int numerator) const { return(numerator%2==1); }
+  std::vector<unsigned int> columns_per_thread(bool balance=true) const;
+  void mul_by_vec_helper(// Input
+			 unsigned int                                   first,
+			 unsigned int                                   last,
+			 unsigned int                                   tid,       // Thread number
+			 std::vector<std::thread>&                      threads,   // All threads
+			 const double                                   *xp,
+			 // Output
+			 std::vector<double *>                          vec_bp) const;
+  void trans_mult_helper(// Input
+			 unsigned int first,
+			 unsigned int last,
+			 const double *xp,
+			 // Output
+			 double       *bp) const;
+  void matrix_mul_helper(// Input
+			 unsigned int    first,
+			 unsigned int    last,
+			 const SpMat<T>& lh,
+			 // Output
+			 SpMat<T>&       out) const;
+  void add_same_sparsity_helper(unsigned int    first,
+				unsigned int    last,
+				const SpMat<T>& M,
+				double          s);
+  void add_diff_sparsity_helper(unsigned int    first,
+				unsigned int    last,
+				const SpMat<T>& M,
+				double          s,
+				unsigned int&   nz);
 };
 
 //@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@ -346,8 +385,8 @@ private:
 /////////////////////////////////////////////////////////////////////
 
 template<class T>
-SpMat<T>::SpMat(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp)
-  : _m(m), _n(n), _nz(0), _ri(n), _val(n), _pw(false)
+SpMat<T>::SpMat(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp, unsigned int nt)
+: _m(m), _n(n), _nz(0), _ri(n), _val(n), _pw(false), _nt(nt)
 {
   _nz = jcp[n];
   unsigned long nz = 0;
@@ -376,8 +415,8 @@ SpMat<T>::SpMat(unsigned int m, unsigned int n, const unsigned int *irp, const u
 /////////////////////////////////////////////////////////////////////
 
 template<class T>
-SpMat<T>::SpMat(const NEWMAT::GeneralMatrix& M)
-  : _m(M.Nrows()), _n(M.Ncols()), _nz(0), _ri(M.Ncols()), _val(M.Ncols()), _pw(false)
+SpMat<T>::SpMat(const NEWMAT::GeneralMatrix& M, unsigned int nt)
+: _m(M.Nrows()), _n(M.Ncols()), _nz(0), _ri(M.Ncols()), _val(M.Ncols()), _pw(false), _nt(nt)
 {
   double *m = static_cast<double *>(M.Store());
 
@@ -412,8 +451,8 @@ SpMat<T>::SpMat(const NEWMAT::GeneralMatrix& M)
 /////////////////////////////////////////////////////////////////////
 
 template<class T>
-SpMat<T>::SpMat(const std::string&  fname)
-: _m(0), _n(0), _nz(0), _ri(0), _val(0), _pw(false)
+SpMat<T>::SpMat(const std::string&  fname, unsigned int nt)
+: _m(0), _n(0), _nz(0), _ri(0), _val(0), _pw(false), _nt(nt)
 {
   // First read data into (nz+1)x3 NEWMAT matrix
   NEWMAT::Matrix rcv;
@@ -612,7 +651,7 @@ NEWMAT::ReturnMatrix SpMat<T>::SolveForx(const NEWMAT::ColumnVector&
     status = BiCG(*this,x,b,*M,liter,tol);
     break;
   default:
-    throw SpMatException("SolveForx: No idea how you got here. But you shouldn't be here, punk.");
+    throw SpMatException("SolveForx: No idea how you got here. But you shouldn't be here.");
   }
 
   if (status && _pw) {
@@ -635,11 +674,12 @@ template<class T>
 const SpMat<T> SpMat<T>::t() const
 {
   SpMat<T>         t_mat(_n,_m);
-  // First make a list of number of elements in each row of *this (columns of t_mat)
+  // First make a list of number of elements in each row of *this (columns of t_mat).
   std::vector<unsigned int> no_per_row(_m,static_cast<unsigned int>(0));
   for (unsigned int i=0; i<_n; i++) {
     for (unsigned int j=0; j<_ri[i].size(); j++) no_per_row[_ri[i][j]]++;
   }
+
   // A second pass to allocate storage
   for (unsigned int i=0; i<_m; i++) {
     t_mat._ri[i].resize(no_per_row[i]);
@@ -764,18 +804,76 @@ const NEWMAT::ReturnMatrix SpMat<T>::operator*(const NEWMAT::ColumnVector& x) co
   const double *xp = static_cast<double *>(x.Store());
   double       *bp = static_cast<double *>(b.Store());
 
-  for (unsigned int c=0; c<_n; c++) {
+  if (_nt==1) { // If we are to run single-threaded
+    for (unsigned int c=0; c<_n; c++) {
+      if (_ri[c].size()) {
+	double                            wgt = xp[c];
+	const std::vector<unsigned int>&  ri = _ri[c];
+	const std::vector<T>&             val = _val[c];
+	for (unsigned int i=0; i<ri.size(); i++) {
+	  bp[ri[i]] += static_cast<double>(wgt*val[i]);
+	}
+      }
+    }
+  }
+  else { // We are to run multi-threaded
+    std::vector<unsigned int> col_pt = this->columns_per_thread();
+    std::vector<std::thread> threads(_nt-1); // + main thread makes _nt
+    std::unique_ptr<double[]> bp_at(new double[_m*(_nt-1)]);
+    std::vector<double *> vec_bp(_nt);
+    for (unsigned int i=0; i<_nt-1; i++) vec_bp[i] = &bp_at[i*_m];
+    vec_bp[_nt-1] = bp; // This is where the results will be summed in the end
+    for (unsigned int i=0; i<_nt-1; i++) {
+      threads[i] = std::thread(&SpMat<T>::mul_by_vec_helper,this,col_pt[i],
+			       col_pt[i+1],i,std::ref(threads),xp,std::ref(vec_bp));
+    }
+    this->mul_by_vec_helper(col_pt[_nt-1],col_pt[_nt],_nt-1,threads,xp,vec_bp);
+  }
+  b.Release();
+  return(b);
+}
+
+template<class T>
+void SpMat<T>::mul_by_vec_helper(// Input
+				 unsigned int                 first,
+				 unsigned int                 last,
+				 unsigned int                 tid,       // Thread number
+				 std::vector<std::thread>&    threads,
+				 const double                 *xp,
+				 // Output
+				 std::vector<double *>        vec_bp) const
+{
+  // static std::mutex cout_mutex;
+
+  // Do the columns that you are resonsible for
+  double *bp = vec_bp[tid];
+  std::memset(bp,0,_m*sizeof(double));
+  for (unsigned int c=first; c<last; c++) { // If we are to run single-threaded
     if (_ri[c].size()) {
       double                            wgt = xp[c];
       const std::vector<unsigned int>&  ri = _ri[c];
       const std::vector<T>&             val = _val[c];
       for (unsigned int i=0; i<ri.size(); i++) {
-        bp[ri[i]] += static_cast<double>(wgt*val[i]);
+	bp[ri[i]] += static_cast<double>(wgt*val[i]);
       }
     }
   }
-  b.Release();
-  return(b);
+  // Then add up results with other threads
+  int step = 1;                                        // How far to the left your neighbour is
+  while (step < _nt) {
+    if (is_odd(((int(_nt)-1)-int(tid))/step)) return;  // Is not a candidate
+    int neighbour = int(tid)-step;                     // Check who your neighbour to the left is
+    if (neighbour < 0) return;                         // Return if neighbour is out of bounds
+    threads[neighbour].join();                         // Make sure that neighbour has finished
+    // cout_mutex.lock();
+    // std::cout << "I am thread # " << tid << ", and I will add thread # " << neighbour << " to myself" << std::endl;
+    // cout_mutex.unlock();
+    double *my_p = vec_bp[tid];
+    double *ne_p = vec_bp[neighbour];
+    for (unsigned int i=0; i<_m; i++) my_p[i] += ne_p[i];
+    step *= 2;
+  }
+  return;
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -838,19 +936,52 @@ const NEWMAT::ReturnMatrix SpMat<T>::trans_mult(const NEWMAT::ColumnVector& x) c
   const double *xp = static_cast<double *>(x.Store());
   double       *bp = static_cast<double *>(b.Store());
 
-  for (unsigned int c=0; c<_n; c++) {
+  if (_nt == 1) { // If we are to run single-threaded
+    for (unsigned int c=0; c<_n; c++) {
+      double                            res = 0.0;
+      if (_ri[c].size()) {
+	const std::vector<unsigned int>&  ri = _ri[c];
+	const std::vector<T>&             val = _val[c];
+	for (unsigned int i=0; i<ri.size(); i++) {
+	  res += val[i]*xp[ri[i]];
+	}
+      }
+      bp[c] = res;
+    }
+  }
+  else { // We are to run multi-threaded
+    std::vector<unsigned int> col_pt = this->columns_per_thread();
+    std::vector<std::thread> threads(_nt-1); // + main thread makes _nt
+    for (unsigned int i=0; i<_nt-1; i++) {
+      threads[i] = std::thread(&SpMat<T>::trans_mult_helper,this,col_pt[i],col_pt[i+1],xp,bp);
+    }
+    this->trans_mult_helper(col_pt[_nt-1],col_pt[_nt],xp,bp);
+    std::for_each(threads.begin(),threads.end(),std::mem_fn(&std::thread::join));
+  }
+  b.Release();
+  return(b);
+}
+
+template<class T>
+void SpMat<T>::trans_mult_helper(// Input
+                                 unsigned int first,
+				 unsigned int last,
+				 const double *xp,
+				 // Output
+				 double       *bp) const
+{
+  for (unsigned int c=first; c<last; c++) {
     double                            res = 0.0;
     if (_ri[c].size()) {
       const std::vector<unsigned int>&  ri = _ri[c];
       const std::vector<T>&             val = _val[c];
       for (unsigned int i=0; i<ri.size(); i++) {
-        res += val[i]*xp[ri[i]];
+	res += val[i]*xp[ri[i]];
       }
     }
     bp[c] = res;
   }
-  b.Release();
-  return(b);
+  return;
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -945,20 +1076,75 @@ const SpMat<T> operator*(const SpMat<T>& lh, const SpMat<T>& rh)
   if (lh._n != rh._m) throw SpMatException("operator*: Left hand matrix must have same # of columns as right hand has rows");
 
   SpMat<T>              out(lh._m,rh._n);
-  Accumulator<T>        acc(lh._m);
+  if (rh._nt==1) { // If we are to run single threaded
+    Accumulator<T>        acc(lh._m);
+
+    for (unsigned int cr=0; cr<rh._n; cr++) {
+      acc.Reset();
+      if (rh._ri[cr].size()) {
+	const std::vector<unsigned int>&   rri = rh._ri[cr];
+	const std::vector<T>&             rval = rh._val[cr];
+	for (unsigned int i=0; i<rri.size(); i++) {
+	  if (lh._ri[rri[i]].size()) {
+	    double wgt = rval[i];
+	    const std::vector<unsigned int>&   lri = lh._ri[rri[i]];
+	    const std::vector<T>&             lval = lh._val[rri[i]];
+	    for (unsigned int j=0; j<lri.size(); j++) {
+	      acc(lri[j]) += wgt*lval[j];
+	    }
+	  }
+	}
+      }
+      out._ri[cr].resize(acc.NO());
+      out._val[cr].resize(acc.NO());
+      std::vector<unsigned int>&  ori = out._ri[cr];
+      std::vector<T>&            oval = out._val[cr];
+      for (unsigned int i=0; i<acc.NO(); i++) {
+	ori[i] = acc.ri(i);
+	oval[i] = acc.val(i);
+      }
+      out._nz += acc.NO();
+    }
+  }
+  else { // We are to run multi-threaded
+    std::vector<unsigned int> cpt = rh.columns_per_thread();
+    std::vector<std::thread> threads(rh._nt-1); // + main thread makes rh._nt
+    for (unsigned int i=0; i<rh._nt-1; i++) {
+      threads[i] = std::thread(&SpMat<T>::matrix_mul_helper,&rh,cpt[i],cpt[i+1],std::ref(lh),std::ref(out));
+    }
+    rh.matrix_mul_helper(cpt[rh._nt-1],cpt[rh._nt],lh,out);
+    std::for_each(threads.begin(),threads.end(),std::mem_fn(&std::thread::join));
+  }
 
-  for (unsigned int cr=0; cr<rh._n; cr++) {
+  return(out);
+}
+
+template<class T>
+void SpMat<T>::matrix_mul_helper(// Input
+				 unsigned int    first,
+				 unsigned int    last,
+				 const SpMat<T>& lh,
+				 // Output
+				 SpMat<T>&       out) const
+{
+  static std::mutex nz_mutex;
+
+  // N.B. this is called from the rh matrix, so this refers to that
+  Accumulator<T> acc(lh._m);
+  unsigned int   lnz=0; // Local non-zero
+
+  for (unsigned int cr=first; cr<last; cr++) {
     acc.Reset();
-    if (rh._ri[cr].size()) {
-      const std::vector<unsigned int>&   rri = rh._ri[cr];
-      const std::vector<T>&             rval = rh._val[cr];
+    if (this->_ri[cr].size()) {
+      const std::vector<unsigned int>&   rri = this->_ri[cr];
+      const std::vector<T>&             rval = this->_val[cr];
       for (unsigned int i=0; i<rri.size(); i++) {
-        if (lh._ri[rri[i]].size()) {
-          double wgt = rval[i];
-          const std::vector<unsigned int>&   lri = lh._ri[rri[i]];
-          const std::vector<T>&             lval = lh._val[rri[i]];
-          for (unsigned int j=0; j<lri.size(); j++) {
-            acc(lri[j]) += wgt*lval[j];
+	if (lh._ri[rri[i]].size()) {
+	  double wgt = rval[i];
+	  const std::vector<unsigned int>&   lri = lh._ri[rri[i]];
+	  const std::vector<T>&             lval = lh._val[rri[i]];
+	  for (unsigned int j=0; j<lri.size(); j++) {
+	    acc(lri[j]) += wgt*lval[j];
 	  }
 	}
       }
@@ -971,10 +1157,13 @@ const SpMat<T> operator*(const SpMat<T>& lh, const SpMat<T>& rh)
       ori[i] = acc.ri(i);
       oval[i] = acc.val(i);
     }
-    out._nz += acc.NO();
+    lnz += acc.NO();
   }
+  nz_mutex.lock();
+  out._nz += lnz;
+  nz_mutex.unlock();
 
-  return(out);
+  return;
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -1069,6 +1258,37 @@ const SpMat<T> operator&(const SpMat<T>& th, const NEWMAT::GeneralMatrix& bh)
   return(SpMat<T>(th) &= SpMat<T>(bh));
 }
 
+/////////////////////////////////////////////////////////////////////
+//
+// Global functions for comparison
+//
+/////////////////////////////////////////////////////////////////////
+
+template<class T>
+bool operator==(const SpMat<T>& lh, const SpMat<T>& rh)
+{
+  // Start by comparing dimensions and sparsity
+  if (lh._m != rh._m || lh._n != rh._n || lh._nz != rh._nz) return(false);
+  // Next compare non-zero elements per column
+  for (unsigned int i=0; i<lh._n; i++) {
+    if (lh._ri[i].size() != rh._ri[i].size()) return(false);
+  }
+  // Finally do a detailed comparison if neccessary
+  for (unsigned int i=0; i<lh._n; i++) {
+    for (unsigned int j=0; j<lh._ri[i].size(); j++) {
+      if (lh._ri[i][j] != rh._ri[i][j] || lh._val[i][j] != rh._val[i][j]) {
+	// cout << "lh._ri[" << i << "][" << j << "] = " << lh._ri[i][j] << ", rh._ri[" << i << "][" << j << "] = " << rh._ri[i][j] << endl;
+	// cout << "lh._val[" << i << "][" << j << "] = " << lh._val[i][j] << ", rh._val[" << i << "][" << j << "] = " << rh._val[i][j] << endl;
+	return(false);
+      }
+    }
+  }
+  return(true);
+}
+
+template<class T>
+bool operator!=(const SpMat<T>& lh, const SpMat<T>& rh) { return(!(lh==rh)); }
+
 /*###################################################################
 ##
 ## Here starts protected functions
@@ -1211,16 +1431,46 @@ bool SpMat<T>::same_sparsity(const SpMat<T>& M) const
 template<class T>
 SpMat<T>& SpMat<T>::add_same_sparsity_mat_to_me(const SpMat<T>& M, double s)
 {
-  for (unsigned int c=0; c<_n; c++) {
+  if (_nt==1) { // If we are to run single threaded
+    for (unsigned int c=0; c<_n; c++) {
+      if (_val[c].size()) {
+	std::vector<T>&          val = _val[c];
+	const std::vector<T>&    Mval = M._val[c];
+	for (unsigned int i=0; i<val.size(); i++) {
+	  val[i] += s*Mval[i];
+	}
+      }
+    }
+  }
+  else { // We are to run multi threaded
+    std::vector<unsigned int> col_pt = this->columns_per_thread();
+    std::vector<std::thread> threads(_nt-1); // + main thread makes _nt
+    for (unsigned int i=0; i<_nt-1; i++) {
+      threads[i] = std::thread(&SpMat<T>::add_same_sparsity_helper,this,
+			       col_pt[i],col_pt[i+1],std::ref(M),s);
+    }
+    this->add_same_sparsity_helper(col_pt[_nt-1],col_pt[_nt],M,s);
+    std::for_each(threads.begin(),threads.end(),std::mem_fn(&std::thread::join));
+  }
+  return(*this);
+}
+
+template<class T>
+void SpMat<T>::add_same_sparsity_helper(unsigned int    first,
+					unsigned int    last,
+					const SpMat<T>& M,
+					double          s)
+{
+  for (unsigned int c=first; c<last; c++) {
     if (_val[c].size()) {
       std::vector<T>&          val = _val[c];
       const std::vector<T>&    Mval = M._val[c];
       for (unsigned int i=0; i<val.size(); i++) {
-        val[i] += s*Mval[i];
+	val[i] += s*Mval[i];
       }
     }
   }
-  return(*this);
+  return;
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -1234,10 +1484,58 @@ SpMat<T>& SpMat<T>::add_diff_sparsity_mat_to_me(const SpMat<T>& M, double s)
 {
   if (_m != M._m || _n != M._n) throw SpMatException("add_diff_sparsity_mat_to_me: Size mismatch between matrices");
 
+  if (_nt==1) { // If we are to run single threaded
+    Accumulator<T> acc(_m);
+    _nz = 0;
+    for (unsigned int c=0; c<_n; c++) {
+      acc.Reset();
+      if (M._ri[c].size()) {
+	const std::vector<unsigned int>&        Mri = M._ri[c];
+	const std::vector<T>&                   Mval = M._val[c];
+	for (unsigned int i=0; i<Mri.size(); i++) {
+	  acc(Mri[i]) += s*Mval[i];
+	}
+	std::vector<unsigned int>&        ri = _ri[c];
+	std::vector<T>&                   val = _val[c];
+	for (unsigned int i=0; i<ri.size(); i++) {
+	  acc(ri[i]) += val[i];
+	}
+	ri.resize(acc.NO());
+	val.resize(acc.NO());
+	for (unsigned int i=0; i<acc.NO(); i++) {
+	  ri[i] = acc.ri(i);
+	  val[i] = acc.val(i);
+	}
+	_nz += acc.NO();
+      }
+    }
+  }
+  else { // We are to run multi-threaded
+    std::vector<unsigned int> col_pt = this->columns_per_thread(false);
+    std::vector<std::thread> threads(_nt-1); // + main thread makes _nt
+    std::vector<unsigned int> nz(_nt,0);        // Non-zero elements per thread
+    for (unsigned int i=0; i<_nt-1; i++) {
+      threads[i] = std::thread(&SpMat<T>::add_diff_sparsity_helper,this,col_pt[i],
+			       col_pt[i+1],std::ref(M),s,std::ref(nz[i]));
+    }
+    this->add_diff_sparsity_helper(col_pt[_nt-1],col_pt[_nt],M,s,nz[_nt-1]);
+    std::for_each(threads.begin(),threads.end(),std::mem_fn(&std::thread::join));
+    _nz = std::accumulate(nz.begin(),nz.end(),0u);
+  }
+  return(*this);
+}
+
+template<class T>
+void SpMat<T>::add_diff_sparsity_helper(unsigned int    first,
+					unsigned int    last,
+					const SpMat<T>& M,
+					double          s,
+					unsigned int&   nz)
+{
   Accumulator<T> acc(_m);
 
-  _nz = 0;
-  for (unsigned int c=0; c<_n; c++) {
+  nz = 0;
+  for (unsigned int c=first; c<last; c++) {
     acc.Reset();
     if (M._ri[c].size()) {
       const std::vector<unsigned int>&        Mri = M._ri[c];
@@ -1256,10 +1554,10 @@ SpMat<T>& SpMat<T>::add_diff_sparsity_mat_to_me(const SpMat<T>& M, double s)
         ri[i] = acc.ri(i);
         val[i] = acc.val(i);
       }
-      _nz += acc.NO();
+      nz += acc.NO();
     }
   }
-  return(*this);
+  return;
 }
 
 /////////////////////////////////////////////////////////////////////
@@ -1283,6 +1581,47 @@ bool SpMat<T>::is_sorted() const
   return(true);
 }
 
+/////////////////////////////////////////////////////////////////////
+//
+// Returns a vector vec where vec[i] and vec[i+1] contains the first
+// column and one past the last column respectively for thread i.
+// It is based on trying to balance the number of non-zero elements
+// between the different threads.
+//
+/////////////////////////////////////////////////////////////////////
+template<class T>
+std::vector<unsigned int> SpMat<T>::columns_per_thread(bool balance) const
+{
+  std::vector<unsigned int> cpt(_nt+1,0);
+  if (balance) { // Try to balance number of non-zero elements between threads
+    unsigned int nnz_per_thread = std::lround(static_cast<double>(_nz)/static_cast<double>(_nt));
+    unsigned int cum_nnz = 0;
+    unsigned int ti = 1;
+    for (unsigned int c=0; c<_n; c++) {
+      cum_nnz += _ri[c].size();
+      if (cum_nnz >= nnz_per_thread) {
+	if (std::abs(int(cum_nnz - nnz_per_thread)) < std::abs(int(cum_nnz - _ri[c].size() - nnz_per_thread))) { // If this c is best
+	  cpt[ti] = c+1;
+	  cum_nnz = 0;
+	}
+	else { // If previous c is best
+	  cpt[ti] = c;
+	  cum_nnz = _ri[c].size();
+	}
+	if (ti < _nt-1) ti++;
+	else break;
+      }
+    }
+    cpt[_nt] = _n;
+  }
+  else { // If we just want the same number of columns
+    double dcpt = static_cast<double>(_n) / static_cast<double>(_nt);
+    for (unsigned int i=1; i<_nt; i++) cpt[i] = static_cast<unsigned int>(std::floor(i*dcpt));
+    cpt[_nt] = _n;
+  }
+  return(cpt);
+}
+
 /*
 template<class T>
 SpMat<T>& SpMat<T>::add_diff_sparsity_mat_to_me(const SpMat<T>& M, double s)
diff --git a/bfmatrix.h b/bfmatrix.h
index 2ba2d7f6ec544a1fa948f43cf2a0b566097fe531..92ef6e39cdd64155632cbf8ed985cd0d0088c39a 100644
--- a/bfmatrix.h
+++ b/bfmatrix.h
@@ -76,6 +76,9 @@ public:
   // Basic properties
   virtual unsigned int Nrows() const = 0;
   virtual unsigned int Ncols() const = 0;
+  virtual unsigned int Nthreads() const = 0;
+  virtual void SetNthreads(unsigned int nt) = 0;
+  virtual unsigned int NZ() const = 0;
 
   // Print matrix (for debugging)
   virtual void Print(const std::string fname=std::string("")) const = 0;
@@ -137,14 +140,14 @@ public:
   // Constructors, destructor and assignment
   SparseBFMatrix()
   : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>())) {}
-  SparseBFMatrix(unsigned int m, unsigned int n)
-  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(m,n))) {}
-  SparseBFMatrix(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp)
-  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(m,n,irp,jcp,sp))) {}
+  SparseBFMatrix(unsigned int m, unsigned int n, unsigned int nt=1)
+  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(m,n,nt))) {}
+  SparseBFMatrix(unsigned int m, unsigned int n, const unsigned int *irp, const unsigned int *jcp, const double *sp, unsigned int nt=1)
+  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(m,n,irp,jcp,sp,nt))) {}
   SparseBFMatrix(const MISCMATHS::SpMat<T>& M)
   : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(M))) {}
-  SparseBFMatrix(const NEWMAT::Matrix& M)
-  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(M))) {}
+  SparseBFMatrix(const NEWMAT::Matrix& M, unsigned int nt=1)
+  : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(M,nt))) {}
   SparseBFMatrix(const SparseBFMatrix<T>& M)
   : mp(std::shared_ptr<MISCMATHS::SpMat<T> >(new MISCMATHS::SpMat<T>(*(M.mp)))) {}
   virtual ~SparseBFMatrix() {}
@@ -157,9 +160,15 @@ public:
   // Access as NEWMAT::Matrix
   virtual NEWMAT::ReturnMatrix AsMatrix() const {NEWMAT::Matrix ret; ret = mp->AsNEWMAT(); ret.Release(); return(ret);}
 
+  // Access as MISCMATHS::SpMat<T> matrix. Cannot be used from pointer to base-class.
+  MISCMATHS::SpMat<T> AsSpMat() const {return(*mp);}
+
   // Basic properties
   virtual unsigned int Nrows() const {return(mp->Nrows());}
   virtual unsigned int Ncols() const {return(mp->Ncols());}
+  virtual unsigned int Nthreads() const {return(mp->Nthreads());}
+  virtual void SetNthreads(unsigned int nt) {mp->SetNthreads(nt);}
+  virtual unsigned int NZ() const {return(mp->NZ());}
 
   // Print matrix (for debugging)
   virtual void Print(const std::string fname=std::string("")) const {mp->Print(fname);}
@@ -236,6 +245,9 @@ public:
   // Basic properties
   virtual unsigned int Nrows() const {return(mp->Nrows());}
   virtual unsigned int Ncols() const {return(mp->Ncols());}
+  virtual unsigned int Nthreads() const {return(1);}
+  virtual void SetNthreads(unsigned int nt) {} // Null function
+  virtual unsigned int NZ() const {return(mp->Nrows()*mp->Ncols());}
 
   // Print matrix (for debugging)
   virtual void Print(const std::string fname=std::string("")) const;