From d4a9f9f7c42a1903f063d9abbb21a3eda2bdbdfc Mon Sep 17 00:00:00 2001
From: Moises Fernandez <moisesf@fmrib.ox.ac.uk>
Date: Thu, 28 Feb 2013 15:02:34 +0000
Subject: [PATCH] new version Levenberg: several threads per voxel

---
 CUDA/PVM_multi.cu           | 678 +++++++++++++++-----------
 CUDA/PVM_single.cu          | 633 +++++++++++++-----------
 CUDA/PVM_single_c.cu        | 927 +++++++++++++++++++-----------------
 CUDA/levenberg_marquardt.cu | 386 ++++++++-------
 4 files changed, 1474 insertions(+), 1150 deletions(-)

diff --git a/CUDA/PVM_multi.cu b/CUDA/PVM_multi.cu
index 839b19d..31dd264 100644
--- a/CUDA/PVM_multi.cu
+++ b/CUDA/PVM_multi.cu
@@ -17,42 +17,48 @@
 /////////////////////////////////////
 
 __device__ inline double isoterm_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
-	return exp(double(-_a*log(double(1+bvals[pt]*_b))));
+	return exp(-_a*log(1+bvals[pt]*_b));
 }
 
 __device__ inline double isoterm_a_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
-    	return  -log(double(1+bvals[pt]*_b))*exp(double(-_a*log(double(1+bvals[pt]*_b))));
+    	return  -log(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b));
 }
 
 __device__ inline double isoterm_b_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
-      	return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(double(-_a*log(double(1+bvals[pt]*_b))));
+      	return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b));
 }
 
 __device__ inline double anisoterm_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	return exp(double(-_a*log(double(1+bvals[pt]*_b*(dp*dp)))));
+  	return exp(-_a*log(1+bvals[pt]*_b*(dp*dp)));
 }
  
 __device__ inline double anisoterm_a_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	return -log(double(1+bvals[pt]*(dp*dp)*_b))* exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))));
+  	return -log(1+bvals[pt]*(dp*dp)*_b)* exp(-_a*log(1+bvals[pt]*(dp*dp)*_b));
 }
 
 __device__ inline double anisoterm_b_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
   	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b)))));
+  	return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b)));
 }
 
 __device__ inline double anisoterm_th_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){
+	double sinth,costh,sinph,cosph;
+	sincos(_th,&sinth,&costh);
+	sincos(_ph,&sinph,&cosph);
   	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	double dp1 = cos(double(_th))* (bvecs[pt]*cos(double(_ph)) + bvecs[NDIRECTIONS+pt]*sin(double(_ph))) - bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th));
-	return  (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1);	
+  	double dp1 = costh* (bvecs[pt]*cosph + bvecs[NDIRECTIONS+pt]*sinph) - bvecs[(2*NDIRECTIONS)+pt]*sinth;
+	return  (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1);	
 }
 
 __device__ inline double anisoterm_ph_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){
+	double sinth,sinph,cosph;
+	sinth=sin(_th);
+	sincos(_ph,&sinph,&cosph);
   	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	double dp1 = sin(double(_th))* (-bvecs[pt]*sin(double(_ph)) + bvecs[NDIRECTIONS+pt]*cos(double(_ph)));
-  	return  (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1);
+  	double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[NDIRECTIONS+pt]*cosph);
+  	return  (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1);
 }
 
 //in diffmodel.cc
@@ -79,147 +85,99 @@ __device__ void fix_fsum_PVM_multi(	//INPUT
 }
 
 //in diffmodel.cc
-__device__ void sort_PVM_multi(int nfib,int nparams,double* params)
+__device__ void sort_PVM_multi(int nfib,double* params)
 {
 	double temp_f, temp_th, temp_ph;
 	// Order vector descending using f parameters as index
 	for(int i=1; i<(nfib); i++){ 
     		for(int j=0; j<(nfib-i); j++){ 
-      			if (params[3+j*3] < params[3+i*3]){ 
+      			if (params[3+j*3] < params[3+(j+1)*3]){ 
         			temp_f = params[3+j*3];
 				temp_th = params[3+j*3+1];
 				temp_ph = params[3+j*3+2];
-        			params[3+j*3] = params[3+i*3]; 
-				params[3+j*3+1] = params[3+i*3+1]; 
-				params[3+j*3+2] = params[3+i*3+2]; 
-        			params[3+i*3] = temp_f; 
-				params[3+i*3+1] = temp_th; 
-				params[3+i*3+2] = temp_ph; 
+        			params[3+j*3] = params[3+(j+1)*3]; 
+				params[3+j*3+1] = params[3+(j+1)*3+1]; 
+				params[3+j*3+2] = params[3+(j+1)*3+2]; 
+        			params[3+(j+1)*3] = temp_f; 
+				params[3+(j+1)*3+1] = temp_th; 
+				params[3+(j+1)*3+2] = temp_ph; 
       			} 
     		} 
   	} 
 }
 
-//in diffmodels.cc -- for calculate residuals
-__device__ void  forwardModel_PVM_multi(	//INPUT
-						const double* 		p,
-						const double*		bvecs, 
-						const double*		bvals,
-						const int		nfib,
-						const int 		nparams,
-						const bool 		m_include_f0,
-						//OUTPUT
-						double*		 	predicted_signal)
-{
-  	for(int i=0;i<NDIRECTIONS;i++){
-		predicted_signal[i]=0;		//pred = 0;
-	}
-  	double val;
-  	double _a = abs(p[1]);
-  	double _b = abs(p[2]);
-  	////////////////////////////////////
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double sumf=0;
-	double3 x2;
-  	for(int k=0;k<nfib;k++){
-    		int kk = 3+3*k;
-	    	fs[k] = x2f_gpu(p[kk]);
-	    	sumf += fs[k];
-		x[k*3] = sin(p[kk+1])*cos(p[kk+2]);
-    		x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]);
-    		x[k*3+2] = cos(p[kk+1]);
-  	}
-  	////////////////////////////////////
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		val = 0.0;
-    		for(int k=0;k<nfib;k++){
-			x2.x=x[k*3];
-			x2.y=x[k*3+1];
-			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals);
-    		}	
-    		if (m_include_f0){
-      			double temp_f0=x2f_gpu(p[nparams-1]);
-      			predicted_signal[i] = abs(p[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+val);
-    		} 
-    		else
-      			predicted_signal[i] = abs(p[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+val); 
-  		}   
-}
-
-//in diffmodels.cc -- for calculate residuals
-__device__ void get_prediction_PVM_multi(	//INPUT
-						const double*	params,
-						const double*	bvecs, 
-						const double*	bvals,
-						const int 	nfib,
-						const int 	nparams,
-						const bool 	m_include_f0,
-						//OUTPUT
-						double* 	predicted_signal)
-{
-	//m_s0-myparams[0] 	m_d-myparams[1] 	m_d_std-myparams[2]		m_f-m_th-m_ph-myparams[3,4,5,6 etc..]   	m_f0-myparams[nparams-1]
-  	double p[NPARAMS];
-  	p[0] = params[0];
-  	p[1] = params[1]*params[1]/params[2]/params[2];		//m_d*m_d/m_d_std/m_d_std;
-  	p[2] = params[2]*params[2]/params[1];			//m_d_std*m_d_std/m_d; // =1/beta
-  	for(int k=0;k<nfib;k++){
-    		int kk = 3+3*k;
-    		p[kk]   = f2x_gpu(params[kk]);
-    		p[kk+1] = params[kk+1];
-    		p[kk+2] = params[kk+2];
-  	}
-  	if (m_include_f0)
-    		p[nparams-1]=f2x_gpu(params[nparams-1]);
-  	forwardModel_PVM_multi(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal);
-}
-
 //cost function PVM_multi
-__device__ double cf_PVM_multi(		//INPUT
+__device__ void cf_PVM_multi(		//INPUT
 					const double*		params,
 					const double*		mdata,
 					const double*		bvecs, 
 					const double*		bvals,
 					const int 		nparams,
-					const bool 		m_include_f0)
+					const bool 		m_include_f0,
+					const int		idB,
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory	
+					double 			&_a,		//shared memory
+					double 			&_b,		//shared memory
+					double 			&sumf,		//shared memory
+					//OUTPUT
+					double			&cfv)
 {
-	double cfv = 0.0;
-  	double err;	
-	double _a= abs(params[1]);
-	double _b= abs(params[2]); 
-	double fs[NFIBRES];    
-	double x[NFIBRES*3];	
-	double sumf=0;
+	if(idB<NFIBRES){
+		int kk = 3+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+		fs[idB] = x2f_gpu(params[kk]);
+		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
+  	}
+	if(idB==0){
+		_a= abs(params[1]);
+		_b= abs(params[2]); 
+		cfv = 0.0;
+		sumf=0;
+		for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
+	}
+
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	
+	double err;
 	double3 x2;
+	int dir_iter=idB;
 
-	for(int k=0;k<NFIBRES;k++){
-		int kk = 3+3*(k);
-		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
-  	}
-	for(int i=0;i<NDIRECTIONS;i++){
+	__syncthreads();
+	
+	shared[idB]=0;
+	for(int dir=0;dir<ndir;dir++){
     		err = 0.0;
     		for(int k=0;k<NFIBRES;k++){
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	 
-			err += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals); 
+			err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals); 
     		}
 		if(m_include_f0){
 			double temp_f0=x2f_gpu(params[nparams-1]);
-			err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+err)))-mdata[i];
+			err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter];
 		}else{
-			err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+err)-mdata[i];
+			err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter];
 		}
-		cfv += err*err;
+		shared[idB]+= err*err;  
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}  
-	return(cfv);
-}
 
+	__syncthreads();
+
+	if(idB==0){
+		for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+			cfv+=shared[i];
+		}
+	}	
+}
 
 //gradient function PVM_multi
 __device__ void grad_PVM_multi(		//INPUT
@@ -229,66 +187,94 @@ __device__ void grad_PVM_multi(		//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory	
+					double 			&_a,		//shared memory
+					double 			&_b,		//shared memory
+					double 			&sumf,		//shared memory
 					//OUTPUT
 					double*			grad)
-{
-  	double _a= abs(params[1]);
-  	double _b= abs(params[2]);
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;		
-  	double sumf=0;
-
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 3+3*(k);	
-    		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+{	
+	if(idB<NFIBRES){
+		int kk = 3+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+		fs[idB] = x2f_gpu(params[kk]);
+		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
+	if(idB==0){
+		_a= abs(params[1]);
+		_b= abs(params[2]); 
+		sumf=0;
+		for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
+		for (int p=0;p<nparams;p++) grad[p]=0;
+	}
 
-  	double J[NPARAMS];
-  	double diff;
-  	double sig;
-  
-	for (int p=0;p<nparams;p++) grad[p]=0;
+  	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
 
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-      			int kk = 3+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];		
-      			sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals);
-      			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals); 
-			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals);
-			J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); 
-      			J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);  
-      			J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
-    		if(m_include_f0){
-			double temp_f0=x2f_gpu(params[nparams-1]);
-			J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
-			sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals))+sig);
-    			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals);
-			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals);
-    		}else{
-	    		sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig);
-	    		J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals);
-	    		J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals);	
-    		}
+	double J[NPARAMS];
+	double diff;
+  	double sig;
+	double3 xx;
+	int dir_iter=idB;
+
+	__syncthreads();
+
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+      				int kk = 3+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];		
+      				sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
+      				J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); 
+				J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
+				J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); 
+      				J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);  
+      				J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}
+    			if(m_include_f0){
+				double temp_f0=x2f_gpu(params[nparams-1]);
+				J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
+				sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig);
+    				J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
+				J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
+    			}else{
+	    			sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
+	    			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
+	    			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);	
+    			}
     
-    		diff = sig - mdata[i];
-    		J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; 
+    			diff = sig - mdata[dir_iter];
+    			J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; 
+		}
 
-		for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff;
+		for (int p=0;p<nparams;p++){ 
+			shared[idB]=2*J[p]*diff;
+
+			__syncthreads();
+			if(idB==0){
+				for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+					grad[p] += shared[i];
+				}
+			}
+			__syncthreads(); 
+		} 
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 }
 
-
 //hessian function PVM_multi 
 __device__ void hess_PVM_multi(		//INPUT
 					const double*		params,
@@ -296,75 +282,105 @@ __device__ void hess_PVM_multi(		//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory	
+					double 			&_a,		//shared memory
+					double 			&_b,		//shared memory
+					double 			&sumf,		//shared memory
+					//OUTPUT
 					double*			hess)
 {
-  	double _a= abs(params[1]);
-  	double _b= abs(params[2]);
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;
-  	double sumf=0;
- 
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 3+3*(k);	
-    		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+	if(idB<NFIBRES){
+		int kk = 3+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+		fs[idB] = x2f_gpu(params[kk]);
+		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
+	if(idB==0){
+		_a= abs(params[1]);
+		_b= abs(params[2]); 
+		sumf=0;
+		for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
+		for (int p=0;p<nparams;p++){
+			for (int p2=0;p2<nparams;p2++){ 
+				hess[p*nparams+p2] = 0;
+			}
+		}
+	}
+
+  	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
 
-  	double J[NPARAMS];
+	double J[NPARAMS];
   	double sig;
+	double3 xx;
+	int dir_iter=idB; 
 
-	for (int p=0;p<nparams;p++){
-		for (int p2=0;p2<nparams;p2++){ 
-			hess[p*nparams+p2] = 0;
-		}
-	}
-  
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-      			int kk = 3+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];		
-      			sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals);
-      			double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);	
-      			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals);
-			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals);
-			J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*cov;
-      			J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
-      			J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
-    		if(m_include_f0){
-			double temp_f0=x2f_gpu(params[nparams-1]);
-			J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
-	    		sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+sig);
-    			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals);
-			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals);
-    		}else{
-			sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig);
-	    		J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals);
-	    		J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals);	
-    		}
+	__syncthreads(); 
 	
-    		J[0] = sig/params[0]; 
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+      				int kk = 3+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];		
+      				sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
+      				double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);	
+      				J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
+				J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
+				J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov;
+      				J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
+      				J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}
+    			if(m_include_f0){
+				double temp_f0=x2f_gpu(params[nparams-1]);
+				J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
+	    			sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
+    				J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
+				J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
+    			}else{
+				sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
+	    			J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
+	    			J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);	
+    			}
+	
+    			J[0] = sig/params[0]; 
+		}
 
 		for (int p=0;p<nparams;p++){
 			for (int p2=p;p2<nparams;p2++){ 
-				hess[p*nparams+p2] += 2*(J[p]*J[p2]);
+
+				shared[idB]=2*(J[p]*J[p2]);
+				__syncthreads();
+				if(idB==0){
+					for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+						hess[p*nparams+p2] += shared[i];
+					}
+				}
+				__syncthreads(); 
 			}
 		}
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 
-  	for (int j=0; j<nparams; j++) {
-    		for (int i=j+1; i<nparams; i++) {
-     			hess[i*nparams+j]=hess[j*nparams+i];	
-    		}
-  	}
+	if(idB==0){
+  		for (int j=0; j<nparams; j++) {
+    			for (int i=j+1; i<nparams; i++) {
+     				hess[i*nparams+j]=hess[j*nparams+i];	
+    			}
+  		}
+	}
 }
 
 //in diffmodel.cc
@@ -379,63 +395,80 @@ extern "C" __global__ void fit_PVM_multi_kernel(	//INPUT
 							//OUTPUT
 							double* 		params)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-  	if (id >=nvox) { return; }	
-	
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 4; 
-    	else
-      		nparams = nfib*3 + 3;
-
-	int nparams_single_c = nparams-1;
-
-   	double myparams[NPARAMS];
-	double myparams_aux[NPARAMS];
-   	double mydata[NDIRECTIONS];
-
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
-
-	for(int i=0;i<nparams_single_c;i++){
-		myparams_aux[i]=params_PVM_single_c[(id*nparams_single_c)+i];
-   	}
-
-  	myparams[0] = myparams_aux[0];			//pvm1.get_s0();
-  	myparams[1] = 1.0;				//start with d=d_std
-  	for(int i=0,ii=3;i<nfib;i++,ii+=3){
-    		myparams[ii] = f2x_gpu(myparams_aux[ii-1]);
-    		myparams[ii+1] = myparams_aux[ii];
-    		myparams[ii+2] = myparams_aux[ii+1];
-  	}
-	myparams[2] = myparams_aux[1];   		//pvm1.get_d();
-  	if (m_include_f0)
-		myparams[nparams-1]=f2x_gpu(myparams_aux[nparams_single_c-1]);
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+
+	__shared__ double step[NPARAMS];
+	__shared__ double grad[NPARAMS];                          
+   	__shared__ double hess[NPARAMS*NPARAMS]; 	
+	__shared__ double inverse[NPARAMS];
+	__shared__ double pcf;
+	__shared__ double ncf;
+	__shared__ double lambda;
+	__shared__ double cftol;
+	__shared__ double ltol;
+	__shared__ double olambda;
+	__shared__ bool success;    
+	__shared__ bool end;    
+
+	__shared__ double shared[THREADS_X_BLOCK_FIT]; 
+	__shared__ double fs[NFIBRES];
+  	__shared__ double x[NFIBRES*3];	
+	__shared__ double _a;
+	__shared__ double _b;
+  	__shared__ double sumf;
+
+	if(idB==0){
+		if (m_include_f0)
+      			nparams = nfib*3 + 4; 
+    		else
+      			nparams = nfib*3 + 3;
+
+		int nparams_single_c = nparams-1;
+
+		myparams[0] = params_PVM_single_c[(idVOX*nparams_single_c)+0];			//pvm1.get_s0();
+  		myparams[1] = 1.0;								//start with d=d_std
+  		for(int i=0,ii=3;i<nfib;i++,ii+=3){
+    			myparams[ii] = f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+ii-1]);
+    			myparams[ii+1] = params_PVM_single_c[(idVOX*nparams_single_c)+ii];
+    			myparams[ii+2] = params_PVM_single_c[(idVOX*nparams_single_c)+ii+1];
+  		}
+		myparams[2] = params_PVM_single_c[(idVOX*nparams_single_c)+1] ; 		//pvm1.get_d();
+  		if (m_include_f0)
+			myparams[nparams-1]=f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+nparams_single_c-1]);
+	}
+
+	__syncthreads();
 
   	//do the fit
-	levenberg_marquardt_PVM_multi_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams);
+	levenberg_marquardt_PVM_multi_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_a,_b,sumf,myparams);
+
+	__syncthreads();
 
   	// finalise parameters
 	//m_s0-myparams[0] 	m_d-myparams[1] 	m_d_std-myparams[2]		m_f-m_th-m_ph-myparams[3,4,5,6 etc..]   	m_f0-myparams[nparams-1]
 
-	myparams_aux[1] = myparams[1];
+	if(idB==0){  	
+		double aux = myparams[1];
 
-  	myparams[1] = abs(myparams_aux[1]*myparams[2]);
-  	myparams[2] = sqrt(double(abs(myparams_aux[1]*myparams[2]*myparams[2])));
-  	for(int i=3,k=0;k<nfib;i+=3,k++){
-    		myparams[i]  = x2f_gpu(myparams[i]);
-  	}
-  	if (m_include_f0)
-    		myparams[nparams-1]=x2f_gpu(myparams[nparams-1]);
+  		myparams[1] = abs(aux*myparams[2]);
+		myparams[2] = sqrt(double(abs(aux*myparams[2]*myparams[2])));
+  		for(int i=3,k=0;k<nfib;i+=3,k++){
+    			myparams[i]  = x2f_gpu(myparams[i]);
+  		}
+  		if (m_include_f0)
+    			myparams[nparams-1]=x2f_gpu(myparams[nparams-1]);
 
-	sort_PVM_multi(nfib,nparams,myparams);
-  	fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams);
+		sort_PVM_multi(nfib,myparams);
+  		fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams);
 
-	for(int i=0;i<nparams;i++){
-		params[(id*nparams)+i] = myparams[i];
-		//printf("PARAM[%i]: %.20f\n",i,myparams[i]);
-   	}
+		for(int i=0;i<nparams;i++){
+			params[(idVOX*nparams)+i] = myparams[i];
+   		}
+	}
 }
 
 //in diffmodel.cc
@@ -451,34 +484,99 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel(	//INPUT
 								//OUTPUT
 								double*			residuals)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-  	if (id >=nvox) { return; }	
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+	__shared__ bool my_include_f0;
+	__shared__ double val;
+  	__shared__ double _a;
+	__shared__ double _b;
+  	__shared__ double fs[NFIBRES];
+  	__shared__ double x[NFIBRES*3];	
+  	__shared__ double sumf;
+
+	double predicted_signal;
+	double mydata;
+
+	if(idB==0){
+		if (m_include_f0)
+      			nparams = nfib*3 + 4; 
+    		else
+      			nparams = nfib*3 + 3;
 	
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 4; 
-    	else
-      		nparams = nfib*3 + 3;
+		my_include_f0 = includes_f0[idVOX];
+
+  		//m_s0-myparams[0]  m_d-myparams[1]  m_d_std-myparams[2]  m_f-m_th-m_ph-myparams[3,4,5,6 etc..]  m_f0-myparams[nparams-1]
+
+  		myparams[0] = params[(idVOX*nparams)+0];
+		double aux1 = params[(idVOX*nparams)+1];
+		double aux2 = params[(idVOX*nparams)+2];
+		//m_d*m_d/m_d_std/m_d_std;
+  		myparams[1] = aux1*aux1/aux2/aux2;		
+  		myparams[2] = aux2*aux2/aux1;			//m_d_std*m_d_std/m_d; // =1/beta
+  		
+  		if (my_include_f0)
+    			myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]);
+	}
 
-	bool my_include_f0 = includes_f0[id];
+	if(idB<nfib){
+		int kk = 3+3*idB;
+		double sinth,costh,sinph,cosph;
+	
+		myparams[kk]   = f2x_gpu(params[(idVOX*nparams)+kk]);
+    		myparams[kk+1] = params[(idVOX*nparams)+kk+1];
+    		myparams[kk+2] = params[(idVOX*nparams)+kk+2];
+
+		sincos(myparams[kk+1],&sinth,&costh);
+		sincos(myparams[kk+2],&sinph,&cosph);		
+    		fs[idB] = x2f_gpu(myparams[kk]);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
+  	}
 
-   	double myparams[NPARAMS];
-   	double mydata[NDIRECTIONS];
+	__syncthreads(); 
 
-	for(int i=0;i<nparams;i++){
-		myparams[i]=params[(id*nparams)+i];
-   	}
+	if(idB==0){
+  		_a = abs(myparams[1]);
+  		_b = abs(myparams[2]);
+  		sumf=0;
+  		for(int k=0;k<nfib;k++){
+	    		sumf += fs[k];
+  	}
+  	
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	
+	double3 x2;
+	int dir_iter=idB; 
 
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
+	__syncthreads();
 
-	double predicted_signal[NDIRECTIONS];
+  	for(int dir=0;dir<ndir;dir++){
+		mydata = data[(idVOX*NDIRECTIONS)+dir_iter];
+  		predicted_signal=0;	//pred = 0;
+    		val = 0.0;
+    		for(int k=0;k<nfib;k++){
+			x2.x=x[k*3];
+			x2.y=x[k*3+1];
+			x2.z=x[k*3+2];	 
+      			val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]);
+    		}	
+    		if (my_include_f0){
+      			double temp_f0=x2f_gpu(myparams[nparams-1]);
+      			predicted_signal = abs(myparams[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val);
+    		} 
+    		else
+      			predicted_signal = abs(myparams[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val); 
+  		}   
 
-	get_prediction_PVM_multi(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal);
+		//residuals=m_data-predicted_signal;
+		residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal;
 
-	for(int i=0;i<NDIRECTIONS;i++){		//residuals=m_data-predicted_signal;
-		residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i];
+		dir_iter+=THREADS_X_BLOCK_FIT;
 	}
 }
 
diff --git a/CUDA/PVM_single.cu b/CUDA/PVM_single.cu
index 0c1d513..8f65048 100644
--- a/CUDA/PVM_single.cu
+++ b/CUDA/PVM_single.cu
@@ -20,42 +20,46 @@
 
 __device__ 
 inline double isoterm_PVM_single(const int pt,const double _d,const double *bvals){
-  	return exp(double(-bvals[pt]*_d));
+  	return exp(-bvals[pt]*_d);
 }
 
 __device__ 
 inline double isoterm_d_PVM_single(const int pt,const double _d,const double *bvals){
-  	return (-bvals[pt]*exp(double(-bvals[pt]*_d)));
+  	return (-bvals[pt]*exp(-bvals[pt]*_d));
 }
 
 __device__ 
 inline double anisoterm_PVM_single(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	return exp(double(-bvals[pt]*_d*dp*dp));
+	return exp(-bvals[pt]*_d*dp*dp);
 }
 
 __device__ 
 inline double anisoterm_d_PVM_single(const int pt,const double _d,const double3 x,const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-  	return(-bvals[pt]*dp*dp*exp(double(-bvals[pt]*_d*dp*dp)));
+  	return(-bvals[pt]*dp*dp*exp(-bvals[pt]*_d*dp*dp));
 }
 
 __device__ 
 inline double anisoterm_th_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){
-
+	double sinth,costh,sinph,cosph;
+	sincos(_th,&sinth,&costh);
+	sincos(_ph,&sinph,&cosph);
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	double dp1 = (cos(double(_th))*(bvecs[pt]*cos(double(_ph))+bvecs[NDIRECTIONS+pt]*sin(double(_ph)))-bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th)));
-  	return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp)));
+	double dp1 = (costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth);
+  	return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp));
 }
 
 __device__ 
 inline double anisoterm_ph_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){
+	double sinth,sinph,cosph;
+	sinth=sin(_th);
+	sincos(_ph,&sinph,&cosph);
   	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	double dp1 = sin(double(_th))*(-bvecs[pt]*sin(double(_ph))+bvecs[NDIRECTIONS+pt]*cos(double(_ph)));
-  	return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp)));
+	double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph);
+  	return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp));
 }
 
-
 //in diffmodel.cc
 __device__ void fix_fsum_PVM_single(	//INPUT 
 					bool m_include_f0, 
@@ -77,150 +81,99 @@ __device__ void fix_fsum_PVM_single(	//INPUT
   	}
 }
 
-
-
 //in diffmodel.cc
-__device__  void sort_PVM_single(int nfib,int nparams,double* params)
+__device__  void sort_PVM_single(int nfib,double* params)
 {
 	double temp_f, temp_th, temp_ph;
 	// Order vector descending using f parameters as index
   	for(int i=1; i<(nfib); i++){ 
     		for(int j=0; j<(nfib-i); j++){ 
-      			if (params[2+j*3] < params[2+i*3]){ 
+      			if (params[2+j*3] < params[2+(j+1)*3]){ 
         			temp_f = params[2+j*3];
 				temp_th = params[2+j*3+1];
 				temp_ph = params[2+j*3+2];
-        			params[2+j*3] = params[2+i*3]; 
-				params[2+j*3+1] = params[2+i*3+1]; 
-				params[2+j*3+2] = params[2+i*3+2]; 
-        			params[2+i*3] = temp_f; 
-				params[2+i*3+1] = temp_th; 
-				params[2+i*3+2] = temp_ph; 
+        			params[2+j*3] = params[2+(j+1)*3]; 
+				params[2+j*3+1] = params[2+(j+1)*3+1]; 
+				params[2+j*3+2] = params[2+(j+1)*3+2]; 
+        			params[2+(j+1)*3] = temp_f; 
+				params[2+(j+1)*3+1] = temp_th; 
+				params[2+(j+1)*3+2] = temp_ph; 
       			} 
     		} 
   	} 
 }
 
-
-//in diffmodels.cc -- for calculate residuals
-__device__ void  forwardModel_PVM_single(	//INPUT
-						const double* 		p,
-						const double*		bvecs, 
-						const double*		bvals,
-						const int		nfib,
-						const int 		nparams,
-						const bool 		m_include_f0,
-						//OUTPUT
-						double*		 	predicted_signal)
-{
-  	for(int i=0;i<NDIRECTIONS;i++){
-		predicted_signal[i]=0;		//pred = 0;
-	}
-  	double val;
-  	double _d = abs(p[1]);
-  	////////////////////////////////////
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double sumf=0;
-	double3 x2;
-  	for(int k=0;k<nfib;k++){
-    		int kk = 2+3*k;
-	    	fs[k] = x2f_gpu(p[kk]);
-	    	sumf += fs[k];
-		x[k*3] = sin(p[kk+1])*cos(p[kk+2]);
-    		x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]);
-    		x[k*3+2] = cos(p[kk+1]);
-  	}
-  	////////////////////////////////////
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		val = 0.0;
-    		for(int k=0;k<nfib;k++){
-			x2.x=x[k*3];
-			x2.y=x[k*3+1];
-			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_single(i,_d,x2,bvecs,bvals);
-    		}	
-    		if (m_include_f0){
-      			double temp_f0=x2f_gpu(p[nparams-1]);
-      			predicted_signal[i] = p[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals)+val);
-    		} 
-    		else
-      			predicted_signal[i] = p[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+val); 
-  	}
-}
-
-
-//in diffmodels.cc -- for calculate residuals
-__device__ void get_prediction_PVM_single(	//INPUT
-						const double*	params,
-						const double*	bvecs, 
-						const double*	bvals,
-						const int 	nfib,
-						const int 	nparams,
-						const bool 	m_include_f0,
-						//OUTPUT
-						double* 	predicted_signal)
+//cost function PVM_single
+__device__ void cf_PVM_single(	//INPUT
+				const double*		params,
+				const double*		mdata,
+				const double*		bvecs, 
+				const double*		bvals,
+				const int 		nparams,
+				const bool 		m_include_f0,
+				const int		idB,
+				double*			shared,		//shared memory
+				double* 		fs,		//shared memory
+				double*			x,		//shared memory	
+				double 			&_d,		//shared memory
+				double 			&sumf,		//shared memory
+				//OUTPUT
+				double			&cfv)
 {
-	//m_s0-myparams[0] 	m_d-myparams[1] 	m_d_std-myparams[2]		m_f-m_th-m_ph-myparams[3,4,5,6 etc..]   	m_f0-myparams[nparams-1]
-  	double p[NPARAMS];
-  	p[0] = params[0];
-  	p[1] = params[1];		
-  	for(int k=0;k<nfib;k++){
-    		int kk = 2+3*k;
-    		p[kk]   = f2x_gpu(params[kk]);
-    		p[kk+1] = params[kk+1];
-    		p[kk+2] = params[kk+2];
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+    		fs[idB] = x2f_gpu(params[kk]);
+		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
-  	if (m_include_f0)
-    		p[nparams-1]=f2x_gpu(params[nparams-1]);
-  	forwardModel_PVM_single(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal);
-}
 
+	__syncthreads(); 	
 
-//cost function PVM_single
-__device__ double cf_PVM_single(	//INPUT
-					const double*		params,
-					const double*		mdata,
-					const double*		bvecs, 
-					const double*		bvals,
-					const int 		nparams,
-					const bool 		m_include_f0)
-{
-	double cfv = 0.0;
-  	double err;
-	double _d = abs(params[1]);
-	double fs[NFIBRES];    
-	double x[NFIBRES*3];	
-	double sumf=0;
+	if(idB==0){
+		_d = abs(params[1]);
+		cfv = 0.0;
+		sumf=0;
+		for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
+	}
+	
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	
+	double err;
 	double3 x2;
+	int dir_iter=idB;
 
-	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
-  	}
-	
-	for(int i=0;i<NDIRECTIONS;i++){
+	__syncthreads();
+
+	shared[idB]=0;
+	for(int dir=0;dir<ndir;dir++){
 		err = 0.0;
     		for(int k=0;k<NFIBRES;k++){
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	
-			err += fs[k]*anisoterm_PVM_single(i,_d,x2,bvecs,bvals); 
+			err += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,bvecs,bvals); 
     		}
 		if(m_include_f0){
 			double temp_f0=x2f_gpu(params[nparams-1]);
-			err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+err))-mdata[i];
+			err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+err))-mdata[dir_iter];
 		}else{
-			err =  (params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+err))-mdata[i];
+			err =  (params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+err))-mdata[dir_iter];
 		}
-		cfv += err*err;  
+		shared[idB]+= err*err;  
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}  
-	return(cfv);
+	__syncthreads();
+
+	if(idB==0){
+		for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+			cfv+=shared[i];
+		}
+	}	
 }
 
 //gradient function PVM_single
@@ -231,58 +184,89 @@ __device__ void grad_PVM_single(	//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,		
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory
+					double 			&_d,		//shared memory
+					double 			&sumf,		//shared memory
 					//OUTPUT
 					double*			grad)
 {
-  	double _d = abs(params[1]);
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;		
-  	double sumf=0;
-
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+    		fs[idB] = x2f_gpu(params[kk]);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
- 
-  	double J[NPARAMS];
-  	double diff;
-  	double sig;
 
-	for (int p=0;p<nparams;p++) grad[p]=0;
+	__syncthreads(); 
 
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-      			int kk = 2+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];			
-			sig +=  fs[k]*anisoterm_PVM_single(i,_d,xx,bvecs,bvals);
-			J[1] +=  (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(i,_d,xx,bvecs,bvals);
-      			J[kk] = params[0]*(anisoterm_PVM_single(i,_d,xx,bvecs,bvals)-isoterm_PVM_single(i,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
-      			J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-      			J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
+	if(idB==0){
+		sumf=0;
+		for(int i=0;i<NFIBRES;i++) sumf+=fs[i];
+		_d = abs(params[1]);
+		for (int p=0;p<nparams;p++) grad[p]=0;
+	}
 
-    		if(m_include_f0){
-			double temp_f0=x2f_gpu(params[nparams-1]);
-			J[nparams-1]= params[0]*(1-isoterm_PVM_single(i,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
-			sig= params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+sig);
-    			J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(i,_d,bvals);
-    		}else{
-			sig = params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+sig);
-			J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(i,_d,bvals);
-    		}
-    		diff = sig - mdata[i];
-    		J[0] = sig/params[0];
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
+
+	double J[NPARAMS];
+	double diff;
+  	double sig;
+	double3 xx;
+	int dir_iter=idB;
+
+	__syncthreads();
+
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+      				int kk = 2+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];			
+				sig +=  fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals);
+				J[1] +=  (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals);
+      				J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
+      				J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+      				J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}
+
+    			if(m_include_f0){
+				double temp_f0=x2f_gpu(params[nparams-1]);
+				J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
+				sig= params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig);
+    				J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals);
+    			}else{
+				sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig);
+				J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals);
+    			}
+    			diff = sig - mdata[dir_iter];
+    			J[0] = sig/params[0];
+		}
+
+		for (int p=0;p<nparams;p++){ 
+			shared[idB]=2*J[p]*diff;
 
-		for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff; 
+			__syncthreads();
+			if(idB==0){
+				for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+					grad[p] += shared[i];
+				}
+			}
+			__syncthreads(); 
+		} 
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 }
 
@@ -293,70 +277,102 @@ __device__ void hess_PVM_single(	//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,
+					double*			shared,		//shared memory					
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory
+					double 			&_d,		//shared memory
+					double 			&sumf,		//shared memory
+					//OUTPUT
 					double*			hess)
 {
-  	double _d = abs(params[1]);
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;
-  	double sumf=0;
-
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		fs[k] = x2f_gpu(params[kk]);
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+    		fs[idB] = x2f_gpu(params[kk]);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
- 
-  	double J[NPARAMS];
-  	double sig;
 
-	for (int p=0;p<nparams;p++){
-		for (int p2=0;p2<nparams;p2++){ 
-			hess[p*nparams+p2] = 0;
+	__syncthreads(); 
+
+	if(idB==0){
+		sumf=0;
+		for(int i=0;i<NFIBRES;i++) sumf+=fs[i];
+		_d = abs(params[1]);
+		for (int p=0;p<nparams;p++){
+			for (int p2=0;p2<nparams;p2++){ 
+				hess[p*nparams+p2] = 0;
+			}
 		}
 	}
 
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-      			int kk = 2+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];		
-			sig += fs[k]*anisoterm_PVM_single(i,_d,xx,bvecs,bvals);
-      			J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(i,_d,xx,bvecs,bvals);
-      			J[kk] = params[0]*(anisoterm_PVM_single(i,_d,xx,bvecs,bvals)-isoterm_PVM_single(i,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
-		      	J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-		      	J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
 
-    		if(m_include_f0){
-			double temp_f0=x2f_gpu(params[nparams-1]);
-			J[nparams-1]= params[0]*(1-isoterm_PVM_single(i,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
-			sig=params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+sig);
-    			J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(i,_d,bvals);	
-    		}else{
-			sig = params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+sig);
-	    		J[1] +=  (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(i,_d,bvals);
-    		}   
-    		J[0] = sig/params[0];
+	double J[NPARAMS];
+  	double sig;
+	double3 xx;
+	int dir_iter=idB; 
+
+	__syncthreads(); 
+	
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+      				int kk = 2+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];		
+				sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals);
+      				J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals);
+      				J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
+		      		J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+		      		J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}	
+
+    			if(m_include_f0){
+				double temp_f0=x2f_gpu(params[nparams-1]);
+				J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
+				sig=params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig);
+    				J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals);	
+    			}else{
+				sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig);
+	    			J[1] +=  (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals);
+    			}   
+    			J[0] = sig/params[0];
+		}
 
 		for (int p=0;p<nparams;p++){
 			for (int p2=p;p2<nparams;p2++){ 
-				hess[p*nparams+p2] += 2*(J[p]*J[p2]);
+
+				shared[idB]=2*(J[p]*J[p2]);
+				__syncthreads();
+				if(idB==0){
+					for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+						hess[p*nparams+p2] += shared[i];
+					}
+				}
+				__syncthreads(); 
 			}
 		}
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 
-  	for (int j=0; j<nparams; j++) {
-    		for (int i=j+1; i<nparams; i++) {
-     			hess[i*nparams+j]=hess[j*nparams+i];	
-    		}
-  	}
+	if(idB==0){
+	  	for (int j=0; j<nparams; j++) {
+	    		for (int i=j+1; i<nparams; i++) {
+	     			hess[i*nparams+j]=hess[j*nparams+i];	
+	    		}
+	  	}
+	}
 }
 
 //in diffmodel.cc
@@ -370,46 +386,67 @@ extern "C" __global__ void fit_PVM_single_kernel(	//INPUT
 							//INPUT-OUTPUT
 							double* 		params)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-   	if (id >=nvox) { return; }	
-
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 3; 
-    	else
-      		nparams = nfib*3 + 2;
-
-	double myparams[NPARAMS];
-   	double mydata[NDIRECTIONS];
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+
+	__shared__ double step[NPARAMS];
+	__shared__ double grad[NPARAMS];                          
+   	__shared__ double hess[NPARAMS*NPARAMS]; 	
+	__shared__ double inverse[NPARAMS];
+	__shared__ double pcf;
+	__shared__ double ncf;
+	__shared__ double lambda;
+	__shared__ double cftol;
+	__shared__ double ltol;
+	__shared__ double olambda;
+	__shared__ bool success;    
+	__shared__ bool end;    
+
+	__shared__ double shared[THREADS_X_BLOCK_FIT]; 
+	__shared__ double fs[NFIBRES];
+  	__shared__ double x[NFIBRES*3];	
+	__shared__ double _d;
+  	__shared__ double sumf;
+
+	if(idB==0){
+		if(m_include_f0)
+      			nparams = nfib*3 + 3; 
+    		else
+      			nparams = nfib*3 + 2;
+  	
+		for(int i=0;i<nparams;i++){
+			myparams[i]=params[(idVOX*nparams)+i];
+   		}
+	}
 
-	for(int i=0;i<nparams;i++){
-		myparams[i]=params[(id*nparams)+i];
-   	}
-	
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
+	__syncthreads();
 
 	// do the fit
-	levenberg_marquardt_PVM_single_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0,  myparams);
+	levenberg_marquardt_PVM_single_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_d,sumf,myparams);
+
+	__syncthreads();
 	
   	// finalise parameters
 	//m_s0 in myparams[0] 	m_d in myparams[1] 	m_f-m_th-m_ph in myparams[2,3,4,5, etc..]   	m_f0 in myparams[nparams-1]
-  			
-  	myparams[1] = abs(myparams[1]); 
-  	for(int k=1;k<=nfib;k++){
-    		int kk = 2 + 3*(k-1);
-    		myparams[kk]  = x2f_gpu(myparams[kk]);
-  	}
-  	if (m_include_f0)
-    		myparams[nparams-1]=x2f_gpu(myparams[nparams-1]);
 
-  	sort_PVM_single(nfib,nparams,myparams);
-  	fix_fsum_PVM_single(m_include_f0,nfib,nparams,myparams);
+	if(idB==0){  	
+  		myparams[1] = abs(myparams[1]); 
+  		for(int k=1;k<=nfib;k++){
+    			int kk = 2 + 3*(k-1);
+    			myparams[kk] = x2f_gpu(myparams[kk]);
+  		}
+  		if(m_include_f0)
+    			myparams[nparams-1]=x2f_gpu(myparams[nparams-1]);
 
-	for(int i=0;i<nparams;i++){
-		params[id*nparams+i]=myparams[i];	
-		//printf("PARAM[%i]: %.20f\n",i,myparams[i]);
+  		sort_PVM_single(nfib,myparams);
+  		fix_fsum_PVM_single(m_include_f0,nfib,nparams,myparams);
+
+		for(int i=0;i<nparams;i++){
+			params[idVOX*nparams+i]=myparams[i];	
+		}
 	}
 }
 
@@ -426,34 +463,92 @@ extern "C" __global__ void get_residuals_PVM_single_kernel(	//INPUT
 								//OUTPUT
 								double*			residuals)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-   	if (id >=nvox) { return; }	
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+	__shared__ bool my_include_f0;
+	__shared__ double val;
+  	__shared__ double _d;
+  	__shared__ double fs[NFIBRES];
+  	__shared__ double x[NFIBRES*3];	
+  	__shared__ double sumf;
+
+	double predicted_signal;
+	double mydata;
+
+	if(idB==0){
+		if (m_include_f0)
+      			nparams = nfib*3 + 3; 
+    		else
+      			nparams = nfib*3 + 2;
 
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 3; 
-    	else
-      		nparams = nfib*3 + 2;
+		my_include_f0 = includes_f0[idVOX];
 
-	bool my_include_f0 = includes_f0[id];
+		//m_s0-myparams[0]  m_d-myparams[1]  m_d_std-myparams[2]  m_f-m_th-m_ph-myparams[3,4,5,6 etc..]  m_f0-myparams[nparams-1]
 
-	double myparams[NPARAMS];
-   	double mydata[NDIRECTIONS];
+		myparams[0]=params[(idVOX*nparams)+0];
+		myparams[1]=params[(idVOX*nparams)+1];
 
-	for(int i=0;i<nparams;i++){
-		myparams[i]=params[(id*nparams)+i];
-   	}
-	
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
+  		if (my_include_f0)
+    			myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]);
+	}
+
+	if(idB<nfib){
+		int kk = 2+3*idB;
+		double sinth,costh,sinph,cosph;
 
-	double predicted_signal[NDIRECTIONS];
+		myparams[kk]   = f2x_gpu(params[(idVOX*nparams)+kk]);
+    		myparams[kk+1] = params[(idVOX*nparams)+kk+1];
+    		myparams[kk+2] = params[(idVOX*nparams)+kk+2];
 
-	get_prediction_PVM_single(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal);
+		sincos(myparams[kk+1],&sinth,&costh);
+		sincos(myparams[kk+2],&sinph,&cosph);
 
-	for(int i=0;i<NDIRECTIONS;i++){		//residuals=m_data-predicted_signal;
-		residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i];
+    		fs[idB] = x2f_gpu(myparams[kk]);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
+  	}
+
+	__syncthreads(); 
+
+	if(idB==0){
+		sumf=0;
+		for(int i=0;i<NFIBRES;i++) sumf+=fs[i];
+		_d = abs(myparams[1]);
 	}
+  	
+  	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	
+	double3 x2;
+	int dir_iter=idB; 
+
+	__syncthreads();
+
+	for(int dir=0;dir<ndir;dir++){
+		mydata = data[(idVOX*NDIRECTIONS)+dir_iter];
+  		predicted_signal=0;	//pred = 0;
+    		val = 0.0;
+    		for(int k=0;k<nfib;k++){
+			x2.x=x[k*3];
+			x2.y=x[k*3+1];
+			x2.z=x[k*3+2];	 
+      			val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]);
+    		}	
+    		if (my_include_f0){
+      			double temp_f0=x2f_gpu(myparams[nparams-1]);
+      			predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val);
+    		} 
+    		else
+      			predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); 
+	
+		//residuals=m_data-predicted_signal;
+		residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal;
+
+		dir_iter+=THREADS_X_BLOCK_FIT;
+  	}
 }
 
diff --git a/CUDA/PVM_single_c.cu b/CUDA/PVM_single_c.cu
index 38e5db9..09c399f 100644
--- a/CUDA/PVM_single_c.cu
+++ b/CUDA/PVM_single_c.cu
@@ -20,43 +20,46 @@
 
 __device__ 
 inline double isoterm_PVM_single_c(const int pt,const double _d,const double *bvals){
-  	return exp(double(-bvals[pt]*_d));
+  	return exp(-bvals[pt]*_d);
 }
 
 __device__ 
 inline double isoterm_lambda_PVM_single_c(const int pt,const double lambda,const double *bvals){
-  	return(-2*bvals[pt]*lambda*exp(double(-bvals[pt]*lambda*lambda)));
+  	return(-2*bvals[pt]*lambda*exp(-bvals[pt]*lambda*lambda));
 }
 
 __device__ 
 inline double anisoterm_PVM_single_c(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	return exp(double(-bvals[pt]*_d*dp*dp));
+	return exp(-bvals[pt]*_d*dp*dp);
 }
 
 __device__ 
 inline double anisoterm_lambda_PVM_single_c(const int pt,const double lambda,const double3 x, const double *bvecs, const double *bvals){
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	return(-2*bvals[pt]*lambda*dp*dp*exp(double(-bvals[pt]*lambda*lambda*dp*dp)));
+	return(-2*bvals[pt]*lambda*dp*dp*exp(-bvals[pt]*lambda*lambda*dp*dp));
 }
 
 __device__ 
 inline double anisoterm_th_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){
-
+	double sinth,costh,sinph,cosph;
+	sincos(_th,&sinth,&costh);
+	sincos(_ph,&sinph,&cosph);
 	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	double dp1 = cos(double(_th))*(bvecs[pt]*cos(double(_ph))+bvecs[NDIRECTIONS+pt]*sin(double(_ph)))-bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th));
-  	return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp)));
+	double dp1 = costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth;
+  	return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp));
 }
 
 __device__ 
 inline double anisoterm_ph_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){
+	double sinth,sinph,cosph;
+	sinth=sin(_th);
+	sincos(_ph,&sinph,&cosph);
   	double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
-	double dp1 = sin(double(_th))*(-bvecs[pt]*sin(double(_ph))+bvecs[NDIRECTIONS+pt]*cos(double(_ph)));
-  	return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp)));
+	double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph);
+  	return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp));
 }
 
-
-
 //If the sum of the fractions is >1, then zero as many fractions
 //as necessary, so that the sum becomes smaller than 1.
 //in diffmodel.cc
@@ -87,205 +90,148 @@ __device__ double partial_fsum_PVM_single_c(double* fs, int ii){
   	return fsum;
 }
 
-
-
 //in diffmodel.cc
-__device__ void sort_PVM_single_c(int nfib,int nparams,double* params)
+__device__ void sort_PVM_single_c(int nfib,double* params)
 {
 	double temp_f, temp_th, temp_ph;
 	// Order vector descending using f parameters as index
   	for(int i=1; i<(nfib); i++){ 
     		for(int j=0; j<(nfib-i); j++){ 
-      			if (params[2+j*3] < params[2+i*3]){ 
+      			if (params[2+j*3] < params[2+(j+1)*3]){ 
         			temp_f = params[2+j*3];
 				temp_th = params[2+j*3+1];
 				temp_ph = params[2+j*3+2];
-        			params[2+j*3] = params[2+i*3]; 
-				params[2+j*3+1] = params[2+i*3+1]; 
-				params[2+j*3+2] = params[2+i*3+2]; 
-        			params[2+i*3] = temp_f; 
-				params[2+i*3+1] = temp_th; 
-				params[2+i*3+2] = temp_ph; 
+        			params[2+j*3] = params[2+(j+1)*3]; 
+				params[2+j*3+1] = params[2+(j+1)*3+1]; 
+				params[2+j*3+2] = params[2+(j+1)*3+2]; 
+        			params[2+(j+1)*3] = temp_f; 
+				params[2+(j+1)*3+1] = temp_th; 
+				params[2+(j+1)*3+2] = temp_ph; 
       			} 
     		} 
   	} 
-
-	//if (m_return_fanning){
-     	 //	fantmp=m_fanning_angles;
-      	//	Hess_vec_tmp=m_invprHes_e1;
-      	//	Hess=m_Hessian;
-  	//}
-
-	//if (m_return_fanning){
-     	 	//m_fanning_angles(i)=fantmp(fvals[ii].second);
-      		//m_invprHes_e1[i-1]=Hess_vec_tmp[fvals[ii].second-1];
-      		//m_Hessian[i-1]=Hess[fvals[ii].second-1];
-    	//}
-}
-
-//in diffmodels.cc -- for calculate residuals
-__device__ void  forwardModel_PVM_single_c(	//INPUT
-						const double* 		p,
-						const double*		bvecs, 
-						const double*		bvals,
-						const int		nfib,
-						const int 		nparams,
-						const bool 		m_include_f0,
-						//OUTPUT
-						double*		 	predicted_signal)
-{
-  	for(int i=0;i<NDIRECTIONS;i++){
-		predicted_signal[i]=0;		//pred = 0;
-	}
-  	double val;
-  	double _d = lambda2d_gpu(p[1]);
-  	////////////////////////////////////
-  	double fs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double sumf=0;
-	double3 x2;
-	double partial_fsum;
-  	for(int k=0;k<nfib;k++){
-    		int kk = 2+3*k;
-		////// partial_fsum //////
-		partial_fsum=1.0;
-		for(int j=0;j<k;j++)
-			partial_fsum-=fs[j];
-    		//////////////////////////
-	    	fs[k] = beta2f_gpu(p[kk])*partial_fsum;
-	    	sumf += fs[k];
-		x[k*3] = sin(p[kk+1])*cos(p[kk+2]);
-    		x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]);
-    		x[k*3+2] = cos(p[kk+1]);
-  	}
-  	////////////////////////////////////
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		val = 0.0;
-    		for(int k=0;k<nfib;k++){
-			x2.x=x[k*3];
-			x2.y=x[k*3+1];
-			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_single_c(i,_d,x2,bvecs,bvals);
-    		}	
-    		if (m_include_f0){
-			//partial_fsum ///////////
-			partial_fsum=1.0;
-			for(int j=0;j<NFIBRES;j++)
-				partial_fsum-=fs[j];
-	     		//////////////////////////
-      			double temp_f0= beta2f_gpu(p[nparams-1])*partial_fsum;
-      			predicted_signal[i] = p[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(i,_d,bvals)+val);
-    		} 
-    		else
-      			predicted_signal[i] = p[0]*((1-sumf)*isoterm_PVM_single_c(i,_d,bvals)+val); 
-  	}
 }
 
-//in diffmodels.cc -- for calculate residuals
-__device__ void get_prediction_PVM_single_c(	//INPUT
+__device__  void fractions_deriv_PVM_single_c(	//INPUT
 						const double*	params,
-						const double*	bvecs, 
-						const double*	bvals,
-						const int 	nfib,
-						const int 	nparams,
-						const bool 	m_include_f0,
+						const double* 	fs, 
+						const int	nfib,
+						const int	idB,
 						//OUTPUT
-						double* 	predicted_signal)
+						double* 	Deriv) 
 {
-	//m_s0-myparams[0] 	m_d-myparams[1] 	m_d_std-myparams[2]		m_f-m_th-m_ph-myparams[3,4,5,6 etc..]   	m_f0-myparams[nparams-1]
-  	double p[NPARAMS];
-  	p[0] = params[0];
-	if(params[1]<0)  p[1] = 0;	//This can be due to numerical errors..sqrt
-  	else p[1] = d2lambda_gpu(params[1]);	
-	double partial_fsum;	
-	double fs[NFIBRES];
-  	for(int k=0;k<nfib;k++){
-    		int kk = 2+3*k;
-		//partial_fsum ///////////
-		partial_fsum=1.0;
-		for(int j=0;j<k;j++)
-			partial_fsum-=fs[j];
-	     	//////////////////////////
-		fs[k] = params[kk];
-		double tmpr=params[kk]/partial_fsum;
-    		if (tmpr>1.0) tmpr=1; //This can be due to numerical errors
-		if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt
-    		p[kk]   = f2beta_gpu(tmpr);
-    		p[kk+1] = params[kk+1];
-    		p[kk+2] = params[kk+2];
-  	}
-  	if (m_include_f0){
-		//partial_fsum ///////////
-		partial_fsum=1.0;
-		for(int j=0;j<NFIBRES;j++)
-			partial_fsum-=fs[j];
-	     	//////////////////////////	
-		double tmpr=params[nparams-1]/partial_fsum;
-    		if (tmpr>1.0) tmpr=1; //This can be due to numerical errors..asin
-		if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt
-    		p[nparams-1]= f2beta_gpu(tmpr);	
-	}
-  	forwardModel_PVM_single_c(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal);
+	int nparams_per_fibre=3;
+  	double fsum;
+	int k=idB%nfib;
+	for (int j=0; j<nfib; j++){
+		Deriv[j*nfib+k]=0;
+    	}
+
+  	int kk = 2+(k*nparams_per_fibre);
+	double sinparamkk = sin(2*params[kk]);
+
+	for (int j=0; j<nfib; j++){
+		int jj = 2+(j*nparams_per_fibre);
+      		if (j==k){
+			fsum=1; 
+			for (int n=0; n<=(j-1); n++){
+	  			fsum-=fs[n];
+			}
+			Deriv[j*nfib+k]=sinparamkk*fsum;
+      		}else if (j>k){
+			double sinparam = sin(params[jj]);
+			fsum=0;
+			for (int n=0; n<=(j-1); n++){
+	  			fsum+=Deriv[n*nfib+k];
+			}
+			Deriv[j*nfib+k]=  -(sinparam*sinparam)*fsum;
+      		}
+    	}
 }
 
 //cost function PVM_single_c
-__device__ double cf_PVM_single_c(	//INPUT
+__device__ void cf_PVM_single_c(	//INPUT
 					const double*		params,
 					const double*		mdata,
 					const double*		bvecs, 
 					const double*		bvals,
 					const int 		nparams,
-					const bool 		m_include_f0)
+					const bool 		m_include_f0,
+					const int		idB,
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			x,		//shared memory	
+					double 			&_d,		//shared memory
+					double 			&sumf,		//shared memory
+					//OUTPUT
+					double			&cfv)
 {
-	double cfv = 0.0;
-  	double err;
-	double _d = lambda2d_gpu(params[1]);
-	double fs[NFIBRES];    
-	double x[NFIBRES*3];	
-	double sumf=0;
-	double3 x2;
-        double partial_fsum;
-
-	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		//partial_fsum ///////////
-		partial_fsum=1.0;
-		for(int j=0;j<k;j++)
-			partial_fsum-=fs[j];
-    		//////////////////////////
-		fs[k] = beta2f_gpu(params[kk])*partial_fsum;
-    		sumf += fs[k];
-    		
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
+	if(idB==0){
+		_d = lambda2d_gpu(params[1]);
+		cfv = 0.0;
+		sumf=0;
+		double partial_fsum;
+		for(int k=0;k<NFIBRES;k++){
+			int kk = 2+3*(k);
+    			//partial_fsum ///////////
+			partial_fsum=1.0;
+			for(int j=0;j<k;j++)
+				partial_fsum-=fs[j];
+    			//////////////////////////
+			fs[k] = beta2f_gpu(params[kk])*partial_fsum;
+    			sumf += fs[k];
+		}
+	}
+
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
 	
-	for(int i=0;i<NDIRECTIONS;i++){
-    		err = 0.0;
+	double err;
+	double3 x2;
+	int dir_iter=idB;
+
+	__syncthreads();
+	
+	shared[idB]=0;
+	for(int dir=0;dir<ndir;dir++){
+		err = 0.0;
     		for(int k=0;k<NFIBRES;k++){
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	
-			err += fs[k]*anisoterm_PVM_single_c(i,_d,x2,bvecs,bvals); 
+			err += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,bvecs,bvals); 
     		}
 		if(m_include_f0){
 			//partial_fsum ///////////
-			partial_fsum=1.0;
+			double partial_fsum=1.0;
 			for(int j=0;j<NFIBRES;j++)
 				partial_fsum-=fs[j];
 	     		//////////////////////////
-	      		double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
-			err=(params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(i,_d,bvals))+err))-mdata[i];
+			double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
+			err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,bvals))+err))-mdata[dir_iter];
 		}else{
-
-			err = params[0]*((1-sumf)*isoterm_PVM_single_c(i,_d,bvals)+err)-mdata[i];
-		
+			err = params[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,bvals)+err)-mdata[dir_iter];
 		}
-		cfv += err*err;  	
+		shared[idB]+= err*err;  
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}  
 
-	return(cfv);
+	__syncthreads();
+
+	if(idB==0){
+		for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+			cfv+=shared[i];
+		}	
+	}	
 }
 
 
@@ -297,115 +243,122 @@ __device__ void grad_PVM_single_c(	//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,		
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			f_deriv,	//shared memory
+					double*			x,		//shared memory
+					double 			&_d,		//shared memory
+					double 			&sumf,		//shared memory
 					//OUTPUT
 					double*			grad)
 {
-  	double _d = lambda2d_gpu(params[1]);
-  	double fs[NFIBRES];
-  	double bs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;		
-  	double sumf=0;
-  	double partial_fsum;
-
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		bs[k]=params[kk];
-    		//partial_fsum ///////////
-		partial_fsum=1.0;
-		for(int j=0;j<k;j++){
-			partial_fsum-=fs[j];
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
+  	}
+	if(idB==0){
+		_d = lambda2d_gpu(params[1]);
+		sumf=0;
+		double partial_fsum;
+		for(int k=0;k<NFIBRES;k++){
+			int kk = 2+3*(k);
+    			//partial_fsum ///////////
+			partial_fsum=1.0;
+			for(int j=0;j<k;j++)
+				partial_fsum-=fs[j];
+    			//////////////////////////
+			fs[k] = beta2f_gpu(params[kk])*partial_fsum;
+    			sumf += fs[k];
 		}
-   		 //////////////////////////
+		for (int p=0;p<nparams;p++) grad[p]=0;
+	}
 
-    		fs[k] = beta2f_gpu(params[kk])*partial_fsum;
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
-  	}
+	__syncthreads();
 
-  	////////// fraction deriv //////////////
-  	// f_deriv=fractions_deriv(nfib, fs, bs);  
-  	//////////////////////////////////////
-  	double f_deriv[NFIBRES*NFIBRES];
-  	double fsum;
-  	for (int j=0; j<NFIBRES; j++){
-   		 for (int k=0; k<NFIBRES; k++){
-			f_deriv[j*NFIBRES+k]=0;
-    		}
-  	}  
+  	if(idB<NFIBRES){ 
+		fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); 
+	} 
 
-  	for (int j=0; j<NFIBRES; j++){
-    		for (int k=0; k<NFIBRES; k++){
-      			if (j==k){
-				fsum=1; 
-				for (int n=0; n<=(j-1); n++)
-	  				fsum-=fs[n];
-	  			f_deriv[j*NFIBRES+k] = sin(double(2*bs[k]))*fsum;
-      			}else if (j>k){
-				fsum=0;
-				for (int n=0; n<=(j-1); n++)
-	  				fsum += f_deriv[n*NFIBRES+k];
-				f_deriv[j*NFIBRES+k] += -sin(bs[j])*sin(bs[j])*fsum;
-
-      			}
-    		} 	   
-  	}
-  	///////////////////////////////
-  	double J[NPARAMS];
-  	double diff;
-  	double sig,Iso_term;
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
+
+	double J[NPARAMS];
+	double diff;
+  	double sig;
+	double Iso_term;
+	double3 xx;
+	int dir_iter=idB;
   	double Aniso_terms[NFIBRES];
 
-	for (int p=0;p<nparams;p++) grad[p]=0;
-  
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		Iso_term=isoterm_PVM_single_c(i,_d,bvals);  //Precompute some terms for this datapoint
-    		for(int k=0;k<NFIBRES;k++){
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-     			xx.z=x[k*3+2];	
-      			Aniso_terms[k]=anisoterm_PVM_single_c(i,_d,xx,bvecs,bvals);
-    		}
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-     			int kk = 2+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];		
-      			sig += fs[k]*Aniso_terms[k];
-     			J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(i,params[1],xx,bvecs,bvals);
-     			J[kk] = 0;
-      			for (int j=0;j<NFIBRES;j++){
-				if(f_deriv[j*NFIBRES+k]!=0){
-	  				J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; 
-				}
-      			}
-      			J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-      			J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
-    		if(m_include_f0){
-			//partial_fsum ///////////
-    			partial_fsum=1.0;
-    			for(int j=0;j<(NFIBRES);j++)
-				partial_fsum-=fs[j];
-			//////////////////////////
-			double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
+	__syncthreads();
+
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals);  //Precompute some terms for this datapoint
+    			for(int k=0;k<NFIBRES;k++){
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+     				xx.z=x[k*3+2];	
+      				Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals);
+    			}
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+     				int kk = 2+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];		
+      				sig += fs[k]*Aniso_terms[k];
+     				J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals);
+     				J[kk] = 0;
+      				for (int j=0;j<NFIBRES;j++){
+					if(f_deriv[j*NFIBRES+k]!=0){
+	  					J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; 
+					}
+      				}
+      				J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+      				J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}
+    			if(m_include_f0){
+				//partial_fsum ///////////
+    				double partial_fsum=1.0;
+    				for(int j=0;j<(NFIBRES);j++)
+					partial_fsum-=fs[j];
+				//////////////////////////
+				double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
+
+    				//derivative with respect to f0
+    				J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; 
+				sig=params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig);
+    				J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals);
+    			}else{
+				sig = params[0]*((1-sumf)*Iso_term+sig);
+	    			J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals);
+    			}
+    			diff = sig - mdata[dir_iter];
+    			J[0] = sig/params[0]; 
+		}
 
-    			//derivative with respect to f0
-    			J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; 
-			sig=params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig);
-    			J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(i,params[1],bvals);
-    		}else{
-			sig = params[0]*((1-sumf)*Iso_term+sig);
-	    		J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(i,params[1],bvals);
-    		}
-    		diff = sig - mdata[i];
-    		J[0] = sig/params[0]; 
+		for (int p=0;p<nparams;p++){ 
+			shared[idB]=2*J[p]*diff;
 
-		for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff;
+			__syncthreads();
+			if(idB==0){
+				for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+					grad[p] += shared[i];
+				}
+			}
+			__syncthreads(); 
+		} 
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 }
 
@@ -417,126 +370,134 @@ __device__ void hess_PVM_single_c(	//INPUT
 					const double*		bvals,
 					const int 		nparams,
 					const bool 		m_include_f0,
+					const int		idB,		
+					double*			shared,		//shared memory
+					double* 		fs,		//shared memory
+					double*			f_deriv,	//shared memory
+					double*			x,		//shared memory
+					double 			&_d,		//shared memory
+					double 			&sumf,		//shared memory
 					//OUTPUT
 					double*			hess)
 {
-  	double _d = lambda2d_gpu(params[1]);
-  	double fs[NFIBRES];
-  	double bs[NFIBRES];
-  	double x[NFIBRES*3];	
-  	double3 xx;
-  	double sumf=0;
-  	double partial_fsum;
-
-  	for(int k=0;k<NFIBRES;k++){
-    		int kk = 2+3*(k);
-    		bs[k]=params[kk];
-    		//partial_fsum ///////////
-		partial_fsum=1.0;
-		for(int j=0;j<k;j++)
-			partial_fsum-=fs[j];
-    		//////////////////////////
-    		fs[k] = beta2f_gpu(params[kk])*partial_fsum;
-    		sumf += fs[k];
-    		x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
-    		x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
-    		x[k*3+2] = cos(params[kk+1]);
+	if(idB<NFIBRES){
+		int kk = 2+3*(idB);
+		double sinth,costh,sinph,cosph;
+		sincos(params[kk+1],&sinth,&costh);
+		sincos(params[kk+2],&sinph,&cosph);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
   	}
+	if(idB==0){
+		_d = lambda2d_gpu(params[1]);
+		sumf=0;
+		double partial_fsum;
+		for(int k=0;k<NFIBRES;k++){
+			int kk = 2+3*(k);
+    			//partial_fsum ///////////
+			partial_fsum=1.0;
+			for(int j=0;j<k;j++)
+				partial_fsum-=fs[j];
+    			//////////////////////////
+			fs[k] = beta2f_gpu(params[kk])*partial_fsum;
+    			sumf += fs[k];
+		}
+		for (int p=0;p<nparams;p++){
+			for (int p2=0;p2<nparams;p2++){ 
+				hess[p*nparams+p2] = 0;
+			}
+		}
+	}
 
-  	////////// fraction deriv //////////////
-  	// f_deriv=fractions_deriv(nfib, fs, bs);  
-  	//////////////////////////////////////
-  	double f_deriv[NFIBRES*NFIBRES];
-  	double fsum;
-  	for (int j=0; j<NFIBRES; j++){
-    		for (int k=0; k<NFIBRES; k++){
-			f_deriv[j*NFIBRES+k]=0;
-    		}
-  	}
+	__syncthreads();
 
-  	for (int j=0; j<NFIBRES; j++){
-    		for (int k=0; k<NFIBRES; k++){
-      			if (j==k){
-				fsum=1; 
-				for (int n=0; n<=(j-1); n++)
-	  			fsum-=fs[n];
-	  			f_deriv[j*NFIBRES+k] = sin(double(2*bs[k]))*fsum;
-      			}
-      			else if (j>k){
-				fsum=0;
-				for (int n=0; n<=(j-1); n++)
-	  				fsum += f_deriv[n*NFIBRES+k];
-				f_deriv[j*NFIBRES+k] += -sin(bs[j])*sin(bs[j])*fsum;
-      			}
-    		} 	   
-  	}
-  	///////////////////////////////
-  	double J[NPARAMS];
-  	double sig,Iso_term;
+  	if(idB<NFIBRES){ 
+		fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); 
+	} 
+
+  	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
+
+	double J[NPARAMS];
+  	double sig;
+	double Iso_term;
+	double3 xx;
+	int dir_iter=idB;
   	double Aniso_terms[NFIBRES];
 
-	for (int p=0;p<nparams;p++){
-		for (int p2=0;p2<nparams;p2++){ 
-			hess[p*nparams+p2] = 0;
+	__syncthreads();
+
+  	for(int dir=0;dir<max_dir;dir++){
+		for (int p=0; p<nparams; p++) J[p]=0;
+		if(dir<ndir){
+    			Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals);  //Precompute some terms for this datapoint
+    			for(int k=0;k<NFIBRES;k++){
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];	
+      				Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals);
+    			}
+    			sig = 0;
+    			for(int k=0;k<NFIBRES;k++){
+      				int kk = 2+3*(k);
+      				xx.x=x[k*3];
+      				xx.y=x[k*3+1];
+      				xx.z=x[k*3+2];		 
+      				sig += fs[k]*Aniso_terms[k];
+      				J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals);	 
+      				J[kk] = 0;
+      				for (int j=0; j<NFIBRES; j++){
+					if (f_deriv[j*NFIBRES+k]!=0)
+	  				J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; 
+      				}
+      				J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+      				J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
+    			}
+    			if(m_include_f0){
+				//partial_fsum ///////////
+	    			double partial_fsum=1.0;
+	    			for(int j=0;j<(NFIBRES);j++)
+					partial_fsum-=fs[j];
+	    			//////////////////////////
+    				double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
+    				//derivative with respect to f0
+    				J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; 
+				sig= params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig);
+    				J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals);
+    			}else{
+	    			sig = params[0]*((1-sumf)*Iso_term+sig);
+	    			J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals);
+    			}
+    			J[0] = sig/params[0]; 
 		}
-	}
-
-  	for(int i=0;i<NDIRECTIONS;i++){
-    		Iso_term=isoterm_PVM_single_c(i,_d,bvals);  //Precompute some terms for this datapoint
-    		for(int k=0;k<NFIBRES;k++){
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];	
-      			Aniso_terms[k]=anisoterm_PVM_single_c(i,_d,xx,bvecs,bvals);
-    		}
-    		sig = 0;
-    		for(int a=0;a<nparams;a++) J[a]=0;
-    		for(int k=0;k<NFIBRES;k++){
-      			int kk = 2+3*(k);
-      			xx.x=x[k*3];
-      			xx.y=x[k*3+1];
-      			xx.z=x[k*3+2];		 
-      			sig += fs[k]*Aniso_terms[k];
-      			J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(i,params[1],xx,bvecs,bvals);	 
-      			J[kk] = 0;
-      			for (int j=0; j<NFIBRES; j++){
-				if (f_deriv[j*NFIBRES+k]!=0)
-	  			J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; 
-      			}
-      			J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-      			J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals);
-    		}
-    		if(m_include_f0){
-			//partial_fsum ///////////
-	    		partial_fsum=1.0;
-	    		for(int j=0;j<(NFIBRES);j++)
-				partial_fsum-=fs[j];
-	    		//////////////////////////
-    			double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum;
-    			//derivative with respect to f0
-    			J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; 
-			sig= params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig);
-    			J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(i,params[1],bvals);
-    		}else{
-	    		sig = params[0]*((1-sumf)*Iso_term+sig);
-	    		J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(i,params[1],bvals);
-    		}
-    		J[0] = sig/params[0]; 
 
 		for (int p=0;p<nparams;p++){
 			for (int p2=p;p2<nparams;p2++){ 
-				hess[p*nparams+p2] += 2*(J[p]*J[p2]);
+
+				shared[idB]=2*(J[p]*J[p2]);
+				__syncthreads();
+				if(idB==0){
+					for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
+						hess[p*nparams+p2] += shared[i];
+					}
+				}
+				__syncthreads(); 
 			}
 		}
+		dir_iter+=THREADS_X_BLOCK_FIT;
   	}
 
-  	for (int j=0; j<nparams; j++) {
-    		for (int i=j+1; i<nparams; i++) {
-     			hess[i*nparams+j]=hess[j*nparams+i];	
-    		}
-  	}
+	if(idB==0){
+  		for (int j=0; j<nparams; j++) {
+    			for (int i=j+1; i<nparams; i++) {
+     				hess[i*nparams+j]=hess[j*nparams+i];	
+    			}
+  		}
+	}
 }
-
 //in diffmodel.cc
 extern "C" __global__ void fit_PVM_single_c_kernel(	//INPUT
 							const double* 		data, 
@@ -550,59 +511,72 @@ extern "C" __global__ void fit_PVM_single_c_kernel(	//INPUT
 							//INPUT - OUTPUT
 							double* 		params)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-   	if (id >=nvox) { return; }	
-
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 3; 
-    	else
-      		nparams = nfib*3 + 2;
-
-   	double myparams[NPARAMS];
-   	double mydata[NDIRECTIONS];
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+
+	__shared__ double shared[THREADS_X_BLOCK_FIT]; 
+	__shared__ double step[NPARAMS];
+	__shared__ double grad[NPARAMS];                          
+   	__shared__ double hess[NPARAMS*NPARAMS]; 	
+	__shared__ double inverse[NPARAMS];
+	__shared__ double pcf;
+	__shared__ double ncf;
+	__shared__ double lambda;
+	__shared__ double cftol;
+	__shared__ double ltol;
+	__shared__ double olambda;
+	__shared__ bool success;    
+	__shared__ bool end;    
+
+	__shared__ double fs[NFIBRES];
+	__shared__ double f_deriv[NFIBRES*NFIBRES];	
+  	__shared__ double x[NFIBRES*3];	
+	__shared__ double _d;
+  	__shared__ double sumf;
+
+	if(idB==0){
+		if (m_include_f0)
+      			nparams = nfib*3 + 3; 
+    		else
+      			nparams = nfib*3 + 2;
 
-	for(int i=0;i<nparams;i++){
-		myparams[i]=params[(id*nparams)+i];
-   	}
+		for(int i=0;i<nparams;i++){
+			myparams[i]=params[(idVOX*nparams)+i];
+   		}
+	}
 
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
+	__syncthreads();
 
-	//if(id==1) for(int i=0;i<nparams;i++)printf("START[%i]: %.20f\n",i,myparams[i]);
 	//do the fit
-	levenberg_marquardt_PVM_single_c_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams);
+	levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,f_deriv,x,_d,sumf,myparams);
 
-	//double m_BIC;
-	//if (m_eval_BIC){  
-    	//	double RSS= cf_PVM_single_c(myparams,mydata,&bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams,m_include_f0); // get the sum of squared residuals
-    	//	m_BIC=NDIRECTIONS*log(double(RSS/NDIRECTIONS))+log(double(NDIRECTIONS))*nparams; // evaluate BIC
-  	//}   NOT USED at the moment
+	__syncthreads();
 
 	// finalise parameters
 	// m_s0-myparams[0] 	m_d-myparams[1] 	m_f-m_th-m_ph-myparams[2,3,4,5, etc..]   	m_f0-myparams[nparams-1]
 	
-	double m_f[NFIBRES]; 					// for partial_fsum
-
-  	myparams[1] = lambda2d_gpu(myparams[1]); 
-  	for(int k=0;k<nfib;k++){
-    		int kk = 2 + 3*(k);
-    		myparams[kk]  = beta2f_gpu(myparams[kk])*partial_fsum_PVM_single_c(m_f,k);
-		m_f[k]=myparams[kk];
-  	}
+	if(idB==0){
+		double m_f[NFIBRES]; 					// for partial_fsum
+
+  		myparams[1] = lambda2d_gpu(myparams[1]); 
+  		for(int k=0;k<nfib;k++){
+    			int kk = 2 + 3*(k);
+    			myparams[kk]  = beta2f_gpu(myparams[kk])*partial_fsum_PVM_single_c(m_f,k);
+			m_f[k]=myparams[kk];
+  		}
   
-  	//if (m_return_fanning)
-    		//Fanning_angles_from_Hessian(); NOT USED at the moment
-  
-  	if (m_include_f0)
-    		myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum_PVM_single_c(m_f,nfib);
+  		if (m_include_f0)
+    			myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum_PVM_single_c(m_f,nfib);
 
-	sort_PVM_single_c(nfib,nparams,myparams);
+		sort_PVM_single_c(nfib,myparams);
 
-	for(int i=0;i<nparams;i++){
-		params[(id*nparams)+i] = myparams[i];
-   	}
+		for(int i=0;i<nparams;i++){
+			params[(idVOX*nparams)+i] = myparams[i];
+   		}
+	}
 }
 
 //in diffmodel.cc
@@ -618,34 +592,125 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 								//OUTPUT
 								double*			residuals)
 {
-	int id = blockIdx.x * blockDim.x + threadIdx.x;	
-   	if (id >=nvox) { return; }	
+	int idB = threadIdx.x;
+	int idVOX = blockIdx.x;
+
+	__shared__ double myparams[NPARAMS];
+	__shared__ int nparams;
+	__shared__ bool my_include_f0;
+	__shared__ double val;
+  	__shared__ double _d;
+  	__shared__ double fs[NFIBRES];
+  	__shared__ double x[NFIBRES*3];	
+  	__shared__ double sumf;
+
+	double predicted_signal;
+	double mydata;
+
+	if(idB==0){
+		if (m_include_f0)
+      			nparams = nfib*3 + 3; 
+    		else
+      			nparams = nfib*3 + 2;
 
-	int nparams;
-	if (m_include_f0)
-      		nparams = nfib*3 + 3; 
-    	else
-      		nparams = nfib*3 + 2;
+		my_include_f0 = includes_f0[idVOX];
 
-	bool my_include_f0 = includes_f0[id];
+		//m_s0-myparams[0]  m_d-myparams[1]  m_d_std-myparams[2]  m_f-m_th-m_ph-myparams[3,4,5,6 etc..]  m_f0-myparams[nparams-1]
+  		
+  		myparams[0]=params[(idVOX*nparams)+0];
+		if(myparams[1]<0)  myparams[1] = 0;	//This can be due to numerical errors..sqrt
+  		else myparams[1] = d2lambda_gpu(params[(idVOX*nparams)+1]);
 
-   	double myparams[NPARAMS];
-   	double mydata[NDIRECTIONS];
+		double partial_fsum;	
+  		for(int k=0;k<nfib;k++){
+    			int kk = 2+3*k;
+			//partial_fsum ///////////
+			partial_fsum=1.0;
+			for(int j=0;j<k;j++)
+				partial_fsum-=fs[j];
+	     		//////////////////////////
+			fs[k] = params[(idVOX*nparams)+kk];
+			double tmpr=fs[k]/partial_fsum;
+    			if (tmpr>1.0) tmpr=1; //This can be due to numerical errors
+			if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt
+    			myparams[kk]   = f2beta_gpu(tmpr);
+    			myparams[kk+1] = params[(idVOX*nparams)+kk+1];
+    			myparams[kk+2] = params[(idVOX*nparams)+kk+2];
+  		}
+  		if (my_include_f0){
+			//partial_fsum ///////////
+			partial_fsum=1.0;
+			for(int j=0;j<NFIBRES;j++)
+				partial_fsum-=fs[j];
+	     		//////////////////////////	
+			double tmpr=params[(idVOX*nparams)+nparams-1]/partial_fsum;
+    			if (tmpr>1.0) tmpr=1; //This can be due to numerical errors..asin
+			if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt
+    			myparams[nparams-1]= f2beta_gpu(tmpr);	
+		}
+	}
 
-	for(int i=0;i<nparams;i++){
-		myparams[i]=params[(id*nparams)+i];
-   	}
+	__syncthreads();
 
-   	for(int i=0;i<NDIRECTIONS;i++){
-		mydata[i]=data[(id*NDIRECTIONS)+i];
-   	}
+	if(idB<nfib){
+		int kk = 2+3*idB;
+		double sinth,costh,sinph,cosph;
+		sincos(myparams[kk+1],&sinth,&costh);
+		sincos(myparams[kk+2],&sinph,&cosph);
+    		x[idB*3] = sinth*cosph;
+    		x[idB*3+1] = sinth*sinph;
+    		x[idB*3+2] = costh;
+  	}
 
-	double predicted_signal[NDIRECTIONS];
+	if(idB==0){
+		double partial_fsum;	
+		sumf=0;
+		for(int k=0;k<nfib;k++){
+    			int kk = 2+3*k;
+			////// partial_fsum //////
+			partial_fsum=1.0;
+			for(int j=0;j<k;j++)
+				partial_fsum-=fs[j];
+    			//////////////////////////
+	    		fs[k] = beta2f_gpu(myparams[kk])*partial_fsum;
+	    		sumf += fs[k];
+		}
+		_d = lambda2d_gpu(myparams[1]);
+	}
 
-	get_prediction_PVM_single_c(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal);
+	int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
+	if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
+	
+	double3 x2;
+	int dir_iter=idB; 
+
+	__syncthreads();
 
-	for(int i=0;i<NDIRECTIONS;i++){		//residuals=m_data-predicted_signal;
-		residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i];
+	for(int dir=0;dir<ndir;dir++){
+		mydata = data[(idVOX*NDIRECTIONS)+dir_iter];
+		predicted_signal=0;	//pred = 0;
+		val = 0.0;
+    		for(int k=0;k<nfib;k++){
+			x2.x=x[k*3];
+			x2.y=x[k*3+1];
+			x2.z=x[k*3+2];	 
+      			val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]);
+    		}	
+    		if (my_include_f0){
+			//partial_fsum ///////////
+			double partial_fsum=1.0;
+			for(int j=0;j<NFIBRES;j++)
+				partial_fsum-=fs[j];
+	     		//////////////////////////
+      			double temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum;
+      			predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val);
+    		} 
+    		else
+      			predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); 
+
+		//residuals=m_data-predicted_signal;
+		residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal;
+
+		dir_iter+=THREADS_X_BLOCK_FIT;
 	}
 }
-
diff --git a/CUDA/levenberg_marquardt.cu b/CUDA/levenberg_marquardt.cu
index d13cca0..03f09da 100644
--- a/CUDA/levenberg_marquardt.cu
+++ b/CUDA/levenberg_marquardt.cu
@@ -11,6 +11,7 @@
 
 #include "solver_mult_inverse.cu"
 #include "diffmodels.cuh"
+#include "dim_blocks.h"
 #include "options.h"
 
 //CPU version in nonlin.h
@@ -27,72 +28,93 @@ __device__ void levenberg_marquardt_PVM_single_gpu(	//INPUT
 							const double*		bvals, 
 							const int 		nparams,
 							const bool 		m_include_f0,
+							const int		idB,
+							double* 		step,		//shared memory
+							double*			grad,           //shared memory     	          
+						   	double* 		hess,		//shared memory
+							double* 		inverse,	//shared memory
+							double 			&pcf,		//shared memory
+							double 			&ncf,		//shared memory
+							double 			&lambda,	//shared memory
+							double 			&cftol,		//shared memory
+							double 			&ltol,		//shared memory
+							double 			&olambda,	//shared memory
+							bool 			&success,    	//shared memory
+							bool 			&end,    	//shared memory
+							double*			shared,		//shared memory
+							double* 		fs,		//shared memory
+						  	double*			x,		//shared memory
+							double 			&_d,		//shared memory
+						  	double 			&sumf,		//shared memory
 							//INPUT-OUTPUT
-							double*			myparams)
+							double*			myparams)	//shared memory
 {
-   	double pcf;
-   	double lambda=0.1;
-   	double cftol=1.0e-8;
-   	double ltol=1.0e20;                  
-
-   	bool success = true;             
-   	double olambda = 0.0;              
-   	double grad[NPARAMS];                          
-   	double hess[NPARAMS*NPARAMS];   
-   	double inverse[NPARAMS];
-   	double step[NPARAMS];	
-
-   	double ncf=0;
-
-   	int maxiter=200;
-   	int niter=0;     
-
-   	pcf=cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0); 
-	
-   	while (!(success&&niter++ >= maxiter)){ 	//if success we not increase niter (first condition is true)
-							//function cost has decreise, we have advanced.
+	int niter=0; 
+	int maxiter=200;
+
+   	if(idB==0){
+		end=false;
+   		lambda=0.1;
+   		cftol=1.0e-8;
+   		ltol=1.0e20;                  
+   		success = true;               
+   		olambda = 0.0;              
+   		ncf=0;  
+	}
+
+   	cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf);  
+	__syncthreads();
+
+   	while (!(success&&niter++>=maxiter)){ 	//if success we not increase niter (first condition is true)
+						//function cost has decreise, we have advanced.
    		if(success){
-    			grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad);   
-    			hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,hess);  
-    		}
-
-    		for (int i=0; i<nparams; i++) {                         
-			hess[(i*nparams)+i]+=lambda-olambda;	
+    			grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,grad); 
+			__syncthreads(); 
+    			hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,hess);  
     		}
 
-    		solver(hess,grad,nparams,inverse);
-
-    		for (int i=0;i<nparams;i++){
-			step[i]=-inverse[i];		
-    		}
+		if(idB==0){
+    			for (int i=0; i<nparams; i++) {                         
+				hess[(i*nparams)+i]+=lambda-olambda;	//Levenberg LM_L
+    			}
 
-   		for(int i=0;i<nparams;i++){
-			step[i]=myparams[i]+step[i];
-   		}
+    			solver(hess,grad,nparams,inverse);
 
-   		ncf = cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0);
+    			for (int i=0;i<nparams;i++){
+				step[i]=-inverse[i];		
+    			}
 
-   		if (success = (ncf < pcf)) {
-			olambda = 0.0;
-        		for(int i=0;i<nparams;i++){
-				myparams[i]=step[i];
+   			for(int i=0;i<nparams;i++){
+				step[i]=myparams[i]+step[i];
    			}
-        		lambda=lambda/10.0;
-
-			if (zero_cf_diff_conv(pcf,ncf,cftol)){
-				return;
-			}
-			pcf=ncf;
-    		}else{
-			olambda=lambda;
-			lambda=lambda*10.0;
-			if(lambda> ltol){ 
-				return;
+		}
+		
+		__syncthreads();
+   		cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); 
+
+		if(idB==0){
+   			if (success = (ncf < pcf)){ 
+				olambda = 0.0;
+        			for(int i=0;i<nparams;i++){
+					myparams[i]=step[i];
+   				}
+        			lambda=lambda/10.0;
+
+				if (zero_cf_diff_conv(pcf,ncf,cftol)){
+					end=true;
+				}
+				pcf=ncf;
+    			}else{
+				olambda=lambda;
+				lambda=lambda*10.0;
+				if(lambda> ltol){ 
+					end=true;
+				}
 			}
     		}	
+		__syncthreads();
+		if(end) return;		
    	}
-	
-	return;
 }
 
 __device__ void levenberg_marquardt_PVM_single_c_gpu(	//INPUT
@@ -101,72 +123,94 @@ __device__ void levenberg_marquardt_PVM_single_c_gpu(	//INPUT
 							const double*		bvals, 
 							const int 		nparams,
 							const bool 		m_include_f0,
+							const int		idB,
+							double* 		step,		//shared memory
+							double*			grad,           //shared memory     	          
+						   	double* 		hess,		//shared memory
+							double* 		inverse,	//shared memory
+							double 			&pcf,		//shared memory
+							double 			&ncf,		//shared memory
+							double 			&lambda,	//shared memory
+							double 			&cftol,		//shared memory
+							double 			&ltol,		//shared memory
+							double 			&olambda,	//shared memory
+							bool 			&success,    	//shared memory
+							bool 			&end,    	//shared memory
+							double*			shared,		//shared memory
+							double* 		fs,		//shared memory
+							double*			f_deriv,	//shared memory
+						  	double*			x,		//shared memory
+							double 			&_d,		//shared memory
+						  	double 			&sumf,		//shared memory
 							//INPUT-OUTPUT
-							double*			myparams)
+							double*			myparams)	//shared memory
 {
-   	double pcf;
-   	double lambda=0.1;
-   	double cftol=1.0e-8;
-   	double ltol=1.0e20;                  
-
-   	bool success = true;             
-   	double olambda = 0.0;              
-   	double grad[NPARAMS];                          
-   	double hess[NPARAMS*NPARAMS];   
-   	double inverse[NPARAMS];
-   	double step[NPARAMS];	
-
-   	double ncf=0;
-
-   	int maxiter=200;
-   	int niter=0;     
-
-   	pcf=cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0);
+	int niter=0; 
+	int maxiter=200;
+
+   	if(idB==0){
+		end=false;
+   		lambda=0.1;
+   		cftol=1.0e-8;
+   		ltol=1.0e20;                  
+   		success = true;               
+   		olambda = 0.0;              
+   		ncf=0;  
+	}
+			
+	cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf);  
+	__syncthreads();
 	
    	while (!(success&&niter++ >= maxiter)){ 	//if success we not increase niter (first condition is true)
 							//function cost has decreise, we have advanced.
    		if(success){
-    			grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad);   
-    			hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,hess);  
+			grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,grad);  
+			__syncthreads();
+    			hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,hess);  
     		}
 
-    		for (int i=0; i<nparams; i++) {                         
-			hess[(i*nparams)+i]+=lambda-olambda;	
-    		}
+		if(idB==0){
+    			for (int i=0; i<nparams; i++) {                         
+				hess[(i*nparams)+i]+=lambda-olambda;	//Levenberg LM_L
+    			}
 
-    		solver(hess,grad,nparams,inverse);
+    			solver(hess,grad,nparams,inverse);
 
-    		for (int i=0;i<nparams;i++){
-			step[i]=-inverse[i];		
-    		}
+    			for (int i=0;i<nparams;i++){
+				step[i]=-inverse[i];		
+    			}
 
-   		for(int i=0;i<nparams;i++){
-			step[i]=myparams[i]+step[i];
-   		}
-
-   		ncf = cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0);
-		
-   		if (success = (ncf < pcf)) {
-			olambda = 0.0;
-        		for(int i=0;i<nparams;i++){
-				myparams[i]=step[i];
+   			for(int i=0;i<nparams;i++){
+				step[i]=myparams[i]+step[i];
    			}
-        		lambda=lambda/10.0;
-
-			if (zero_cf_diff_conv(pcf,ncf,cftol)){
-				return;
-			}
-			pcf=ncf;
-    		}else{
-			olambda=lambda;
-			lambda=lambda*10.0;
-			if(lambda> ltol){ 
-				return;
-			}
-    		}	
+		}
+
+		__syncthreads();
+   		cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); 
+
+		if(idB==0){
+   			if (success = (ncf < pcf)) {
+				olambda = 0.0;
+        			for(int i=0;i<nparams;i++){
+					myparams[i]=step[i];
+   				}
+        			lambda=lambda/10.0;
+
+				if (zero_cf_diff_conv(pcf,ncf,cftol)){
+					end=true;
+				}
+				pcf=ncf;
+    			}else{
+				olambda=lambda;
+				lambda=lambda*10.0;
+				if(lambda> ltol){ 
+					end=true;
+				}
+    			}
+		}
+		__syncthreads();
+		if(end) return;		
    	}
-	
-	return;
 }
 
 
@@ -176,71 +220,93 @@ __device__ void levenberg_marquardt_PVM_multi_gpu(	//INPUT
 							const double*		bvals, 
 							const int 		nparams,
 							const bool 		m_include_f0,
+							const int		idB,
+							double* 		step,		//shared memory
+							double*			grad,           //shared memory     	          
+						   	double* 		hess,		//shared memory
+							double* 		inverse,	//shared memory
+							double 			&pcf,		//shared memory
+							double 			&ncf,		//shared memory
+							double 			&lambda,	//shared memory
+							double 			&cftol,		//shared memory
+							double 			&ltol,		//shared memory
+							double 			&olambda,	//shared memory
+							bool 			&success,    	//shared memory
+							bool 			&end,    	//shared memory
+							double*			shared,		//shared memory
+							double* 		fs,		//shared memory
+						  	double*			x,		//shared memory
+							double 			&_a,		//shared memory
+							double 			&_b,		//shared memory
+						  	double 			&sumf,		//shared memory
 							//INPUT-OUTPUT
-							double*			myparams)
+							double*			myparams)	//shared memory
 {
-   	double pcf;
-   	double lambda=0.1;
-   	double cftol=1.0e-8;
-   	double ltol=1.0e20;                  
-
-   	bool success = true;             
-   	double olambda = 0.0;              
-   	double grad[NPARAMS];                          
-   	double hess[NPARAMS*NPARAMS];   
-   	double inverse[NPARAMS];
-   	double step[NPARAMS];	
-
-   	double ncf=0;
-
-   	int maxiter=200;
-   	int niter=0;     
-
-   	pcf=cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0);
+	int niter=0; 
+	int maxiter=200;
+
+   	if(idB==0){
+		end=false;
+   		lambda=0.1;
+   		cftol=1.0e-8;
+   		ltol=1.0e20;                  
+   		success = true;               
+   		olambda = 0.0;              
+   		ncf=0;  
+	}
+
+	cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,pcf);  
+	__syncthreads();
 	
    	while (!(success&&niter++ >= maxiter)){ 	//if success we not increase niter (first condition is true)
 							//function cost has decreise, we have advanced.
    		if(success){
-    			grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad);   
-    			hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,hess);  
-    		}
-
-    		for (int i=0; i<nparams; i++) {                         
-			hess[(i*nparams)+i]+=lambda-olambda;	
+			grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,grad);  
+			__syncthreads(); 
+    			hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,hess);  
     		}
 
-    		solver(hess,grad,nparams,inverse);
-
-    		for (int i=0;i<nparams;i++){
-			step[i]=-inverse[i];		
-    		}
+		if(idB==0){
+    			for (int i=0; i<nparams; i++) {                         
+				hess[(i*nparams)+i]+=lambda-olambda;	//Levenberg LM_L
+    			}
 
-   		for(int i=0;i<nparams;i++){
-			step[i]=myparams[i]+step[i];
-   		}
+    			solver(hess,grad,nparams,inverse);
 
-   		ncf = cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0);
+    			for (int i=0;i<nparams;i++){
+				step[i]=-inverse[i];		
+    			}
 
-   		if (success = (ncf < pcf)) {
-			olambda = 0.0;
-        		for(int i=0;i<nparams;i++){
-				myparams[i]=step[i];
+   			for(int i=0;i<nparams;i++){
+				step[i]=myparams[i]+step[i];
    			}
-        		lambda=lambda/10.0;
-
-			if (zero_cf_diff_conv(pcf,ncf,cftol)){
-				return;
-			}
-			pcf=ncf;
-    		}else{
-			olambda=lambda;
-			lambda=lambda*10.0;
-			if(lambda> ltol){ 
-				return;
-			}
-    		}	
+		}
+
+		__syncthreads();
+   		cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,ncf); 
+
+		if(idB==0){
+   			if (success = (ncf < pcf)) {
+				olambda = 0.0;
+        			for(int i=0;i<nparams;i++){
+					myparams[i]=step[i];
+   				}
+        			lambda=lambda/10.0;
+
+				if (zero_cf_diff_conv(pcf,ncf,cftol)){
+					end=true;
+				}
+				pcf=ncf;
+    			}else{
+				olambda=lambda;
+				lambda=lambda*10.0;
+				if(lambda> ltol){ 
+					end=true;
+				}
+    			}
+		}
+		__syncthreads();
+		if(end) return;				
    	}
-	
-	return;
 }
 #endif
-- 
GitLab