From 98f57485e8412d06e08c28f59f96e19ec3687998 Mon Sep 17 00:00:00 2001
From: Moises Fernandez <moisesf@fmrib.ox.ac.uk>
Date: Fri, 19 Apr 2013 15:19:06 +0000
Subject: [PATCH] Avoid extra copies of bvals-bvecs if consider gradient
 nonlinearities is not activated

---
 CUDA/PVM_multi.cu       | 29 ++++++++++++++++----
 CUDA/PVM_single.cu      | 29 ++++++++++++++++----
 CUDA/PVM_single_c.cu    | 29 ++++++++++++++++----
 CUDA/diffmodels.cuh     |  4 +++
 CUDA/fit_gpu_kernels.h  | 12 ++++++---
 CUDA/runmcmc.cu         | 16 +++++++----
 CUDA/runmcmc_kernels.cu | 60 ++++++++++++++++++++++++++++++++++-------
 7 files changed, 147 insertions(+), 32 deletions(-)

diff --git a/CUDA/PVM_multi.cu b/CUDA/PVM_multi.cu
index e00c9d3..1e15913 100644
--- a/CUDA/PVM_multi.cu
+++ b/CUDA/PVM_multi.cu
@@ -400,6 +400,7 @@ extern "C" __global__ void fit_PVM_multi_kernel(	//INPUT
 							const int 		nfib, 	
 							const int		nparams,			
 							const bool 		m_include_f0,
+							const bool		gradnonlin,
 							//OUTPUT
 							double* 		params)
 {
@@ -454,8 +455,16 @@ extern "C" __global__ void fit_PVM_multi_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
   	//do the fit
-	levenberg_marquardt_PVM_multi_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_a,_b,sumf,C,el,indx,myparams);
+	levenberg_marquardt_PVM_multi_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_a,_b,sumf,C,el,indx,myparams);
 
 	__syncthreads();
 
@@ -494,7 +503,8 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
-								const bool* 		includes_f0,
+								const bool		gradnonlin,
+								const bool* 		includes_f0,								
 								//OUTPUT
 								double*			residuals)
 {
@@ -567,6 +577,15 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
+
   	for(int dir=0;dir<ndir;dir++){
 		mydata = data[(idVOX*ndirections)+dir_iter];
   		predicted_signal=0;	//pred = 0;
@@ -575,13 +594,13 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel(	//INPUT
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections);
+      			val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections);
     		}	
     		if (*my_include_f0){
       			double temp_f0=x2f_gpu(myparams[nparams-1]);
-      			predicted_signal = abs(myparams[0])*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*ndirections])+val);
+      			predicted_signal = abs(myparams[0])*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[pos_bvals])+val);
     		}else{
-      			predicted_signal = abs(myparams[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*ndirections])+val); 
+      			predicted_signal = abs(myparams[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[pos_bvals])+val); 
   		}   
 
 		//residuals=m_data-predicted_signal;
diff --git a/CUDA/PVM_single.cu b/CUDA/PVM_single.cu
index ecdf6c9..667e73a 100644
--- a/CUDA/PVM_single.cu
+++ b/CUDA/PVM_single.cu
@@ -390,7 +390,8 @@ extern "C" __global__ void fit_PVM_single_kernel(	//INPUT
 							const int		ndirections,
 							const int 		nfib, 
 							const int		nparams,
-							const bool 		m_include_f0, 
+							const bool 		m_include_f0,
+							const bool		gradnonlin,
 							//INPUT-OUTPUT
 							double* 		params)
 {
@@ -432,8 +433,16 @@ extern "C" __global__ void fit_PVM_single_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
 	// do the fit
-	levenberg_marquardt_PVM_single_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_d,sumf,C,el,indx,myparams);
+	levenberg_marquardt_PVM_single_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_d,sumf,C,el,indx,myparams);
 
 	__syncthreads();
 	
@@ -470,6 +479,7 @@ extern "C" __global__ void get_residuals_PVM_single_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
+								const bool		gradnonlin,
 								const bool* 		includes_f0,
 								//OUTPUT
 								double*			residuals)
@@ -537,6 +547,15 @@ extern "C" __global__ void get_residuals_PVM_single_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
+
 	for(int dir=0;dir<ndir;dir++){
 		mydata = data[(idVOX*ndirections)+dir_iter];
   		predicted_signal=0;	//pred = 0;
@@ -545,13 +564,13 @@ extern "C" __global__ void get_residuals_PVM_single_kernel(	//INPUT
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections);
+      			val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections);
     		}	
     		if (*my_include_f0){
       			double temp_f0=x2f_gpu(myparams[nparams-1]);
-      			predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*ndirections])+val);
+      			predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[pos_bvals])+val);
     		}else{
-      			predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*ndirections])+val); 
+      			predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[pos_bvals])+val); 
 		}
 	
 		//residuals=m_data-predicted_signal;
diff --git a/CUDA/PVM_single_c.cu b/CUDA/PVM_single_c.cu
index a23aa79..04864e8 100644
--- a/CUDA/PVM_single_c.cu
+++ b/CUDA/PVM_single_c.cu
@@ -505,6 +505,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel(	//INPUT
 							const bool		m_eval_BIC,
 							const bool 		m_include_f0,
 							const bool	 	m_return_fanning,
+							const bool		gradnonlin,
 							//INPUT - OUTPUT
 							double* 		params)
 {
@@ -547,8 +548,16 @@ extern "C" __global__ void fit_PVM_single_c_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
 	//do the fit
-	levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,f_deriv,x,_d,sumf,C,el,indx,myparams);
+	levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,f_deriv,x,_d,sumf,C,el,indx,myparams);
 
 	__syncthreads();
 
@@ -596,7 +605,8 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
-								const bool* 		includes_f0,
+								const bool		gradnonlin,
+								const bool* 		includes_f0,								
 								//OUTPUT
 								double*			residuals)
 {
@@ -693,6 +703,15 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 
 	__syncthreads();
 
+	int pos_bvals, pos_bvecs;
+	if(gradnonlin){ 
+		pos_bvals=idVOX*ndirections;
+		pos_bvecs=idVOX*3*ndirections;
+	}else{ 
+		pos_bvals=0;
+		pos_bvecs=0;
+	}
+
 	for(int dir=0;dir<ndir;dir++){
 		mydata = data[(idVOX*ndirections)+dir_iter];
 		predicted_signal=0;	//pred = 0;
@@ -701,7 +720,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 			x2.x=x[k*3];
 			x2.y=x[k*3+1];
 			x2.z=x[k*3+2];	 
-      			val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections);
+      			val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections);
     		}	
     		if (*my_include_f0){
 			//partial_fsum ///////////
@@ -710,9 +729,9 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 				partial_fsum-=fs[j];
 	     		//////////////////////////
       			double temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum;
-      			predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*ndirections])+val);
+      			predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[pos_bvals])+val);
     		}else{
-      			predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*ndirections])+val); 
+      			predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[pos_bvals])+val); 
 		}
 
 		//residuals=m_data-predicted_signal;
diff --git a/CUDA/diffmodels.cuh b/CUDA/diffmodels.cuh
index 777543b..7601885 100644
--- a/CUDA/diffmodels.cuh
+++ b/CUDA/diffmodels.cuh
@@ -18,6 +18,7 @@ void fit_PVM_single(	//INPUT
 			int				ndirections,
 			int 				nfib,	
 			bool 				m_include_f0,
+			bool				gradnonlin,
 			string 				output_file,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -32,6 +33,7 @@ void fit_PVM_single_c(	//INPUT
 			int				ndirections,
 			int 				nfib,		
 			bool 				m_include_f0,
+			bool				gradnonlin,
 			string 				output_file,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -44,6 +46,7 @@ void fit_PVM_multi(	//INPUT
 			int				ndirections,	
 			int				nfib,
 			bool 				m_include_f0,
+			bool				gradnonlin,
 			string 				output_file,
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -60,6 +63,7 @@ void calculate_tau(	//INPUT
 			int 				model,
 			bool 				m_include_f0,
 			bool 				nonlin,
+			bool				gradnonlin,
 			string 				output_file,				
 			//OUTPUT
 			thrust::host_vector<float>&	tau);
diff --git a/CUDA/fit_gpu_kernels.h b/CUDA/fit_gpu_kernels.h
index 62e99bc..8065231 100644
--- a/CUDA/fit_gpu_kernels.h
+++ b/CUDA/fit_gpu_kernels.h
@@ -15,6 +15,7 @@ extern "C" __global__ void fit_PVM_single_kernel(	//INPUT
 							const int 		nfib,
 							const int		nparams, 
 							const bool 		m_include_f0, 
+							const bool		gradnonlin,
 							//INPUT-OUTPUT
 							double* 		params);
 
@@ -29,6 +30,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel(	//INPUT
 							const bool		m_eval_BIC,
 							const bool 		m_include_f0,
 							const bool	 	m_return_fanning,
+							const bool		gradnonlin,
 							//INPUT-OUTPUT
 							double* 		params);
 
@@ -42,6 +44,7 @@ extern "C" __global__ void fit_PVM_multi_kernel(	//INPUT
 							const int 		nfib, 	
 							const int		nparams,	
 							const bool 		m_include_f0,
+							const bool		gradnonlin,
 							//OUTPUT
 							double* 		params);
 
@@ -55,7 +58,8 @@ extern "C" __global__ void get_residuals_PVM_single_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
-								const bool* 		includes_f0,
+								const bool		gradnonlin,
+								const bool* 		includes_f0,								
 								//OUTPUT
 								double*			residuals);
 
@@ -69,7 +73,8 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
-								const bool* 		includes_f0,
+								const bool		gradnonlin,
+								const bool* 		includes_f0,								
 								//OUTPUT
 								double*			residuals);
 
@@ -84,7 +89,8 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel(	//INPUT
 								const int 		nfib, 
 								const int		nparams,
 								const bool 		m_include_f0,
-								const bool* 		includes_f0,
+								const bool		gradnonlin,
+								const bool* 		includes_f0,								
 								//OUTPUT
 								double*			residuals);
 
diff --git a/CUDA/runmcmc.cu b/CUDA/runmcmc.cu
index 3fd3360..13af13c 100644
--- a/CUDA/runmcmc.cu
+++ b/CUDA/runmcmc.cu
@@ -55,6 +55,8 @@ void init_Fibres_Multifibres(	//INPUT
 	if(opts.modelnum.value()==2) nparams_fit++;
 	if(opts.f0.value()) nparams_fit++;
 
+	bool gradnonlin = opts.grad_file.set();
+
 	int blocks = nvox; 
   	dim3 Dim_Grid_MCMC(blocks, 1);
   	dim3 Dim_Block_MCMC(THREADS_BLOCK_MCMC ,1);	///dimensions for MCMC
@@ -74,7 +76,7 @@ void init_Fibres_Multifibres(	//INPUT
 
 	myfile << "Shared Memory Used in init_Fibres_Multifibres: " << amount_shared << "\n";
 
-	init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC, amount_shared>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, ndirections, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
+	init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC, amount_shared>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, ndirections, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), gradnonlin, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
 	sync_check("init_Fibres_Multifibres_kernel");
 
 	gettimeofday(&t2,NULL);
@@ -118,6 +120,8 @@ void runmcmc_burnin(	//INPUT
    	int nfib= opts.nfibres.value();
 	int nparams;
 
+	bool gradnonlin=opts.grad_file.set();
+
 	if(opts.f0.value()) nparams=3+nfib*3;
 	else nparams=2+nfib*3;	
 	if(opts.modelnum.value()==2) nparams++;
@@ -220,7 +224,7 @@ void runmcmc_burnin(	//INPUT
 
 	   	gettimeofday(&t1,NULL);
 
-	   	runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
+	   	runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), gradnonlin, opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
 	   	sync_check("runmcmc_burnin_kernel");
 
  	   	gettimeofday(&t2,NULL);
@@ -253,7 +257,7 @@ void runmcmc_burnin(	//INPUT
    	gettimeofday(&t1,NULL);
 
    	if(nvox!=0){
-		runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
+		runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), gradnonlin, opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
    		sync_check("runmcmc_burnin_kernel");
    	}
 
@@ -319,6 +323,8 @@ void runmcmc_record(	//INPUT
    	int nfib= opts.nfibres.value();
 	int nparams;
 
+	bool gradnonlin=opts.grad_file.set();
+
 	if(opts.f0.value()) nparams=3+nfib*3;
 	else nparams=2+nfib*3;	
 	if(opts.modelnum.value()==2) nparams++;
@@ -430,7 +436,7 @@ void runmcmc_record(	//INPUT
 
 	   	gettimeofday(&t1,NULL);
 
-	   	runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr);   
+	   	runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), gradnonlin, opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr);   
 	   	sync_check("runmcmc_record_kernel");
 
  	   	gettimeofday(&t2,NULL);
@@ -460,7 +466,7 @@ void runmcmc_record(	//INPUT
    	gettimeofday(&t1,NULL);
 
    	if(nvox!=0){
-		runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords,rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr);   
+		runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), gradnonlin, opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords,rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr);   
    		sync_check("runmcmc_record_kernel");
    	}
 
diff --git a/CUDA/runmcmc_kernels.cu b/CUDA/runmcmc_kernels.cu
index dcd4153..5b28537 100644
--- a/CUDA/runmcmc_kernels.cu
+++ b/CUDA/runmcmc_kernels.cu
@@ -146,6 +146,7 @@ extern "C" __global__ void init_Fibres_Multifibres_kernel(	//INPUT
 								const bool 			m_ardf0,	// opts.ardf0.value()
 								const bool 			ard_value,	// opts.all_ard.value()
 								const bool 			no_ard_value,	// opts.no_ard.value()
+								const bool			gradnonlin,
 								//OUTPUT
 								FibreGPU*			fibres,
 								MultifibreGPU*			multifibres,
@@ -297,9 +298,22 @@ extern "C" __global__ void init_Fibres_Multifibres_kernel(	//INPUT
 	for(int i=0; i<mydirs; i++){	
 		int pos = (idVOX*ndirections)+idSubVOX+i*threadsBlock;
 		mydata[i] = datam[pos];
-		mybvals[i] = bvals[pos];
-		myalpha[i] = alpha[pos];
-		mybeta[i] = beta[pos];
+	}
+	if(gradnonlin){
+		for(int i=0; i<mydirs; i++){	
+			int pos = (idVOX*ndirections)+idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
+	}else{
+		for(int i=0; i<mydirs; i++){	
+			int pos = idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
+
 	}
 
 	__syncthreads();
@@ -413,6 +427,7 @@ extern "C" __global__ void runmcmc_burnin_kernel(	//INPUT
 							const bool 			m_ardf0,
 							const bool 			can_use_ard, 
 							const bool 			rician,
+							const bool			gradnonlin,
 							const int 			updateproposalevery, 	//update every this number of iterations	
 							const int 			iterations,		//num of iterations to do this time (maybe is a part of the total)
 							const int 			current_iter,		//the number of the current iteration over the total iterations
@@ -608,11 +623,24 @@ extern "C" __global__ void runmcmc_burnin_kernel(	//INPUT
 		int pos = (*idVOX*ndirections)+idSubVOX+i*threadsBlock;
 		myisosig[i] = isosignals[pos];
 		mydata[i] = datam[pos];
-		mybvals[i] = bvals[pos];
-		myalpha[i] = alpha[pos];
-		mybeta[i] = beta[pos];
 	}
+	if(gradnonlin){
+		for(int i=0; i<mydirs; i++){	
+			int pos = (*idVOX*ndirections)+idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
+	}else{
+		for(int i=0; i<mydirs; i++){	
+			int pos = idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
 
+	}
+	
 	for(int f=0;f<nfib;f++){
 		for(int i=0; i<mydirs; i++){	
 			mysig[i*nfib+f]= signals[(*idVOX*ndirections*nfib)+(f*ndirections)+idSubVOX+i*threadsBlock];	
@@ -1560,6 +1588,7 @@ extern "C" __global__ void runmcmc_record_kernel(	//INPUT
 							const bool 			m_ardf0,
 							const bool 			can_use_ard, 
 							const bool 			rician,
+							const bool			gradnonlin,
 							const int 			updateproposalevery, 	//update every this number of iterations	
 							const int 			iterations,		//num of iterations to do this time (maybe is a part of the total)	
 							const int 			current_iter,		//the number of the current iteration over the total iterations
@@ -1768,9 +1797,22 @@ extern "C" __global__ void runmcmc_record_kernel(	//INPUT
 		int pos = (*idVOX*ndirections)+idSubVOX+i*threadsBlock;
 		myisosig[i] = isosignals[pos];
 		mydata[i] = datam[pos];
-		mybvals[i] = bvals[pos];
-		myalpha[i] = alpha[pos];
-		mybeta[i] = beta[pos];
+	}
+	if(gradnonlin){
+		for(int i=0; i<mydirs; i++){	
+			int pos = (*idVOX*ndirections)+idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
+	}else{
+		for(int i=0; i<mydirs; i++){	
+			int pos = idSubVOX+i*threadsBlock;
+			mybvals[i] = bvals[pos];
+			myalpha[i] = alpha[pos];
+			mybeta[i] = beta[pos];
+		}
+
 	}
 
 	for(int f=0;f<nfib;f++){
-- 
GitLab