From 7f81c0cc4b8eb59d89248fcc1e23d7e09f60abc5 Mon Sep 17 00:00:00 2001
From: Moises Fernandez <moisesf@fmrib.ox.ac.uk>
Date: Thu, 11 Apr 2013 15:00:13 +0000
Subject: [PATCH] Changed to use Dynamic Shared Memory

---
 CUDA/diffmodels.cu  |  78 +++++++++++++++++++-----------
 CUDA/diffmodels.cuh | 115 ++++++++++++++++++++++++++------------------
 2 files changed, 117 insertions(+), 76 deletions(-)

diff --git a/CUDA/diffmodels.cu b/CUDA/diffmodels.cu
index 7ae2b61..72f63d0 100644
--- a/CUDA/diffmodels.cu
+++ b/CUDA/diffmodels.cu
@@ -15,7 +15,6 @@
 #include "PVM_single_c.cu"
 #include "PVM_multi.cu"
 #include "fit_gpu_kernels.h"
-#include "dim_blocks.h"	
 #include "sync_check.h"
 
 #include <time.h>
@@ -33,7 +32,8 @@ void fit_PVM_single(	//INPUT
 			const vector<Matrix> 		bvals_vec,
 			thrust::device_vector<double> 	datam_gpu, 
 			thrust::device_vector<double>	bvecs_gpu, 
-			thrust::device_vector<double>	bvals_gpu,	
+			thrust::device_vector<double>	bvals_gpu,
+			int				ndirections,	
 			bool 				m_include_f0,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu)
@@ -87,9 +87,13 @@ void fit_PVM_single(	//INPUT
 
 	int blocks = nvox;
    	dim3 Dim_Grid(blocks,1);
-  	dim3 Dim_Block(THREADS_X_BLOCK_FIT,1);
+  	dim3 Dim_Block(THREADS_BLOCK_FIT,1);
+
+	int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+8)*sizeof(double)+(2+nparams)*sizeof(int);
+
+	printf("Shared Memory Used in fit_PVM_single: %i\n",amount_shared);
 
-	fit_PVM_single_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, m_include_f0, thrust::raw_pointer_cast(params_gpu.data()));
+	fit_PVM_single_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(params_gpu.data()));
 	sync_check("fit_PVM_single_kernel");
 }
 
@@ -99,7 +103,8 @@ void fit_PVM_single_c(	//INPUT
 			const vector<Matrix> 		bvals_vec,
 			thrust::device_vector<double> 	datam_gpu, 
 			thrust::device_vector<double>	bvecs_gpu, 
-			thrust::device_vector<double>	bvals_gpu,	
+			thrust::device_vector<double>	bvals_gpu,
+			int				ndirections,		
 			bool 				m_include_f0,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu)
@@ -151,9 +156,13 @@ void fit_PVM_single_c(	//INPUT
 
 	int blocks = nvox;
    	dim3 Dim_Grid(blocks,1);
-  	dim3 Dim_Block(THREADS_X_BLOCK_FIT,1);
+  	dim3 Dim_Block(THREADS_BLOCK_FIT,1);
+
+	int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+nfib*nfib+8)*sizeof(double)+(2+nparams)*sizeof(int);
+
+	printf("Shared Memory Used in fit_PVM_single_c: %i\n",amount_shared);
 
-	fit_PVM_single_c_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, false, m_include_f0, false, thrust::raw_pointer_cast(params_gpu.data()));
+	fit_PVM_single_c_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, false, m_include_f0, false, thrust::raw_pointer_cast(params_gpu.data()));
 	sync_check("fit_PVM_single_c_kernel");
 }
 
@@ -161,7 +170,8 @@ void fit_PVM_multi(	//INPUT
 			thrust::device_vector<double> 	datam_gpu, 
 			thrust::device_vector<double>	bvecs_gpu, 
 			thrust::device_vector<double>	bvals_gpu,	
-			int 				nvox,		
+			int 				nvox,	
+			int				ndirections,		
 			bool 				m_include_f0,
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu)
@@ -171,7 +181,7 @@ void fit_PVM_multi(	//INPUT
 
 	int blocks = nvox;
    	dim3 Dim_Grid(blocks,1);
-  	dim3 Dim_Block(THREADS_X_BLOCK_FIT,1);
+  	dim3 Dim_Block(THREADS_BLOCK_FIT,1);
 
 	int nparams;
 	if (m_include_f0)
@@ -184,7 +194,11 @@ void fit_PVM_multi(	//INPUT
 	thrust::copy(params_gpu.begin(), params_gpu.end(), params_PVM_single_c_gpu.begin());	
 								//maybe 1 block finish before other one read their params.
 
-	fit_PVM_multi_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_PVM_single_c_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, m_include_f0, thrust::raw_pointer_cast(params_gpu.data()));
+	int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+9)*sizeof(double)+(2+nparams)*sizeof(int);
+
+	printf("Shared Memory Used in fit_PVM_multi: %i\n",amount_shared);
+
+	fit_PVM_multi_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_PVM_single_c_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(params_gpu.data()));
 	sync_check("fit_PVM_multi_kernel");
 }
 
@@ -194,16 +208,14 @@ void calculate_tau(	//INPUT
 			thrust::device_vector<double>	bvecs_gpu, 
 			thrust::device_vector<double>	bvals_gpu,
 			thrust::host_vector<int>	vox_repeat,
-			int				nrepeat,		
-			string 				output_file,	
+			int				nrepeat,
+			int				ndirections,		
 			//OUTPUT
 			thrust::host_vector<float>&	tau)
 {
-	std::ofstream myfile;
-	myfile.open (output_file.data(), ios::out | ios::app );
-	myfile << "----------------------------------------------------- " << "\n"; 
-   	myfile << "--------- CALCULATE TAU/RESIDULAS IN GPU ------------ " << "\n"; 
-   	myfile << "----------------------------------------------------- " << "\n"; 
+	cout << "----------------------------------------------------- " << "\n"; 
+   	cout << "--------- CALCULATE TAU/RESIDULAS IN GPU ------------ " << "\n"; 
+   	cout << "----------------------------------------------------- " << "\n"; 
 
 	struct timeval t1,t2;
    	double time;
@@ -212,6 +224,13 @@ void calculate_tau(	//INPUT
 	xfibresOptions& opts = xfibresOptions::getInstance();
 	int nvox = vox_repeat.size(); 
 	int nfib = opts.nfibres.value();
+
+	int nparams;
+	if (opts.f0.value())
+      		nparams = nfib*3 + 3; 
+    	else
+      		nparams = nfib*3 + 2;
+	if(opts.modelnum.value()==2) nparams++;
 	
 	thrust::device_vector<bool> includes_f0_gpu;
 	includes_f0_gpu.resize(nvox);
@@ -225,42 +244,43 @@ void calculate_tau(	//INPUT
 
 	int blocks = nvox;
    	dim3 Dim_Grid(blocks,1);
-  	dim3 Dim_Block(THREADS_X_BLOCK_FIT,1);
+  	dim3 Dim_Block(THREADS_BLOCK_FIT,1);
+
+	int amount_shared = (nparams+4*nfib+3)*sizeof(double) + sizeof(int);
 
 	thrust::device_vector<double> residuals_gpu;
-	residuals_gpu.resize(nvox*NDIRECTIONS);
+	residuals_gpu.resize(nvox*ndirections);
 
 	if(opts.modelnum.value()==1){
 		if(opts.nonlin.value()){ 
-			get_residuals_PVM_single_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
+			get_residuals_PVM_single_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
 			sync_check("get_residuals_PVM_single_kernel");
 
 		}else{
-			get_residuals_PVM_single_c_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
+			get_residuals_PVM_single_c_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
 			sync_check("get_residuals_PVM_single_c_kernel");
 		}
 	}else{
       		//model 2 : non-mono-exponential
-		get_residuals_PVM_multi_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
+		get_residuals_PVM_multi_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data()));
 		sync_check("get_residuals_PVM_multi_kernel");
 	}
 
 	thrust::host_vector<double> residuals_host;
-	residuals_host.resize(nvox*NDIRECTIONS);
+	residuals_host.resize(nvox*ndirections);
 	thrust::copy(residuals_gpu.begin(), residuals_gpu.end(), residuals_host.begin());	
 
-	ColumnVector res(NDIRECTIONS);
+	ColumnVector res(ndirections);
 	for(int vox=0;vox<nvox;vox++){	
-		for(int i=0;i<NDIRECTIONS;i++) res(i+1)= residuals_host[vox*NDIRECTIONS+i];  
+		for(int i=0;i<ndirections;i++) res(i+1)= residuals_host[vox*ndirections+i];  
 
-		float variance=var(res).AsScalar();  
+		float variance=var(res).AsScalar();
 		tau[vox]=1.0/variance;
 	}
 
 	gettimeofday(&t2,NULL);
     	time=timeval_diff(&t2,&t1);
-   	myfile << "TIME TOTAL: " << time << " seconds\n"; 
-	myfile << "--------------------------------------------" << "\n\n" ; 
-	myfile.close();					
+   	cout << "TIME TOTAL: " << time << " seconds\n"; 
+	cout << "--------------------------------------------" << "\n\n" ; 				
 }
 
diff --git a/CUDA/diffmodels.cuh b/CUDA/diffmodels.cuh
index d97bfe9..4975789 100644
--- a/CUDA/diffmodels.cuh
+++ b/CUDA/diffmodels.cuh
@@ -14,7 +14,8 @@ void fit_PVM_single(	//INPUT
 			const vector<Matrix> 		bvals_vec,
 			thrust::device_vector<double> 	datam_gpu, 
 			thrust::device_vector<double>	bvecs_gpu, 
-			thrust::device_vector<double>	bvals_gpu,	
+			thrust::device_vector<double>	bvals_gpu,
+			int				ndirections,		
 			bool 				m_include_f0,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -25,7 +26,8 @@ void fit_PVM_single_c(	//INPUT
 			const vector<Matrix> 		bvals_vec,
 			thrust::device_vector<double> 	datam_gpu, 
 			thrust::device_vector<double>	bvecs_gpu, 
-			thrust::device_vector<double>	bvals_gpu,	
+			thrust::device_vector<double>	bvals_gpu,
+			int				ndirections,		
 			bool 				m_include_f0,		
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -35,6 +37,7 @@ void fit_PVM_multi(	//INPUT
 			thrust::device_vector<double>	bvecs_gpu, 
 			thrust::device_vector<double>	bvals_gpu,	
 			int 				nvox,		
+			int				ndirections,	
 			bool 				m_include_f0,
 			//OUTPUT
 			thrust::device_vector<double>&	params_gpu);
@@ -45,8 +48,8 @@ void calculate_tau(	//INPUT
 			thrust::device_vector<double>	bvecs_gpu, 
 			thrust::device_vector<double>	bvals_gpu,
 			thrust::host_vector<int>	vox_repeat,
-			int				nrepeat,		
-			string 				output_file,	
+			int				nrepeat,
+			int				ndirections,				
 			//OUTPUT
 			thrust::host_vector<float>&	tau);
 
@@ -55,31 +58,35 @@ __device__ void cf_PVM_single(		//INPUT
 					const double*			params,
 					const double*			data,
 					const double*			bvecs, 
-					const double*			bvals,
+					const double*			bvals,	
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams, 
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double*				_d,
+					double*				sumf,
 					//OUTPUT
-					double 				&cfv);
+					double* 			cfv);
 
 __device__ void grad_PVM_single(	//INPUT
 					const double*			params,
 					const double*			data,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double* 			_d,
+					double* 			sumf,
 					//OUTPUT
 					double*				grad);
 
@@ -87,14 +94,16 @@ __device__ void hess_PVM_single(	//INPUT
 					const double*			params,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,
+					const int			idSubVOX,
+					double*				reduction,
 					double* 			fs,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double* 			_d,
+					double* 			sumf,
 					//OUTPUT
 					double*				hess);
 
@@ -103,16 +112,18 @@ __device__ void cf_PVM_single_c(	//INPUT
 					const double*			data,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams, 
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,
+					const int			idSubVOX,
+					double*				reduction,
 					double* 			fs,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double* 			_d,
+					double* 			sumf,
 					//OUTPUT
-					double 				&cfv);
+					double* 			cfv);
 
 
 __device__ void grad_PVM_single_c(	//INPUT
@@ -120,15 +131,17 @@ __device__ void grad_PVM_single_c(	//INPUT
 					const double*			data,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double* 			f_deriv,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double* 			_d,
+					double* 			sumf,
 					//OUTPUT
 					double*				grad);
 
@@ -136,15 +149,17 @@ __device__ void hess_PVM_single_c(	//INPUT
 					const double*			params,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double* 			f_deriv,
 					double*				x,
-					double 				&_d,
-					double 				&sumf,
+					double* 			_d,
+					double* 			sumf,
 					//OUTPUT
 					double*				hess);
 
@@ -153,32 +168,36 @@ __device__ void cf_PVM_multi(		//INPUT
 					const double*			data,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams, 
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double*				x,
-					double 				&_a,
-					double 				&_b,
-					double 				&sumf,
+					double* 			_a,
+					double* 			_b,
+					double* 			sumf,
 					//OUTPUT
-					double 				&cfv);
+					double* 			cfv);
 
 __device__ void grad_PVM_multi(		//INPUT
 					const double*			params,
 					const double*			data,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double*				x,
-					double 				&_a,
-					double 				&_b,
-					double 				&sumf,
+					double* 			_a,
+					double* 			_b,
+					double* 			sumf,
 					//OUTPUT
 					double*				grad);
 
@@ -186,14 +205,16 @@ __device__ void hess_PVM_multi(		//INPUT
 					const double*			params,
 					const double*			bvecs, 
 					const double*			bvals,
+					const int			ndirections,
+					const int			nfib,
 					const int 			nparams,
 					const bool 			m_include_f0,
-					const int			idB,
-					double*				shared,					
+					const int			idSubVOX,
+					double*				reduction,					
 					double* 			fs,
 					double*				x,
-					double 				&_a,
-					double 				&_b,
-					double 				&sumf,
+					double* 			_a,
+					double*				_b,
+					double* 			sumf,
 					//OUTPUT
 					double*				hess);
-- 
GitLab