From 17d8d4bc314195e8b2e833b93afe124fb32e6eb6 Mon Sep 17 00:00:00 2001
From: Moises Fernandez <moisesf@fmrib.ox.ac.uk>
Date: Thu, 11 Apr 2013 15:42:31 +0000
Subject: [PATCH] Changed to use Dynamic Shared Memory and fix curand with a
 number of randoms not multiplo of 2

---
 CUDA/runmcmc.cu | 210 +++++++++++++++++++++++++++++-------------------
 1 file changed, 127 insertions(+), 83 deletions(-)

diff --git a/CUDA/runmcmc.cu b/CUDA/runmcmc.cu
index f0d9354..a4c04a5 100644
--- a/CUDA/runmcmc.cu
+++ b/CUDA/runmcmc.cu
@@ -31,18 +31,16 @@ void init_Fibres_Multifibres(	//INPUT
 				thrust::device_vector<double> 			bvals_gpu,
 				thrust::device_vector<double> 			alpha_gpu,
 				thrust::device_vector<double> 			beta_gpu,
-				string 						output_file, 
+				const int 					ndirections,
 				//OUTPUT
 				thrust::device_vector<FibreGPU>& 		fibres_gpu,
 				thrust::device_vector<MultifibreGPU>& 		multifibres_gpu,
 				thrust::device_vector<double>&			signals_gpu,
 				thrust::device_vector<double>&			isosignals_gpu)
 {
-	std::ofstream myfile;
-	myfile.open (output_file.data(), ios::out | ios::app );
-	myfile << "----------------------------------------------------- " << "\n"; 
-   	myfile << "------MCMC ALGORITHM PART INITIALITATION IN GPU ----- " << "\n"; 
-   	myfile << "----------------------------------------------------- " << "\n"; 	
+	cout << "----------------------------------------------------- " << "\n"; 
+   	cout << "------MCMC ALGORITHM PART INITIALITATION IN GPU ----- " << "\n"; 
+   	cout << "----------------------------------------------------- " << "\n"; 	
 
    	struct timeval t1,t2;
    	double time;
@@ -58,7 +56,7 @@ void init_Fibres_Multifibres(	//INPUT
 
 	int blocks = nvox; 
   	dim3 Dim_Grid_MCMC(blocks, 1);
-  	dim3 Dim_Block_MCMC(THREADS_BLOCK ,1);	///dimensions for MCMC
+  	dim3 Dim_Block_MCMC(THREADS_BLOCK_MCMC ,1);	///dimensions for MCMC
 
 	double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
 	double *params_ptr = thrust::raw_pointer_cast(params_gpu.data());	
@@ -71,13 +69,17 @@ void init_Fibres_Multifibres(	//INPUT
 	double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
 	double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
 
-	init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, nvox, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
+	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (3*nfib + 8)*sizeof(float);
+
+	printf("Shared Memory Used in init_Fibres_Multifibres: %i\n",amount_shared);
+
+	init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC, amount_shared>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, ndirections, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
 	sync_check("init_Fibres_Multifibres_kernel");
 
 	gettimeofday(&t2,NULL);
     	time=timeval_diff(&t2,&t1);
-   	myfile << "TIME TOTAL: " << time << " seconds\n"; 
-	myfile << "--------------------------------------------" << "\n\n" ; 
+   	cout << "TIME TOTAL: " << time << " seconds\n"; 
+	cout << "--------------------------------------------" << "\n\n" ; 
 }
 
 void runmcmc_burnin(	//INPUT
@@ -85,8 +87,8 @@ void runmcmc_burnin(	//INPUT
 			thrust::device_vector<double> 			bvals_gpu,
 			thrust::device_vector<double> 			alpha_gpu,
 			thrust::device_vector<double> 			beta_gpu,
+			const int 					ndirections,
 			double 						seed,
-			string 						output_file, 
 			//INPUT-OUTPUT
 			thrust::device_vector<FibreGPU>& 		fibres_gpu,
 			thrust::device_vector<MultifibreGPU>& 		multifibres_gpu,
@@ -95,11 +97,9 @@ void runmcmc_burnin(	//INPUT
 {
 	xfibresOptions& opts = xfibresOptions::getInstance();
 	
-	std::ofstream myfile;
-	myfile.open (output_file.data(), ios::out | ios::app );
-	myfile << "----------------------------------------------------- " << "\n"; 
-   	myfile << "--------- MCMC ALGORITHM PART BURNIN IN GPU --------- " << "\n"; 
-   	myfile << "----------------------------------------------------- " << "\n"; 	
+	cout << "----------------------------------------------------- " << "\n"; 
+   	cout << "--------- MCMC ALGORITHM PART BURNIN IN GPU --------- " << "\n"; 
+   	cout << "----------------------------------------------------- " << "\n"; 	
 
    	struct timeval t1,t2,t_tot1,t_tot2;
    	double time,timecurand,timemcmc;
@@ -123,12 +123,12 @@ void runmcmc_burnin(	//INPUT
    	unsigned int totalrandoms=(opts.nburn.value() * nvox * nparams);
 
    	cuMemGetInfo(&free,&total);
-   	myfile << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
-   	//4 bytes each float, 2 randoms arrays, and 80% of total memory at this moment 
-   	unsigned int maxrandoms=(free/(4*2))*0.8; 
+   	cout << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
+   	//4 bytes each float, 2 random arrays, and 80% of total memory at this moment 
+   	unsigned int maxrandoms=((free*0.8)/(4*2)); 
 
-   	myfile << "Total randoms: " << totalrandoms << "\n"; 
-   	myfile << "Max randoms: " << maxrandoms << "\n"; 
+   	cout << "Total randoms: " << totalrandoms << "\n"; 
+   	cout << "Max randoms: " << maxrandoms << "\n"; 
    
    	int steps; //num iter if not enough memory
    	int minrandoms; //min num of randoms ensamble
@@ -138,9 +138,9 @@ void runmcmc_burnin(	//INPUT
 	int nrandoms=0;	
 
    	if(totalrandoms>maxrandoms){ 
-		iters_step = maxrandoms / minrandoms; 
-		nrandoms = iters_step*minrandoms;		//nrandoms for each step
-		steps =  (opts.nburn.value()/iters_step);  	//repeat process iterations times, no enough memory for all randoms 			
+		iters_step = maxrandoms / minrandoms; 			//iterations in each step
+		nrandoms = iters_step*minrandoms;			//nrandoms for each step
+		steps =  (opts.nburn.value()/iters_step);  		//repeat process steps times, no enough memory for all randoms 			
    	}else{ 
 		nrandoms = totalrandoms;
 		iters_step= opts.nburn.value();
@@ -148,18 +148,17 @@ void runmcmc_burnin(	//INPUT
   	}
 	if(nrandoms%2){							//CURAND must generates multiples of 2 randoms
 		nrandoms++;
-	}   
-
-   	myfile << "Process " << opts.nburn.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    
+	}
+	
+   	cout << "Process " << opts.nburn.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    
 
    	int last_step = opts.nburn.value() - (iters_step*steps);
-   	int last_randoms= (last_step*minrandoms); 
-	if(last_randoms%2){					//CURAND must generates multiples of 2 randoms
+   	int last_randoms = (last_step*minrandoms);
+	if(last_randoms%2){						//CURAND must generates multiples of 2 randoms
 		last_randoms++;
 	}
 
-
-   	myfile << "Last step with " << last_step << " iterations" << "\n"; 
+   	cout << "Last step with " << last_step << " iterations" << "\n"; 
 	
 	thrust::device_vector<float> randomsN_gpu;
 	thrust::device_vector<float> randomsU_gpu;	
@@ -167,14 +166,14 @@ void runmcmc_burnin(	//INPUT
 	randomsU_gpu.resize(nrandoms);
 
    	cuMemGetInfo(&free,&total);
-   	myfile << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
+   	cout << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
    
   	int blocks = nvox;        
   	dim3 Dim_Grid(blocks, 1);
-  	dim3 Dim_Block(THREADS_BLOCK,1);	//dimensions for MCMC   
+  	dim3 Dim_Block(THREADS_BLOCK_MCMC,1);	//dimensions for MCMC   
 
-   	myfile << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
-   	myfile << "THREADS PER BLOCK : " << THREADS_BLOCK << "\n\n"; 	
+   	cout << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
+   	cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n"; 	
 
    	curandGenerator_t gen;
    	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
@@ -191,31 +190,58 @@ void runmcmc_burnin(	//INPUT
 	MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
 	double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
 	double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
+
+	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 16)*sizeof(int);
+
+	printf("Shared Memory Used in runmcmc_burnin: %i\n",amount_shared);
 	
    	for(int i=0;i<steps;i++){
 
    		gettimeofday(&t1,NULL);
 
-	   	curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
-	   	curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
+	   	curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1); 
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
+	   	status = curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
 
  	   	gettimeofday(&t2,NULL);
     	   	timecurand+=timeval_diff(&t2,&t1);
 
 	   	gettimeofday(&t1,NULL);
 
-	   	runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, nvox, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
+	   	runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
 	   	sync_check("runmcmc_burnin_kernel");
 
  	   	gettimeofday(&t2,NULL);
     	   	timemcmc+=timeval_diff(&t2,&t1);
    	}
 
-   	gettimeofday(&t1,NULL);
+	MultifibreGPU mult = multifibres_gpu[0];
+	FibreGPU fib = fibres_gpu[0];
+
+   	gettimeofday(&t1,NULL); 
 
    	if(nvox!=0){
-   		curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
-   		curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
+   		curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
+   		status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
    	}
 	
    	gettimeofday(&t2,NULL);
@@ -224,24 +250,22 @@ void runmcmc_burnin(	//INPUT
    	gettimeofday(&t1,NULL);
 
    	if(nvox!=0){
-		runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, nvox, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
+		runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
    		sync_check("runmcmc_burnin_kernel");
    	}
 
    	gettimeofday(&t2,NULL);
    	timemcmc+=timeval_diff(&t2,&t1);
 
-
-    	myfile << "TIME CURAND: " << timecurand << " seconds\n"; 
-    	myfile << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
+    	cout << "TIME CURAND: " << timecurand << " seconds\n"; 
+    	cout << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
    
    	curandDestroyGenerator(gen);
 
 	gettimeofday(&t_tot2,NULL);
     	time=timeval_diff(&t_tot2,&t_tot1);
-   	myfile << "TIME TOTAL: " << time << " seconds\n"; 
-	myfile << "--------------------------------------------" << "\n\n" ; 
-	myfile.close();
+   	cout << "TIME TOTAL: " << time << " seconds\n"; 
+	cout << "--------------------------------------------" << "\n\n" ; 
 
    	sync_check("runmcmc_burnin");
 }
@@ -256,8 +280,8 @@ void runmcmc_record(	//INPUT
 			thrust::device_vector<MultifibreGPU> 		multifibres_gpu,
 			thrust::device_vector<double>			signals_gpu,
 			thrust::device_vector<double>			isosignals_gpu,
+			const int 					ndirections,
 			double 						seed,
-			string 						output_file, 
 			//OUTPUT
 			thrust::device_vector<int>&			multirecords_gpu,
 			thrust::device_vector<float>&			rf0_gpu,
@@ -272,11 +296,9 @@ void runmcmc_record(	//INPUT
 {
 	xfibresOptions& opts = xfibresOptions::getInstance();
 	
-	std::ofstream myfile;
-	myfile.open (output_file.data(), ios::out | ios::app );
-	myfile << "----------------------------------------------------- " << "\n"; 
-   	myfile << "--------- MCMC ALGORITHM PART RECORD IN GPU --------- " << "\n"; 
-   	myfile << "----------------------------------------------------- " << "\n"; 	
+	cout << "----------------------------------------------------- " << "\n"; 
+   	cout << "--------- MCMC ALGORITHM PART RECORD IN GPU --------- " << "\n"; 
+   	cout << "----------------------------------------------------- " << "\n"; 	
 
    	struct timeval t1,t2,t_tot1,t_tot2;
    	double time,timecurand,timemcmc;
@@ -302,12 +324,12 @@ void runmcmc_record(	//INPUT
    	unsigned int totalrandoms=(opts.njumps.value() * nvox * nparams);
 
    	cuMemGetInfo(&free,&total);
-   	myfile << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
-   	//4 bytes each float, 2 randoms arrays, and 80% of total memory at this moment 
-   	unsigned int maxrandoms=(free/(4*2))*0.8; 
+   	cout << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
+   	//4 bytes each float, 2 random arrays, and 80% of total memory at this moment 
+   	unsigned int maxrandoms=((free*0.8)/(4*2)); 
 
-   	myfile << "Total randoms: " << totalrandoms << "\n"; 
-   	myfile << "Max randoms: " << maxrandoms << "\n"; 
+   	cout << "Total randoms: " << totalrandoms << "\n"; 
+   	cout << "Max randoms: " << maxrandoms << "\n"; 
    
    	int steps; //num iter if not enough memory
    	int minrandoms; //min num of randoms ensamble
@@ -317,28 +339,27 @@ void runmcmc_record(	//INPUT
 	int nrandoms=0;	
 
    	if(totalrandoms>maxrandoms){ 
-		iters_step = maxrandoms / minrandoms; 
+		iters_step = maxrandoms / minrandoms; 		//iterations in each step
 		nrandoms = iters_step*minrandoms;		//nrandoms for each step
-		steps =  (opts.njumps.value()/iters_step);  	//repeat process iterations times, no enough memory for all randoms 			
+		steps =  (opts.njumps.value()/iters_step);  	//repeat process steps times, no enough memory for all randoms 			
    	}else{ 
 		nrandoms = totalrandoms;
 		iters_step= opts.njumps.value();
 		steps = 0;
-  	}
+  	}   
 	if(nrandoms%2){						//CURAND must generates multiples of 2 randoms
 		nrandoms++;
-	}   
+	}
 
-   	myfile << "Process " << opts.njumps.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    
+   	cout << "Process " << opts.njumps.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    
 
    	int last_step = opts.njumps.value() - (iters_step*steps);
-   	int last_randoms= (last_step*minrandoms); 
+   	int last_randoms = (last_step*minrandoms); 
 	if(last_randoms%2){					//CURAND must generates multiples of 2 randoms
 		last_randoms++;
 	}
 
-
-   	myfile << "Last step with " << last_step << " iterations" << "\n"; 
+   	cout << "Last step with " << last_step << " iterations" << "\n"; 
 	
 	thrust::device_vector<float> randomsN_gpu;
 	thrust::device_vector<float> randomsU_gpu;	
@@ -346,14 +367,14 @@ void runmcmc_record(	//INPUT
 	randomsU_gpu.resize(nrandoms);
 
    	cuMemGetInfo(&free,&total);
-   	myfile << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
+   	cout << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
    
   	int blocks = nvox;        
   	dim3 Dim_Grid(blocks, 1);
-  	dim3 Dim_Block(THREADS_BLOCK,1);	//dimensions for MCMC   
+  	dim3 Dim_Block(THREADS_BLOCK_MCMC,1);	//dimensions for MCMC   
 
-   	myfile << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
-   	myfile << "THREADS PER BLOCK : " << THREADS_BLOCK << "\n\n"; 	
+   	cout << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
+   	cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n"; 	
 
    	curandGenerator_t gen;
    	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
@@ -382,19 +403,33 @@ void runmcmc_record(	//INPUT
 	float *rf_ptr = thrust::raw_pointer_cast(rf_gpu.data());
 	float *rlikelihood_energy_ptr = thrust::raw_pointer_cast(rlikelihood_energy_gpu.data());
 
+	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 18)*sizeof(int);
+
+	printf("Shared Memory Used in runmcmc_record: %i\n",amount_shared);
+
    	for(int i=0;i<steps;i++){
 
    		gettimeofday(&t1,NULL);
 
-	   	curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
-	   	curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
+	   	curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
+	   	status = curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
 
  	   	gettimeofday(&t2,NULL);
     	   	timecurand+=timeval_diff(&t2,&t1);
 
 	   	gettimeofday(&t1,NULL);
 
-	   	runmcmc_record_kernel<<< Dim_Grid, Dim_Block >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, nvox, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
+	   	runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
 	   	sync_check("runmcmc_record_kernel");
 
  	   	gettimeofday(&t2,NULL);
@@ -404,8 +439,18 @@ void runmcmc_record(	//INPUT
    	gettimeofday(&t1,NULL);
 
    	if(nvox!=0){
-   		curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
-   		curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
+   		curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
+   		status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
+		if (status != CURAND_STATUS_SUCCESS)
+		{
+			printf("Failure generating cuda random numbers: %d\n",status);
+			exit(1);
+		}
    	}
 	
    	gettimeofday(&t2,NULL);
@@ -414,7 +459,7 @@ void runmcmc_record(	//INPUT
    	gettimeofday(&t1,NULL);
 
    	if(nvox!=0){
-		runmcmc_record_kernel<<< Dim_Grid, Dim_Block >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, nvox, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
+		runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
    		sync_check("runmcmc_record_kernel");
    	}
 
@@ -422,16 +467,15 @@ void runmcmc_record(	//INPUT
    	timemcmc+=timeval_diff(&t2,&t1);
 
 
-    	myfile << "TIME CURAND: " << timecurand << " seconds\n"; 
-    	myfile << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
+    	cout << "TIME CURAND: " << timecurand << " seconds\n"; 
+    	cout << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
    
    	curandDestroyGenerator(gen);
 
 	gettimeofday(&t_tot2,NULL);
     	time=timeval_diff(&t_tot2,&t_tot1);
-   	myfile << "TIME TOTAL: " << time << " seconds\n"; 
-	myfile << "--------------------------------------------" << "\n\n" ; 
-	myfile.close();
-
+   	cout << "TIME TOTAL: " << time << " seconds\n"; 
+	cout << "--------------------------------------------" << "\n\n" ; 
+	
    	sync_check("runmcmc_record");
 }
-- 
GitLab