Skip to content
Snippets Groups Projects
runmcmc.cu 18.33 KiB
/*  runmcmc.cu

    Tim Behrens, Saad Jbabdi, Stam Sotiropoulos, Moises Hernandez  - FMRIB Image Analysis Group

    Copyright (C) 2005 University of Oxford  */

/*  CCOPYRIGHT  */

#include "xfibresoptions.h"
#include <curand.h>
#include "runmcmc_kernels.cu"
#include "sync_check.h"

#include <host_vector.h>
#include <device_vector.h> 

#include <time.h>
#include <sys/time.h>
#include "init_gpu.h"

using namespace Xfibres;

////////////////////////////////////////////////////// 
//   MCMC IN GPU
////////////////////////////////////////////////////// 

void init_Fibres_Multifibres(	//INPUT
				thrust::device_vector<double> 			datam_gpu,
				thrust::device_vector<double> 			params_gpu,
				thrust::device_vector<float> 			tau_gpu,
				thrust::device_vector<double> 			bvals_gpu,
				thrust::device_vector<double> 			alpha_gpu,
				thrust::device_vector<double> 			beta_gpu,
				const int 					ndirections,
				//OUTPUT
				thrust::device_vector<FibreGPU>& 		fibres_gpu,
				thrust::device_vector<MultifibreGPU>& 		multifibres_gpu,
				thrust::device_vector<double>&			signals_gpu,
				thrust::device_vector<double>&			isosignals_gpu)
{
	cout << "----------------------------------------------------- " << "\n"; 
   	cout << "------MCMC ALGORITHM PART INITIALITATION IN GPU ----- " << "\n"; 
   	cout << "----------------------------------------------------- " << "\n"; 	

   	struct timeval t1,t2;
   	double time;
   	gettimeofday(&t1,NULL);

	int nvox = multifibres_gpu.size();

	xfibresOptions& opts = xfibresOptions::getInstance();
	int nfib= opts.nfibres.value();
	int nparams_fit = 2+3*opts.nfibres.value();
	if(opts.modelnum.value()==2) nparams_fit++;
	if(opts.f0.value()) nparams_fit++;

	int blocks = nvox; 
  	dim3 Dim_Grid_MCMC(blocks, 1);
  	dim3 Dim_Block_MCMC(THREADS_BLOCK_MCMC ,1);	///dimensions for MCMC

	double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
	double *params_ptr = thrust::raw_pointer_cast(params_gpu.data());	
	float *tau_ptr = thrust::raw_pointer_cast(tau_gpu.data());	
	double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
	double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
	double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
	FibreGPU *fibres_ptr =  thrust::raw_pointer_cast(fibres_gpu.data());
	MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
	double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
	double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());

	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (3*nfib + 8)*sizeof(float);

	printf("Shared Memory Used in init_Fibres_Multifibres: %i\n",amount_shared);

	init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC, amount_shared>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, ndirections, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
	sync_check("init_Fibres_Multifibres_kernel");

	gettimeofday(&t2,NULL);
    	time=timeval_diff(&t2,&t1);
   	cout << "TIME TOTAL: " << time << " seconds\n"; 
	cout << "--------------------------------------------" << "\n\n" ; 
}

void runmcmc_burnin(	//INPUT
			thrust::device_vector<double> 			datam_gpu,
			thrust::device_vector<double> 			bvals_gpu,
			thrust::device_vector<double> 			alpha_gpu,
			thrust::device_vector<double> 			beta_gpu,
			const int 					ndirections,
			double 						seed,
			//INPUT-OUTPUT
			thrust::device_vector<FibreGPU>& 		fibres_gpu,
			thrust::device_vector<MultifibreGPU>& 		multifibres_gpu,
			thrust::device_vector<double>&			signals_gpu,
			thrust::device_vector<double>&			isosignals_gpu)
{
	xfibresOptions& opts = xfibresOptions::getInstance();
	
	cout << "----------------------------------------------------- " << "\n"; 
   	cout << "--------- MCMC ALGORITHM PART BURNIN IN GPU --------- " << "\n"; 
   	cout << "----------------------------------------------------- " << "\n"; 	

   	struct timeval t1,t2,t_tot1,t_tot2;
   	double time,timecurand,timemcmc;
   	time=0;
   	timecurand=0;
   	timemcmc=0;

   	gettimeofday(&t_tot1,NULL);

   	size_t free,total;
	
	int nvox = multifibres_gpu.size();
   	int nfib= opts.nfibres.value();
	int nparams;

	if(opts.f0.value()) nparams=3+nfib*3;
	else nparams=2+nfib*3;	
	if(opts.modelnum.value()==2) nparams++;
	if(opts.rician.value()) nparams++;
   
   	unsigned int totalrandoms=(opts.nburn.value() * nvox * nparams);

   	cuMemGetInfo(&free,&total);
   	cout << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
   	//4 bytes each float, 2 random arrays, and 80% of total memory at this moment 
   	unsigned int maxrandoms=((free*0.8)/(4*2)); 

   	cout << "Total randoms: " << totalrandoms << "\n"; 
   	cout << "Max randoms: " << maxrandoms << "\n"; 
   
   	int steps; //num iter if not enough memory
   	int minrandoms; //min num of randoms ensamble
   	minrandoms= nvox * nparams;

   	int iters_step=0;
	int nrandoms=0;	

   	if(totalrandoms>maxrandoms){ 
		iters_step = maxrandoms / minrandoms; 			//iterations in each step
		nrandoms = iters_step*minrandoms;			//nrandoms for each step
		steps =  (opts.nburn.value()/iters_step);  		//repeat process steps times, no enough memory for all randoms 			
   	}else{ 
		nrandoms = totalrandoms;
		iters_step= opts.nburn.value();
		steps = 0;
  	}
	if(nrandoms%2){							//CURAND must generates multiples of 2 randoms
		nrandoms++;
	}
	
   	cout << "Process " << opts.nburn.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    

   	int last_step = opts.nburn.value() - (iters_step*steps);
   	int last_randoms = (last_step*minrandoms);
	if(last_randoms%2){						//CURAND must generates multiples of 2 randoms
		last_randoms++;
	}

   	cout << "Last step with " << last_step << " iterations" << "\n"; 
	
	thrust::device_vector<float> randomsN_gpu;
	thrust::device_vector<float> randomsU_gpu;	
	randomsN_gpu.resize(nrandoms);
	randomsU_gpu.resize(nrandoms);

   	cuMemGetInfo(&free,&total);
   	cout << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
   
  	int blocks = nvox;        
  	dim3 Dim_Grid(blocks, 1);
  	dim3 Dim_Block(THREADS_BLOCK_MCMC,1);	//dimensions for MCMC   

   	cout << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
   	cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n"; 	

   	curandGenerator_t gen;
   	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
   	curandSetPseudoRandomGeneratorSeed(gen,seed);

	//get pointers
	double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
	double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
	double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
	double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
	float *randomsN_ptr = thrust::raw_pointer_cast(randomsN_gpu.data());
	float *randomsU_ptr = thrust::raw_pointer_cast(randomsU_gpu.data());
	FibreGPU *fibres_ptr =  thrust::raw_pointer_cast(fibres_gpu.data());
	MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
	double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
	double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());

	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 16)*sizeof(int);

	printf("Shared Memory Used in runmcmc_burnin: %i\n",amount_shared);
	
   	for(int i=0;i<steps;i++){

   		gettimeofday(&t1,NULL);

	   	curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1); 
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
	   	status = curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}

 	   	gettimeofday(&t2,NULL);
    	   	timecurand+=timeval_diff(&t2,&t1);

	   	gettimeofday(&t1,NULL);

	   	runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
	   	sync_check("runmcmc_burnin_kernel");

 	   	gettimeofday(&t2,NULL);
    	   	timemcmc+=timeval_diff(&t2,&t1);
   	}

	MultifibreGPU mult = multifibres_gpu[0];
	FibreGPU fib = fibres_gpu[0];

   	gettimeofday(&t1,NULL); 

   	if(nvox!=0){
   		curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
   		status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
   	}
	
   	gettimeofday(&t2,NULL);
   	timecurand+=timeval_diff(&t2,&t1);

   	gettimeofday(&t1,NULL);

   	if(nvox!=0){
		runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);   
   		sync_check("runmcmc_burnin_kernel");
   	}

   	gettimeofday(&t2,NULL);
   	timemcmc+=timeval_diff(&t2,&t1);

    	cout << "TIME CURAND: " << timecurand << " seconds\n"; 
    	cout << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
   
   	curandDestroyGenerator(gen);

	gettimeofday(&t_tot2,NULL);
    	time=timeval_diff(&t_tot2,&t_tot1);
   	cout << "TIME TOTAL: " << time << " seconds\n"; 
	cout << "--------------------------------------------" << "\n\n" ; 

   	sync_check("runmcmc_burnin");
}


void runmcmc_record(	//INPUT
			thrust::device_vector<double> 			datam_gpu,
			thrust::device_vector<double> 			bvals_gpu,
			thrust::device_vector<double> 			alpha_gpu,
			thrust::device_vector<double> 			beta_gpu,
			thrust::device_vector<FibreGPU> 		fibres_gpu,
			thrust::device_vector<MultifibreGPU> 		multifibres_gpu,
			thrust::device_vector<double>			signals_gpu,
			thrust::device_vector<double>			isosignals_gpu,
			const int 					ndirections,
			double 						seed,
			//OUTPUT
			thrust::device_vector<int>&			multirecords_gpu,
			thrust::device_vector<float>&			rf0_gpu,
			thrust::device_vector<float>&			rtau_gpu,
			thrust::device_vector<float>&			rs0_gpu,
			thrust::device_vector<float>&			rd_gpu,
			thrust::device_vector<float>&			rdstd_gpu,
			thrust::device_vector<float>&			rth_gpu,
			thrust::device_vector<float>&			rph_gpu,
			thrust::device_vector<float>&			rf_gpu,
			thrust::device_vector<float>&			rlikelihood_energy_gpu)
{
	xfibresOptions& opts = xfibresOptions::getInstance();
	
	cout << "----------------------------------------------------- " << "\n"; 
   	cout << "--------- MCMC ALGORITHM PART RECORD IN GPU --------- " << "\n"; 
   	cout << "----------------------------------------------------- " << "\n"; 	

   	struct timeval t1,t2,t_tot1,t_tot2;
   	double time,timecurand,timemcmc;
   	time=0;
   	timecurand=0;
   	timemcmc=0;

   	gettimeofday(&t_tot1,NULL);

   	size_t free,total;

	int totalrecords = (opts.njumps.value()/opts.sampleevery.value()); 
	
	int nvox = multifibres_gpu.size();
   	int nfib= opts.nfibres.value();
	int nparams;

	if(opts.f0.value()) nparams=3+nfib*3;
	else nparams=2+nfib*3;	
	if(opts.modelnum.value()==2) nparams++;
	if(opts.rician.value()) nparams++;
   
   	unsigned int totalrandoms=(opts.njumps.value() * nvox * nparams);

   	cuMemGetInfo(&free,&total);
   	cout << "Free memory Before Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
   	//4 bytes each float, 2 random arrays, and 80% of total memory at this moment 
   	unsigned int maxrandoms=((free*0.8)/(4*2)); 

   	cout << "Total randoms: " << totalrandoms << "\n"; 
   	cout << "Max randoms: " << maxrandoms << "\n"; 
   
   	int steps; //num iter if not enough memory
   	int minrandoms; //min num of randoms ensamble
   	minrandoms= nvox * nparams;

   	int iters_step=0;
	int nrandoms=0;	

   	if(totalrandoms>maxrandoms){ 
		iters_step = maxrandoms / minrandoms; 		//iterations in each step
		nrandoms = iters_step*minrandoms;		//nrandoms for each step
		steps =  (opts.njumps.value()/iters_step);  	//repeat process steps times, no enough memory for all randoms 			
   	}else{ 
		nrandoms = totalrandoms;
		iters_step= opts.njumps.value();
		steps = 0;
  	}   
	if(nrandoms%2){						//CURAND must generates multiples of 2 randoms
		nrandoms++;
	}

   	cout << "Process " << opts.njumps.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";    

   	int last_step = opts.njumps.value() - (iters_step*steps);
   	int last_randoms = (last_step*minrandoms); 
	if(last_randoms%2){					//CURAND must generates multiples of 2 randoms
		last_randoms++;
	}

   	cout << "Last step with " << last_step << " iterations" << "\n"; 
	
	thrust::device_vector<float> randomsN_gpu;
	thrust::device_vector<float> randomsU_gpu;	
	randomsN_gpu.resize(nrandoms);
	randomsU_gpu.resize(nrandoms);

   	cuMemGetInfo(&free,&total);
   	cout << "Free memory after Malloc Randoms: "<< free <<  " ---- Total memory: " << total << "\n";
   
  	int blocks = nvox;        
  	dim3 Dim_Grid(blocks, 1);
  	dim3 Dim_Block(THREADS_BLOCK_MCMC,1);	//dimensions for MCMC   

   	cout << "\n" << "NUM BLOCKS: " << blocks << "\n"; 
   	cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n"; 	

   	curandGenerator_t gen;
   	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
   	curandSetPseudoRandomGeneratorSeed(gen,seed);

	//get pointers
	double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
	double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
	double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
	double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
	float *randomsN_ptr = thrust::raw_pointer_cast(randomsN_gpu.data());
	float *randomsU_ptr = thrust::raw_pointer_cast(randomsU_gpu.data());
	FibreGPU *fibres_ptr =  thrust::raw_pointer_cast(fibres_gpu.data());
	MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
	double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
	double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
	
	int *multirecords_ptr = thrust::raw_pointer_cast(multirecords_gpu.data());
	float *rf0_ptr = thrust::raw_pointer_cast(rf0_gpu.data());
	float *rtau_ptr = thrust::raw_pointer_cast(rtau_gpu.data());
	float *rs0_ptr = thrust::raw_pointer_cast(rs0_gpu.data());
	float *rd_ptr = thrust::raw_pointer_cast(rd_gpu.data());
	float *rdstd_ptr = thrust::raw_pointer_cast(rdstd_gpu.data());
	float *rth_ptr = thrust::raw_pointer_cast(rth_gpu.data());
	float *rph_ptr = thrust::raw_pointer_cast(rph_gpu.data());
	float *rf_ptr = thrust::raw_pointer_cast(rf_gpu.data());
	float *rlikelihood_energy_ptr = thrust::raw_pointer_cast(rlikelihood_energy_gpu.data());

	int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 18)*sizeof(int);

	printf("Shared Memory Used in runmcmc_record: %i\n",amount_shared);

   	for(int i=0;i<steps;i++){

   		gettimeofday(&t1,NULL);

	   	curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
	   	status = curandGenerateUniform(gen,randomsU_ptr,nrandoms);	//generate randoms
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}

 	   	gettimeofday(&t2,NULL);
    	   	timecurand+=timeval_diff(&t2,&t1);

	   	gettimeofday(&t1,NULL);

	   	runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
	   	sync_check("runmcmc_record_kernel");

 	   	gettimeofday(&t2,NULL);
    	   	timemcmc+=timeval_diff(&t2,&t1);
   	}

   	gettimeofday(&t1,NULL);

   	if(nvox!=0){
   		curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
   		status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); 	//generate randoms
		if (status != CURAND_STATUS_SUCCESS)
		{
			printf("Failure generating cuda random numbers: %d\n",status);
			exit(1);
		}
   	}
	
   	gettimeofday(&t2,NULL);
   	timecurand+=timeval_diff(&t2,&t1);

   	gettimeofday(&t1,NULL);

   	if(nvox!=0){
		runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);   
   		sync_check("runmcmc_record_kernel");
   	}

   	gettimeofday(&t2,NULL);
   	timemcmc+=timeval_diff(&t2,&t1);


    	cout << "TIME CURAND: " << timecurand << " seconds\n"; 
    	cout << "TIME RUNMCMC: " << timemcmc << " seconds\n"; 
   
   	curandDestroyGenerator(gen);

	gettimeofday(&t_tot2,NULL);
    	time=timeval_diff(&t_tot2,&t_tot1);
   	cout << "TIME TOTAL: " << time << " seconds\n"; 
	cout << "--------------------------------------------" << "\n\n" ; 
	
   	sync_check("runmcmc_record");
}