-
Moises Fernandez authoredMoises Fernandez authored
runmcmc.cu 18.33 KiB
/* runmcmc.cu
Tim Behrens, Saad Jbabdi, Stam Sotiropoulos, Moises Hernandez - FMRIB Image Analysis Group
Copyright (C) 2005 University of Oxford */
/* CCOPYRIGHT */
#include "xfibresoptions.h"
#include <curand.h>
#include "runmcmc_kernels.cu"
#include "sync_check.h"
#include <host_vector.h>
#include <device_vector.h>
#include <time.h>
#include <sys/time.h>
#include "init_gpu.h"
using namespace Xfibres;
//////////////////////////////////////////////////////
// MCMC IN GPU
//////////////////////////////////////////////////////
void init_Fibres_Multifibres( //INPUT
thrust::device_vector<double> datam_gpu,
thrust::device_vector<double> params_gpu,
thrust::device_vector<float> tau_gpu,
thrust::device_vector<double> bvals_gpu,
thrust::device_vector<double> alpha_gpu,
thrust::device_vector<double> beta_gpu,
const int ndirections,
//OUTPUT
thrust::device_vector<FibreGPU>& fibres_gpu,
thrust::device_vector<MultifibreGPU>& multifibres_gpu,
thrust::device_vector<double>& signals_gpu,
thrust::device_vector<double>& isosignals_gpu)
{
cout << "----------------------------------------------------- " << "\n";
cout << "------MCMC ALGORITHM PART INITIALITATION IN GPU ----- " << "\n";
cout << "----------------------------------------------------- " << "\n";
struct timeval t1,t2;
double time;
gettimeofday(&t1,NULL);
int nvox = multifibres_gpu.size();
xfibresOptions& opts = xfibresOptions::getInstance();
int nfib= opts.nfibres.value();
int nparams_fit = 2+3*opts.nfibres.value();
if(opts.modelnum.value()==2) nparams_fit++;
if(opts.f0.value()) nparams_fit++;
int blocks = nvox;
dim3 Dim_Grid_MCMC(blocks, 1);
dim3 Dim_Block_MCMC(THREADS_BLOCK_MCMC ,1); ///dimensions for MCMC
double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
double *params_ptr = thrust::raw_pointer_cast(params_gpu.data());
float *tau_ptr = thrust::raw_pointer_cast(tau_gpu.data());
double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
FibreGPU *fibres_ptr = thrust::raw_pointer_cast(fibres_gpu.data());
MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (3*nfib + 8)*sizeof(float);
printf("Shared Memory Used in init_Fibres_Multifibres: %i\n",amount_shared);
init_Fibres_Multifibres_kernel<<< Dim_Grid_MCMC, Dim_Block_MCMC, amount_shared>>>(datam_ptr, params_ptr, tau_ptr, bvals_ptr, alpha_ptr, beta_ptr, ndirections, nfib, nparams_fit, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.rician.value(), opts.ardf0.value(), opts.all_ard.value(), opts.no_ard.value(), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
sync_check("init_Fibres_Multifibres_kernel");
gettimeofday(&t2,NULL);
time=timeval_diff(&t2,&t1);
cout << "TIME TOTAL: " << time << " seconds\n";
cout << "--------------------------------------------" << "\n\n" ;
}
void runmcmc_burnin( //INPUT
thrust::device_vector<double> datam_gpu,
thrust::device_vector<double> bvals_gpu,
thrust::device_vector<double> alpha_gpu,
thrust::device_vector<double> beta_gpu,
const int ndirections,
double seed,
//INPUT-OUTPUT
thrust::device_vector<FibreGPU>& fibres_gpu,
thrust::device_vector<MultifibreGPU>& multifibres_gpu,
thrust::device_vector<double>& signals_gpu,
thrust::device_vector<double>& isosignals_gpu)
{
xfibresOptions& opts = xfibresOptions::getInstance();
cout << "----------------------------------------------------- " << "\n";
cout << "--------- MCMC ALGORITHM PART BURNIN IN GPU --------- " << "\n";
cout << "----------------------------------------------------- " << "\n";
struct timeval t1,t2,t_tot1,t_tot2;
double time,timecurand,timemcmc;
time=0;
timecurand=0;
timemcmc=0;
gettimeofday(&t_tot1,NULL);
size_t free,total;
int nvox = multifibres_gpu.size();
int nfib= opts.nfibres.value();
int nparams;
if(opts.f0.value()) nparams=3+nfib*3;
else nparams=2+nfib*3;
if(opts.modelnum.value()==2) nparams++;
if(opts.rician.value()) nparams++;
unsigned int totalrandoms=(opts.nburn.value() * nvox * nparams);
cuMemGetInfo(&free,&total);
cout << "Free memory Before Randoms: "<< free << " ---- Total memory: " << total << "\n";
//4 bytes each float, 2 random arrays, and 80% of total memory at this moment
unsigned int maxrandoms=((free*0.8)/(4*2));
cout << "Total randoms: " << totalrandoms << "\n";
cout << "Max randoms: " << maxrandoms << "\n";
int steps; //num iter if not enough memory
int minrandoms; //min num of randoms ensamble
minrandoms= nvox * nparams;
int iters_step=0;
int nrandoms=0;
if(totalrandoms>maxrandoms){
iters_step = maxrandoms / minrandoms; //iterations in each step
nrandoms = iters_step*minrandoms; //nrandoms for each step
steps = (opts.nburn.value()/iters_step); //repeat process steps times, no enough memory for all randoms
}else{
nrandoms = totalrandoms;
iters_step= opts.nburn.value();
steps = 0;
}
if(nrandoms%2){ //CURAND must generates multiples of 2 randoms
nrandoms++;
}
cout << "Process " << opts.nburn.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";
int last_step = opts.nburn.value() - (iters_step*steps);
int last_randoms = (last_step*minrandoms);
if(last_randoms%2){ //CURAND must generates multiples of 2 randoms
last_randoms++;
}
cout << "Last step with " << last_step << " iterations" << "\n";
thrust::device_vector<float> randomsN_gpu;
thrust::device_vector<float> randomsU_gpu;
randomsN_gpu.resize(nrandoms);
randomsU_gpu.resize(nrandoms);
cuMemGetInfo(&free,&total);
cout << "Free memory after Malloc Randoms: "<< free << " ---- Total memory: " << total << "\n";
int blocks = nvox;
dim3 Dim_Grid(blocks, 1);
dim3 Dim_Block(THREADS_BLOCK_MCMC,1); //dimensions for MCMC
cout << "\n" << "NUM BLOCKS: " << blocks << "\n";
cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n";
curandGenerator_t gen;
curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,seed);
//get pointers
double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
float *randomsN_ptr = thrust::raw_pointer_cast(randomsN_gpu.data());
float *randomsU_ptr = thrust::raw_pointer_cast(randomsU_gpu.data());
FibreGPU *fibres_ptr = thrust::raw_pointer_cast(fibres_gpu.data());
MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 16)*sizeof(int);
printf("Shared Memory Used in runmcmc_burnin: %i\n",amount_shared);
for(int i=0;i<steps;i++){
gettimeofday(&t1,NULL);
curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
status = curandGenerateUniform(gen,randomsU_ptr,nrandoms); //generate randoms
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
gettimeofday(&t2,NULL);
timecurand+=timeval_diff(&t2,&t1);
gettimeofday(&t1,NULL);
runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
sync_check("runmcmc_burnin_kernel");
gettimeofday(&t2,NULL);
timemcmc+=timeval_diff(&t2,&t1);
}
MultifibreGPU mult = multifibres_gpu[0];
FibreGPU fib = fibres_gpu[0];
gettimeofday(&t1,NULL);
if(nvox!=0){
curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); //generate randoms
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
}
gettimeofday(&t2,NULL);
timecurand+=timeval_diff(&t2,&t1);
gettimeofday(&t1,NULL);
if(nvox!=0){
runmcmc_burnin_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr);
sync_check("runmcmc_burnin_kernel");
}
gettimeofday(&t2,NULL);
timemcmc+=timeval_diff(&t2,&t1);
cout << "TIME CURAND: " << timecurand << " seconds\n";
cout << "TIME RUNMCMC: " << timemcmc << " seconds\n";
curandDestroyGenerator(gen);
gettimeofday(&t_tot2,NULL);
time=timeval_diff(&t_tot2,&t_tot1);
cout << "TIME TOTAL: " << time << " seconds\n";
cout << "--------------------------------------------" << "\n\n" ;
sync_check("runmcmc_burnin");
}
void runmcmc_record( //INPUT
thrust::device_vector<double> datam_gpu,
thrust::device_vector<double> bvals_gpu,
thrust::device_vector<double> alpha_gpu,
thrust::device_vector<double> beta_gpu,
thrust::device_vector<FibreGPU> fibres_gpu,
thrust::device_vector<MultifibreGPU> multifibres_gpu,
thrust::device_vector<double> signals_gpu,
thrust::device_vector<double> isosignals_gpu,
const int ndirections,
double seed,
//OUTPUT
thrust::device_vector<int>& multirecords_gpu,
thrust::device_vector<float>& rf0_gpu,
thrust::device_vector<float>& rtau_gpu,
thrust::device_vector<float>& rs0_gpu,
thrust::device_vector<float>& rd_gpu,
thrust::device_vector<float>& rdstd_gpu,
thrust::device_vector<float>& rth_gpu,
thrust::device_vector<float>& rph_gpu,
thrust::device_vector<float>& rf_gpu,
thrust::device_vector<float>& rlikelihood_energy_gpu)
{
xfibresOptions& opts = xfibresOptions::getInstance();
cout << "----------------------------------------------------- " << "\n";
cout << "--------- MCMC ALGORITHM PART RECORD IN GPU --------- " << "\n";
cout << "----------------------------------------------------- " << "\n";
struct timeval t1,t2,t_tot1,t_tot2;
double time,timecurand,timemcmc;
time=0;
timecurand=0;
timemcmc=0;
gettimeofday(&t_tot1,NULL);
size_t free,total;
int totalrecords = (opts.njumps.value()/opts.sampleevery.value());
int nvox = multifibres_gpu.size();
int nfib= opts.nfibres.value();
int nparams;
if(opts.f0.value()) nparams=3+nfib*3;
else nparams=2+nfib*3;
if(opts.modelnum.value()==2) nparams++;
if(opts.rician.value()) nparams++;
unsigned int totalrandoms=(opts.njumps.value() * nvox * nparams);
cuMemGetInfo(&free,&total);
cout << "Free memory Before Randoms: "<< free << " ---- Total memory: " << total << "\n";
//4 bytes each float, 2 random arrays, and 80% of total memory at this moment
unsigned int maxrandoms=((free*0.8)/(4*2));
cout << "Total randoms: " << totalrandoms << "\n";
cout << "Max randoms: " << maxrandoms << "\n";
int steps; //num iter if not enough memory
int minrandoms; //min num of randoms ensamble
minrandoms= nvox * nparams;
int iters_step=0;
int nrandoms=0;
if(totalrandoms>maxrandoms){
iters_step = maxrandoms / minrandoms; //iterations in each step
nrandoms = iters_step*minrandoms; //nrandoms for each step
steps = (opts.njumps.value()/iters_step); //repeat process steps times, no enough memory for all randoms
}else{
nrandoms = totalrandoms;
iters_step= opts.njumps.value();
steps = 0;
}
if(nrandoms%2){ //CURAND must generates multiples of 2 randoms
nrandoms++;
}
cout << "Process " << opts.njumps.value() << " iterations divided in "<< steps << " steps with "<< iters_step << " iterations in each one" << "\n";
int last_step = opts.njumps.value() - (iters_step*steps);
int last_randoms = (last_step*minrandoms);
if(last_randoms%2){ //CURAND must generates multiples of 2 randoms
last_randoms++;
}
cout << "Last step with " << last_step << " iterations" << "\n";
thrust::device_vector<float> randomsN_gpu;
thrust::device_vector<float> randomsU_gpu;
randomsN_gpu.resize(nrandoms);
randomsU_gpu.resize(nrandoms);
cuMemGetInfo(&free,&total);
cout << "Free memory after Malloc Randoms: "<< free << " ---- Total memory: " << total << "\n";
int blocks = nvox;
dim3 Dim_Grid(blocks, 1);
dim3 Dim_Block(THREADS_BLOCK_MCMC,1); //dimensions for MCMC
cout << "\n" << "NUM BLOCKS: " << blocks << "\n";
cout << "THREADS PER BLOCK : " << THREADS_BLOCK_MCMC << "\n\n";
curandGenerator_t gen;
curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen,seed);
//get pointers
double *datam_ptr = thrust::raw_pointer_cast(datam_gpu.data());
double *bvals_ptr = thrust::raw_pointer_cast(bvals_gpu.data());
double *alpha_ptr = thrust::raw_pointer_cast(alpha_gpu.data());
double *beta_ptr = thrust::raw_pointer_cast(beta_gpu.data());
float *randomsN_ptr = thrust::raw_pointer_cast(randomsN_gpu.data());
float *randomsU_ptr = thrust::raw_pointer_cast(randomsU_gpu.data());
FibreGPU *fibres_ptr = thrust::raw_pointer_cast(fibres_gpu.data());
MultifibreGPU *multifibres_ptr = thrust::raw_pointer_cast(multifibres_gpu.data());
double *signals_ptr = thrust::raw_pointer_cast(signals_gpu.data());
double *isosignals_ptr = thrust::raw_pointer_cast(isosignals_gpu.data());
int *multirecords_ptr = thrust::raw_pointer_cast(multirecords_gpu.data());
float *rf0_ptr = thrust::raw_pointer_cast(rf0_gpu.data());
float *rtau_ptr = thrust::raw_pointer_cast(rtau_gpu.data());
float *rs0_ptr = thrust::raw_pointer_cast(rs0_gpu.data());
float *rd_ptr = thrust::raw_pointer_cast(rd_gpu.data());
float *rdstd_ptr = thrust::raw_pointer_cast(rdstd_gpu.data());
float *rth_ptr = thrust::raw_pointer_cast(rth_gpu.data());
float *rph_ptr = thrust::raw_pointer_cast(rph_gpu.data());
float *rf_ptr = thrust::raw_pointer_cast(rf_gpu.data());
float *rlikelihood_energy_ptr = thrust::raw_pointer_cast(rlikelihood_energy_gpu.data());
int amount_shared = (THREADS_BLOCK_MCMC+1)*sizeof(double) + (10*nfib + 2*nparams + 24)*sizeof(float) + (7*nfib + 18)*sizeof(int);
printf("Shared Memory Used in runmcmc_record: %i\n",amount_shared);
for(int i=0;i<steps;i++){
gettimeofday(&t1,NULL);
curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,nrandoms,0,1);
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
status = curandGenerateUniform(gen,randomsU_ptr,nrandoms); //generate randoms
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
gettimeofday(&t2,NULL);
timecurand+=timeval_diff(&t2,&t1);
gettimeofday(&t1,NULL);
runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), iters_step, (i*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);
sync_check("runmcmc_record_kernel");
gettimeofday(&t2,NULL);
timemcmc+=timeval_diff(&t2,&t1);
}
gettimeofday(&t1,NULL);
if(nvox!=0){
curandStatus_t status = curandGenerateNormal(gen,randomsN_ptr,last_randoms,0,1);
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
status = curandGenerateUniform(gen,randomsU_ptr,last_randoms); //generate randoms
if (status != CURAND_STATUS_SUCCESS)
{
printf("Failure generating cuda random numbers: %d\n",status);
exit(1);
}
}
gettimeofday(&t2,NULL);
timecurand+=timeval_diff(&t2,&t1);
gettimeofday(&t1,NULL);
if(nvox!=0){
runmcmc_record_kernel<<< Dim_Grid, Dim_Block, amount_shared >>>(datam_ptr, bvals_ptr, alpha_ptr, beta_ptr, fibres_ptr, multifibres_ptr, signals_ptr, isosignals_ptr, randomsN_ptr, randomsU_ptr, ndirections, nfib, nparams, opts.modelnum.value(), opts.fudge.value(), opts.f0.value(), opts.ardf0.value(), !opts.no_ard.value(), opts.rician.value(), opts.updateproposalevery.value(), last_step, (steps*iters_step), opts.nburn.value(), opts.sampleevery.value(), totalrecords, multirecords_ptr, rf0_ptr, rtau_ptr, rs0_ptr, rd_ptr, rdstd_ptr, rth_ptr, rph_ptr, rf_ptr, rlikelihood_energy_ptr);
sync_check("runmcmc_record_kernel");
}
gettimeofday(&t2,NULL);
timemcmc+=timeval_diff(&t2,&t1);
cout << "TIME CURAND: " << timecurand << " seconds\n";
cout << "TIME RUNMCMC: " << timemcmc << " seconds\n";
curandDestroyGenerator(gen);
gettimeofday(&t_tot2,NULL);
time=timeval_diff(&t_tot2,&t_tot1);
cout << "TIME TOTAL: " << time << " seconds\n";
cout << "--------------------------------------------" << "\n\n" ;
sync_check("runmcmc_record");
}