diff --git a/CUDA/diffmodels.cu b/CUDA/diffmodels.cu new file mode 100644 index 0000000000000000000000000000000000000000..12288e805375747bbef1a58a400a777263c201ec --- /dev/null +++ b/CUDA/diffmodels.cu @@ -0,0 +1,248 @@ +#include "newmat.h" +#include "newimage/newimageall.h" +#include "xfibresoptions.h" +#include "diffmodels.h" + +#include "PVM_single.cu" +#include "PVM_single_c.cu" +#include "PVM_multi.cu" +#include "fit_gpu_kernels.h" +#include "dim_blocks.h" +#include "sync_check.h" + +#include <time.h> +#include <sys/time.h> +#include "init_gpu.h" + +using namespace Xfibres; + +////////////////////////////////////////////////////// +// FIT IN GPU +////////////////////////////////////////////////////// +void fit_PVM_single( //INPUT + const vector<ColumnVector> datam_vec, + const vector<Matrix> bvecs_vec, + const vector<Matrix> bvals_vec, + thrust::device_vector<double> datam_gpu, + thrust::device_vector<double> bvecs_gpu, + thrust::device_vector<double> bvals_gpu, + bool m_include_f0, + //OUTPUT + thrust::device_vector<double>& params_gpu) +{ + xfibresOptions& opts = xfibresOptions::getInstance(); + int nvox = datam_vec.size(); + int nfib = opts.nfibres.value(); + int nparams; + if (m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; + + for(int vox=0;vox<nvox;vox++){ + // initialise with a tensor + DTI dti(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox]); + dti.linfit(); + + // set starting parameters for nonlinear fitting + float _th,_ph; + cart2sph(dti.get_v1(),_th,_ph); + + params_gpu[vox*nparams] = dti.get_s0(); + //start(2) = dti.get_md()>0?dti.get_md()*2:0.001; // empirically found that d~2*MD + params_gpu[vox*nparams+1] = dti.get_l1()>0?dti.get_l1():0.002; // empirically found that d~L1 + params_gpu[vox*nparams+2] = dti.get_fa()<1?f2x(dti.get_fa()):f2x(0.95); // first pvf = FA + params_gpu[vox*nparams+3] = _th; + params_gpu[vox*nparams+4] = _ph; + float sumf=x2f(params_gpu[vox*nparams+2]); + float tmpsumf=sumf; + for(int ii=2,i=5;ii<=nfib;ii++,i+=3){ + float denom=2; + do{ + params_gpu[vox*nparams+i] = f2x(x2f(params_gpu[vox*nparams+i-3])/denom); + denom *= 2; + tmpsumf = sumf + x2f(params_gpu[vox*nparams+i]); + }while(tmpsumf>=1); + sumf += x2f(params_gpu[vox*nparams+i]); + cart2sph(dti.get_v(ii),_th,_ph); + params_gpu[vox*nparams+i+1] = _th; + params_gpu[vox*nparams+i+2] = _ph; + } + if (m_include_f0) + params_gpu[vox*nparams+nparams-1]=f2x(FSMALL); + } + + int blocks = nvox/THREADS_X_BLOCK_FIT; + if (nvox % THREADS_X_BLOCK_FIT) blocks++; + dim3 Dim_Grid(blocks,1); + dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); + + fit_PVM_single_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, m_include_f0, thrust::raw_pointer_cast(params_gpu.data())); + sync_check("fit_PVM_single_kernel"); +} + +void fit_PVM_single_c( //INPUT + const vector<ColumnVector> datam_vec, + const vector<Matrix> bvecs_vec, + const vector<Matrix> bvals_vec, + thrust::device_vector<double> datam_gpu, + thrust::device_vector<double> bvecs_gpu, + thrust::device_vector<double> bvals_gpu, + bool m_include_f0, + //OUTPUT + thrust::device_vector<double>& params_gpu) +{ + xfibresOptions& opts = xfibresOptions::getInstance(); + int nvox = datam_vec.size(); + int nfib = opts.nfibres.value(); + int nparams; + if (m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; + + for(int vox=0;vox<nvox;vox++){ + // initialise with a tensor + DTI dti(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox]); + dti.linfit(); + + // set starting parameters for nonlinear fitting + float _th,_ph; + cart2sph(dti.get_v1(),_th,_ph); + + ColumnVector start(nparams); + //Initialize the non-linear fitter. Use the DTI estimates for most parameters, apart from the volume fractions + start(1) = dti.get_s0(); + //start(2) = d2lambda(dti.get_md()>0?dti.get_md()*2:0.001); // empirically found that d~2*MD + start(2) = d2lambda(dti.get_l1()>0?dti.get_l1():0.002); // empirically found that d~L1 + start(4) = _th; + start(5) = _ph; + for(int ii=2,i=6;ii<=nfib;ii++,i+=3){ + cart2sph(dti.get_v(ii),_th,_ph); + start(i+1) = _th; + start(i+2) = _ph; + } + + // do a better job for initializing the volume fractions + PVM_single_c pvm(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox],opts.nfibres.value(),false,m_include_f0,false); + pvm.fit_pvf(start); + + for(int i=0;i<nparams;i++){ + params_gpu[vox*nparams+i]=start(i+1); + } + } + + int blocks = nvox/THREADS_X_BLOCK_FIT; + if (nvox % THREADS_X_BLOCK_FIT) blocks++; + dim3 Dim_Grid(blocks,1); + dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); + + fit_PVM_single_c_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, false, m_include_f0, false, thrust::raw_pointer_cast(params_gpu.data())); + sync_check("fit_PVM_single_c_kernel"); +} + +void fit_PVM_multi( //INPUT + thrust::device_vector<double> datam_gpu, + thrust::device_vector<double> bvecs_gpu, + thrust::device_vector<double> bvals_gpu, + int nvox, + bool m_include_f0, + //OUTPUT + thrust::device_vector<double>& params_gpu) +{ + xfibresOptions& opts = xfibresOptions::getInstance(); + int nfib = opts.nfibres.value(); + + int blocks = nvox/THREADS_X_BLOCK_FIT; + if (nvox % THREADS_X_BLOCK_FIT) blocks++; + dim3 Dim_Grid(blocks,1); + dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); + + int nparams; + if (m_include_f0) + nparams = nfib*3 + 4; + else + nparams = nfib*3 + 3; + + thrust::device_vector<double> params_PVM_single_c_gpu; //copy params to an auxiliar structure because there are different number of nparams + params_PVM_single_c_gpu.resize(nvox*nparams); //between single_c and multi. We must read and write in different structures, + thrust::copy(params_gpu.begin(), params_gpu.end(), params_PVM_single_c_gpu.begin()); + //maybe 1 block finish before other one read their params. + + fit_PVM_multi_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_PVM_single_c_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, nfib, m_include_f0, thrust::raw_pointer_cast(params_gpu.data())); + sync_check("fit_PVM_multi_kernel"); +} + +void calculate_tau( //INPUT + thrust::device_vector<double> datam_gpu, + thrust::device_vector<double>& params_gpu, + thrust::device_vector<double> bvecs_gpu, + thrust::device_vector<double> bvals_gpu, + thrust::host_vector<int>& vox_repeat, + int nrepeat, + string output_file, + //OUTPUT + thrust::host_vector<float>& tau) +{ + std::ofstream myfile; + myfile.open (output_file.data(), ios::out | ios::app ); + myfile << "----------------------------------------------------- " << "\n"; + myfile << "--------- CALCULATE TAU/RESIDULAS IN GPU ------------ " << "\n"; + myfile << "----------------------------------------------------- " << "\n"; + + struct timeval t1,t2; + double time; + gettimeofday(&t1,NULL); + + xfibresOptions& opts = xfibresOptions::getInstance(); + int nvox = vox_repeat.size(); + int nfib = opts.nfibres.value(); + + thrust::device_vector<bool> includes_f0_gpu; + includes_f0_gpu.resize(nvox); + thrust::fill(includes_f0_gpu.begin(), includes_f0_gpu.end(), opts.f0.value()); + + if(opts.f0.value()){ + for(int i=0;i<nrepeat;i++){ + includes_f0_gpu[vox_repeat[i]]=false; //if has been reprocessed f0 will be 0. + } + } + + int blocks = nvox/THREADS_X_BLOCK_FIT; + if (nvox % THREADS_X_BLOCK_FIT) blocks++; + dim3 Dim_Grid(blocks,1); + dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); + + thrust::device_vector<double> residuals_gpu; + residuals_gpu.resize(nvox*NDIRECTIONS); + + if(opts.modelnum.value()==1){ + if(opts.nonlin.value()){ + get_residuals_PVM_single_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + sync_check("get_residuals_PVM_single_kernel"); + + }else{ + get_residuals_PVM_single_c_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + sync_check("get_residuals_PVM_single_c_kernel"); + } + }else{ + //model 2 : non-mono-exponential + get_residuals_PVM_multi_kernel<<<Dim_Grid, Dim_Block>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, nfib, opts.f0.value(), thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + sync_check("get_residuals_PVM_multi_kernel"); + } + + ColumnVector res(NDIRECTIONS); + for(int vox=0;vox<nvox;vox++){ + for(int i=0;i<NDIRECTIONS;i++) res(i+1)= residuals_gpu[vox*NDIRECTIONS+i]; + + float variance=var(res).AsScalar(); + tau[vox]=1.0/variance; + } + + gettimeofday(&t2,NULL); + time=timeval_diff(&t2,&t1); + myfile << "TIME TOTAL: " << time << " seconds\n"; + myfile << "--------------------------------------------" << "\n\n" ; + myfile.close(); +} +