From 87efb1bfe984d37b17580b138789a06a9c91b9c6 Mon Sep 17 00:00:00 2001 From: Moises Fernandez <moisesf@fmrib.ox.ac.uk> Date: Fri, 19 Apr 2013 14:24:23 +0000 Subject: [PATCH] Avoid extra copies of bvals-bvecs if consider gradient nonlinearities is not activated --- CUDA/diffmodels.cu | 30 +++++++---- CUDA/xfibres_gpu.cu | 121 +++++++++++++++++++++++++++++--------------- 2 files changed, 99 insertions(+), 52 deletions(-) diff --git a/CUDA/diffmodels.cu b/CUDA/diffmodels.cu index 83870f7..5ef7276 100644 --- a/CUDA/diffmodels.cu +++ b/CUDA/diffmodels.cu @@ -35,6 +35,7 @@ void fit_PVM_single( //INPUT int ndirections, int nfib, bool m_include_f0, + bool gradnonlin, string output_file, //OUTPUT thrust::device_vector<double>& params_gpu) @@ -54,7 +55,10 @@ void fit_PVM_single( //INPUT for(int vox=0;vox<nvox;vox++){ // initialise with a tensor - DTI dti(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox]); + int pos_bv; + if (gradnonlin) pos_bv=vox; + else pos_bv=0; + DTI dti(datam_vec[vox],bvecs_vec[pos_bv],bvals_vec[pos_bv]); dti.linfit(); // set starting parameters for nonlinear fitting @@ -95,7 +99,7 @@ void fit_PVM_single( //INPUT myfile << "Shared Memory Used in fit_PVM_single: " << amount_shared << "\n"; - fit_PVM_single_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(params_gpu.data())); + fit_PVM_single_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, gradnonlin,thrust::raw_pointer_cast(params_gpu.data())); sync_check("fit_PVM_single_kernel"); myfile.close(); @@ -110,7 +114,8 @@ void fit_PVM_single_c( //INPUT thrust::device_vector<double> bvals_gpu, int ndirections, int nfib, - bool m_include_f0, + bool m_include_f0, + bool gradnonlin, string output_file, //OUTPUT thrust::device_vector<double>& params_gpu) @@ -130,7 +135,10 @@ void fit_PVM_single_c( //INPUT for(int vox=0;vox<nvox;vox++){ // initialise with a tensor - DTI dti(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox]); + int pos_bv; + if (gradnonlin) pos_bv=vox; + else pos_bv=0; + DTI dti(datam_vec[vox],bvecs_vec[pos_bv],bvals_vec[pos_bv]); dti.linfit(); // set starting parameters for nonlinear fitting @@ -151,7 +159,7 @@ void fit_PVM_single_c( //INPUT } // do a better job for initializing the volume fractions - PVM_single_c pvm(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox],nfib,false,m_include_f0,false); + PVM_single_c pvm(datam_vec[vox],bvecs_vec[pos_bv],bvals_vec[pos_bv],nfib,false,m_include_f0,false); pvm.fit_pvf(start); for(int i=0;i<nparams;i++){ @@ -169,7 +177,7 @@ void fit_PVM_single_c( //INPUT myfile << "Shared Memory Used in fit_PVM_single_c: " << amount_shared << "\n"; - fit_PVM_single_c_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, false, m_include_f0, false, thrust::raw_pointer_cast(params_gpu.data())); + fit_PVM_single_c_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, false, m_include_f0, false, gradnonlin,thrust::raw_pointer_cast(params_gpu.data())); sync_check("fit_PVM_single_c_kernel"); myfile.close(); @@ -183,6 +191,7 @@ void fit_PVM_multi( //INPUT int ndirections, int nfib, bool m_include_f0, + bool gradnonlin, string output_file, //OUTPUT thrust::device_vector<double>& params_gpu) @@ -209,7 +218,7 @@ void fit_PVM_multi( //INPUT myfile << "Shared Memory Used in fit_PVM_multi: " << amount_shared << "\n"; - fit_PVM_multi_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_PVM_single_c_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(params_gpu.data())); + fit_PVM_multi_kernel<<<Dim_Grid, Dim_Block, amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_PVM_single_c_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()) ,nvox, ndirections, nfib, nparams, m_include_f0, gradnonlin,thrust::raw_pointer_cast(params_gpu.data())); sync_check("fit_PVM_multi_kernel"); myfile.close(); @@ -227,6 +236,7 @@ void calculate_tau( //INPUT int model, bool m_include_f0, bool nonlin, + bool gradnonlin, string output_file, //OUTPUT thrust::host_vector<float>& tau) @@ -269,16 +279,16 @@ void calculate_tau( //INPUT if(model==1){ if(nonlin){ - get_residuals_PVM_single_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + get_residuals_PVM_single_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0,gradnonlin, thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); sync_check("get_residuals_PVM_single_kernel"); }else{ - get_residuals_PVM_single_c_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + get_residuals_PVM_single_c_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0, gradnonlin,thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); sync_check("get_residuals_PVM_single_c_kernel"); } }else{ //model 2 : non-mono-exponential - get_residuals_PVM_multi_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0, thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); + get_residuals_PVM_multi_kernel<<<Dim_Grid, Dim_Block,amount_shared>>>(thrust::raw_pointer_cast(datam_gpu.data()), thrust::raw_pointer_cast(params_gpu.data()), thrust::raw_pointer_cast(bvecs_gpu.data()), thrust::raw_pointer_cast(bvals_gpu.data()), nvox, ndirections, nfib, nparams, m_include_f0, gradnonlin,thrust::raw_pointer_cast(includes_f0_gpu.data()), thrust::raw_pointer_cast(residuals_gpu.data())); sync_check("get_residuals_PVM_multi_kernel"); } diff --git a/CUDA/xfibres_gpu.cu b/CUDA/xfibres_gpu.cu index 53430bf..dbb7232 100644 --- a/CUDA/xfibres_gpu.cu +++ b/CUDA/xfibres_gpu.cu @@ -68,6 +68,7 @@ void xfibres_gpu( //INPUT int nvox = datam.Ncols(); int ndirections = datam.Nrows(); int nfib= opts.nfibres.value(); + bool gradnonlin=opts.grad_file.set(); if(nvox>0){ thrust::host_vector<double> datam_host, bvecs_host, bvals_host, alpha_host, beta_host, params_host; @@ -89,7 +90,7 @@ void xfibres_gpu( //INPUT fit(datam_vec,bvecs_vec,bvals_vec,datam_host,bvecs_host,bvals_host,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,gpu_log,params_gpu,vox_repeat,nrepeat); if(opts.rician.value()){ - calculate_tau(datam_gpu,params_gpu,bvecs_gpu,bvals_gpu,vox_repeat,nrepeat,ndirections,nfib,opts.modelnum.value(),opts.f0.value(),opts.nonlin.value(),gpu_log,tau_host); + calculate_tau(datam_gpu,params_gpu,bvecs_gpu,bvals_gpu,vox_repeat,nrepeat,ndirections,nfib,opts.modelnum.value(),opts.f0.value(),opts.nonlin.value(),gradnonlin,gpu_log,tau_host); } bvecs_gpu.clear(); //free bvecs_gpu @@ -187,10 +188,11 @@ void fit( //INPUT int nparams_fit = 2+3*opts.nfibres.value(); if(opts.modelnum.value()==2) nparams_fit++; if(opts.f0.value()) nparams_fit++; + bool gradnonlin=opts.grad_file.set(); if(opts.modelnum.value()==1){ if(opts.nonlin.value()){ - fit_PVM_single(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),output_file,params_gpu); + fit_PVM_single(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),gradnonlin,output_file,params_gpu); if (opts.f0.value()){ float md,mf,f0; @@ -223,14 +225,14 @@ void fit( //INPUT thrust::device_vector<double> bvals_repeat_gpu=bvals_repeat_host; thrust::device_vector<double> params_repeat_gpu=params_repeat_host; - fit_PVM_single(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,output_file,params_repeat_gpu); + fit_PVM_single(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,gradnonlin,output_file,params_repeat_gpu); thrust::copy(params_repeat_gpu.begin(), params_repeat_gpu.end(), params_repeat_host.begin()); //mix all the parameteres: repeated and not repeated mix_params(params_repeat_host,vox_repeat, nrepeat, nvox, params_gpu); } } }else{ - fit_PVM_single_c(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),output_file,params_gpu); + fit_PVM_single_c(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),gradnonlin,output_file,params_gpu); if (opts.f0.value()){ float md,mf,f0; @@ -263,7 +265,7 @@ void fit( //INPUT thrust::device_vector<double> bvals_repeat_gpu=bvals_repeat_host; thrust::device_vector<double> params_repeat_gpu=params_repeat_host; - fit_PVM_single_c(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,output_file,params_repeat_gpu); + fit_PVM_single_c(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,gradnonlin,output_file,params_repeat_gpu); thrust::copy(params_repeat_gpu.begin(), params_repeat_gpu.end(), params_repeat_host.begin()); //mix all the parameteres: repeated and not repeated @@ -273,9 +275,9 @@ void fit( //INPUT } }else{ //model 2 : non-mono-exponential - fit_PVM_single_c(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),output_file,params_gpu); + fit_PVM_single_c(datam_vec,bvecs_vec,bvals_vec,datam_gpu,bvecs_gpu,bvals_gpu,ndirections,nfib,opts.f0.value(),gradnonlin,output_file,params_gpu); - fit_PVM_multi(datam_gpu,bvecs_gpu,bvals_gpu,nvox,ndirections,nfib,opts.f0.value(),output_file,params_gpu); + fit_PVM_multi(datam_gpu,bvecs_gpu,bvals_gpu,nvox,ndirections,nfib,opts.f0.value(),gradnonlin,output_file,params_gpu); if (opts.f0.value()){ float md,mf,f0; @@ -308,9 +310,9 @@ void fit( //INPUT thrust::device_vector<double> bvals_repeat_gpu=bvals_repeat_host; thrust::device_vector<double> params_repeat_gpu=params_repeat_host; - fit_PVM_single_c(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,output_file,params_repeat_gpu); + fit_PVM_single_c(datam_repeat_vec,bvecs_repeat_vec,bvals_repeat_vec,datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,ndirections,nfib,false,gradnonlin,output_file,params_repeat_gpu); - fit_PVM_multi(datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,nrepeat,ndirections,nfib,false,output_file,params_repeat_gpu); + fit_PVM_multi(datam_repeat_gpu,bvecs_repeat_gpu,bvals_repeat_gpu,nrepeat,ndirections,nfib,false,gradnonlin,output_file,params_repeat_gpu); thrust::copy(params_repeat_gpu.begin(), params_repeat_gpu.end(), params_repeat_host.begin()); //mix all the parameteres: repeated and not repeated @@ -357,14 +359,22 @@ void prepare_data_gpu_FIT( //INPUT } } - bvecs_vec.resize(nvox); - bvals_vec.resize(nvox); - bvecs_host.resize(nvox*bvecs.Nrows()*bvecs.Ncols()); - bvals_host.resize(nvox*bvals.Ncols()); + if (opts.grad_file.set()){ + bvecs_vec.resize(nvox); + bvals_vec.resize(nvox); + bvecs_host.resize(nvox*bvecs.Nrows()*bvecs.Ncols()); + bvals_host.resize(nvox*bvals.Ncols()); + alpha_host.resize(nvox*bvecs.Ncols()); + beta_host.resize(nvox*bvecs.Ncols()); + }else{ + bvecs_vec.resize(1); + bvals_vec.resize(1); + bvecs_host.resize(1*bvecs.Nrows()*bvecs.Ncols()); + bvals_host.resize(1*bvals.Ncols()); + alpha_host.resize(1*bvecs.Ncols()); + beta_host.resize(1*bvecs.Ncols()); + } - alpha_host.resize(nvox*bvecs.Ncols()); - beta_host.resize(nvox*bvecs.Ncols()); - ColumnVector alpha,beta; if (opts.grad_file.set()){ @@ -386,18 +396,16 @@ void prepare_data_gpu_FIT( //INPUT }else{ MISCMATHS::cart2sph(bvecs,alpha,beta); - for(int vox=0;vox<nvox;vox++){ - bvecs_vec[vox]=bvecs; - bvals_vec[vox]=bvals; - for(int dir=0;dir<ndirections;dir++){ - bvecs_host[vox*ndirections*3+dir] = bvecs(1,dir+1); - bvecs_host[vox*ndirections*3+ndirections+dir] = bvecs(2,dir+1); - bvecs_host[vox*ndirections*3+ndirections*2+dir] = bvecs(3,dir+1); - bvals_host[vox*ndirections+dir] = bvals(1,dir+1); + bvecs_vec[0]=bvecs; + bvals_vec[0]=bvals; + for(int dir=0;dir<ndirections;dir++){ + bvecs_host[ndirections*3+dir] = bvecs(1,dir+1); + bvecs_host[ndirections*3+ndirections+dir] = bvecs(2,dir+1); + bvecs_host[ndirections*3+ndirections*2+dir] = bvecs(3,dir+1); + bvals_host[ndirections+dir] = bvals(1,dir+1); - alpha_host[vox*ndirections+dir] = alpha(dir+1); - beta_host[vox*ndirections+dir] = beta(dir+1); - } + alpha_host[ndirections+dir] = alpha(dir+1); + beta_host[ndirections+dir] = beta(dir+1); } } @@ -437,29 +445,58 @@ void prepare_data_gpu_FIT_repeat( //INPUT datam_repeat_vec.resize(nrepeat); datam_repeat_host.resize(nrepeat*ndirections); - bvecs_repeat_vec.resize(nrepeat); - bvals_repeat_vec.resize(nrepeat); - bvecs_repeat_host.resize(nrepeat*3*ndirections); - bvals_repeat_host.resize(nrepeat*ndirections); + + if (opts.grad_file.set()){ + bvecs_repeat_vec.resize(nrepeat); + bvals_repeat_vec.resize(nrepeat); + bvecs_repeat_host.resize(nrepeat*3*ndirections); + bvals_repeat_host.resize(nrepeat*ndirections); + }else{ + bvecs_repeat_vec.resize(1); + bvals_repeat_vec.resize(1); + bvecs_repeat_host.resize(1*3*ndirections); + bvals_repeat_host.resize(1*ndirections); + } + for(int vox=0;vox<nrepeat;vox++){ for(int dir=0;dir<ndirections;dir++){ datam(dir+1)= datam_host[vox_repeat[vox]*ndirections+dir]; datam_repeat_host[vox*ndirections+dir]=datam_host[vox_repeat[vox]*ndirections+dir]; + } + datam_repeat_vec[vox]=datam; + } - bvecs_repeat_host[vox*ndirections*3+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+dir]; - bvecs_repeat_host[vox*ndirections*3+ndirections+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+ndirections+dir]; - bvecs_repeat_host[vox*ndirections*3+ndirections*2+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+ndirections*2+dir]; - bvals_repeat_host[vox*ndirections+dir] = bvals_host[vox_repeat[vox]*ndirections+dir]; + if (opts.grad_file.set()){ + for(int vox=0;vox<nrepeat;vox++){ + for(int dir=0;dir<ndirections;dir++){ + bvecs_repeat_host[vox*ndirections*3+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+dir]; + bvecs_repeat_host[vox*ndirections*3+ndirections+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+ndirections+dir]; + bvecs_repeat_host[vox*ndirections*3+ndirections*2+dir] = bvecs_host[vox_repeat[vox]*ndirections*3+ndirections*2+dir]; + bvals_repeat_host[vox*ndirections+dir] = bvals_host[vox_repeat[vox]*ndirections+dir]; + + bvecs(1,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+dir]; + bvecs(2,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+ndirections+dir]; + bvecs(3,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+ndirections*2+dir]; + bvals(1,dir+1)= bvals_host[vox_repeat[vox]*ndirections+dir]; + } + bvecs_repeat_vec[vox]=bvecs; + bvals_repeat_vec[vox]=bvals; + } + }else{ + for(int dir=0;dir<ndirections;dir++){ + bvecs_repeat_host[ndirections*3+dir] = bvecs_host[ndirections*3+dir]; + bvecs_repeat_host[ndirections*3+ndirections+dir] = bvecs_host[ndirections*3+ndirections+dir]; + bvecs_repeat_host[ndirections*3+ndirections*2+dir] = bvecs_host[ndirections*3+ndirections*2+dir]; + bvals_repeat_host[ndirections+dir] = bvals_host[ndirections+dir]; - bvecs(1,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+dir]; - bvecs(2,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+ndirections+dir]; - bvecs(3,dir+1)= bvecs_host[vox_repeat[vox]*ndirections*3+ndirections*2+dir]; - bvals(1,dir+1)= bvals_host[vox_repeat[vox]*ndirections+dir]; + bvecs(1,dir+1)= bvecs_host[ndirections*3+dir]; + bvecs(2,dir+1)= bvecs_host[ndirections*3+ndirections+dir]; + bvecs(3,dir+1)= bvecs_host[ndirections*3+ndirections*2+dir]; + bvals(1,dir+1)= bvals_host[ndirections+dir]; } - datam_repeat_vec[vox]=datam; - bvecs_repeat_vec[vox]=bvecs; - bvals_repeat_vec[vox]=bvals; + bvecs_repeat_vec[0]=bvecs; + bvals_repeat_vec[0]=bvals; } int nfib= opts.nfibres.value(); -- GitLab