diff --git a/CUDA/diffmodels.cu b/CUDA/diffmodels.cu index e4a027ef932e4d0335c1e33157e82bdaaf1bcd55..7ae2b61e3b7d631af924b5d3fc3c0caefd2dcf7a 100644 --- a/CUDA/diffmodels.cu +++ b/CUDA/diffmodels.cu @@ -46,6 +46,9 @@ void fit_PVM_single( //INPUT nparams = nfib*3 + 3; else nparams = nfib*3 + 2; + + thrust::host_vector<double> params_host; + params_host.resize(nvox*nparams); for(int vox=0;vox<nvox;vox++){ // initialise with a tensor @@ -56,32 +59,33 @@ void fit_PVM_single( //INPUT float _th,_ph; cart2sph(dti.get_v1(),_th,_ph); - params_gpu[vox*nparams] = dti.get_s0(); + params_host[vox*nparams] = dti.get_s0(); //start(2) = dti.get_md()>0?dti.get_md()*2:0.001; // empirically found that d~2*MD - params_gpu[vox*nparams+1] = dti.get_l1()>0?dti.get_l1():0.002; // empirically found that d~L1 - params_gpu[vox*nparams+2] = dti.get_fa()<1?f2x(dti.get_fa()):f2x(0.95); // first pvf = FA - params_gpu[vox*nparams+3] = _th; - params_gpu[vox*nparams+4] = _ph; - float sumf=x2f(params_gpu[vox*nparams+2]); + params_host[vox*nparams+1] = dti.get_l1()>0?dti.get_l1():0.002; // empirically found that d~L1 + params_host[vox*nparams+2] = dti.get_fa()<1?f2x(dti.get_fa()):f2x(0.95); // first pvf = FA + params_host[vox*nparams+3] = _th; + params_host[vox*nparams+4] = _ph; + float sumf=x2f(params_host[vox*nparams+2]); float tmpsumf=sumf; for(int ii=2,i=5;ii<=nfib;ii++,i+=3){ float denom=2; do{ - params_gpu[vox*nparams+i] = f2x(x2f(params_gpu[vox*nparams+i-3])/denom); + params_host[vox*nparams+i] = f2x(x2f(params_host[vox*nparams+i-3])/denom); denom *= 2; - tmpsumf = sumf + x2f(params_gpu[vox*nparams+i]); + tmpsumf = sumf + x2f(params_host[vox*nparams+i]); }while(tmpsumf>=1); - sumf += x2f(params_gpu[vox*nparams+i]); + sumf += x2f(params_host[vox*nparams+i]); cart2sph(dti.get_v(ii),_th,_ph); - params_gpu[vox*nparams+i+1] = _th; - params_gpu[vox*nparams+i+2] = _ph; + params_host[vox*nparams+i+1] = _th; + params_host[vox*nparams+i+2] = _ph; } if (m_include_f0) - params_gpu[vox*nparams+nparams-1]=f2x(FSMALL); + params_host[vox*nparams+nparams-1]=f2x(FSMALL); } - int blocks = nvox/THREADS_X_BLOCK_FIT; - if (nvox % THREADS_X_BLOCK_FIT) blocks++; + thrust::copy(params_host.begin(), params_host.end(), params_gpu.begin()); + + int blocks = nvox; dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); @@ -109,6 +113,9 @@ void fit_PVM_single_c( //INPUT else nparams = nfib*3 + 2; + thrust::host_vector<double> params_host; + params_host.resize(nvox*nparams); + for(int vox=0;vox<nvox;vox++){ // initialise with a tensor DTI dti(datam_vec[vox],bvecs_vec[vox],bvals_vec[vox]); @@ -136,12 +143,13 @@ void fit_PVM_single_c( //INPUT pvm.fit_pvf(start); for(int i=0;i<nparams;i++){ - params_gpu[vox*nparams+i]=start(i+1); + params_host[vox*nparams+i]=start(i+1); } } - int blocks = nvox/THREADS_X_BLOCK_FIT; - if (nvox % THREADS_X_BLOCK_FIT) blocks++; + thrust::copy(params_host.begin(), params_host.end(), params_gpu.begin()); + + int blocks = nvox; dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); @@ -161,8 +169,7 @@ void fit_PVM_multi( //INPUT xfibresOptions& opts = xfibresOptions::getInstance(); int nfib = opts.nfibres.value(); - int blocks = nvox/THREADS_X_BLOCK_FIT; - if (nvox % THREADS_X_BLOCK_FIT) blocks++; + int blocks = nvox; dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); @@ -183,10 +190,10 @@ void fit_PVM_multi( //INPUT void calculate_tau( //INPUT thrust::device_vector<double> datam_gpu, - thrust::device_vector<double>& params_gpu, + thrust::device_vector<double> params_gpu, thrust::device_vector<double> bvecs_gpu, thrust::device_vector<double> bvals_gpu, - thrust::host_vector<int>& vox_repeat, + thrust::host_vector<int> vox_repeat, int nrepeat, string output_file, //OUTPUT @@ -216,8 +223,7 @@ void calculate_tau( //INPUT } } - int blocks = nvox/THREADS_X_BLOCK_FIT; - if (nvox % THREADS_X_BLOCK_FIT) blocks++; + int blocks = nvox; dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_X_BLOCK_FIT,1); @@ -239,11 +245,15 @@ void calculate_tau( //INPUT sync_check("get_residuals_PVM_multi_kernel"); } + thrust::host_vector<double> residuals_host; + residuals_host.resize(nvox*NDIRECTIONS); + thrust::copy(residuals_gpu.begin(), residuals_gpu.end(), residuals_host.begin()); + ColumnVector res(NDIRECTIONS); for(int vox=0;vox<nvox;vox++){ - for(int i=0;i<NDIRECTIONS;i++) res(i+1)= residuals_gpu[vox*NDIRECTIONS+i]; + for(int i=0;i<NDIRECTIONS;i++) res(i+1)= residuals_host[vox*NDIRECTIONS+i]; - float variance=var(res).AsScalar(); + float variance=var(res).AsScalar(); tau[vox]=1.0/variance; } diff --git a/CUDA/diffmodels.cuh b/CUDA/diffmodels.cuh index a9d136171aba86462d3184aab408ff77d5a61c73..d97bfe9da39c0f2f442e71b4efc82d2115881870 100644 --- a/CUDA/diffmodels.cuh +++ b/CUDA/diffmodels.cuh @@ -41,23 +41,31 @@ void fit_PVM_multi( //INPUT void calculate_tau( //INPUT thrust::device_vector<double> datam_gpu, - thrust::device_vector<double>& params_gpu, + thrust::device_vector<double> params_gpu, thrust::device_vector<double> bvecs_gpu, thrust::device_vector<double> bvals_gpu, - thrust::host_vector<int>& vox_repeat, + thrust::host_vector<int> vox_repeat, int nrepeat, string output_file, //OUTPUT thrust::host_vector<float>& tau); -__device__ double cf_PVM_single( //INPUT +__device__ void cf_PVM_single( //INPUT const double* params, const double* data, const double* bvecs, const double* bvals, const int nparams, - const bool m_include_f0); + const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_d, + double &sumf, + //OUTPUT + double &cfv); __device__ void grad_PVM_single( //INPUT const double* params, @@ -66,7 +74,13 @@ __device__ void grad_PVM_single( //INPUT const double* bvals, const int nparams, const bool m_include_f0, - // OUTPUT + const int idB, + double* shared, + double* fs, + double* x, + double &_d, + double &sumf, + //OUTPUT double* grad); __device__ void hess_PVM_single( //INPUT @@ -75,16 +89,31 @@ __device__ void hess_PVM_single( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_d, + double &sumf, //OUTPUT double* hess); -__device__ double cf_PVM_single_c( //INPUT +__device__ void cf_PVM_single_c( //INPUT const double* params, const double* data, const double* bvecs, const double* bvals, const int nparams, - const bool m_include_f0); + const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_d, + double &sumf, + //OUTPUT + double &cfv); + __device__ void grad_PVM_single_c( //INPUT const double* params, @@ -93,6 +122,13 @@ __device__ void grad_PVM_single_c( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* f_deriv, + double* x, + double &_d, + double &sumf, //OUTPUT double* grad); @@ -102,16 +138,32 @@ __device__ void hess_PVM_single_c( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* f_deriv, + double* x, + double &_d, + double &sumf, //OUTPUT double* hess); -__device__ double cf_PVM_multi( //INPUT +__device__ void cf_PVM_multi( //INPUT const double* params, const double* data, const double* bvecs, const double* bvals, const int nparams, - const bool m_include_f0); + const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_a, + double &_b, + double &sumf, + //OUTPUT + double &cfv); __device__ void grad_PVM_multi( //INPUT const double* params, @@ -120,6 +172,13 @@ __device__ void grad_PVM_multi( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_a, + double &_b, + double &sumf, //OUTPUT double* grad); @@ -129,6 +188,12 @@ __device__ void hess_PVM_multi( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, + double* fs, + double* x, + double &_a, + double &_b, + double &sumf, //OUTPUT double* hess); -