From d4a9f9f7c42a1903f063d9abbb21a3eda2bdbdfc Mon Sep 17 00:00:00 2001 From: Moises Fernandez <moisesf@fmrib.ox.ac.uk> Date: Thu, 28 Feb 2013 15:02:34 +0000 Subject: [PATCH] new version Levenberg: several threads per voxel --- CUDA/PVM_multi.cu | 678 +++++++++++++++----------- CUDA/PVM_single.cu | 633 +++++++++++++----------- CUDA/PVM_single_c.cu | 927 +++++++++++++++++++----------------- CUDA/levenberg_marquardt.cu | 386 ++++++++------- 4 files changed, 1474 insertions(+), 1150 deletions(-) diff --git a/CUDA/PVM_multi.cu b/CUDA/PVM_multi.cu index 839b19d..31dd264 100644 --- a/CUDA/PVM_multi.cu +++ b/CUDA/PVM_multi.cu @@ -17,42 +17,48 @@ ///////////////////////////////////// __device__ inline double isoterm_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return exp(double(-_a*log(double(1+bvals[pt]*_b)))); + return exp(-_a*log(1+bvals[pt]*_b)); } __device__ inline double isoterm_a_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return -log(double(1+bvals[pt]*_b))*exp(double(-_a*log(double(1+bvals[pt]*_b)))); + return -log(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b)); } __device__ inline double isoterm_b_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(double(-_a*log(double(1+bvals[pt]*_b)))); + return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b)); } __device__ inline double anisoterm_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(double(-_a*log(double(1+bvals[pt]*_b*(dp*dp))))); + return exp(-_a*log(1+bvals[pt]*_b*(dp*dp))); } __device__ inline double anisoterm_a_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return -log(double(1+bvals[pt]*(dp*dp)*_b))* exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b)))); + return -log(1+bvals[pt]*(dp*dp)*_b)* exp(-_a*log(1+bvals[pt]*(dp*dp)*_b)); } __device__ inline double anisoterm_b_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))); + return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))); } __device__ inline double anisoterm_th_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ + double sinth,costh,sinph,cosph; + sincos(_th,&sinth,&costh); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = cos(double(_th))* (bvecs[pt]*cos(double(_ph)) + bvecs[NDIRECTIONS+pt]*sin(double(_ph))) - bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th)); - return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1); + double dp1 = costh* (bvecs[pt]*cosph + bvecs[NDIRECTIONS+pt]*sinph) - bvecs[(2*NDIRECTIONS)+pt]*sinth; + return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1); } __device__ inline double anisoterm_ph_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ + double sinth,sinph,cosph; + sinth=sin(_th); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sin(double(_th))* (-bvecs[pt]*sin(double(_ph)) + bvecs[NDIRECTIONS+pt]*cos(double(_ph))); - return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1); + double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[NDIRECTIONS+pt]*cosph); + return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1); } //in diffmodel.cc @@ -79,147 +85,99 @@ __device__ void fix_fsum_PVM_multi( //INPUT } //in diffmodel.cc -__device__ void sort_PVM_multi(int nfib,int nparams,double* params) +__device__ void sort_PVM_multi(int nfib,double* params) { double temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ - if (params[3+j*3] < params[3+i*3]){ + if (params[3+j*3] < params[3+(j+1)*3]){ temp_f = params[3+j*3]; temp_th = params[3+j*3+1]; temp_ph = params[3+j*3+2]; - params[3+j*3] = params[3+i*3]; - params[3+j*3+1] = params[3+i*3+1]; - params[3+j*3+2] = params[3+i*3+2]; - params[3+i*3] = temp_f; - params[3+i*3+1] = temp_th; - params[3+i*3+2] = temp_ph; + params[3+j*3] = params[3+(j+1)*3]; + params[3+j*3+1] = params[3+(j+1)*3+1]; + params[3+j*3+2] = params[3+(j+1)*3+2]; + params[3+(j+1)*3] = temp_f; + params[3+(j+1)*3+1] = temp_th; + params[3+(j+1)*3+2] = temp_ph; } } } } -//in diffmodels.cc -- for calculate residuals -__device__ void forwardModel_PVM_multi( //INPUT - const double* p, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, - //OUTPUT - double* predicted_signal) -{ - for(int i=0;i<NDIRECTIONS;i++){ - predicted_signal[i]=0; //pred = 0; - } - double val; - double _a = abs(p[1]); - double _b = abs(p[2]); - //////////////////////////////////// - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; - double3 x2; - for(int k=0;k<nfib;k++){ - int kk = 3+3*k; - fs[k] = x2f_gpu(p[kk]); - sumf += fs[k]; - x[k*3] = sin(p[kk+1])*cos(p[kk+2]); - x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]); - x[k*3+2] = cos(p[kk+1]); - } - //////////////////////////////////// - for(int i=0;i<NDIRECTIONS;i++){ - val = 0.0; - for(int k=0;k<nfib;k++){ - x2.x=x[k*3]; - x2.y=x[k*3+1]; - x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals); - } - if (m_include_f0){ - double temp_f0=x2f_gpu(p[nparams-1]); - predicted_signal[i] = abs(p[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+val); - } - else - predicted_signal[i] = abs(p[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+val); - } -} - -//in diffmodels.cc -- for calculate residuals -__device__ void get_prediction_PVM_multi( //INPUT - const double* params, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, - //OUTPUT - double* predicted_signal) -{ - //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - double p[NPARAMS]; - p[0] = params[0]; - p[1] = params[1]*params[1]/params[2]/params[2]; //m_d*m_d/m_d_std/m_d_std; - p[2] = params[2]*params[2]/params[1]; //m_d_std*m_d_std/m_d; // =1/beta - for(int k=0;k<nfib;k++){ - int kk = 3+3*k; - p[kk] = f2x_gpu(params[kk]); - p[kk+1] = params[kk+1]; - p[kk+2] = params[kk+2]; - } - if (m_include_f0) - p[nparams-1]=f2x_gpu(params[nparams-1]); - forwardModel_PVM_multi(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal); -} - //cost function PVM_multi -__device__ double cf_PVM_multi( //INPUT +__device__ void cf_PVM_multi( //INPUT const double* params, const double* mdata, const double* bvecs, const double* bvals, const int nparams, - const bool m_include_f0) + const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_a, //shared memory + double &_b, //shared memory + double &sumf, //shared memory + //OUTPUT + double &cfv) { - double cfv = 0.0; - double err; - double _a= abs(params[1]); - double _b= abs(params[2]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; + if(idB<NFIBRES){ + int kk = 3+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; + } + if(idB==0){ + _a= abs(params[1]); + _b= abs(params[2]); + cfv = 0.0; + sumf=0; + for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + + double err; double3 x2; + int dir_iter=idB; - for(int k=0;k<NFIBRES;k++){ - int kk = 3+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); - } - for(int i=0;i<NDIRECTIONS;i++){ + __syncthreads(); + + shared[idB]=0; + for(int dir=0;dir<ndir;dir++){ err = 0.0; for(int k=0;k<NFIBRES;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); - err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+err)))-mdata[i]; + err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter]; }else{ - err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+err)-mdata[i]; + err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter]; } - cfv += err*err; + shared[idB]+= err*err; + dir_iter+=THREADS_X_BLOCK_FIT; } - return(cfv); -} + __syncthreads(); + + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + cfv+=shared[i]; + } + } +} //gradient function PVM_multi __device__ void grad_PVM_multi( //INPUT @@ -229,66 +187,94 @@ __device__ void grad_PVM_multi( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_a, //shared memory + double &_b, //shared memory + double &sumf, //shared memory //OUTPUT double* grad) -{ - double _a= abs(params[1]); - double _b= abs(params[2]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - - for(int k=0;k<NFIBRES;k++){ - int kk = 3+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); +{ + if(idB<NFIBRES){ + int kk = 3+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } + if(idB==0){ + _a= abs(params[1]); + _b= abs(params[2]); + sumf=0; + for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + for (int p=0;p<nparams;p++) grad[p]=0; + } - double J[NPARAMS]; - double diff; - double sig; - - for (int p=0;p<nparams;p++) grad[p]=0; + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; - for(int i=0;i<NDIRECTIONS;i++){ - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 3+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - } - if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals); - }else{ - sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals); - } + double J[NPARAMS]; + double diff; + double sig; + double3 xx; + int dir_iter=idB; + + __syncthreads(); + + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 3+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + if(m_include_f0){ + double temp_f0=x2f_gpu(params[nparams-1]); + J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + }else{ + sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + } - diff = sig - mdata[i]; - J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; + diff = sig - mdata[dir_iter]; + J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; + } - for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff; + for (int p=0;p<nparams;p++){ + shared[idB]=2*J[p]*diff; + + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + grad[p] += shared[i]; + } + } + __syncthreads(); + } + dir_iter+=THREADS_X_BLOCK_FIT; } } - //hessian function PVM_multi __device__ void hess_PVM_multi( //INPUT const double* params, @@ -296,75 +282,105 @@ __device__ void hess_PVM_multi( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_a, //shared memory + double &_b, //shared memory + double &sumf, //shared memory + //OUTPUT double* hess) { - double _a= abs(params[1]); - double _b= abs(params[2]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - - for(int k=0;k<NFIBRES;k++){ - int kk = 3+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); + if(idB<NFIBRES){ + int kk = 3+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } + if(idB==0){ + _a= abs(params[1]); + _b= abs(params[2]); + sumf=0; + for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + for (int p=0;p<nparams;p++){ + for (int p2=0;p2<nparams;p2++){ + hess[p*nparams+p2] = 0; + } + } + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[NPARAMS]; double sig; + double3 xx; + int dir_iter=idB; - for (int p=0;p<nparams;p++){ - for (int p2=0;p2<nparams;p2++){ - hess[p*nparams+p2] = 0; - } - } - - for(int i=0;i<NDIRECTIONS;i++){ - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 3+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals); - double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*cov; - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - } - if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals); - }else{ - sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals); - } + __syncthreads(); - J[0] = sig/params[0]; + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 3+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov; + J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + if(m_include_f0){ + double temp_f0=x2f_gpu(params[nparams-1]); + J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + }else{ + sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + } + + J[0] = sig/params[0]; + } for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - hess[p*nparams+p2] += 2*(J[p]*J[p2]); + + shared[idB]=2*(J[p]*J[p2]); + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + hess[p*nparams+p2] += shared[i]; + } + } + __syncthreads(); } } + dir_iter+=THREADS_X_BLOCK_FIT; } - for (int j=0; j<nparams; j++) { - for (int i=j+1; i<nparams; i++) { - hess[i*nparams+j]=hess[j*nparams+i]; - } - } + if(idB==0){ + for (int j=0; j<nparams; j++) { + for (int i=j+1; i<nparams; i++) { + hess[i*nparams+j]=hess[j*nparams+i]; + } + } + } } //in diffmodel.cc @@ -379,63 +395,80 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT //OUTPUT double* params) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } - - int nparams; - if (m_include_f0) - nparams = nfib*3 + 4; - else - nparams = nfib*3 + 3; - - int nparams_single_c = nparams-1; - - double myparams[NPARAMS]; - double myparams_aux[NPARAMS]; - double mydata[NDIRECTIONS]; - - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } - - for(int i=0;i<nparams_single_c;i++){ - myparams_aux[i]=params_PVM_single_c[(id*nparams_single_c)+i]; - } - - myparams[0] = myparams_aux[0]; //pvm1.get_s0(); - myparams[1] = 1.0; //start with d=d_std - for(int i=0,ii=3;i<nfib;i++,ii+=3){ - myparams[ii] = f2x_gpu(myparams_aux[ii-1]); - myparams[ii+1] = myparams_aux[ii]; - myparams[ii+2] = myparams_aux[ii+1]; - } - myparams[2] = myparams_aux[1]; //pvm1.get_d(); - if (m_include_f0) - myparams[nparams-1]=f2x_gpu(myparams_aux[nparams_single_c-1]); + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + + __shared__ double step[NPARAMS]; + __shared__ double grad[NPARAMS]; + __shared__ double hess[NPARAMS*NPARAMS]; + __shared__ double inverse[NPARAMS]; + __shared__ double pcf; + __shared__ double ncf; + __shared__ double lambda; + __shared__ double cftol; + __shared__ double ltol; + __shared__ double olambda; + __shared__ bool success; + __shared__ bool end; + + __shared__ double shared[THREADS_X_BLOCK_FIT]; + __shared__ double fs[NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double _a; + __shared__ double _b; + __shared__ double sumf; + + if(idB==0){ + if (m_include_f0) + nparams = nfib*3 + 4; + else + nparams = nfib*3 + 3; + + int nparams_single_c = nparams-1; + + myparams[0] = params_PVM_single_c[(idVOX*nparams_single_c)+0]; //pvm1.get_s0(); + myparams[1] = 1.0; //start with d=d_std + for(int i=0,ii=3;i<nfib;i++,ii+=3){ + myparams[ii] = f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+ii-1]); + myparams[ii+1] = params_PVM_single_c[(idVOX*nparams_single_c)+ii]; + myparams[ii+2] = params_PVM_single_c[(idVOX*nparams_single_c)+ii+1]; + } + myparams[2] = params_PVM_single_c[(idVOX*nparams_single_c)+1] ; //pvm1.get_d(); + if (m_include_f0) + myparams[nparams-1]=f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+nparams_single_c-1]); + } + + __syncthreads(); //do the fit - levenberg_marquardt_PVM_multi_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams); + levenberg_marquardt_PVM_multi_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_a,_b,sumf,myparams); + + __syncthreads(); // finalise parameters //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - myparams_aux[1] = myparams[1]; + if(idB==0){ + double aux = myparams[1]; - myparams[1] = abs(myparams_aux[1]*myparams[2]); - myparams[2] = sqrt(double(abs(myparams_aux[1]*myparams[2]*myparams[2]))); - for(int i=3,k=0;k<nfib;i+=3,k++){ - myparams[i] = x2f_gpu(myparams[i]); - } - if (m_include_f0) - myparams[nparams-1]=x2f_gpu(myparams[nparams-1]); + myparams[1] = abs(aux*myparams[2]); + myparams[2] = sqrt(double(abs(aux*myparams[2]*myparams[2]))); + for(int i=3,k=0;k<nfib;i+=3,k++){ + myparams[i] = x2f_gpu(myparams[i]); + } + if (m_include_f0) + myparams[nparams-1]=x2f_gpu(myparams[nparams-1]); - sort_PVM_multi(nfib,nparams,myparams); - fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams); + sort_PVM_multi(nfib,myparams); + fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams); - for(int i=0;i<nparams;i++){ - params[(id*nparams)+i] = myparams[i]; - //printf("PARAM[%i]: %.20f\n",i,myparams[i]); - } + for(int i=0;i<nparams;i++){ + params[(idVOX*nparams)+i] = myparams[i]; + } + } } //in diffmodel.cc @@ -451,34 +484,99 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT //OUTPUT double* residuals) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + __shared__ bool my_include_f0; + __shared__ double val; + __shared__ double _a; + __shared__ double _b; + __shared__ double fs[NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double sumf; + + double predicted_signal; + double mydata; + + if(idB==0){ + if (m_include_f0) + nparams = nfib*3 + 4; + else + nparams = nfib*3 + 3; - int nparams; - if (m_include_f0) - nparams = nfib*3 + 4; - else - nparams = nfib*3 + 3; + my_include_f0 = includes_f0[idVOX]; + + //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] + + myparams[0] = params[(idVOX*nparams)+0]; + double aux1 = params[(idVOX*nparams)+1]; + double aux2 = params[(idVOX*nparams)+2]; + //m_d*m_d/m_d_std/m_d_std; + myparams[1] = aux1*aux1/aux2/aux2; + myparams[2] = aux2*aux2/aux1; //m_d_std*m_d_std/m_d; // =1/beta + + if (my_include_f0) + myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]); + } - bool my_include_f0 = includes_f0[id]; + if(idB<nfib){ + int kk = 3+3*idB; + double sinth,costh,sinph,cosph; + + myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); + myparams[kk+1] = params[(idVOX*nparams)+kk+1]; + myparams[kk+2] = params[(idVOX*nparams)+kk+2]; + + sincos(myparams[kk+1],&sinth,&costh); + sincos(myparams[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(myparams[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; + } - double myparams[NPARAMS]; - double mydata[NDIRECTIONS]; + __syncthreads(); - for(int i=0;i<nparams;i++){ - myparams[i]=params[(id*nparams)+i]; - } + if(idB==0){ + _a = abs(myparams[1]); + _b = abs(myparams[2]); + sumf=0; + for(int k=0;k<nfib;k++){ + sumf += fs[k]; + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + + double3 x2; + int dir_iter=idB; - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } + __syncthreads(); - double predicted_signal[NDIRECTIONS]; + for(int dir=0;dir<ndir;dir++){ + mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + predicted_signal=0; //pred = 0; + val = 0.0; + for(int k=0;k<nfib;k++){ + x2.x=x[k*3]; + x2.y=x[k*3+1]; + x2.z=x[k*3+2]; + val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + } + if (my_include_f0){ + double temp_f0=x2f_gpu(myparams[nparams-1]); + predicted_signal = abs(myparams[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val); + } + else + predicted_signal = abs(myparams[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val); + } - get_prediction_PVM_multi(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal); + //residuals=m_data-predicted_signal; + residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; - for(int i=0;i<NDIRECTIONS;i++){ //residuals=m_data-predicted_signal; - residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i]; + dir_iter+=THREADS_X_BLOCK_FIT; } } diff --git a/CUDA/PVM_single.cu b/CUDA/PVM_single.cu index 0c1d513..8f65048 100644 --- a/CUDA/PVM_single.cu +++ b/CUDA/PVM_single.cu @@ -20,42 +20,46 @@ __device__ inline double isoterm_PVM_single(const int pt,const double _d,const double *bvals){ - return exp(double(-bvals[pt]*_d)); + return exp(-bvals[pt]*_d); } __device__ inline double isoterm_d_PVM_single(const int pt,const double _d,const double *bvals){ - return (-bvals[pt]*exp(double(-bvals[pt]*_d))); + return (-bvals[pt]*exp(-bvals[pt]*_d)); } __device__ inline double anisoterm_PVM_single(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(double(-bvals[pt]*_d*dp*dp)); + return exp(-bvals[pt]*_d*dp*dp); } __device__ inline double anisoterm_d_PVM_single(const int pt,const double _d,const double3 x,const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return(-bvals[pt]*dp*dp*exp(double(-bvals[pt]*_d*dp*dp))); + return(-bvals[pt]*dp*dp*exp(-bvals[pt]*_d*dp*dp)); } __device__ inline double anisoterm_th_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ - + double sinth,costh,sinph,cosph; + sincos(_th,&sinth,&costh); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = (cos(double(_th))*(bvecs[pt]*cos(double(_ph))+bvecs[NDIRECTIONS+pt]*sin(double(_ph)))-bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th))); - return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp))); + double dp1 = (costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth); + return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); } __device__ inline double anisoterm_ph_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ + double sinth,sinph,cosph; + sinth=sin(_th); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sin(double(_th))*(-bvecs[pt]*sin(double(_ph))+bvecs[NDIRECTIONS+pt]*cos(double(_ph))); - return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp))); + double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph); + return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); } - //in diffmodel.cc __device__ void fix_fsum_PVM_single( //INPUT bool m_include_f0, @@ -77,150 +81,99 @@ __device__ void fix_fsum_PVM_single( //INPUT } } - - //in diffmodel.cc -__device__ void sort_PVM_single(int nfib,int nparams,double* params) +__device__ void sort_PVM_single(int nfib,double* params) { double temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ - if (params[2+j*3] < params[2+i*3]){ + if (params[2+j*3] < params[2+(j+1)*3]){ temp_f = params[2+j*3]; temp_th = params[2+j*3+1]; temp_ph = params[2+j*3+2]; - params[2+j*3] = params[2+i*3]; - params[2+j*3+1] = params[2+i*3+1]; - params[2+j*3+2] = params[2+i*3+2]; - params[2+i*3] = temp_f; - params[2+i*3+1] = temp_th; - params[2+i*3+2] = temp_ph; + params[2+j*3] = params[2+(j+1)*3]; + params[2+j*3+1] = params[2+(j+1)*3+1]; + params[2+j*3+2] = params[2+(j+1)*3+2]; + params[2+(j+1)*3] = temp_f; + params[2+(j+1)*3+1] = temp_th; + params[2+(j+1)*3+2] = temp_ph; } } } } - -//in diffmodels.cc -- for calculate residuals -__device__ void forwardModel_PVM_single( //INPUT - const double* p, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, - //OUTPUT - double* predicted_signal) -{ - for(int i=0;i<NDIRECTIONS;i++){ - predicted_signal[i]=0; //pred = 0; - } - double val; - double _d = abs(p[1]); - //////////////////////////////////// - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; - double3 x2; - for(int k=0;k<nfib;k++){ - int kk = 2+3*k; - fs[k] = x2f_gpu(p[kk]); - sumf += fs[k]; - x[k*3] = sin(p[kk+1])*cos(p[kk+2]); - x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]); - x[k*3+2] = cos(p[kk+1]); - } - //////////////////////////////////// - for(int i=0;i<NDIRECTIONS;i++){ - val = 0.0; - for(int k=0;k<nfib;k++){ - x2.x=x[k*3]; - x2.y=x[k*3+1]; - x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_single(i,_d,x2,bvecs,bvals); - } - if (m_include_f0){ - double temp_f0=x2f_gpu(p[nparams-1]); - predicted_signal[i] = p[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals)+val); - } - else - predicted_signal[i] = p[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+val); - } -} - - -//in diffmodels.cc -- for calculate residuals -__device__ void get_prediction_PVM_single( //INPUT - const double* params, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, - //OUTPUT - double* predicted_signal) +//cost function PVM_single +__device__ void cf_PVM_single( //INPUT + const double* params, + const double* mdata, + const double* bvecs, + const double* bvals, + const int nparams, + const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory + //OUTPUT + double &cfv) { - //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - double p[NPARAMS]; - p[0] = params[0]; - p[1] = params[1]; - for(int k=0;k<nfib;k++){ - int kk = 2+3*k; - p[kk] = f2x_gpu(params[kk]); - p[kk+1] = params[kk+1]; - p[kk+2] = params[kk+2]; + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } - if (m_include_f0) - p[nparams-1]=f2x_gpu(params[nparams-1]); - forwardModel_PVM_single(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal); -} + __syncthreads(); -//cost function PVM_single -__device__ double cf_PVM_single( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, - const int nparams, - const bool m_include_f0) -{ - double cfv = 0.0; - double err; - double _d = abs(params[1]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; + if(idB==0){ + _d = abs(params[1]); + cfv = 0.0; + sumf=0; + for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + + double err; double3 x2; + int dir_iter=idB; - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); - } - - for(int i=0;i<NDIRECTIONS;i++){ + __syncthreads(); + + shared[idB]=0; + for(int dir=0;dir<ndir;dir++){ err = 0.0; for(int k=0;k<NFIBRES;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_single(i,_d,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,bvecs,bvals); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); - err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+err))-mdata[i]; + err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ - err = (params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+err))-mdata[i]; + err = (params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+err))-mdata[dir_iter]; } - cfv += err*err; + shared[idB]+= err*err; + dir_iter+=THREADS_X_BLOCK_FIT; } - return(cfv); + __syncthreads(); + + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + cfv+=shared[i]; + } + } } //gradient function PVM_single @@ -231,58 +184,89 @@ __device__ void grad_PVM_single( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory //OUTPUT double* grad) { - double _d = abs(params[1]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } - - double J[NPARAMS]; - double diff; - double sig; - for (int p=0;p<nparams;p++) grad[p]=0; + __syncthreads(); - for(int i=0;i<NDIRECTIONS;i++){ - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_single(i,_d,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(i,_d,xx,bvecs,bvals); - J[kk] = params[0]*(anisoterm_PVM_single(i,_d,xx,bvecs,bvals)-isoterm_PVM_single(i,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - } + if(idB==0){ + sumf=0; + for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; + _d = abs(params[1]); + for (int p=0;p<nparams;p++) grad[p]=0; + } - if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= params[0]*(1-isoterm_PVM_single(i,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig= params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(i,_d,bvals); - }else{ - sig = params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(i,_d,bvals); - } - diff = sig - mdata[i]; - J[0] = sig/params[0]; + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + + double J[NPARAMS]; + double diff; + double sig; + double3 xx; + int dir_iter=idB; + + __syncthreads(); + + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals); + J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + + if(m_include_f0){ + double temp_f0=x2f_gpu(params[nparams-1]); + J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + sig= params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + }else{ + sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + } + diff = sig - mdata[dir_iter]; + J[0] = sig/params[0]; + } + + for (int p=0;p<nparams;p++){ + shared[idB]=2*J[p]*diff; - for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff; + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + grad[p] += shared[i]; + } + } + __syncthreads(); + } + dir_iter+=THREADS_X_BLOCK_FIT; } } @@ -293,70 +277,102 @@ __device__ void hess_PVM_single( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory + //OUTPUT double* hess) { - double _d = abs(params[1]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - fs[k] = x2f_gpu(params[kk]); - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + fs[idB] = x2f_gpu(params[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } - - double J[NPARAMS]; - double sig; - for (int p=0;p<nparams;p++){ - for (int p2=0;p2<nparams;p2++){ - hess[p*nparams+p2] = 0; + __syncthreads(); + + if(idB==0){ + sumf=0; + for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; + _d = abs(params[1]); + for (int p=0;p<nparams;p++){ + for (int p2=0;p2<nparams;p2++){ + hess[p*nparams+p2] = 0; + } } } - for(int i=0;i<NDIRECTIONS;i++){ - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_single(i,_d,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(i,_d,xx,bvecs,bvals); - J[kk] = params[0]*(anisoterm_PVM_single(i,_d,xx,bvecs,bvals)-isoterm_PVM_single(i,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - } + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; - if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= params[0]*(1-isoterm_PVM_single(i,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig=params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(i,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(i,_d,bvals); - }else{ - sig = params[0]*((1-sumf)*isoterm_PVM_single(i,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(i,_d,bvals); - } - J[0] = sig/params[0]; + double J[NPARAMS]; + double sig; + double3 xx; + int dir_iter=idB; + + __syncthreads(); + + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals); + J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + + if(m_include_f0){ + double temp_f0=x2f_gpu(params[nparams-1]); + J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + sig=params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + }else{ + sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + } + J[0] = sig/params[0]; + } for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - hess[p*nparams+p2] += 2*(J[p]*J[p2]); + + shared[idB]=2*(J[p]*J[p2]); + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + hess[p*nparams+p2] += shared[i]; + } + } + __syncthreads(); } } + dir_iter+=THREADS_X_BLOCK_FIT; } - for (int j=0; j<nparams; j++) { - for (int i=j+1; i<nparams; i++) { - hess[i*nparams+j]=hess[j*nparams+i]; - } - } + if(idB==0){ + for (int j=0; j<nparams; j++) { + for (int i=j+1; i<nparams; i++) { + hess[i*nparams+j]=hess[j*nparams+i]; + } + } + } } //in diffmodel.cc @@ -370,46 +386,67 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT //INPUT-OUTPUT double* params) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } - - int nparams; - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - double myparams[NPARAMS]; - double mydata[NDIRECTIONS]; + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + + __shared__ double step[NPARAMS]; + __shared__ double grad[NPARAMS]; + __shared__ double hess[NPARAMS*NPARAMS]; + __shared__ double inverse[NPARAMS]; + __shared__ double pcf; + __shared__ double ncf; + __shared__ double lambda; + __shared__ double cftol; + __shared__ double ltol; + __shared__ double olambda; + __shared__ bool success; + __shared__ bool end; + + __shared__ double shared[THREADS_X_BLOCK_FIT]; + __shared__ double fs[NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double _d; + __shared__ double sumf; + + if(idB==0){ + if(m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; + + for(int i=0;i<nparams;i++){ + myparams[i]=params[(idVOX*nparams)+i]; + } + } - for(int i=0;i<nparams;i++){ - myparams[i]=params[(id*nparams)+i]; - } - - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } + __syncthreads(); // do the fit - levenberg_marquardt_PVM_single_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams); + levenberg_marquardt_PVM_single_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_d,sumf,myparams); + + __syncthreads(); // finalise parameters //m_s0 in myparams[0] m_d in myparams[1] m_f-m_th-m_ph in myparams[2,3,4,5, etc..] m_f0 in myparams[nparams-1] - - myparams[1] = abs(myparams[1]); - for(int k=1;k<=nfib;k++){ - int kk = 2 + 3*(k-1); - myparams[kk] = x2f_gpu(myparams[kk]); - } - if (m_include_f0) - myparams[nparams-1]=x2f_gpu(myparams[nparams-1]); - sort_PVM_single(nfib,nparams,myparams); - fix_fsum_PVM_single(m_include_f0,nfib,nparams,myparams); + if(idB==0){ + myparams[1] = abs(myparams[1]); + for(int k=1;k<=nfib;k++){ + int kk = 2 + 3*(k-1); + myparams[kk] = x2f_gpu(myparams[kk]); + } + if(m_include_f0) + myparams[nparams-1]=x2f_gpu(myparams[nparams-1]); - for(int i=0;i<nparams;i++){ - params[id*nparams+i]=myparams[i]; - //printf("PARAM[%i]: %.20f\n",i,myparams[i]); + sort_PVM_single(nfib,myparams); + fix_fsum_PVM_single(m_include_f0,nfib,nparams,myparams); + + for(int i=0;i<nparams;i++){ + params[idVOX*nparams+i]=myparams[i]; + } } } @@ -426,34 +463,92 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT //OUTPUT double* residuals) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + __shared__ bool my_include_f0; + __shared__ double val; + __shared__ double _d; + __shared__ double fs[NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double sumf; + + double predicted_signal; + double mydata; + + if(idB==0){ + if (m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; - int nparams; - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; + my_include_f0 = includes_f0[idVOX]; - bool my_include_f0 = includes_f0[id]; + //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - double myparams[NPARAMS]; - double mydata[NDIRECTIONS]; + myparams[0]=params[(idVOX*nparams)+0]; + myparams[1]=params[(idVOX*nparams)+1]; - for(int i=0;i<nparams;i++){ - myparams[i]=params[(id*nparams)+i]; - } - - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } + if (my_include_f0) + myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]); + } + + if(idB<nfib){ + int kk = 2+3*idB; + double sinth,costh,sinph,cosph; - double predicted_signal[NDIRECTIONS]; + myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); + myparams[kk+1] = params[(idVOX*nparams)+kk+1]; + myparams[kk+2] = params[(idVOX*nparams)+kk+2]; - get_prediction_PVM_single(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal); + sincos(myparams[kk+1],&sinth,&costh); + sincos(myparams[kk+2],&sinph,&cosph); - for(int i=0;i<NDIRECTIONS;i++){ //residuals=m_data-predicted_signal; - residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i]; + fs[idB] = x2f_gpu(myparams[kk]); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; + } + + __syncthreads(); + + if(idB==0){ + sumf=0; + for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; + _d = abs(myparams[1]); } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + + double3 x2; + int dir_iter=idB; + + __syncthreads(); + + for(int dir=0;dir<ndir;dir++){ + mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + predicted_signal=0; //pred = 0; + val = 0.0; + for(int k=0;k<nfib;k++){ + x2.x=x[k*3]; + x2.y=x[k*3+1]; + x2.z=x[k*3+2]; + val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + } + if (my_include_f0){ + double temp_f0=x2f_gpu(myparams[nparams-1]); + predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + } + else + predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + + //residuals=m_data-predicted_signal; + residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; + + dir_iter+=THREADS_X_BLOCK_FIT; + } } diff --git a/CUDA/PVM_single_c.cu b/CUDA/PVM_single_c.cu index 38e5db9..09c399f 100644 --- a/CUDA/PVM_single_c.cu +++ b/CUDA/PVM_single_c.cu @@ -20,43 +20,46 @@ __device__ inline double isoterm_PVM_single_c(const int pt,const double _d,const double *bvals){ - return exp(double(-bvals[pt]*_d)); + return exp(-bvals[pt]*_d); } __device__ inline double isoterm_lambda_PVM_single_c(const int pt,const double lambda,const double *bvals){ - return(-2*bvals[pt]*lambda*exp(double(-bvals[pt]*lambda*lambda))); + return(-2*bvals[pt]*lambda*exp(-bvals[pt]*lambda*lambda)); } __device__ inline double anisoterm_PVM_single_c(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(double(-bvals[pt]*_d*dp*dp)); + return exp(-bvals[pt]*_d*dp*dp); } __device__ inline double anisoterm_lambda_PVM_single_c(const int pt,const double lambda,const double3 x, const double *bvecs, const double *bvals){ double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return(-2*bvals[pt]*lambda*dp*dp*exp(double(-bvals[pt]*lambda*lambda*dp*dp))); + return(-2*bvals[pt]*lambda*dp*dp*exp(-bvals[pt]*lambda*lambda*dp*dp)); } __device__ inline double anisoterm_th_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ - + double sinth,costh,sinph,cosph; + sincos(_th,&sinth,&costh); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = cos(double(_th))*(bvecs[pt]*cos(double(_ph))+bvecs[NDIRECTIONS+pt]*sin(double(_ph)))-bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th)); - return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp))); + double dp1 = costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth; + return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); } __device__ inline double anisoterm_ph_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ + double sinth,sinph,cosph; + sinth=sin(_th); + sincos(_ph,&sinph,&cosph); double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sin(double(_th))*(-bvecs[pt]*sin(double(_ph))+bvecs[NDIRECTIONS+pt]*cos(double(_ph))); - return(-2*bvals[pt]*_d*dp*dp1*exp(double(-bvals[pt]*_d*dp*dp))); + double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph); + return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); } - - //If the sum of the fractions is >1, then zero as many fractions //as necessary, so that the sum becomes smaller than 1. //in diffmodel.cc @@ -87,205 +90,148 @@ __device__ double partial_fsum_PVM_single_c(double* fs, int ii){ return fsum; } - - //in diffmodel.cc -__device__ void sort_PVM_single_c(int nfib,int nparams,double* params) +__device__ void sort_PVM_single_c(int nfib,double* params) { double temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ - if (params[2+j*3] < params[2+i*3]){ + if (params[2+j*3] < params[2+(j+1)*3]){ temp_f = params[2+j*3]; temp_th = params[2+j*3+1]; temp_ph = params[2+j*3+2]; - params[2+j*3] = params[2+i*3]; - params[2+j*3+1] = params[2+i*3+1]; - params[2+j*3+2] = params[2+i*3+2]; - params[2+i*3] = temp_f; - params[2+i*3+1] = temp_th; - params[2+i*3+2] = temp_ph; + params[2+j*3] = params[2+(j+1)*3]; + params[2+j*3+1] = params[2+(j+1)*3+1]; + params[2+j*3+2] = params[2+(j+1)*3+2]; + params[2+(j+1)*3] = temp_f; + params[2+(j+1)*3+1] = temp_th; + params[2+(j+1)*3+2] = temp_ph; } } } - - //if (m_return_fanning){ - // fantmp=m_fanning_angles; - // Hess_vec_tmp=m_invprHes_e1; - // Hess=m_Hessian; - //} - - //if (m_return_fanning){ - //m_fanning_angles(i)=fantmp(fvals[ii].second); - //m_invprHes_e1[i-1]=Hess_vec_tmp[fvals[ii].second-1]; - //m_Hessian[i-1]=Hess[fvals[ii].second-1]; - //} -} - -//in diffmodels.cc -- for calculate residuals -__device__ void forwardModel_PVM_single_c( //INPUT - const double* p, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, - //OUTPUT - double* predicted_signal) -{ - for(int i=0;i<NDIRECTIONS;i++){ - predicted_signal[i]=0; //pred = 0; - } - double val; - double _d = lambda2d_gpu(p[1]); - //////////////////////////////////// - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; - double3 x2; - double partial_fsum; - for(int k=0;k<nfib;k++){ - int kk = 2+3*k; - ////// partial_fsum ////// - partial_fsum=1.0; - for(int j=0;j<k;j++) - partial_fsum-=fs[j]; - ////////////////////////// - fs[k] = beta2f_gpu(p[kk])*partial_fsum; - sumf += fs[k]; - x[k*3] = sin(p[kk+1])*cos(p[kk+2]); - x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]); - x[k*3+2] = cos(p[kk+1]); - } - //////////////////////////////////// - for(int i=0;i<NDIRECTIONS;i++){ - val = 0.0; - for(int k=0;k<nfib;k++){ - x2.x=x[k*3]; - x2.y=x[k*3+1]; - x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_single_c(i,_d,x2,bvecs,bvals); - } - if (m_include_f0){ - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<NFIBRES;j++) - partial_fsum-=fs[j]; - ////////////////////////// - double temp_f0= beta2f_gpu(p[nparams-1])*partial_fsum; - predicted_signal[i] = p[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(i,_d,bvals)+val); - } - else - predicted_signal[i] = p[0]*((1-sumf)*isoterm_PVM_single_c(i,_d,bvals)+val); - } } -//in diffmodels.cc -- for calculate residuals -__device__ void get_prediction_PVM_single_c( //INPUT +__device__ void fractions_deriv_PVM_single_c( //INPUT const double* params, - const double* bvecs, - const double* bvals, - const int nfib, - const int nparams, - const bool m_include_f0, + const double* fs, + const int nfib, + const int idB, //OUTPUT - double* predicted_signal) + double* Deriv) { - //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - double p[NPARAMS]; - p[0] = params[0]; - if(params[1]<0) p[1] = 0; //This can be due to numerical errors..sqrt - else p[1] = d2lambda_gpu(params[1]); - double partial_fsum; - double fs[NFIBRES]; - for(int k=0;k<nfib;k++){ - int kk = 2+3*k; - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<k;j++) - partial_fsum-=fs[j]; - ////////////////////////// - fs[k] = params[kk]; - double tmpr=params[kk]/partial_fsum; - if (tmpr>1.0) tmpr=1; //This can be due to numerical errors - if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt - p[kk] = f2beta_gpu(tmpr); - p[kk+1] = params[kk+1]; - p[kk+2] = params[kk+2]; - } - if (m_include_f0){ - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<NFIBRES;j++) - partial_fsum-=fs[j]; - ////////////////////////// - double tmpr=params[nparams-1]/partial_fsum; - if (tmpr>1.0) tmpr=1; //This can be due to numerical errors..asin - if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt - p[nparams-1]= f2beta_gpu(tmpr); - } - forwardModel_PVM_single_c(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal); + int nparams_per_fibre=3; + double fsum; + int k=idB%nfib; + for (int j=0; j<nfib; j++){ + Deriv[j*nfib+k]=0; + } + + int kk = 2+(k*nparams_per_fibre); + double sinparamkk = sin(2*params[kk]); + + for (int j=0; j<nfib; j++){ + int jj = 2+(j*nparams_per_fibre); + if (j==k){ + fsum=1; + for (int n=0; n<=(j-1); n++){ + fsum-=fs[n]; + } + Deriv[j*nfib+k]=sinparamkk*fsum; + }else if (j>k){ + double sinparam = sin(params[jj]); + fsum=0; + for (int n=0; n<=(j-1); n++){ + fsum+=Deriv[n*nfib+k]; + } + Deriv[j*nfib+k]= -(sinparam*sinparam)*fsum; + } + } } //cost function PVM_single_c -__device__ double cf_PVM_single_c( //INPUT +__device__ void cf_PVM_single_c( //INPUT const double* params, const double* mdata, const double* bvecs, const double* bvals, const int nparams, - const bool m_include_f0) + const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory + //OUTPUT + double &cfv) { - double cfv = 0.0; - double err; - double _d = lambda2d_gpu(params[1]); - double fs[NFIBRES]; - double x[NFIBRES*3]; - double sumf=0; - double3 x2; - double partial_fsum; - - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<k;j++) - partial_fsum-=fs[j]; - ////////////////////////// - fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; - - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } + if(idB==0){ + _d = lambda2d_gpu(params[1]); + cfv = 0.0; + sumf=0; + double partial_fsum; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + //partial_fsum /////////// + partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=fs[j]; + ////////////////////////// + fs[k] = beta2f_gpu(params[kk])*partial_fsum; + sumf += fs[k]; + } + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - for(int i=0;i<NDIRECTIONS;i++){ - err = 0.0; + double err; + double3 x2; + int dir_iter=idB; + + __syncthreads(); + + shared[idB]=0; + for(int dir=0;dir<ndir;dir++){ + err = 0.0; for(int k=0;k<NFIBRES;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_single_c(i,_d,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,bvecs,bvals); } if(m_include_f0){ //partial_fsum /////////// - partial_fsum=1.0; + double partial_fsum=1.0; for(int j=0;j<NFIBRES;j++) partial_fsum-=fs[j]; ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; - err=(params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(i,_d,bvals))+err))-mdata[i]; + double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ - - err = params[0]*((1-sumf)*isoterm_PVM_single_c(i,_d,bvals)+err)-mdata[i]; - + err = params[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,bvals)+err)-mdata[dir_iter]; } - cfv += err*err; + shared[idB]+= err*err; + dir_iter+=THREADS_X_BLOCK_FIT; } - return(cfv); + __syncthreads(); + + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + cfv+=shared[i]; + } + } } @@ -297,115 +243,122 @@ __device__ void grad_PVM_single_c( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* f_deriv, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory //OUTPUT double* grad) { - double _d = lambda2d_gpu(params[1]); - double fs[NFIBRES]; - double bs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - double partial_fsum; - - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - bs[k]=params[kk]; - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<k;j++){ - partial_fsum-=fs[j]; + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; + } + if(idB==0){ + _d = lambda2d_gpu(params[1]); + sumf=0; + double partial_fsum; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + //partial_fsum /////////// + partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=fs[j]; + ////////////////////////// + fs[k] = beta2f_gpu(params[kk])*partial_fsum; + sumf += fs[k]; } - ////////////////////////// + for (int p=0;p<nparams;p++) grad[p]=0; + } - fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); - } + __syncthreads(); - ////////// fraction deriv ////////////// - // f_deriv=fractions_deriv(nfib, fs, bs); - ////////////////////////////////////// - double f_deriv[NFIBRES*NFIBRES]; - double fsum; - for (int j=0; j<NFIBRES; j++){ - for (int k=0; k<NFIBRES; k++){ - f_deriv[j*NFIBRES+k]=0; - } - } + if(idB<NFIBRES){ + fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); + } - for (int j=0; j<NFIBRES; j++){ - for (int k=0; k<NFIBRES; k++){ - if (j==k){ - fsum=1; - for (int n=0; n<=(j-1); n++) - fsum-=fs[n]; - f_deriv[j*NFIBRES+k] = sin(double(2*bs[k]))*fsum; - }else if (j>k){ - fsum=0; - for (int n=0; n<=(j-1); n++) - fsum += f_deriv[n*NFIBRES+k]; - f_deriv[j*NFIBRES+k] += -sin(bs[j])*sin(bs[j])*fsum; - - } - } - } - /////////////////////////////// - double J[NPARAMS]; - double diff; - double sig,Iso_term; + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + + double J[NPARAMS]; + double diff; + double sig; + double Iso_term; + double3 xx; + int dir_iter=idB; double Aniso_terms[NFIBRES]; - for (int p=0;p<nparams;p++) grad[p]=0; - - for(int i=0;i<NDIRECTIONS;i++){ - Iso_term=isoterm_PVM_single_c(i,_d,bvals); //Precompute some terms for this datapoint - for(int k=0;k<NFIBRES;k++){ - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(i,_d,xx,bvecs,bvals); - } - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(i,params[1],xx,bvecs,bvals); - J[kk] = 0; - for (int j=0;j<NFIBRES;j++){ - if(f_deriv[j*NFIBRES+k]!=0){ - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; - } - } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - } - if(m_include_f0){ - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<(NFIBRES);j++) - partial_fsum-=fs[j]; - ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + __syncthreads(); + + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint + for(int k=0;k<NFIBRES;k++){ + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals); + } + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*Aniso_terms[k]; + J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals); + J[kk] = 0; + for (int j=0;j<NFIBRES;j++){ + if(f_deriv[j*NFIBRES+k]!=0){ + J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; + } + } + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + if(m_include_f0){ + //partial_fsum /////////// + double partial_fsum=1.0; + for(int j=0;j<(NFIBRES);j++) + partial_fsum-=fs[j]; + ////////////////////////// + double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + + //derivative with respect to f0 + J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; + sig=params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); + J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + }else{ + sig = params[0]*((1-sumf)*Iso_term+sig); + J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + } + diff = sig - mdata[dir_iter]; + J[0] = sig/params[0]; + } - //derivative with respect to f0 - J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; - sig=params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(i,params[1],bvals); - }else{ - sig = params[0]*((1-sumf)*Iso_term+sig); - J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(i,params[1],bvals); - } - diff = sig - mdata[i]; - J[0] = sig/params[0]; + for (int p=0;p<nparams;p++){ + shared[idB]=2*J[p]*diff; - for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff; + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + grad[p] += shared[i]; + } + } + __syncthreads(); + } + dir_iter+=THREADS_X_BLOCK_FIT; } } @@ -417,126 +370,134 @@ __device__ void hess_PVM_single_c( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* shared, //shared memory + double* fs, //shared memory + double* f_deriv, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory //OUTPUT double* hess) { - double _d = lambda2d_gpu(params[1]); - double fs[NFIBRES]; - double bs[NFIBRES]; - double x[NFIBRES*3]; - double3 xx; - double sumf=0; - double partial_fsum; - - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - bs[k]=params[kk]; - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<k;j++) - partial_fsum-=fs[j]; - ////////////////////////// - fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; - x[k*3] = sin(params[kk+1])*cos(params[kk+2]); - x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]); - x[k*3+2] = cos(params[kk+1]); + if(idB<NFIBRES){ + int kk = 2+3*(idB); + double sinth,costh,sinph,cosph; + sincos(params[kk+1],&sinth,&costh); + sincos(params[kk+2],&sinph,&cosph); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; } + if(idB==0){ + _d = lambda2d_gpu(params[1]); + sumf=0; + double partial_fsum; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + //partial_fsum /////////// + partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=fs[j]; + ////////////////////////// + fs[k] = beta2f_gpu(params[kk])*partial_fsum; + sumf += fs[k]; + } + for (int p=0;p<nparams;p++){ + for (int p2=0;p2<nparams;p2++){ + hess[p*nparams+p2] = 0; + } + } + } - ////////// fraction deriv ////////////// - // f_deriv=fractions_deriv(nfib, fs, bs); - ////////////////////////////////////// - double f_deriv[NFIBRES*NFIBRES]; - double fsum; - for (int j=0; j<NFIBRES; j++){ - for (int k=0; k<NFIBRES; k++){ - f_deriv[j*NFIBRES+k]=0; - } - } + __syncthreads(); - for (int j=0; j<NFIBRES; j++){ - for (int k=0; k<NFIBRES; k++){ - if (j==k){ - fsum=1; - for (int n=0; n<=(j-1); n++) - fsum-=fs[n]; - f_deriv[j*NFIBRES+k] = sin(double(2*bs[k]))*fsum; - } - else if (j>k){ - fsum=0; - for (int n=0; n<=(j-1); n++) - fsum += f_deriv[n*NFIBRES+k]; - f_deriv[j*NFIBRES+k] += -sin(bs[j])*sin(bs[j])*fsum; - } - } - } - /////////////////////////////// - double J[NPARAMS]; - double sig,Iso_term; + if(idB<NFIBRES){ + fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); + } + + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + + double J[NPARAMS]; + double sig; + double Iso_term; + double3 xx; + int dir_iter=idB; double Aniso_terms[NFIBRES]; - for (int p=0;p<nparams;p++){ - for (int p2=0;p2<nparams;p2++){ - hess[p*nparams+p2] = 0; + __syncthreads(); + + for(int dir=0;dir<max_dir;dir++){ + for (int p=0; p<nparams; p++) J[p]=0; + if(dir<ndir){ + Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint + for(int k=0;k<NFIBRES;k++){ + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals); + } + sig = 0; + for(int k=0;k<NFIBRES;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + sig += fs[k]*Aniso_terms[k]; + J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals); + J[kk] = 0; + for (int j=0; j<NFIBRES; j++){ + if (f_deriv[j*NFIBRES+k]!=0) + J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; + } + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + } + if(m_include_f0){ + //partial_fsum /////////// + double partial_fsum=1.0; + for(int j=0;j<(NFIBRES);j++) + partial_fsum-=fs[j]; + ////////////////////////// + double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + //derivative with respect to f0 + J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; + sig= params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); + J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + }else{ + sig = params[0]*((1-sumf)*Iso_term+sig); + J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + } + J[0] = sig/params[0]; } - } - - for(int i=0;i<NDIRECTIONS;i++){ - Iso_term=isoterm_PVM_single_c(i,_d,bvals); //Precompute some terms for this datapoint - for(int k=0;k<NFIBRES;k++){ - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(i,_d,xx,bvecs,bvals); - } - sig = 0; - for(int a=0;a<nparams;a++) J[a]=0; - for(int k=0;k<NFIBRES;k++){ - int kk = 2+3*(k); - xx.x=x[k*3]; - xx.y=x[k*3+1]; - xx.z=x[k*3+2]; - sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(i,params[1],xx,bvecs,bvals); - J[kk] = 0; - for (int j=0; j<NFIBRES; j++){ - if (f_deriv[j*NFIBRES+k]!=0) - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; - } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(i,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - } - if(m_include_f0){ - //partial_fsum /////////// - partial_fsum=1.0; - for(int j=0;j<(NFIBRES);j++) - partial_fsum-=fs[j]; - ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; - //derivative with respect to f0 - J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; - sig= params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(i,params[1],bvals); - }else{ - sig = params[0]*((1-sumf)*Iso_term+sig); - J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(i,params[1],bvals); - } - J[0] = sig/params[0]; for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - hess[p*nparams+p2] += 2*(J[p]*J[p2]); + + shared[idB]=2*(J[p]*J[p2]); + __syncthreads(); + if(idB==0){ + for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ + hess[p*nparams+p2] += shared[i]; + } + } + __syncthreads(); } } + dir_iter+=THREADS_X_BLOCK_FIT; } - for (int j=0; j<nparams; j++) { - for (int i=j+1; i<nparams; i++) { - hess[i*nparams+j]=hess[j*nparams+i]; - } - } + if(idB==0){ + for (int j=0; j<nparams; j++) { + for (int i=j+1; i<nparams; i++) { + hess[i*nparams+j]=hess[j*nparams+i]; + } + } + } } - //in diffmodel.cc extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT const double* data, @@ -550,59 +511,72 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT //INPUT - OUTPUT double* params) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } - - int nparams; - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - double myparams[NPARAMS]; - double mydata[NDIRECTIONS]; + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + + __shared__ double shared[THREADS_X_BLOCK_FIT]; + __shared__ double step[NPARAMS]; + __shared__ double grad[NPARAMS]; + __shared__ double hess[NPARAMS*NPARAMS]; + __shared__ double inverse[NPARAMS]; + __shared__ double pcf; + __shared__ double ncf; + __shared__ double lambda; + __shared__ double cftol; + __shared__ double ltol; + __shared__ double olambda; + __shared__ bool success; + __shared__ bool end; + + __shared__ double fs[NFIBRES]; + __shared__ double f_deriv[NFIBRES*NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double _d; + __shared__ double sumf; + + if(idB==0){ + if (m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; - for(int i=0;i<nparams;i++){ - myparams[i]=params[(id*nparams)+i]; - } + for(int i=0;i<nparams;i++){ + myparams[i]=params[(idVOX*nparams)+i]; + } + } - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } + __syncthreads(); - //if(id==1) for(int i=0;i<nparams;i++)printf("START[%i]: %.20f\n",i,myparams[i]); //do the fit - levenberg_marquardt_PVM_single_c_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams); + levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,f_deriv,x,_d,sumf,myparams); - //double m_BIC; - //if (m_eval_BIC){ - // double RSS= cf_PVM_single_c(myparams,mydata,&bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams,m_include_f0); // get the sum of squared residuals - // m_BIC=NDIRECTIONS*log(double(RSS/NDIRECTIONS))+log(double(NDIRECTIONS))*nparams; // evaluate BIC - //} NOT USED at the moment + __syncthreads(); // finalise parameters // m_s0-myparams[0] m_d-myparams[1] m_f-m_th-m_ph-myparams[2,3,4,5, etc..] m_f0-myparams[nparams-1] - double m_f[NFIBRES]; // for partial_fsum - - myparams[1] = lambda2d_gpu(myparams[1]); - for(int k=0;k<nfib;k++){ - int kk = 2 + 3*(k); - myparams[kk] = beta2f_gpu(myparams[kk])*partial_fsum_PVM_single_c(m_f,k); - m_f[k]=myparams[kk]; - } + if(idB==0){ + double m_f[NFIBRES]; // for partial_fsum + + myparams[1] = lambda2d_gpu(myparams[1]); + for(int k=0;k<nfib;k++){ + int kk = 2 + 3*(k); + myparams[kk] = beta2f_gpu(myparams[kk])*partial_fsum_PVM_single_c(m_f,k); + m_f[k]=myparams[kk]; + } - //if (m_return_fanning) - //Fanning_angles_from_Hessian(); NOT USED at the moment - - if (m_include_f0) - myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum_PVM_single_c(m_f,nfib); + if (m_include_f0) + myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum_PVM_single_c(m_f,nfib); - sort_PVM_single_c(nfib,nparams,myparams); + sort_PVM_single_c(nfib,myparams); - for(int i=0;i<nparams;i++){ - params[(id*nparams)+i] = myparams[i]; - } + for(int i=0;i<nparams;i++){ + params[(idVOX*nparams)+i] = myparams[i]; + } + } } //in diffmodel.cc @@ -618,34 +592,125 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT //OUTPUT double* residuals) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - if (id >=nvox) { return; } + int idB = threadIdx.x; + int idVOX = blockIdx.x; + + __shared__ double myparams[NPARAMS]; + __shared__ int nparams; + __shared__ bool my_include_f0; + __shared__ double val; + __shared__ double _d; + __shared__ double fs[NFIBRES]; + __shared__ double x[NFIBRES*3]; + __shared__ double sumf; + + double predicted_signal; + double mydata; + + if(idB==0){ + if (m_include_f0) + nparams = nfib*3 + 3; + else + nparams = nfib*3 + 2; - int nparams; - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; + my_include_f0 = includes_f0[idVOX]; - bool my_include_f0 = includes_f0[id]; + //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] + + myparams[0]=params[(idVOX*nparams)+0]; + if(myparams[1]<0) myparams[1] = 0; //This can be due to numerical errors..sqrt + else myparams[1] = d2lambda_gpu(params[(idVOX*nparams)+1]); - double myparams[NPARAMS]; - double mydata[NDIRECTIONS]; + double partial_fsum; + for(int k=0;k<nfib;k++){ + int kk = 2+3*k; + //partial_fsum /////////// + partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=fs[j]; + ////////////////////////// + fs[k] = params[(idVOX*nparams)+kk]; + double tmpr=fs[k]/partial_fsum; + if (tmpr>1.0) tmpr=1; //This can be due to numerical errors + if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt + myparams[kk] = f2beta_gpu(tmpr); + myparams[kk+1] = params[(idVOX*nparams)+kk+1]; + myparams[kk+2] = params[(idVOX*nparams)+kk+2]; + } + if (my_include_f0){ + //partial_fsum /////////// + partial_fsum=1.0; + for(int j=0;j<NFIBRES;j++) + partial_fsum-=fs[j]; + ////////////////////////// + double tmpr=params[(idVOX*nparams)+nparams-1]/partial_fsum; + if (tmpr>1.0) tmpr=1; //This can be due to numerical errors..asin + if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt + myparams[nparams-1]= f2beta_gpu(tmpr); + } + } - for(int i=0;i<nparams;i++){ - myparams[i]=params[(id*nparams)+i]; - } + __syncthreads(); - for(int i=0;i<NDIRECTIONS;i++){ - mydata[i]=data[(id*NDIRECTIONS)+i]; - } + if(idB<nfib){ + int kk = 2+3*idB; + double sinth,costh,sinph,cosph; + sincos(myparams[kk+1],&sinth,&costh); + sincos(myparams[kk+2],&sinph,&cosph); + x[idB*3] = sinth*cosph; + x[idB*3+1] = sinth*sinph; + x[idB*3+2] = costh; + } - double predicted_signal[NDIRECTIONS]; + if(idB==0){ + double partial_fsum; + sumf=0; + for(int k=0;k<nfib;k++){ + int kk = 2+3*k; + ////// partial_fsum ////// + partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=fs[j]; + ////////////////////////// + fs[k] = beta2f_gpu(myparams[kk])*partial_fsum; + sumf += fs[k]; + } + _d = lambda2d_gpu(myparams[1]); + } - get_prediction_PVM_single_c(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal); + int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; + if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + + double3 x2; + int dir_iter=idB; + + __syncthreads(); - for(int i=0;i<NDIRECTIONS;i++){ //residuals=m_data-predicted_signal; - residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i]; + for(int dir=0;dir<ndir;dir++){ + mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + predicted_signal=0; //pred = 0; + val = 0.0; + for(int k=0;k<nfib;k++){ + x2.x=x[k*3]; + x2.y=x[k*3+1]; + x2.z=x[k*3+2]; + val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + } + if (my_include_f0){ + //partial_fsum /////////// + double partial_fsum=1.0; + for(int j=0;j<NFIBRES;j++) + partial_fsum-=fs[j]; + ////////////////////////// + double temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum; + predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + } + else + predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + + //residuals=m_data-predicted_signal; + residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; + + dir_iter+=THREADS_X_BLOCK_FIT; } } - diff --git a/CUDA/levenberg_marquardt.cu b/CUDA/levenberg_marquardt.cu index d13cca0..03f09da 100644 --- a/CUDA/levenberg_marquardt.cu +++ b/CUDA/levenberg_marquardt.cu @@ -11,6 +11,7 @@ #include "solver_mult_inverse.cu" #include "diffmodels.cuh" +#include "dim_blocks.h" #include "options.h" //CPU version in nonlin.h @@ -27,72 +28,93 @@ __device__ void levenberg_marquardt_PVM_single_gpu( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* step, //shared memory + double* grad, //shared memory + double* hess, //shared memory + double* inverse, //shared memory + double &pcf, //shared memory + double &ncf, //shared memory + double &lambda, //shared memory + double &cftol, //shared memory + double <ol, //shared memory + double &olambda, //shared memory + bool &success, //shared memory + bool &end, //shared memory + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory //INPUT-OUTPUT - double* myparams) + double* myparams) //shared memory { - double pcf; - double lambda=0.1; - double cftol=1.0e-8; - double ltol=1.0e20; - - bool success = true; - double olambda = 0.0; - double grad[NPARAMS]; - double hess[NPARAMS*NPARAMS]; - double inverse[NPARAMS]; - double step[NPARAMS]; - - double ncf=0; - - int maxiter=200; - int niter=0; - - pcf=cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0); - - while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) - //function cost has decreise, we have advanced. + int niter=0; + int maxiter=200; + + if(idB==0){ + end=false; + lambda=0.1; + cftol=1.0e-8; + ltol=1.0e20; + success = true; + olambda = 0.0; + ncf=0; + } + + cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf); + __syncthreads(); + + while (!(success&&niter++>=maxiter)){ //if success we not increase niter (first condition is true) + //function cost has decreise, we have advanced. if(success){ - grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); - hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,hess); - } - - for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; + grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,grad); + __syncthreads(); + hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,hess); } - solver(hess,grad,nparams,inverse); - - for (int i=0;i<nparams;i++){ - step[i]=-inverse[i]; - } + if(idB==0){ + for (int i=0; i<nparams; i++) { + hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + } - for(int i=0;i<nparams;i++){ - step[i]=myparams[i]+step[i]; - } + solver(hess,grad,nparams,inverse); - ncf = cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0); + for (int i=0;i<nparams;i++){ + step[i]=-inverse[i]; + } - if (success = (ncf < pcf)) { - olambda = 0.0; - for(int i=0;i<nparams;i++){ - myparams[i]=step[i]; + for(int i=0;i<nparams;i++){ + step[i]=myparams[i]+step[i]; } - lambda=lambda/10.0; - - if (zero_cf_diff_conv(pcf,ncf,cftol)){ - return; - } - pcf=ncf; - }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - return; + } + + __syncthreads(); + cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); + + if(idB==0){ + if (success = (ncf < pcf)){ + olambda = 0.0; + for(int i=0;i<nparams;i++){ + myparams[i]=step[i]; + } + lambda=lambda/10.0; + + if (zero_cf_diff_conv(pcf,ncf,cftol)){ + end=true; + } + pcf=ncf; + }else{ + olambda=lambda; + lambda=lambda*10.0; + if(lambda> ltol){ + end=true; + } } } + __syncthreads(); + if(end) return; } - - return; } __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT @@ -101,72 +123,94 @@ __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* step, //shared memory + double* grad, //shared memory + double* hess, //shared memory + double* inverse, //shared memory + double &pcf, //shared memory + double &ncf, //shared memory + double &lambda, //shared memory + double &cftol, //shared memory + double <ol, //shared memory + double &olambda, //shared memory + bool &success, //shared memory + bool &end, //shared memory + double* shared, //shared memory + double* fs, //shared memory + double* f_deriv, //shared memory + double* x, //shared memory + double &_d, //shared memory + double &sumf, //shared memory //INPUT-OUTPUT - double* myparams) + double* myparams) //shared memory { - double pcf; - double lambda=0.1; - double cftol=1.0e-8; - double ltol=1.0e20; - - bool success = true; - double olambda = 0.0; - double grad[NPARAMS]; - double hess[NPARAMS*NPARAMS]; - double inverse[NPARAMS]; - double step[NPARAMS]; - - double ncf=0; - - int maxiter=200; - int niter=0; - - pcf=cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0); + int niter=0; + int maxiter=200; + + if(idB==0){ + end=false; + lambda=0.1; + cftol=1.0e-8; + ltol=1.0e20; + success = true; + olambda = 0.0; + ncf=0; + } + + cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf); + __syncthreads(); while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) //function cost has decreise, we have advanced. if(success){ - grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); - hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,hess); + grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,grad); + __syncthreads(); + hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,hess); } - for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; - } + if(idB==0){ + for (int i=0; i<nparams; i++) { + hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + } - solver(hess,grad,nparams,inverse); + solver(hess,grad,nparams,inverse); - for (int i=0;i<nparams;i++){ - step[i]=-inverse[i]; - } + for (int i=0;i<nparams;i++){ + step[i]=-inverse[i]; + } - for(int i=0;i<nparams;i++){ - step[i]=myparams[i]+step[i]; - } - - ncf = cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0); - - if (success = (ncf < pcf)) { - olambda = 0.0; - for(int i=0;i<nparams;i++){ - myparams[i]=step[i]; + for(int i=0;i<nparams;i++){ + step[i]=myparams[i]+step[i]; } - lambda=lambda/10.0; - - if (zero_cf_diff_conv(pcf,ncf,cftol)){ - return; - } - pcf=ncf; - }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - return; - } - } + } + + __syncthreads(); + cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); + + if(idB==0){ + if (success = (ncf < pcf)) { + olambda = 0.0; + for(int i=0;i<nparams;i++){ + myparams[i]=step[i]; + } + lambda=lambda/10.0; + + if (zero_cf_diff_conv(pcf,ncf,cftol)){ + end=true; + } + pcf=ncf; + }else{ + olambda=lambda; + lambda=lambda*10.0; + if(lambda> ltol){ + end=true; + } + } + } + __syncthreads(); + if(end) return; } - - return; } @@ -176,71 +220,93 @@ __device__ void levenberg_marquardt_PVM_multi_gpu( //INPUT const double* bvals, const int nparams, const bool m_include_f0, + const int idB, + double* step, //shared memory + double* grad, //shared memory + double* hess, //shared memory + double* inverse, //shared memory + double &pcf, //shared memory + double &ncf, //shared memory + double &lambda, //shared memory + double &cftol, //shared memory + double <ol, //shared memory + double &olambda, //shared memory + bool &success, //shared memory + bool &end, //shared memory + double* shared, //shared memory + double* fs, //shared memory + double* x, //shared memory + double &_a, //shared memory + double &_b, //shared memory + double &sumf, //shared memory //INPUT-OUTPUT - double* myparams) + double* myparams) //shared memory { - double pcf; - double lambda=0.1; - double cftol=1.0e-8; - double ltol=1.0e20; - - bool success = true; - double olambda = 0.0; - double grad[NPARAMS]; - double hess[NPARAMS*NPARAMS]; - double inverse[NPARAMS]; - double step[NPARAMS]; - - double ncf=0; - - int maxiter=200; - int niter=0; - - pcf=cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0); + int niter=0; + int maxiter=200; + + if(idB==0){ + end=false; + lambda=0.1; + cftol=1.0e-8; + ltol=1.0e20; + success = true; + olambda = 0.0; + ncf=0; + } + + cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,pcf); + __syncthreads(); while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) //function cost has decreise, we have advanced. if(success){ - grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); - hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,hess); - } - - for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; + grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,grad); + __syncthreads(); + hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,hess); } - solver(hess,grad,nparams,inverse); - - for (int i=0;i<nparams;i++){ - step[i]=-inverse[i]; - } + if(idB==0){ + for (int i=0; i<nparams; i++) { + hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + } - for(int i=0;i<nparams;i++){ - step[i]=myparams[i]+step[i]; - } + solver(hess,grad,nparams,inverse); - ncf = cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0); + for (int i=0;i<nparams;i++){ + step[i]=-inverse[i]; + } - if (success = (ncf < pcf)) { - olambda = 0.0; - for(int i=0;i<nparams;i++){ - myparams[i]=step[i]; + for(int i=0;i<nparams;i++){ + step[i]=myparams[i]+step[i]; } - lambda=lambda/10.0; - - if (zero_cf_diff_conv(pcf,ncf,cftol)){ - return; - } - pcf=ncf; - }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - return; - } - } + } + + __syncthreads(); + cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,ncf); + + if(idB==0){ + if (success = (ncf < pcf)) { + olambda = 0.0; + for(int i=0;i<nparams;i++){ + myparams[i]=step[i]; + } + lambda=lambda/10.0; + + if (zero_cf_diff_conv(pcf,ncf,cftol)){ + end=true; + } + pcf=ncf; + }else{ + olambda=lambda; + lambda=lambda*10.0; + if(lambda> ltol){ + end=true; + } + } + } + __syncthreads(); + if(end) return; } - - return; } #endif -- GitLab