diff --git a/CUDA/PVM_multi.cu b/CUDA/PVM_multi.cu index 31dd2643a4aea2fac97d810effd0bf72f723b26f..f595c9ff39fcac813322dbaa74ca2c8dd97e0653 100644 --- a/CUDA/PVM_multi.cu +++ b/CUDA/PVM_multi.cu @@ -16,49 +16,49 @@ ///////////////////////////////////// ///////////////////////////////////// -__device__ inline double isoterm_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return exp(-_a*log(1+bvals[pt]*_b)); +__device__ inline double isoterm_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ + return exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double isoterm_a_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return -log(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b)); +__device__ inline double isoterm_a_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ + return -log(1+bvals[pt]**_b)*exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double isoterm_b_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ - return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b)); +__device__ inline double isoterm_b_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ + return -*_a*bvals[pt]/(1+bvals[pt]**_b)*exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double anisoterm_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(-_a*log(1+bvals[pt]*_b*(dp*dp))); +__device__ inline double anisoterm_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return exp(-*_a*log(1+bvals[pt]**_b*(dp*dp))); } -__device__ inline double anisoterm_a_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return -log(1+bvals[pt]*(dp*dp)*_b)* exp(-_a*log(1+bvals[pt]*(dp*dp)*_b)); +__device__ inline double anisoterm_a_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return -log(1+bvals[pt]*(dp*dp)**_b)* exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b)); } -__device__ inline double anisoterm_b_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))); +__device__ inline double anisoterm_b_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return (-*_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))); } -__device__ inline double anisoterm_th_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ +__device__ inline double anisoterm_th_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = costh* (bvecs[pt]*cosph + bvecs[NDIRECTIONS+pt]*sinph) - bvecs[(2*NDIRECTIONS)+pt]*sinth; - return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = costh* (bvecs[pt]*cosph + bvecs[ndirections+pt]*sinph) - bvecs[(2*ndirections)+pt]*sinth; + return (-*_a**_b*bvals[pt]/(1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))*2*dp*dp1); } -__device__ inline double anisoterm_ph_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ +__device__ inline double anisoterm_ph_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[NDIRECTIONS+pt]*cosph); - return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[ndirections+pt]*cosph); + return (-*_a**_b*bvals[pt]/(1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))*2*dp*dp1); } //in diffmodel.cc @@ -112,69 +112,71 @@ __device__ void cf_PVM_multi( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_a, //shared memory - double &_b, //shared memory - double &sumf, //shared memory + double* _a, //shared memory + double* _b, //shared memory + double* sumf, //shared memory //OUTPUT - double &cfv) + double* cfv) { - if(idB<NFIBRES){ - int kk = 3+3*(idB); + if(idSubVOX<nfib){ + int kk = 3+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _a= abs(params[1]); - _b= abs(params[2]); - cfv = 0.0; - sumf=0; - for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + if(idSubVOX==0){ + *_a= abs(params[1]); + *_b= abs(params[2]); + *cfv = 0.0; + *sumf=0; + for(int k=0;k<nfib;k++) *sumf+= fs[k]; } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; double err; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); - shared[idB]=0; + reduction[idSubVOX]=0; for(int dir=0;dir<ndir;dir++){ err = 0.0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); - err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter]; + err = (abs(params[0])*(temp_f0+((1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter]; }else{ - err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter]; + err = abs(params[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter]; } - shared[idB]+= err*err; - dir_iter+=THREADS_X_BLOCK_FIT; + reduction[idSubVOX]+= err*err; + dir_iter+=THREADS_BLOCK_FIT; } __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - cfv+=shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + *cfv+=reduction[i]; } } } @@ -185,46 +187,48 @@ __device__ void grad_PVM_multi( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_a, //shared memory - double &_b, //shared memory - double &sumf, //shared memory + double* _a, //shared memory + double* _b, //shared memory + double* sumf, //shared memory //OUTPUT double* grad) { - if(idB<NFIBRES){ - int kk = 3+3*(idB); + if(idSubVOX<nfib){ + int kk = 3+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _a= abs(params[1]); - _b= abs(params[2]); - sumf=0; - for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + if(idSubVOX==0){ + *_a= abs(params[1]); + *_b= abs(params[2]); + *sumf=0; + for(int k=0;k<nfib;k++) *sumf+= fs[k]; for (int p=0;p<nparams;p++) grad[p]=0; } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double diff; double sig; double3 xx; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); @@ -232,28 +236,28 @@ __device__ void grad_PVM_multi( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 3+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + sig=abs(params[0])*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); }else{ - sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + sig = abs(params[0]) * ((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); } diff = sig - mdata[dir_iter]; @@ -261,17 +265,17 @@ __device__ void grad_PVM_multi( //INPUT } for (int p=0;p<nparams;p++){ - shared[idB]=2*J[p]*diff; + reduction[idSubVOX]=2*J[p]*diff; __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - grad[p] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + grad[p] += reduction[i]; } } __syncthreads(); } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } } @@ -280,33 +284,35 @@ __device__ void hess_PVM_multi( //INPUT const double* params, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_a, //shared memory - double &_b, //shared memory - double &sumf, //shared memory + double* _a, //shared memory + double* _b, //shared memory + double* sumf, //shared memory //OUTPUT double* hess) { - if(idB<NFIBRES){ - int kk = 3+3*(idB); + if(idSubVOX<nfib){ + int kk = 3+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _a= abs(params[1]); - _b= abs(params[2]); - sumf=0; - for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + if(idSubVOX==0){ + *_a= abs(params[1]); + *_b= abs(params[2]); + *sumf=0; + for(int k=0;k<nfib;k++) *sumf+= fs[k]; for (int p=0;p<nparams;p++){ for (int p2=0;p2<nparams;p2++){ hess[p*nparams+p2] = 0; @@ -314,15 +320,15 @@ __device__ void hess_PVM_multi( //INPUT } } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double sig; double3 xx; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); @@ -330,29 +336,29 @@ __device__ void hess_PVM_multi( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 3+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); + sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov; - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov; + J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + sig = abs(params[0])* (temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); }else{ - sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + sig = abs(params[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); } J[0] = sig/params[0]; @@ -361,20 +367,20 @@ __device__ void hess_PVM_multi( //INPUT for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - shared[idB]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(J[p]*J[p2]); __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - hess[p*nparams+p2] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + hess[p*nparams+p2] += reduction[i]; } } __syncthreads(); } } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } - if(idB==0){ + if(idSubVOX==0){ for (int j=0; j<nparams; j++) { for (int i=j+1; i<nparams; i++) { hess[i*nparams+j]=hess[j*nparams+i]; @@ -390,43 +396,48 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT const double* bvecs, const double* bvals, const int nvox, - const int nfib, + const int ndirections, + const int nfib, + const int nparams, const bool m_include_f0, //OUTPUT double* params) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - - __shared__ double step[NPARAMS]; - __shared__ double grad[NPARAMS]; - __shared__ double hess[NPARAMS*NPARAMS]; - __shared__ double inverse[NPARAMS]; - __shared__ double pcf; - __shared__ double ncf; - __shared__ double lambda; - __shared__ double cftol; - __shared__ double ltol; - __shared__ double olambda; - __shared__ bool success; - __shared__ bool end; - - __shared__ double shared[THREADS_X_BLOCK_FIT]; - __shared__ double fs[NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double _a; - __shared__ double _b; - __shared__ double sumf; - - if(idB==0){ - if (m_include_f0) - nparams = nfib*3 + 4; - else - nparams = nfib*3 + 3; - + int threadsBlock = blockDim.x; + + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* reduction = (double*)shared; //threadsBlock + double* myparams = (double*) &reduction[threadsBlock]; //nparams + double* grad = (double*) &myparams[nparams]; //nparams + double* hess = (double*) &grad[nparams]; //nparams*nparams + double* step = (double*) &hess[nparams*nparams]; //nparams + double* inverse = (double*) &step[nparams]; //nparams + double* pcf = (double*) &inverse[nparams]; //1 + double* ncf = (double*) &pcf[1]; //1 + double* lambda = (double*) &ncf[1]; //1 + double* cftol = (double*) &lambda[1]; //1 + double* ltol = (double*) &cftol[1]; //1 + double* olambda = (double*) <ol[1]; //1 + + double* fs = (double*) &olambda[1]; //nfib + double* x = (double*) &fs[nfib]; //nfib*3 + double* _a = (double*) &x[nfib*3]; //1 + double* _b = (double*) &_a[1]; //1 + double* sumf = (double*) &_b[1]; //1 + + double* C = (double*)&sumf[1]; //nparams*nparams; + double* el = (double*)&C[nparams*nparams]; //nparams + + int* indx = (int*)&el[nparams]; //nparams + int* success = (int*) &indx[nparams]; //1 + int* end = (int*) &success[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + if(idSubVOX==0){ + int nparams_single_c = nparams-1; myparams[0] = params_PVM_single_c[(idVOX*nparams_single_c)+0]; //pvm1.get_s0(); @@ -444,14 +455,14 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT __syncthreads(); //do the fit - levenberg_marquardt_PVM_multi_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_a,_b,sumf,myparams); + levenberg_marquardt_PVM_multi_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_a,_b,sumf,C,el,indx,myparams); __syncthreads(); // finalise parameters //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] - if(idB==0){ + if(idSubVOX==0){ double aux = myparams[1]; myparams[1] = abs(aux*myparams[2]); @@ -464,10 +475,11 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT sort_PVM_multi(nfib,myparams); fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams); + } + __syncthreads(); - for(int i=0;i<nparams;i++){ - params[(idVOX*nparams)+i] = myparams[i]; - } + if(idSubVOX<nparams){ + params[(idVOX*nparams)+idSubVOX] = myparams[idSubVOX]; } } @@ -478,51 +490,50 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT const double* bvecs, const double* bvals, const int nvox, + const int ndirections, const int nfib, + const int nparams, const bool m_include_f0, const bool* includes_f0, //OUTPUT double* residuals) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - __shared__ bool my_include_f0; - __shared__ double val; - __shared__ double _a; - __shared__ double _b; - __shared__ double fs[NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double sumf; - + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* myparams = (double*) shared; //nparams + double* fs = (double*) &myparams[nparams]; //nfib + double* x = (double*) &fs[nfib]; //nfib*3 + double* _a = (double*) &x[nfib*3]; //1 + double* _b = (double*) &_a[1]; //1 + double* sumf = (double*) &_b[1]; //1 + int* my_include_f0 = (int*) &sumf[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + double val; double predicted_signal; double mydata; - if(idB==0){ - if (m_include_f0) - nparams = nfib*3 + 4; - else - nparams = nfib*3 + 3; - - my_include_f0 = includes_f0[idVOX]; + if(idSubVOX==0){ + *my_include_f0 = includes_f0[idVOX]; //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] myparams[0] = params[(idVOX*nparams)+0]; double aux1 = params[(idVOX*nparams)+1]; double aux2 = params[(idVOX*nparams)+2]; - //m_d*m_d/m_d_std/m_d_std; - myparams[1] = aux1*aux1/aux2/aux2; + + myparams[1] = aux1*aux1/aux2/aux2; //m_d*m_d/m_d_std/m_d_std; myparams[2] = aux2*aux2/aux1; //m_d_std*m_d_std/m_d; // =1/beta - if (my_include_f0) + if (*my_include_f0) myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]); } - if(idB<nfib){ - int kk = 3+3*idB; + if(idSubVOX<nfib){ + int kk = 3+3*idSubVOX; double sinth,costh,sinph,cosph; myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); @@ -531,52 +542,50 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT sincos(myparams[kk+1],&sinth,&costh); sincos(myparams[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(myparams[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(myparams[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } __syncthreads(); - if(idB==0){ - _a = abs(myparams[1]); - _b = abs(myparams[2]); - sumf=0; + if(idSubVOX==0){ + *_a = abs(myparams[1]); + *_b = abs(myparams[2]); + *sumf=0; for(int k=0;k<nfib;k++){ - sumf += fs[k]; + *sumf += fs[k]; + } } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<ndir;dir++){ - mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + mydata = data[(idVOX*ndirections)+dir_iter]; predicted_signal=0; //pred = 0; val = 0.0; for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections); } - if (my_include_f0){ + if (*my_include_f0){ double temp_f0=x2f_gpu(myparams[nparams-1]); - predicted_signal = abs(myparams[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val); - } - else - predicted_signal = abs(myparams[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val); + predicted_signal = abs(myparams[0])*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*ndirections])+val); + }else{ + predicted_signal = abs(myparams[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*ndirections])+val); } //residuals=m_data-predicted_signal; - residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; + residuals[idVOX*ndirections+dir_iter]= mydata - predicted_signal; - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } -} - diff --git a/CUDA/PVM_single.cu b/CUDA/PVM_single.cu index 8f65048259e3ecfca3ac15e4b3bc664819ba68cc..ecdf6c9afb0731e3d3fba4e31a8038ee49471768 100644 --- a/CUDA/PVM_single.cu +++ b/CUDA/PVM_single.cu @@ -19,45 +19,45 @@ ///////////////////////////////////// __device__ -inline double isoterm_PVM_single(const int pt,const double _d,const double *bvals){ - return exp(-bvals[pt]*_d); +inline double isoterm_PVM_single(const int pt,const double* _d,const double *bvals){ + return exp(-bvals[pt]**_d); } __device__ -inline double isoterm_d_PVM_single(const int pt,const double _d,const double *bvals){ - return (-bvals[pt]*exp(-bvals[pt]*_d)); +inline double isoterm_d_PVM_single(const int pt,const double* _d,const double *bvals){ + return (-bvals[pt]*exp(-bvals[pt]**_d)); } __device__ -inline double anisoterm_PVM_single(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(-bvals[pt]*_d*dp*dp); +inline double anisoterm_PVM_single(const int pt,const double* _d,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return exp(-bvals[pt]**_d*dp*dp); } __device__ -inline double anisoterm_d_PVM_single(const int pt,const double _d,const double3 x,const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return(-bvals[pt]*dp*dp*exp(-bvals[pt]*_d*dp*dp)); +inline double anisoterm_d_PVM_single(const int pt,const double* _d,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return(-bvals[pt]*dp*dp*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_th_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ +inline double anisoterm_th_PVM_single(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = (costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth); - return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = (costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth); + return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_ph_PVM_single(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ +inline double anisoterm_ph_PVM_single(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph); - return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); + return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } //in diffmodel.cc @@ -109,69 +109,71 @@ __device__ void cf_PVM_single( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT - double &cfv) + double* cfv) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } __syncthreads(); - if(idB==0){ - _d = abs(params[1]); - cfv = 0.0; - sumf=0; - for(int k=0;k<NFIBRES;k++) sumf+= fs[k]; + if(idSubVOX==0){ + *_d = abs(params[1]); + *cfv = 0.0; + *sumf=0; + for(int k=0;k<nfib;k++) *sumf+= fs[k]; } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; double err; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); - shared[idB]=0; + reduction[idSubVOX]=0; for(int dir=0;dir<ndir;dir++){ err = 0.0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); - err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+err))-mdata[dir_iter]; + err= (params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ - err = (params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+err))-mdata[dir_iter]; + err = (params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+err))-mdata[dir_iter]; } - shared[idB]+= err*err; - dir_iter+=THREADS_X_BLOCK_FIT; + reduction[idSubVOX]+= err*err; + dir_iter+=THREADS_BLOCK_FIT; } __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - cfv+=shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + *cfv+=reduction[i]; } } } @@ -182,47 +184,49 @@ __device__ void grad_PVM_single( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT double* grad) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } __syncthreads(); - if(idB==0){ - sumf=0; - for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; - _d = abs(params[1]); + if(idSubVOX==0){ + *sumf=0; + for(int i=0;i<nfib;i++) *sumf+=fs[i]; + *_d = abs(params[1]); for (int p=0;p<nparams;p++) grad[p]=0; } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double diff; double sig; double3 xx; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); @@ -230,43 +234,43 @@ __device__ void grad_PVM_single( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals); - J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig= params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + sig= params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); }else{ - sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + sig = params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); } diff = sig - mdata[dir_iter]; J[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ - shared[idB]=2*J[p]*diff; + reduction[idSubVOX]=2*J[p]*diff; __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - grad[p] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + grad[p] += reduction[i]; } } __syncthreads(); } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } } @@ -275,34 +279,36 @@ __device__ void hess_PVM_single( //INPUT const double* params, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT double* hess) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(params[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(params[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } __syncthreads(); - if(idB==0){ - sumf=0; - for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; - _d = abs(params[1]); + if(idSubVOX==0){ + *sumf=0; + for(int i=0;i<nfib;i++) *sumf+=fs[i]; + *_d = abs(params[1]); for (int p=0;p<nparams;p++){ for (int p2=0;p2<nparams;p2++){ hess[p*nparams+p2] = 0; @@ -310,15 +316,15 @@ __device__ void hess_PVM_single( //INPUT } } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double sig; double3 xx; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); @@ -326,26 +332,26 @@ __device__ void hess_PVM_single( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals); - J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ double temp_f0=x2f_gpu(params[nparams-1]); J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); - sig=params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + sig=params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); }else{ - sig = params[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + sig = params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); + J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); } J[0] = sig/params[0]; } @@ -353,20 +359,20 @@ __device__ void hess_PVM_single( //INPUT for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - shared[idB]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(J[p]*J[p2]); __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - hess[p*nparams+p2] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + hess[p*nparams+p2] += reduction[i]; } } __syncthreads(); } } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } - if(idB==0){ + if(idSubVOX==0){ for (int j=0; j<nparams; j++) { for (int i=j+1; i<nparams; i++) { hess[i*nparams+j]=hess[j*nparams+i]; @@ -381,58 +387,60 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT const double* bvecs, const double* bvals, const int nvox, + const int ndirections, const int nfib, + const int nparams, const bool m_include_f0, //INPUT-OUTPUT double* params) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - - __shared__ double step[NPARAMS]; - __shared__ double grad[NPARAMS]; - __shared__ double hess[NPARAMS*NPARAMS]; - __shared__ double inverse[NPARAMS]; - __shared__ double pcf; - __shared__ double ncf; - __shared__ double lambda; - __shared__ double cftol; - __shared__ double ltol; - __shared__ double olambda; - __shared__ bool success; - __shared__ bool end; - - __shared__ double shared[THREADS_X_BLOCK_FIT]; - __shared__ double fs[NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double _d; - __shared__ double sumf; - - if(idB==0){ - if(m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - for(int i=0;i<nparams;i++){ - myparams[i]=params[(idVOX*nparams)+i]; - } + int threadsBlock = blockDim.x; + + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* reduction = (double*)shared; //threadsBlock + double* myparams = (double*) &reduction[threadsBlock]; //nparams + double* grad = (double*) &myparams[nparams]; //nparams + double* hess = (double*) &grad[nparams]; //nparams*nparams + double* step = (double*) &hess[nparams*nparams]; //nparams + double* inverse = (double*) &step[nparams]; //nparams + double* pcf = (double*) &inverse[nparams]; //1 + double* ncf = (double*) &pcf[1]; //1 + double* lambda = (double*) &ncf[1]; //1 + double* cftol = (double*) &lambda[1]; //1 + double* ltol = (double*) &cftol[1]; //1 + double* olambda = (double*) <ol[1]; //1 + + double* fs = (double*) &olambda[1]; //nfib + double* x = (double*) &fs[nfib]; //nfib*3 + double* _d = (double*) &x[nfib*3]; //1 + double* sumf = (double*) &_d[1]; //1 + + double* C = (double*)&sumf[1]; //nparams*nparams; + double* el = (double*)&C[nparams*nparams]; //nparams + + int* indx = (int*)&el[nparams]; //nparams + int* success = (int*) &indx[nparams]; //1 + int* end = (int*) &success[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + if(idSubVOX<nparams){ + myparams[idSubVOX]=params[(idVOX*nparams)+idSubVOX]; } __syncthreads(); // do the fit - levenberg_marquardt_PVM_single_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_d,sumf,myparams); + levenberg_marquardt_PVM_single_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_d,sumf,C,el,indx,myparams); __syncthreads(); // finalise parameters //m_s0 in myparams[0] m_d in myparams[1] m_f-m_th-m_ph in myparams[2,3,4,5, etc..] m_f0 in myparams[nparams-1] - if(idB==0){ + if(idSubVOX==0){ myparams[1] = abs(myparams[1]); for(int k=1;k<=nfib;k++){ int kk = 2 + 3*(k-1); @@ -443,10 +451,11 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT sort_PVM_single(nfib,myparams); fix_fsum_PVM_single(m_include_f0,nfib,nparams,myparams); + } + __syncthreads(); - for(int i=0;i<nparams;i++){ - params[idVOX*nparams+i]=myparams[i]; - } + if(idSubVOX<nparams){ + params[idVOX*nparams+idSubVOX]=myparams[idSubVOX]; } } @@ -457,46 +466,46 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT const double* bvecs, const double* bvals, const int nvox, + const int ndirections, const int nfib, + const int nparams, const bool m_include_f0, const bool* includes_f0, //OUTPUT double* residuals) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - __shared__ bool my_include_f0; - __shared__ double val; - __shared__ double _d; - __shared__ double fs[NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double sumf; - + int threadsBlock = blockDim.x; + + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* myparams = (double*) shared; //nparams + double* fs = (double*) &myparams[nparams]; //nfib + double* x = (double*) &fs[nfib]; //nfib*3 + double* _d = (double*) &x[nfib*3]; //1 + double* sumf = (double*) &_d[1]; //1 + int* my_include_f0 = (int*) &sumf[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + double val; double predicted_signal; double mydata; - if(idB==0){ - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - my_include_f0 = includes_f0[idVOX]; + if(idSubVOX==0){ + *my_include_f0 = includes_f0[idVOX]; - //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] + //m_s0-myparams[0] m_d-myparams[1] m_f-m_th-m_ph-myparams[2,3,4,5 etc..] m_f0-myparams[nparams-1] myparams[0]=params[(idVOX*nparams)+0]; myparams[1]=params[(idVOX*nparams)+1]; - if (my_include_f0) + if (*my_include_f0) myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]); } - if(idB<nfib){ - int kk = 2+3*idB; + if(idSubVOX<nfib){ + int kk = 2+3*idSubVOX; double sinth,costh,sinph,cosph; myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); @@ -506,49 +515,49 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT sincos(myparams[kk+1],&sinth,&costh); sincos(myparams[kk+2],&sinph,&cosph); - fs[idB] = x2f_gpu(myparams[kk]); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + fs[idSubVOX] = x2f_gpu(myparams[kk]); + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } __syncthreads(); - if(idB==0){ - sumf=0; - for(int i=0;i<NFIBRES;i++) sumf+=fs[i]; - _d = abs(myparams[1]); + if(idSubVOX==0){ + *sumf=0; + for(int i=0;i<nfib;i++) *sumf+=fs[i]; + *_d = abs(myparams[1]); } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/threadsBlock; + if(idSubVOX<(ndirections%threadsBlock)) ndir++; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<ndir;dir++){ - mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + mydata = data[(idVOX*ndirections)+dir_iter]; predicted_signal=0; //pred = 0; val = 0.0; for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections); } - if (my_include_f0){ + if (*my_include_f0){ double temp_f0=x2f_gpu(myparams[nparams-1]); - predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); - } - else - predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*ndirections])+val); + }else{ + predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[idVOX*ndirections])+val); + } //residuals=m_data-predicted_signal; - residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; + residuals[idVOX*ndirections+dir_iter]= mydata - predicted_signal; - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=threadsBlock; } } diff --git a/CUDA/PVM_single_c.cu b/CUDA/PVM_single_c.cu index 09c399f82bcf3a196afc2dd73b0fd551c9ef9d73..a23aa790558b523dd40bbcb0818abdf516e7a6e2 100644 --- a/CUDA/PVM_single_c.cu +++ b/CUDA/PVM_single_c.cu @@ -19,8 +19,8 @@ ///////////////////////////////////// __device__ -inline double isoterm_PVM_single_c(const int pt,const double _d,const double *bvals){ - return exp(-bvals[pt]*_d); +inline double isoterm_PVM_single_c(const int pt,const double* _d,const double *bvals){ + return exp(-bvals[pt]**_d); } __device__ @@ -29,35 +29,35 @@ inline double isoterm_lambda_PVM_single_c(const int pt,const double lambda,const } __device__ -inline double anisoterm_PVM_single_c(const int pt,const double _d,const double3 x, const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - return exp(-bvals[pt]*_d*dp*dp); +inline double anisoterm_PVM_single_c(const int pt,const double* _d,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + return exp(-bvals[pt]**_d*dp*dp); } __device__ -inline double anisoterm_lambda_PVM_single_c(const int pt,const double lambda,const double3 x, const double *bvecs, const double *bvals){ - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; +inline double anisoterm_lambda_PVM_single_c(const int pt,const double lambda,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return(-2*bvals[pt]*lambda*dp*dp*exp(-bvals[pt]*lambda*lambda*dp*dp)); } __device__ -inline double anisoterm_th_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ +inline double anisoterm_th_PVM_single_c(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = costh*(bvecs[pt]*cosph+bvecs[NDIRECTIONS+pt]*sinph)-bvecs[(2*NDIRECTIONS)+pt]*sinth; - return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth; + return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_ph_PVM_single_c(const int pt,const double _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals){ +inline double anisoterm_ph_PVM_single_c(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ double sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; - double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[NDIRECTIONS+pt]*cosph); - return(-2*bvals[pt]*_d*dp*dp1*exp(-bvals[pt]*_d*dp*dp)); + double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); + return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } //If the sum of the fractions is >1, then zero as many fractions @@ -79,17 +79,6 @@ __device__ void fix_fsum_PVM_single_c( //INPUT } } -//Returns 1-Sum(f_j), 1<=j<=ii. (ii<=nfib) -//Used for transforming beta to f and vice versa -//in diffmodel.cc -__device__ double partial_fsum_PVM_single_c(double* fs, int ii){ - double fsum=1.0; - for(int j=0;j<ii;j++){ - fsum-=fs[j]; - } - return fsum; -} - //in diffmodel.cc __device__ void sort_PVM_single_c(int nfib,double* params) { @@ -116,13 +105,13 @@ __device__ void fractions_deriv_PVM_single_c( //INPUT const double* params, const double* fs, const int nfib, - const int idB, + const int idSubVOX, //OUTPUT double* Deriv) { int nparams_per_fibre=3; double fsum; - int k=idB%nfib; + int k=idSubVOX%nfib; for (int j=0; j<nfib; j++){ Deriv[j*nfib+k]=0; } @@ -155,32 +144,34 @@ __device__ void cf_PVM_single_c( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT - double &cfv) + double* cfv) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _d = lambda2d_gpu(params[1]); - cfv = 0.0; - sumf=0; + if(idSubVOX==0){ + *_d = lambda2d_gpu(params[1]); + *cfv = 0.0; + *sumf=0; double partial_fsum; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// partial_fsum=1.0; @@ -188,48 +179,48 @@ __device__ void cf_PVM_single_c( //INPUT partial_fsum-=fs[j]; ////////////////////////// fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; + *sumf += fs[k]; } } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; double err; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); - shared[idB]=0; + reduction[idSubVOX]=0; for(int dir=0;dir<ndir;dir++){ err = 0.0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - err += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,bvecs,bvals); + err += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,bvecs,bvals,ndirections); } if(m_include_f0){ //partial_fsum /////////// double partial_fsum=1.0; - for(int j=0;j<NFIBRES;j++) + for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; - err= (params[0]*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,bvals))+err))-mdata[dir_iter]; + err= (params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ - err = params[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,bvals)+err)-mdata[dir_iter]; + err = params[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,bvals)+err)-mdata[dir_iter]; } - shared[idB]+= err*err; - dir_iter+=THREADS_X_BLOCK_FIT; + reduction[idSubVOX]+= err*err; + dir_iter+=THREADS_BLOCK_FIT; } __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - cfv+=shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + *cfv+=reduction[i]; } } } @@ -241,32 +232,34 @@ __device__ void grad_PVM_single_c( //INPUT const double* mdata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* f_deriv, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT double* grad) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _d = lambda2d_gpu(params[1]); - sumf=0; + if(idSubVOX==0){ + *_d = lambda2d_gpu(params[1]); + *sumf=0; double partial_fsum; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// partial_fsum=1.0; @@ -274,29 +267,29 @@ __device__ void grad_PVM_single_c( //INPUT partial_fsum-=fs[j]; ////////////////////////// fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; + *sumf += fs[k]; } for (int p=0;p<nparams;p++) grad[p]=0; } __syncthreads(); - if(idB<NFIBRES){ - fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); + if(idSubVOX<nfib){ + fractions_deriv_PVM_single_c(params,fs,nfib,idSubVOX,f_deriv); } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double diff; double sig; double Iso_term; double3 xx; - int dir_iter=idB; - double Aniso_terms[NFIBRES]; + int dir_iter=idSubVOX; + double Aniso_terms[MAXNFIBRES]; __syncthreads(); @@ -304,61 +297,61 @@ __device__ void grad_PVM_single_c( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals); + Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); } sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals); + J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); J[kk] = 0; - for (int j=0;j<NFIBRES;j++){ - if(f_deriv[j*NFIBRES+k]!=0){ - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; + for (int j=0;j<nfib;j++){ + if(f_deriv[j*nfib+k]!=0){ + J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; } } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ //partial_fsum /////////// double partial_fsum=1.0; - for(int j=0;j<(NFIBRES);j++) + for(int j=0;j<(nfib);j++) partial_fsum-=fs[j]; ////////////////////////// double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; //derivative with respect to f0 J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; - sig=params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + sig=params[0]*((temp_f0+(1-*sumf-temp_f0)*Iso_term)+sig); + J[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); }else{ - sig = params[0]*((1-sumf)*Iso_term+sig); - J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + sig = params[0]*((1-*sumf)*Iso_term+sig); + J[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); } diff = sig - mdata[dir_iter]; J[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ - shared[idB]=2*J[p]*diff; + reduction[idSubVOX]=2*J[p]*diff; __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - grad[p] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + grad[p] += reduction[i]; } } __syncthreads(); } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } } @@ -368,32 +361,34 @@ __device__ void hess_PVM_single_c( //INPUT const double* params, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, - double* shared, //shared memory + const int idSubVOX, + double* reduction, //shared memory double* fs, //shared memory double* f_deriv, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory //OUTPUT double* hess) { - if(idB<NFIBRES){ - int kk = 2+3*(idB); + if(idSubVOX<nfib){ + int kk = 2+3*(idSubVOX); double sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ - _d = lambda2d_gpu(params[1]); - sumf=0; + if(idSubVOX==0){ + *_d = lambda2d_gpu(params[1]); + *sumf=0; double partial_fsum; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// partial_fsum=1.0; @@ -401,7 +396,7 @@ __device__ void hess_PVM_single_c( //INPUT partial_fsum-=fs[j]; ////////////////////////// fs[k] = beta2f_gpu(params[kk])*partial_fsum; - sumf += fs[k]; + *sumf += fs[k]; } for (int p=0;p<nparams;p++){ for (int p2=0;p2<nparams;p2++){ @@ -412,21 +407,21 @@ __device__ void hess_PVM_single_c( //INPUT __syncthreads(); - if(idB<NFIBRES){ - fractions_deriv_PVM_single_c(params,fs,NFIBRES,idB,f_deriv); + if(idSubVOX<nfib){ + fractions_deriv_PVM_single_c(params,fs,nfib,idSubVOX,f_deriv); } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; - int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++; + int ndir = ndirections/THREADS_BLOCK_FIT; + if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; + int max_dir = ndirections/THREADS_BLOCK_FIT; + if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[NPARAMS]; + double J[MAXNPARAMS]; double sig; double Iso_term; double3 xx; - int dir_iter=idB; - double Aniso_terms[NFIBRES]; + int dir_iter=idSubVOX; + double Aniso_terms[MAXNFIBRES]; __syncthreads(); @@ -434,42 +429,42 @@ __device__ void hess_PVM_single_c( //INPUT for (int p=0; p<nparams; p++) J[p]=0; if(dir<ndir){ Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals); + Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); } sig = 0; - for(int k=0;k<NFIBRES;k++){ + for(int k=0;k<nfib;k++){ int kk = 2+3*(k); xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals); + J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); J[kk] = 0; - for (int j=0; j<NFIBRES; j++){ - if (f_deriv[j*NFIBRES+k]!=0) - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*NFIBRES+k]; + for (int j=0; j<nfib; j++){ + if (f_deriv[j*nfib+k]!=0) + J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals); + J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ //partial_fsum /////////// double partial_fsum=1.0; - for(int j=0;j<(NFIBRES);j++) + for(int j=0;j<(nfib);j++) partial_fsum-=fs[j]; ////////////////////////// double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; //derivative with respect to f0 J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; - sig= params[0]*((temp_f0+(1-sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + sig= params[0]*((temp_f0+(1-*sumf-temp_f0)*Iso_term)+sig); + J[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); }else{ - sig = params[0]*((1-sumf)*Iso_term+sig); - J[1] += params[0]*(1-sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + sig = params[0]*((1-*sumf)*Iso_term+sig); + J[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); } J[0] = sig/params[0]; } @@ -477,20 +472,20 @@ __device__ void hess_PVM_single_c( //INPUT for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - shared[idB]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(J[p]*J[p2]); __syncthreads(); - if(idB==0){ - for(int i=0;i<THREADS_X_BLOCK_FIT;i++){ - hess[p*nparams+p2] += shared[i]; + if(idSubVOX==0){ + for(int i=0;i<THREADS_BLOCK_FIT;i++){ + hess[p*nparams+p2] += reduction[i]; } } __syncthreads(); } } - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=THREADS_BLOCK_FIT; } - if(idB==0){ + if(idSubVOX==0){ for (int j=0; j<nparams; j++) { for (int i=j+1; i<nparams; i++) { hess[i*nparams+j]=hess[j*nparams+i]; @@ -503,79 +498,90 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT const double* data, const double* bvecs, const double* bvals, - const int nvox, - const int nfib, + const int nvox, + const int ndirections, + const int nfib, + const int nparams, const bool m_eval_BIC, const bool m_include_f0, const bool m_return_fanning, //INPUT - OUTPUT double* params) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - - __shared__ double shared[THREADS_X_BLOCK_FIT]; - __shared__ double step[NPARAMS]; - __shared__ double grad[NPARAMS]; - __shared__ double hess[NPARAMS*NPARAMS]; - __shared__ double inverse[NPARAMS]; - __shared__ double pcf; - __shared__ double ncf; - __shared__ double lambda; - __shared__ double cftol; - __shared__ double ltol; - __shared__ double olambda; - __shared__ bool success; - __shared__ bool end; - - __shared__ double fs[NFIBRES]; - __shared__ double f_deriv[NFIBRES*NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double _d; - __shared__ double sumf; - - if(idB==0){ - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - for(int i=0;i<nparams;i++){ - myparams[i]=params[(idVOX*nparams)+i]; - } + int threadsBlock = blockDim.x; + + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* reduction = (double*)shared; //threadsBlock + double* myparams = (double*) &reduction[threadsBlock]; //nparams + double* grad = (double*) &myparams[nparams]; //nparams + double* hess = (double*) &grad[nparams]; //nparams*nparams + double* step = (double*) &hess[nparams*nparams]; //nparams + double* inverse = (double*) &step[nparams]; //nparams + double* pcf = (double*) &inverse[nparams]; //1 + double* ncf = (double*) &pcf[1]; //1 + double* lambda = (double*) &ncf[1]; //1 + double* cftol = (double*) &lambda[1]; //1 + double* ltol = (double*) &cftol[1]; //1 + double* olambda = (double*) <ol[1]; //1 + + double* fs = (double*) &olambda[1]; //nfib + double* f_deriv = (double*) &fs[nfib]; //nfib*nfib + double* x = (double*) &f_deriv[nfib*nfib]; //nfib*3 + double* _d = (double*) &x[nfib*3]; //1 + double* sumf = (double*) &_d[1]; //1 + + double* C = (double*)&sumf[1]; //nparams*nparams; + double* el = (double*)&C[nparams*nparams]; //nparams + + int* indx = (int*)&el[nparams]; //nparams + int* success = (int*) &indx[nparams]; //1 + int* end = (int*) &success[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + if(idSubVOX<nparams){ + myparams[idSubVOX]=params[(idVOX*nparams)+idSubVOX]; } __syncthreads(); //do the fit - levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,f_deriv,x,_d,sumf,myparams); + levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*ndirections],&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,f_deriv,x,_d,sumf,C,el,indx,myparams); __syncthreads(); // finalise parameters // m_s0-myparams[0] m_d-myparams[1] m_f-m_th-m_ph-myparams[2,3,4,5, etc..] m_f0-myparams[nparams-1] - if(idB==0){ - double m_f[NFIBRES]; // for partial_fsum - + if(idSubVOX==0){ myparams[1] = lambda2d_gpu(myparams[1]); for(int k=0;k<nfib;k++){ int kk = 2 + 3*(k); - myparams[kk] = beta2f_gpu(myparams[kk])*partial_fsum_PVM_single_c(m_f,k); - m_f[k]=myparams[kk]; + //partial_fsum /////////// + double partial_fsum=1.0; + for(int j=0;j<k;j++) + partial_fsum-=myparams[2 + 3*j]; + ////////////////////////// + myparams[kk] = beta2f_gpu(myparams[kk])*partial_fsum; } - if (m_include_f0) - myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum_PVM_single_c(m_f,nfib); - + if (m_include_f0){ + //partial_fsum /////////// + double partial_fsum=1.0; + for(int j=0;j<(nfib);j++){ + partial_fsum-=myparams[2 + 3*j]; + } + ////////////////////////// + myparams[nparams-1]= beta2f_gpu(myparams[nparams-1])*partial_fsum; + } sort_PVM_single_c(nfib,myparams); + } + __syncthreads(); - for(int i=0;i<nparams;i++){ - params[(idVOX*nparams)+i] = myparams[i]; - } + if(idSubVOX<nparams){ + params[(idVOX*nparams)+idSubVOX] = myparams[idSubVOX]; } } @@ -586,36 +592,37 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT const double* bvecs, const double* bvals, const int nvox, + const int ndirections, const int nfib, + const int nparams, const bool m_include_f0, const bool* includes_f0, //OUTPUT double* residuals) { - int idB = threadIdx.x; + int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; - - __shared__ double myparams[NPARAMS]; - __shared__ int nparams; - __shared__ bool my_include_f0; - __shared__ double val; - __shared__ double _d; - __shared__ double fs[NFIBRES]; - __shared__ double x[NFIBRES*3]; - __shared__ double sumf; - + int threadsBlock = blockDim.x; + + ////////// DYNAMIC SHARED MEMORY /////////// + extern __shared__ double shared[]; + double* myparams = (double*) shared; //nparams + double* fs = (double*) &myparams[nparams]; //nfib + double* x = (double*) &fs[nfib]; //nfib*3 + double* _d = (double*) &x[nfib*3]; //1 + double* sumf = (double*) &_d[1]; //1 + int* my_include_f0 = (int*) &sumf[1]; //1 + ////////// DYNAMIC SHARED MEMORY /////////// + + double val; double predicted_signal; double mydata; + - if(idB==0){ - if (m_include_f0) - nparams = nfib*3 + 3; - else - nparams = nfib*3 + 2; - - my_include_f0 = includes_f0[idVOX]; + if(idSubVOX==0){ + *my_include_f0 = includes_f0[idVOX]; - //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] + //m_s0-myparams[0] m_d-myparams[1] m_f-m_th-m_ph-myparams[2,3,4,5 etc..] m_f0-myparams[nparams-1] myparams[0]=params[(idVOX*nparams)+0]; if(myparams[1]<0) myparams[1] = 0; //This can be due to numerical errors..sqrt @@ -637,10 +644,10 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT myparams[kk+1] = params[(idVOX*nparams)+kk+1]; myparams[kk+2] = params[(idVOX*nparams)+kk+2]; } - if (my_include_f0){ + if (*my_include_f0){ //partial_fsum /////////// partial_fsum=1.0; - for(int j=0;j<NFIBRES;j++) + for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// double tmpr=params[(idVOX*nparams)+nparams-1]/partial_fsum; @@ -652,19 +659,19 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT __syncthreads(); - if(idB<nfib){ - int kk = 2+3*idB; + if(idSubVOX<nfib){ + int kk = 2+3*idSubVOX; double sinth,costh,sinph,cosph; sincos(myparams[kk+1],&sinth,&costh); sincos(myparams[kk+2],&sinph,&cosph); - x[idB*3] = sinth*cosph; - x[idB*3+1] = sinth*sinph; - x[idB*3+2] = costh; + x[idSubVOX*3] = sinth*cosph; + x[idSubVOX*3+1] = sinth*sinph; + x[idSubVOX*3+2] = costh; } - if(idB==0){ + if(idSubVOX==0){ double partial_fsum; - sumf=0; + *sumf=0; for(int k=0;k<nfib;k++){ int kk = 2+3*k; ////// partial_fsum ////// @@ -673,44 +680,44 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT partial_fsum-=fs[j]; ////////////////////////// fs[k] = beta2f_gpu(myparams[kk])*partial_fsum; - sumf += fs[k]; + *sumf += fs[k]; } - _d = lambda2d_gpu(myparams[1]); + *_d = lambda2d_gpu(myparams[1]); } - int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT; - if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++; + int ndir = ndirections/threadsBlock; + if(idSubVOX<(ndirections%threadsBlock)) ndir++; double3 x2; - int dir_iter=idB; + int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<ndir;dir++){ - mydata = data[(idVOX*NDIRECTIONS)+dir_iter]; + mydata = data[(idVOX*ndirections)+dir_iter]; predicted_signal=0; //pred = 0; val = 0.0; for(int k=0;k<nfib;k++){ x2.x=x[k*3]; x2.y=x[k*3+1]; x2.z=x[k*3+2]; - val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]); + val += fs[k]*anisoterm_PVM_single_c(dir_iter,_d,x2,&bvecs[idVOX*3*ndirections],&bvals[idVOX*ndirections],ndirections); } - if (my_include_f0){ + if (*my_include_f0){ //partial_fsum /////////// double partial_fsum=1.0; - for(int j=0;j<NFIBRES;j++) + for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// double temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum; - predicted_signal = myparams[0]*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); - } - else - predicted_signal = myparams[0]*((1-sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*NDIRECTIONS])+val); + predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*ndirections])+val); + }else{ + predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[idVOX*ndirections])+val); + } //residuals=m_data-predicted_signal; - residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal; + residuals[idVOX*ndirections+dir_iter]= mydata - predicted_signal; - dir_iter+=THREADS_X_BLOCK_FIT; + dir_iter+=threadsBlock; } }