diff --git a/CUDA/PVM_multi.cu b/CUDA/PVM_multi.cu index 1e15913253c46074b62b430389d7ab4d5701de5d..8584d0d10f059efa1e47b79b2daa3d90ae545a9b 100644 --- a/CUDA/PVM_multi.cu +++ b/CUDA/PVM_multi.cu @@ -16,48 +16,48 @@ ///////////////////////////////////// ///////////////////////////////////// -__device__ inline double isoterm_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ +__device__ inline float isoterm_PVM_multi(const int pt,const float* _a,const float* _b, const float *bvals){ return exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double isoterm_a_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ +__device__ inline float isoterm_a_PVM_multi(const int pt,const float* _a,const float* _b, const float *bvals){ return -log(1+bvals[pt]**_b)*exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double isoterm_b_PVM_multi(const int pt,const double* _a,const double* _b, const double *bvals){ +__device__ inline float isoterm_b_PVM_multi(const int pt,const float* _a,const float* _b, const float *bvals){ return -*_a*bvals[pt]/(1+bvals[pt]**_b)*exp(-*_a*log(1+bvals[pt]**_b)); } -__device__ inline double anisoterm_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +__device__ inline float anisoterm_PVM_multi(const int pt,const float* _a,const float* _b,const float3 x,const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return exp(-*_a*log(1+bvals[pt]**_b*(dp*dp))); } -__device__ inline double anisoterm_a_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +__device__ inline float anisoterm_a_PVM_multi(const int pt,const float* _a,const float* _b,const float3 x,const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return -log(1+bvals[pt]*(dp*dp)**_b)* exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b)); } -__device__ inline double anisoterm_b_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +__device__ inline float anisoterm_b_PVM_multi(const int pt,const float* _a,const float* _b,const float3 x,const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return (-*_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))); } -__device__ inline double anisoterm_th_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,costh,sinph,cosph; +__device__ inline float anisoterm_th_PVM_multi(const int pt,const float* _a,const float* _b,const float3 x,const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = costh* (bvecs[pt]*cosph + bvecs[ndirections+pt]*sinph) - bvecs[(2*ndirections)+pt]*sinth; + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = costh* (bvecs[pt]*cosph + bvecs[ndirections+pt]*sinph) - bvecs[(2*ndirections)+pt]*sinth; return (-*_a**_b*bvals[pt]/(1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))*2*dp*dp1); } -__device__ inline double anisoterm_ph_PVM_multi(const int pt,const double* _a,const double* _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,sinph,cosph; +__device__ inline float anisoterm_ph_PVM_multi(const int pt,const float* _a,const float* _b,const float3 x,const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[ndirections+pt]*cosph); + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = sinth* (-bvecs[pt]*sinph + bvecs[ndirections+pt]*cosph); return (-*_a**_b*bvals[pt]/(1+bvals[pt]*(dp*dp)**_b)*exp(-*_a*log(1+bvals[pt]*(dp*dp)**_b))*2*dp*dp1); } @@ -67,9 +67,9 @@ __device__ void fix_fsum_PVM_multi( //INPUT int nfib, int nparams, //INPUT - OUTPUT){ - double *params) + float *params) { - double sumf=0; + float sumf=0; if (m_include_f0) sumf=params[nparams-1]; for(int i=0;i<nfib;i++){ @@ -85,9 +85,9 @@ __device__ void fix_fsum_PVM_multi( //INPUT } //in diffmodel.cc -__device__ void sort_PVM_multi(int nfib,double* params) +__device__ void sort_PVM_multi(int nfib,float* params) { - double temp_f, temp_th, temp_ph; + float temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ @@ -108,27 +108,27 @@ __device__ void sort_PVM_multi(int nfib,double* params) //cost function PVM_multi __device__ void cf_PVM_multi( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _a, //shared memory - double* _b, //shared memory - double* sumf, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _a, //shared memory + float* _b, //shared memory + float* sumf, //shared memory //OUTPUT double* cfv) { if(idSubVOX<nfib){ int kk = 3+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -147,8 +147,8 @@ __device__ void cf_PVM_multi( //INPUT int ndir = ndirections/THREADS_BLOCK_FIT; if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; - double err; - double3 x2; + float err; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -163,7 +163,7 @@ __device__ void cf_PVM_multi( //INPUT err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); err = (abs(params[0])*(temp_f0+((1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter]; }else{ err = abs(params[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter]; @@ -183,27 +183,28 @@ __device__ void cf_PVM_multi( //INPUT //gradient function PVM_multi __device__ void grad_PVM_multi( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _a, //shared memory - double* _b, //shared memory - double* sumf, //shared memory + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _a, //shared memory + float* _b, //shared memory + float* sumf, //shared memory //OUTPUT - double* grad) + float* grad) { if(idSubVOX<nfib){ int kk = 3+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -224,16 +225,16 @@ __device__ void grad_PVM_multi( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double diff; - double sig; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float diff; + float sig; + float3 xx; int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ sig = 0; for(int k=0;k<nfib;k++){ @@ -242,30 +243,30 @@ __device__ void grad_PVM_multi( //INPUT xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + myJ[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + myJ[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); + myJ[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); sig=abs(params[0])*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); }else{ sig = abs(params[0]) * ((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); } diff = sig - mdata[dir_iter]; - J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; + myJ[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; } for (int p=0;p<nparams;p++){ - reduction[idSubVOX]=2*J[p]*diff; + reduction[idSubVOX]=2*myJ[p]*diff; __syncthreads(); if(idSubVOX==0){ @@ -281,26 +282,27 @@ __device__ void grad_PVM_multi( //INPUT //hessian function PVM_multi __device__ void hess_PVM_multi( //INPUT - const double* params, - const double* bvecs, - const double* bvals, + const float* params, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _a, //shared memory - double* _b, //shared memory - double* sumf, //shared memory + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _a, //shared memory + float* _b, //shared memory + float* sumf, //shared memory //OUTPUT - double* hess) + float* hess) { if(idSubVOX<nfib){ int kk = 3+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -325,15 +327,15 @@ __device__ void hess_PVM_multi( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double sig; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float sig; + float3 xx; int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ sig = 0; for(int k=0;k<nfib;k++){ @@ -342,32 +344,32 @@ __device__ void hess_PVM_multi( //INPUT xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); - J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov; - J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + float cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections); + myJ[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals,ndirections)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov; + myJ[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); + myJ[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); sig = abs(params[0])* (temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); }else{ sig = abs(params[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); - J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals); + myJ[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-*sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals); } - J[0] = sig/params[0]; + myJ[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - reduction[idSubVOX]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(myJ[p]*myJ[p2]); __syncthreads(); if(idSubVOX==0){ for(int i=0;i<THREADS_BLOCK_FIT;i++){ @@ -391,10 +393,10 @@ __device__ void hess_PVM_multi( //INPUT //in diffmodel.cc extern "C" __global__ void fit_PVM_multi_kernel( //INPUT - const double* data, - const double* params_PVM_single_c, - const double* bvecs, - const double* bvals, + const float* data, + const float* params_PVM_single_c, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -402,7 +404,7 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT const bool m_include_f0, const bool gradnonlin, //OUTPUT - double* params) + float* params) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; @@ -410,27 +412,29 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* reduction = (double*)shared; //threadsBlock - double* myparams = (double*) &reduction[threadsBlock]; //nparams - double* grad = (double*) &myparams[nparams]; //nparams - double* hess = (double*) &grad[nparams]; //nparams*nparams - double* step = (double*) &hess[nparams*nparams]; //nparams - double* inverse = (double*) &step[nparams]; //nparams - double* pcf = (double*) &inverse[nparams]; //1 + double* pcf = (double*) shared; //1 double* ncf = (double*) &pcf[1]; //1 double* lambda = (double*) &ncf[1]; //1 double* cftol = (double*) &lambda[1]; //1 double* ltol = (double*) &cftol[1]; //1 double* olambda = (double*) <ol[1]; //1 - double* fs = (double*) &olambda[1]; //nfib - double* x = (double*) &fs[nfib]; //nfib*3 - double* _a = (double*) &x[nfib*3]; //1 - double* _b = (double*) &_a[1]; //1 - double* sumf = (double*) &_b[1]; //1 + float* J = (float*)&olambda[1]; //threadsBlock*nparams + float* reduction = (float*)&J[threadsBlock*nparams]; //threadsBlock + float* myparams = (float*) &reduction[threadsBlock]; //nparams + float* grad = (float*) &myparams[nparams]; //nparams + float* hess = (float*) &grad[nparams]; //nparams*nparams + float* step = (float*) &hess[nparams*nparams]; //nparams + float* inverse = (float*) &step[nparams]; //nparams - double* C = (double*)&sumf[1]; //nparams*nparams; - double* el = (double*)&C[nparams*nparams]; //nparams + float* fs = (float*) &inverse[nparams]; //nfib + float* x = (float*) &fs[nfib]; //nfib*3 + float* _a = (float*) &x[nfib*3]; //1 + float* _b = (float*) &_a[1]; //1 + float* sumf = (float*) &_b[1]; //1 + + float* C = (float*)&sumf[1]; //nparams*nparams; + float* el = (float*)&C[nparams*nparams]; //nparams int* indx = (int*)&el[nparams]; //nparams int* success = (int*) &indx[nparams]; //1 @@ -464,7 +468,7 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT pos_bvecs=0; } //do the fit - levenberg_marquardt_PVM_multi_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_a,_b,sumf,C,el,indx,myparams); + levenberg_marquardt_PVM_multi_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,J,reduction,fs,x,_a,_b,sumf,C,el,indx,myparams); __syncthreads(); @@ -472,10 +476,10 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] if(idSubVOX==0){ - double aux = myparams[1]; + float aux = myparams[1]; myparams[1] = abs(aux*myparams[2]); - myparams[2] = sqrt(double(abs(aux*myparams[2]*myparams[2]))); + myparams[2] = sqrt(float(abs(aux*myparams[2]*myparams[2]))); for(int i=3,k=0;k<nfib;i+=3,k++){ myparams[i] = x2f_gpu(myparams[i]); } @@ -494,10 +498,10 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT //in diffmodel.cc extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT - const double* data, - const double* params, - const double* bvecs, - const double* bvals, + const float* data, + const float* params, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -506,25 +510,25 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT const bool gradnonlin, const bool* includes_f0, //OUTPUT - double* residuals) + float* residuals) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* myparams = (double*) shared; //nparams - double* fs = (double*) &myparams[nparams]; //nfib - double* x = (double*) &fs[nfib]; //nfib*3 - double* _a = (double*) &x[nfib*3]; //1 - double* _b = (double*) &_a[1]; //1 - double* sumf = (double*) &_b[1]; //1 + float* myparams = (float*) shared; //nparams + float* fs = (float*) &myparams[nparams]; //nfib + float* x = (float*) &fs[nfib]; //nfib*3 + float* _a = (float*) &x[nfib*3]; //1 + float* _b = (float*) &_a[1]; //1 + float* sumf = (float*) &_b[1]; //1 int* my_include_f0 = (int*) &sumf[1]; //1 ////////// DYNAMIC SHARED MEMORY /////////// - double val; - double predicted_signal; - double mydata; + float val; + float predicted_signal; + float mydata; if(idSubVOX==0){ *my_include_f0 = includes_f0[idVOX]; @@ -532,8 +536,8 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] myparams[0] = params[(idVOX*nparams)+0]; - double aux1 = params[(idVOX*nparams)+1]; - double aux2 = params[(idVOX*nparams)+2]; + float aux1 = params[(idVOX*nparams)+1]; + float aux2 = params[(idVOX*nparams)+2]; myparams[1] = aux1*aux1/aux2/aux2; //m_d*m_d/m_d_std/m_d_std; myparams[2] = aux2*aux2/aux1; //m_d_std*m_d_std/m_d; // =1/beta @@ -544,7 +548,7 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT if(idSubVOX<nfib){ int kk = 3+3*idSubVOX; - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); myparams[kk+1] = params[(idVOX*nparams)+kk+1]; @@ -572,7 +576,7 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT int ndir = ndirections/THREADS_BLOCK_FIT; if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; - double3 x2; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -597,7 +601,7 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections); } if (*my_include_f0){ - double temp_f0=x2f_gpu(myparams[nparams-1]); + float temp_f0=x2f_gpu(myparams[nparams-1]); predicted_signal = abs(myparams[0])*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[pos_bvals])+val); }else{ predicted_signal = abs(myparams[0])*((1-*sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[pos_bvals])+val); diff --git a/CUDA/PVM_single.cu b/CUDA/PVM_single.cu index 667e73a018979d5fe1bb8b9f8585d281843a82d6..edcef6fee1aef26279f9c11e415b30305d76e2b7 100644 --- a/CUDA/PVM_single.cu +++ b/CUDA/PVM_single.cu @@ -19,44 +19,44 @@ ///////////////////////////////////// __device__ -inline double isoterm_PVM_single(const int pt,const double* _d,const double *bvals){ +inline float isoterm_PVM_single(const int pt,const float* _d,const float *bvals){ return exp(-bvals[pt]**_d); } __device__ -inline double isoterm_d_PVM_single(const int pt,const double* _d,const double *bvals){ +inline float isoterm_d_PVM_single(const int pt,const float* _d,const float *bvals){ return (-bvals[pt]*exp(-bvals[pt]**_d)); } __device__ -inline double anisoterm_PVM_single(const int pt,const double* _d,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +inline float anisoterm_PVM_single(const int pt,const float* _d,const float3 x, const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return exp(-bvals[pt]**_d*dp*dp); } __device__ -inline double anisoterm_d_PVM_single(const int pt,const double* _d,const double3 x,const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +inline float anisoterm_d_PVM_single(const int pt,const float* _d,const float3 x,const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return(-bvals[pt]*dp*dp*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_th_PVM_single(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,costh,sinph,cosph; +inline float anisoterm_th_PVM_single(const int pt,const float* _d,const float3 x, const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = (costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth); + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = (costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth); return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_ph_PVM_single(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,sinph,cosph; +inline float anisoterm_ph_PVM_single(const int pt,const float* _d,const float3 x, const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } @@ -66,9 +66,9 @@ __device__ void fix_fsum_PVM_single( //INPUT int nfib, int nparams, //INPUT - OUTPUT){ - double *params) + float *params) { - double sum=0; + float sum=0; if (m_include_f0) sum=params[nparams-1]; for(int i=0;i<nfib;i++){ @@ -82,9 +82,9 @@ __device__ void fix_fsum_PVM_single( //INPUT } //in diffmodel.cc -__device__ void sort_PVM_single(int nfib,double* params) +__device__ void sort_PVM_single(int nfib,float* params) { - double temp_f, temp_th, temp_ph; + float temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ @@ -105,26 +105,26 @@ __device__ void sort_PVM_single(int nfib,double* params) //cost function PVM_single __device__ void cf_PVM_single( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT double* cfv) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -145,8 +145,8 @@ __device__ void cf_PVM_single( //INPUT int ndir = ndirections/THREADS_BLOCK_FIT; if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; - double err; - double3 x2; + float err; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -161,7 +161,7 @@ __device__ void cf_PVM_single( //INPUT err += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); err= (params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ err = (params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+err))-mdata[dir_iter]; @@ -180,26 +180,27 @@ __device__ void cf_PVM_single( //INPUT //gradient function PVM_single __device__ void grad_PVM_single( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, - const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + const int idSubVOX, + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT - double* grad) + float* grad) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -222,16 +223,16 @@ __device__ void grad_PVM_single( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double diff; - double sig; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float diff; + float sig; + float3 xx; int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ sig = 0; for(int k=0;k<nfib;k++){ @@ -240,27 +241,27 @@ __device__ void grad_PVM_single( //INPUT xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); - J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + myJ[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + myJ[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); + myJ[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); sig= params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); }else{ sig = params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); } diff = sig - mdata[dir_iter]; - J[0] = sig/params[0]; + myJ[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ - reduction[idSubVOX]=2*J[p]*diff; + reduction[idSubVOX]=2*myJ[p]*diff; __syncthreads(); if(idSubVOX==0){ @@ -276,25 +277,26 @@ __device__ void grad_PVM_single( //INPUT //hessian function PVM_single __device__ void hess_PVM_single( //INPUT - const double* params, - const double* bvecs, - const double* bvals, + const float* params, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT - double* hess) + float* hess) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); fs[idSubVOX] = x2f_gpu(params[kk]); @@ -321,15 +323,15 @@ __device__ void hess_PVM_single( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double sig; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float sig; + float3 xx; int dir_iter=idSubVOX; __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ sig = 0; for(int k=0;k<nfib;k++){ @@ -338,28 +340,28 @@ __device__ void hess_PVM_single( //INPUT xx.y=x[k*3+1]; xx.z=x[k*3+2]; sig += fs[k]*anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); - J[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*fs[k]*anisoterm_d_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections); + myJ[kk] = params[0]*(anisoterm_PVM_single(dir_iter,_d,xx,bvecs,bvals,ndirections)-isoterm_PVM_single(dir_iter,_d,bvals)) * two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); + myJ[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ - double temp_f0=x2f_gpu(params[nparams-1]); - J[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); + float temp_f0=x2f_gpu(params[nparams-1]); + myJ[nparams-1]= params[0]*(1-isoterm_PVM_single(dir_iter,_d,bvals))* two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); sig=params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,bvals))+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf-temp_f0)*isoterm_d_PVM_single(dir_iter,_d,bvals); }else{ sig = params[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,bvals)+sig); - J[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); + myJ[1] += (params[1]>0?1.0:-1.0)*params[0]*(1-*sumf)*isoterm_d_PVM_single(dir_iter,_d,bvals); } - J[0] = sig/params[0]; + myJ[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - reduction[idSubVOX]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(myJ[p]*myJ[p2]); __syncthreads(); if(idSubVOX==0){ for(int i=0;i<THREADS_BLOCK_FIT;i++){ @@ -383,9 +385,9 @@ __device__ void hess_PVM_single( //INPUT //in diffmodel.cc extern "C" __global__ void fit_PVM_single_kernel( //INPUT - const double* data, - const double* bvecs, - const double* bvals, + const float* data, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -393,7 +395,7 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT const bool m_include_f0, const bool gradnonlin, //INPUT-OUTPUT - double* params) + float* params) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; @@ -401,26 +403,28 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* reduction = (double*)shared; //threadsBlock - double* myparams = (double*) &reduction[threadsBlock]; //nparams - double* grad = (double*) &myparams[nparams]; //nparams - double* hess = (double*) &grad[nparams]; //nparams*nparams - double* step = (double*) &hess[nparams*nparams]; //nparams - double* inverse = (double*) &step[nparams]; //nparams - double* pcf = (double*) &inverse[nparams]; //1 + double* pcf = (double*) shared; //1 double* ncf = (double*) &pcf[1]; //1 double* lambda = (double*) &ncf[1]; //1 double* cftol = (double*) &lambda[1]; //1 double* ltol = (double*) &cftol[1]; //1 double* olambda = (double*) <ol[1]; //1 - double* fs = (double*) &olambda[1]; //nfib - double* x = (double*) &fs[nfib]; //nfib*3 - double* _d = (double*) &x[nfib*3]; //1 - double* sumf = (double*) &_d[1]; //1 + float* J = (float*)&olambda[1]; //threadsBlock*nparams + float* reduction = (float*)&J[threadsBlock*nparams]; //threadsBlock + float* myparams = (float*) &reduction[threadsBlock]; //nparams + float* grad = (float*) &myparams[nparams]; //nparams + float* hess = (float*) &grad[nparams]; //nparams*nparams + float* step = (float*) &hess[nparams*nparams]; //nparams + float* inverse = (float*) &step[nparams]; //nparams - double* C = (double*)&sumf[1]; //nparams*nparams; - double* el = (double*)&C[nparams*nparams]; //nparams + float* fs = (float*) &inverse[nparams]; //nfib + float* x = (float*) &fs[nfib]; //nfib*3 + float* _d = (float*) &x[nfib*3]; //1 + float* sumf = (float*) &_d[1]; //1 + + float* C = (float*)&sumf[1]; //nparams*nparams; + float* el = (float*)&C[nparams*nparams]; //nparams int* indx = (int*)&el[nparams]; //nparams int* success = (int*) &indx[nparams]; //1 @@ -442,7 +446,7 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT pos_bvecs=0; } // do the fit - levenberg_marquardt_PVM_single_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,x,_d,sumf,C,el,indx,myparams); + levenberg_marquardt_PVM_single_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,J,reduction,fs,x,_d,sumf,C,el,indx,myparams); __syncthreads(); @@ -470,10 +474,10 @@ extern "C" __global__ void fit_PVM_single_kernel( //INPUT //in diffmodel.cc extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT - const double* data, - const double* params, - const double* bvecs, - const double* bvals, + const float* data, + const float* params, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -482,7 +486,7 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT const bool gradnonlin, const bool* includes_f0, //OUTPUT - double* residuals) + float* residuals) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; @@ -490,17 +494,17 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* myparams = (double*) shared; //nparams - double* fs = (double*) &myparams[nparams]; //nfib - double* x = (double*) &fs[nfib]; //nfib*3 - double* _d = (double*) &x[nfib*3]; //1 - double* sumf = (double*) &_d[1]; //1 + float* myparams = (float*) shared; //nparams + float* fs = (float*) &myparams[nparams]; //nfib + float* x = (float*) &fs[nfib]; //nfib*3 + float* _d = (float*) &x[nfib*3]; //1 + float* sumf = (float*) &_d[1]; //1 int* my_include_f0 = (int*) &sumf[1]; //1 ////////// DYNAMIC SHARED MEMORY /////////// - double val; - double predicted_signal; - double mydata; + float val; + float predicted_signal; + float mydata; if(idSubVOX==0){ *my_include_f0 = includes_f0[idVOX]; @@ -516,7 +520,7 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT if(idSubVOX<nfib){ int kk = 2+3*idSubVOX; - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]); myparams[kk+1] = params[(idVOX*nparams)+kk+1]; @@ -542,7 +546,7 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT int ndir = ndirections/threadsBlock; if(idSubVOX<(ndirections%threadsBlock)) ndir++; - double3 x2; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -567,7 +571,7 @@ extern "C" __global__ void get_residuals_PVM_single_kernel( //INPUT val += fs[k]*anisoterm_PVM_single(dir_iter,_d,x2,&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections); } if (*my_include_f0){ - double temp_f0=x2f_gpu(myparams[nparams-1]); + float temp_f0=x2f_gpu(myparams[nparams-1]); predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single(dir_iter,_d,&bvals[pos_bvals])+val); }else{ predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single(dir_iter,_d,&bvals[pos_bvals])+val); diff --git a/CUDA/PVM_single_c.cu b/CUDA/PVM_single_c.cu index 04864e8d3ac58d4bfa19f57a20f6c62c4fff121e..2d18caa88cb4cb34c05492d1fcd1a8431b8a3dbe 100644 --- a/CUDA/PVM_single_c.cu +++ b/CUDA/PVM_single_c.cu @@ -19,44 +19,44 @@ ///////////////////////////////////// __device__ -inline double isoterm_PVM_single_c(const int pt,const double* _d,const double *bvals){ +inline float isoterm_PVM_single_c(const int pt,const float* _d,const float *bvals){ return exp(-bvals[pt]**_d); } __device__ -inline double isoterm_lambda_PVM_single_c(const int pt,const double lambda,const double *bvals){ +inline float isoterm_lambda_PVM_single_c(const int pt,const float lambda,const float *bvals){ return(-2*bvals[pt]*lambda*exp(-bvals[pt]*lambda*lambda)); } __device__ -inline double anisoterm_PVM_single_c(const int pt,const double* _d,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +inline float anisoterm_PVM_single_c(const int pt,const float* _d,const float3 x, const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return exp(-bvals[pt]**_d*dp*dp); } __device__ -inline double anisoterm_lambda_PVM_single_c(const int pt,const double lambda,const double3 x, const double *bvecs, const double *bvals, const int ndirections){ - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; +inline float anisoterm_lambda_PVM_single_c(const int pt,const float lambda,const float3 x, const float *bvecs, const float *bvals, const int ndirections){ + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; return(-2*bvals[pt]*lambda*dp*dp*exp(-bvals[pt]*lambda*lambda*dp*dp)); } __device__ -inline double anisoterm_th_PVM_single_c(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,costh,sinph,cosph; +inline float anisoterm_th_PVM_single_c(const int pt,const float* _d,const float3 x, const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,costh,sinph,cosph; sincos(_th,&sinth,&costh); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth; + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = costh*(bvecs[pt]*cosph+bvecs[ndirections+pt]*sinph)-bvecs[(2*ndirections)+pt]*sinth; return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } __device__ -inline double anisoterm_ph_PVM_single_c(const int pt,const double* _d,const double3 x, const double _th,const double _ph,const double *bvecs, const double *bvals, const int ndirections){ - double sinth,sinph,cosph; +inline float anisoterm_ph_PVM_single_c(const int pt,const float* _d,const float3 x, const float _th,const float _ph,const float *bvecs, const float *bvals, const int ndirections){ + float sinth,sinph,cosph; sinth=sin(_th); sincos(_ph,&sinph,&cosph); - double dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; - double dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); + float dp = bvecs[pt]*x.x+bvecs[ndirections+pt]*x.y+bvecs[(2*ndirections)+pt]*x.z; + float dp1 = sinth*(-bvecs[pt]*sinph+bvecs[ndirections+pt]*cosph); return(-2*bvals[pt]**_d*dp*dp1*exp(-bvals[pt]**_d*dp*dp)); } @@ -66,9 +66,9 @@ inline double anisoterm_ph_PVM_single_c(const int pt,const double* _d,const doub __device__ void fix_fsum_PVM_single_c( //INPUT int nfib, //INPUT - OUTPUT){ - double *fs) + float *fs) { - double sumf=0.0; + float sumf=0.0; for(int i=0;i<nfib;i++){ sumf+=fs[i]; if(sumf>=1){ @@ -80,9 +80,9 @@ __device__ void fix_fsum_PVM_single_c( //INPUT } //in diffmodel.cc -__device__ void sort_PVM_single_c(int nfib,double* params) +__device__ void sort_PVM_single_c(int nfib,float* params) { - double temp_f, temp_th, temp_ph; + float temp_f, temp_th, temp_ph; // Order vector descending using f parameters as index for(int i=1; i<(nfib); i++){ for(int j=0; j<(nfib-i); j++){ @@ -102,22 +102,22 @@ __device__ void sort_PVM_single_c(int nfib,double* params) } __device__ void fractions_deriv_PVM_single_c( //INPUT - const double* params, - const double* fs, + const float* params, + const float* fs, const int nfib, const int idSubVOX, //OUTPUT - double* Deriv) + float* Deriv) { int nparams_per_fibre=3; - double fsum; + float fsum; int k=idSubVOX%nfib; for (int j=0; j<nfib; j++){ Deriv[j*nfib+k]=0; } int kk = 2+(k*nparams_per_fibre); - double sinparamkk = sin(2*params[kk]); + float sinparamkk = sin(2*params[kk]); for (int j=0; j<nfib; j++){ int jj = 2+(j*nparams_per_fibre); @@ -128,7 +128,7 @@ __device__ void fractions_deriv_PVM_single_c( //INPUT } Deriv[j*nfib+k]=sinparamkk*fsum; }else if (j>k){ - double sinparam = sin(params[jj]); + float sinparam = sin(params[jj]); fsum=0; for (int n=0; n<=(j-1); n++){ fsum+=Deriv[n*nfib+k]; @@ -140,26 +140,26 @@ __device__ void fractions_deriv_PVM_single_c( //INPUT //cost function PVM_single_c __device__ void cf_PVM_single_c( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT double* cfv) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); x[idSubVOX*3] = sinth*cosph; @@ -170,7 +170,7 @@ __device__ void cf_PVM_single_c( //INPUT *_d = lambda2d_gpu(params[1]); *cfv = 0.0; *sumf=0; - double partial_fsum; + float partial_fsum; for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// @@ -186,8 +186,8 @@ __device__ void cf_PVM_single_c( //INPUT int ndir = ndirections/THREADS_BLOCK_FIT; if(idSubVOX<(ndirections%THREADS_BLOCK_FIT)) ndir++; - double err; - double3 x2; + float err; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -203,11 +203,11 @@ __device__ void cf_PVM_single_c( //INPUT } if(m_include_f0){ //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + float temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; err= (params[0]*((temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,bvals))+err))-mdata[dir_iter]; }else{ err = params[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,bvals)+err)-mdata[dir_iter]; @@ -228,27 +228,28 @@ __device__ void cf_PVM_single_c( //INPUT //gradient function PVM_single_c __device__ void grad_PVM_single_c( //INPUT - const double* params, - const double* mdata, - const double* bvecs, - const double* bvals, + const float* params, + const float* mdata, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, - const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* f_deriv, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + const int idSubVOX, + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* f_deriv, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT - double* grad) + float* grad) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); x[idSubVOX*3] = sinth*cosph; @@ -258,7 +259,7 @@ __device__ void grad_PVM_single_c( //INPUT if(idSubVOX==0){ *_d = lambda2d_gpu(params[1]); *sumf=0; - double partial_fsum; + float partial_fsum; for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// @@ -283,25 +284,27 @@ __device__ void grad_PVM_single_c( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double diff; - double sig; - double Iso_term; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float diff; + float sig; + float Iso_term; + float3 xx; int dir_iter=idSubVOX; - double Aniso_terms[MAXNFIBRES]; + //float Aniso_terms[MAXNFIBRES]; //reuse Shared J --- myJ[kk+1] __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint for(int k=0;k<nfib;k++){ + int kk = 2+3*(k) +1; xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); + //Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); + myJ[kk] = anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); } sig = 0; for(int k=0;k<nfib;k++){ @@ -309,39 +312,46 @@ __device__ void grad_PVM_single_c( //INPUT xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); - J[kk] = 0; + sig += fs[k]*myJ[kk+1];//Aniso_terms[k]; + myJ[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); + myJ[kk] = 0; for (int j=0;j<nfib;j++){ if(f_deriv[j*nfib+k]!=0){ - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; + //myJ[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; + myJ[kk] += params[0]*(myJ[2+j*3+1]-Iso_term)*f_deriv[j*nfib+k]; } } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + } + for(int k=0;k<nfib;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + myJ[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<(nfib);j++) partial_fsum-=fs[j]; ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + float temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; //derivative with respect to f0 - J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; + myJ[nparams-1]= params[0]*(1-Iso_term)*sin(float(2*params[nparams-1]))*partial_fsum; sig=params[0]*((temp_f0+(1-*sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + myJ[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); }else{ sig = params[0]*((1-*sumf)*Iso_term+sig); - J[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + myJ[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); } diff = sig - mdata[dir_iter]; - J[0] = sig/params[0]; + myJ[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ - reduction[idSubVOX]=2*J[p]*diff; + reduction[idSubVOX]=2*myJ[p]*diff; __syncthreads(); if(idSubVOX==0){ @@ -358,26 +368,27 @@ __device__ void grad_PVM_single_c( //INPUT //hessian function PVM_single_c __device__ void hess_PVM_single_c( //INPUT - const double* params, - const double* bvecs, - const double* bvals, + const float* params, + const float* bvecs, + const float* bvals, const int ndirections, const int nfib, const int nparams, const bool m_include_f0, const int idSubVOX, - double* reduction, //shared memory - double* fs, //shared memory - double* f_deriv, //shared memory - double* x, //shared memory - double* _d, //shared memory - double* sumf, //shared memory + float* J, //shared memory + float* reduction, //shared memory + float* fs, //shared memory + float* f_deriv, //shared memory + float* x, //shared memory + float* _d, //shared memory + float* sumf, //shared memory //OUTPUT - double* hess) + float* hess) { if(idSubVOX<nfib){ int kk = 2+3*(idSubVOX); - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(params[kk+1],&sinth,&costh); sincos(params[kk+2],&sinph,&cosph); x[idSubVOX*3] = sinth*cosph; @@ -387,7 +398,7 @@ __device__ void hess_PVM_single_c( //INPUT if(idSubVOX==0){ *_d = lambda2d_gpu(params[1]); *sumf=0; - double partial_fsum; + float partial_fsum; for(int k=0;k<nfib;k++){ int kk = 2+3*(k); //partial_fsum /////////// @@ -416,24 +427,26 @@ __device__ void hess_PVM_single_c( //INPUT int max_dir = ndirections/THREADS_BLOCK_FIT; if(ndirections%THREADS_BLOCK_FIT) max_dir++; - double J[MAXNPARAMS]; - double sig; - double Iso_term; - double3 xx; + float* myJ = &J[idSubVOX*nparams]; + float sig; + float Iso_term; + float3 xx; int dir_iter=idSubVOX; - double Aniso_terms[MAXNFIBRES]; + //float Aniso_terms[MAXNFIBRES]; //reuse Shared J --- myJ[kk+1] __syncthreads(); for(int dir=0;dir<max_dir;dir++){ - for (int p=0; p<nparams; p++) J[p]=0; + for (int p=0; p<nparams; p++) myJ[p]=0; if(dir<ndir){ Iso_term=isoterm_PVM_single_c(dir_iter,_d,bvals); //Precompute some terms for this datapoint for(int k=0;k<nfib;k++){ + int kk = 2+3*(k) +1; xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); + //Aniso_terms[k]=anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); + myJ[kk] = anisoterm_PVM_single_c(dir_iter,_d,xx,bvecs,bvals,ndirections); } sig = 0; for(int k=0;k<nfib;k++){ @@ -441,38 +454,44 @@ __device__ void hess_PVM_single_c( //INPUT xx.x=x[k*3]; xx.y=x[k*3+1]; xx.z=x[k*3+2]; - sig += fs[k]*Aniso_terms[k]; - J[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); - J[kk] = 0; + sig += fs[k]*myJ[kk+1];//Aniso_terms[k]; + myJ[1] += params[0]*fs[k]*anisoterm_lambda_PVM_single_c(dir_iter,params[1],xx,bvecs,bvals,ndirections); for (int j=0; j<nfib; j++){ if (f_deriv[j*nfib+k]!=0) - J[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; + //myJ[kk] += params[0]*(Aniso_terms[j]-Iso_term)*f_deriv[j*nfib+k]; + myJ[kk] += params[0]*(myJ[2+3*j+1]-Iso_term)*f_deriv[j*nfib+k]; } - J[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); - J[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + } + for(int k=0;k<nfib;k++){ + int kk = 2+3*(k); + xx.x=x[k*3]; + xx.y=x[k*3+1]; + xx.z=x[k*3+2]; + myJ[kk+1] = params[0]*fs[k]*anisoterm_th_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); + myJ[kk+2] = params[0]*fs[k]*anisoterm_ph_PVM_single_c(dir_iter,_d,xx,params[kk+1],params[kk+2],bvecs,bvals,ndirections); } if(m_include_f0){ //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<(nfib);j++) partial_fsum-=fs[j]; ////////////////////////// - double temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; + float temp_f0=beta2f_gpu(params[nparams-1])*partial_fsum; //derivative with respect to f0 - J[nparams-1]= params[0]*(1-Iso_term)*sin(double(2*params[nparams-1]))*partial_fsum; + myJ[nparams-1]= params[0]*(1-Iso_term)*sin(float(2*params[nparams-1]))*partial_fsum; sig= params[0]*((temp_f0+(1-*sumf-temp_f0)*Iso_term)+sig); - J[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + myJ[1] += params[0]*(1-*sumf-temp_f0)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); }else{ sig = params[0]*((1-*sumf)*Iso_term+sig); - J[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); + myJ[1] += params[0]*(1-*sumf)*isoterm_lambda_PVM_single_c(dir_iter,params[1],bvals); } - J[0] = sig/params[0]; + myJ[0] = sig/params[0]; } for (int p=0;p<nparams;p++){ for (int p2=p;p2<nparams;p2++){ - reduction[idSubVOX]=2*(J[p]*J[p2]); + reduction[idSubVOX]=2*(myJ[p]*myJ[p2]); __syncthreads(); if(idSubVOX==0){ for(int i=0;i<THREADS_BLOCK_FIT;i++){ @@ -495,9 +514,9 @@ __device__ void hess_PVM_single_c( //INPUT } //in diffmodel.cc extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT - const double* data, - const double* bvecs, - const double* bvals, + const float* data, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -507,7 +526,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT const bool m_return_fanning, const bool gradnonlin, //INPUT - OUTPUT - double* params) + float* params) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; @@ -515,27 +534,29 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* reduction = (double*)shared; //threadsBlock - double* myparams = (double*) &reduction[threadsBlock]; //nparams - double* grad = (double*) &myparams[nparams]; //nparams - double* hess = (double*) &grad[nparams]; //nparams*nparams - double* step = (double*) &hess[nparams*nparams]; //nparams - double* inverse = (double*) &step[nparams]; //nparams - double* pcf = (double*) &inverse[nparams]; //1 + double* pcf = (double*) shared; //1 double* ncf = (double*) &pcf[1]; //1 double* lambda = (double*) &ncf[1]; //1 double* cftol = (double*) &lambda[1]; //1 double* ltol = (double*) &cftol[1]; //1 double* olambda = (double*) <ol[1]; //1 - double* fs = (double*) &olambda[1]; //nfib - double* f_deriv = (double*) &fs[nfib]; //nfib*nfib - double* x = (double*) &f_deriv[nfib*nfib]; //nfib*3 - double* _d = (double*) &x[nfib*3]; //1 - double* sumf = (double*) &_d[1]; //1 + float* J = (float*)&olambda[1]; //threadsBlock*nparams + float* reduction = (float*)&J[threadsBlock*nparams]; //threadsBlock + float* myparams = (float*) &reduction[threadsBlock]; //nparams + float* grad = (float*) &myparams[nparams]; //nparams + float* hess = (float*) &grad[nparams]; //nparams*nparams + float* step = (float*) &hess[nparams*nparams]; //nparams + float* inverse = (float*) &step[nparams]; //nparams + + float* fs = (float*) &inverse[nparams]; //nfib + float* f_deriv = (float*) &fs[nfib]; //nfib*nfib + float* x = (float*) &f_deriv[nfib*nfib]; //nfib*3 + float* _d = (float*) &x[nfib*3]; //1 + float* sumf = (float*) &_d[1]; //1 - double* C = (double*)&sumf[1]; //nparams*nparams; - double* el = (double*)&C[nparams*nparams]; //nparams + float* C = (float*)&sumf[1]; //nparams*nparams; + float* el = (float*)&C[nparams*nparams]; //nparams int* indx = (int*)&el[nparams]; //nparams int* success = (int*) &indx[nparams]; //1 @@ -557,7 +578,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT pos_bvecs=0; } //do the fit - levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,reduction,fs,f_deriv,x,_d,sumf,C,el,indx,myparams); + levenberg_marquardt_PVM_single_c_gpu(&data[idVOX*ndirections],&bvecs[pos_bvecs],&bvals[pos_bvals],ndirections,nfib,nparams,m_include_f0,idSubVOX,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,J,reduction,fs,f_deriv,x,_d,sumf,C,el,indx,myparams); __syncthreads(); @@ -569,7 +590,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT for(int k=0;k<nfib;k++){ int kk = 2 + 3*(k); //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<k;j++) partial_fsum-=myparams[2 + 3*j]; ////////////////////////// @@ -578,7 +599,7 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT if (m_include_f0){ //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<(nfib);j++){ partial_fsum-=myparams[2 + 3*j]; } @@ -596,10 +617,10 @@ extern "C" __global__ void fit_PVM_single_c_kernel( //INPUT //in diffmodel.cc extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT - const double* data, - const double* params, - const double* bvecs, - const double* bvals, + const float* data, + const float* params, + const float* bvecs, + const float* bvals, const int nvox, const int ndirections, const int nfib, @@ -608,7 +629,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT const bool gradnonlin, const bool* includes_f0, //OUTPUT - double* residuals) + float* residuals) { int idSubVOX = threadIdx.x; int idVOX = blockIdx.x; @@ -616,17 +637,17 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT ////////// DYNAMIC SHARED MEMORY /////////// extern __shared__ double shared[]; - double* myparams = (double*) shared; //nparams - double* fs = (double*) &myparams[nparams]; //nfib - double* x = (double*) &fs[nfib]; //nfib*3 - double* _d = (double*) &x[nfib*3]; //1 - double* sumf = (double*) &_d[1]; //1 + float* myparams = (float*) shared; //nparams + float* fs = (float*) &myparams[nparams]; //nfib + float* x = (float*) &fs[nfib]; //nfib*3 + float* _d = (float*) &x[nfib*3]; //1 + float* sumf = (float*) &_d[1]; //1 int* my_include_f0 = (int*) &sumf[1]; //1 ////////// DYNAMIC SHARED MEMORY /////////// - double val; - double predicted_signal; - double mydata; + float val; + float predicted_signal; + float mydata; if(idSubVOX==0){ @@ -638,7 +659,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT if(myparams[1]<0) myparams[1] = 0; //This can be due to numerical errors..sqrt else myparams[1] = d2lambda_gpu(params[(idVOX*nparams)+1]); - double partial_fsum; + float partial_fsum; for(int k=0;k<nfib;k++){ int kk = 2+3*k; //partial_fsum /////////// @@ -647,7 +668,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT partial_fsum-=fs[j]; ////////////////////////// fs[k] = params[(idVOX*nparams)+kk]; - double tmpr=fs[k]/partial_fsum; + float tmpr=fs[k]/partial_fsum; if (tmpr>1.0) tmpr=1; //This can be due to numerical errors if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt myparams[kk] = f2beta_gpu(tmpr); @@ -660,7 +681,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// - double tmpr=params[(idVOX*nparams)+nparams-1]/partial_fsum; + float tmpr=params[(idVOX*nparams)+nparams-1]/partial_fsum; if (tmpr>1.0) tmpr=1; //This can be due to numerical errors..asin if (tmpr<0.0) tmpr=0; //This can be due to numerical errors..sqrt myparams[nparams-1]= f2beta_gpu(tmpr); @@ -671,7 +692,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT if(idSubVOX<nfib){ int kk = 2+3*idSubVOX; - double sinth,costh,sinph,cosph; + float sinth,costh,sinph,cosph; sincos(myparams[kk+1],&sinth,&costh); sincos(myparams[kk+2],&sinph,&cosph); x[idSubVOX*3] = sinth*cosph; @@ -680,7 +701,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT } if(idSubVOX==0){ - double partial_fsum; + float partial_fsum; *sumf=0; for(int k=0;k<nfib;k++){ int kk = 2+3*k; @@ -698,7 +719,7 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT int ndir = ndirections/threadsBlock; if(idSubVOX<(ndirections%threadsBlock)) ndir++; - double3 x2; + float3 x2; int dir_iter=idSubVOX; __syncthreads(); @@ -724,11 +745,11 @@ extern "C" __global__ void get_residuals_PVM_single_c_kernel( //INPUT } if (*my_include_f0){ //partial_fsum /////////// - double partial_fsum=1.0; + float partial_fsum=1.0; for(int j=0;j<nfib;j++) partial_fsum-=fs[j]; ////////////////////////// - double temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum; + float temp_f0= beta2f_gpu(myparams[nparams-1])*partial_fsum; predicted_signal = myparams[0]*(temp_f0+(1-*sumf-temp_f0)*isoterm_PVM_single_c(dir_iter,_d,&bvals[pos_bvals])+val); }else{ predicted_signal = myparams[0]*((1-*sumf)*isoterm_PVM_single_c(dir_iter,_d,&bvals[pos_bvals])+val); diff --git a/CUDA/diffmodels.cu b/CUDA/diffmodels.cu index 5ef72763023913b6ec321e0c0a072fcb4716effa..9ca6100d246c1db8b8b72c74b87f09e806b7b207 100644 --- a/CUDA/diffmodels.cu +++ b/CUDA/diffmodels.cu @@ -29,16 +29,16 @@ void fit_PVM_single( //INPUT const vector<ColumnVector> datam_vec, const vector<Matrix> bvecs_vec, const vector<Matrix> bvals_vec, - thrust::device_vector<double> datam_gpu, - thrust::device_vector<double> bvecs_gpu, - thrust::device_vector<double> bvals_gpu, + thrust::device_vector<float> datam_gpu, + thrust::device_vector<float> bvecs_gpu, + thrust::device_vector<float> bvals_gpu, int ndirections, int nfib, bool m_include_f0, bool gradnonlin, string output_file, //OUTPUT - thrust::device_vector<double>& params_gpu) + thrust::device_vector<float>& params_gpu) { std::ofstream myfile; myfile.open (output_file.data(), ios::out | ios::app ); @@ -50,7 +50,7 @@ void fit_PVM_single( //INPUT else nparams = nfib*3 + 2; - thrust::host_vector<double> params_host; + thrust::host_vector<float> params_host; params_host.resize(nvox*nparams); for(int vox=0;vox<nvox;vox++){ @@ -95,7 +95,7 @@ void fit_PVM_single( //INPUT dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_BLOCK_FIT,1); - int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+8)*sizeof(double)+(2+nparams)*sizeof(int); + int amount_shared = 6*sizeof(double)+(THREADS_BLOCK_FIT*nparams+THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+2)*sizeof(float)+(2+nparams)*sizeof(int); myfile << "Shared Memory Used in fit_PVM_single: " << amount_shared << "\n"; @@ -109,16 +109,16 @@ void fit_PVM_single_c( //INPUT const vector<ColumnVector> datam_vec, const vector<Matrix> bvecs_vec, const vector<Matrix> bvals_vec, - thrust::device_vector<double> datam_gpu, - thrust::device_vector<double> bvecs_gpu, - thrust::device_vector<double> bvals_gpu, + thrust::device_vector<float> datam_gpu, + thrust::device_vector<float> bvecs_gpu, + thrust::device_vector<float> bvals_gpu, int ndirections, int nfib, bool m_include_f0, bool gradnonlin, string output_file, //OUTPUT - thrust::device_vector<double>& params_gpu) + thrust::device_vector<float>& params_gpu) { std::ofstream myfile; myfile.open (output_file.data(), ios::out | ios::app ); @@ -130,7 +130,7 @@ void fit_PVM_single_c( //INPUT else nparams = nfib*3 + 2; - thrust::host_vector<double> params_host; + thrust::host_vector<float> params_host; params_host.resize(nvox*nparams); for(int vox=0;vox<nvox;vox++){ @@ -173,7 +173,7 @@ void fit_PVM_single_c( //INPUT dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_BLOCK_FIT,1); - int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+nfib*nfib+8)*sizeof(double)+(2+nparams)*sizeof(int); + int amount_shared = 6*sizeof(double)+(THREADS_BLOCK_FIT*nparams+THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+nfib*nfib+6)*sizeof(float)+(2+nparams)*sizeof(int); myfile << "Shared Memory Used in fit_PVM_single_c: " << amount_shared << "\n"; @@ -184,9 +184,9 @@ void fit_PVM_single_c( //INPUT } void fit_PVM_multi( //INPUT - thrust::device_vector<double> datam_gpu, - thrust::device_vector<double> bvecs_gpu, - thrust::device_vector<double> bvals_gpu, + thrust::device_vector<float> datam_gpu, + thrust::device_vector<float> bvecs_gpu, + thrust::device_vector<float> bvals_gpu, int nvox, int ndirections, int nfib, @@ -194,7 +194,7 @@ void fit_PVM_multi( //INPUT bool gradnonlin, string output_file, //OUTPUT - thrust::device_vector<double>& params_gpu) + thrust::device_vector<float>& params_gpu) { std::ofstream myfile; myfile.open (output_file.data(), ios::out | ios::app ); @@ -209,12 +209,12 @@ void fit_PVM_multi( //INPUT else nparams = nfib*3 + 3; - thrust::device_vector<double> params_PVM_single_c_gpu; //copy params to an auxiliar structure because there are different number of nparams + thrust::device_vector<float> params_PVM_single_c_gpu; //copy params to an auxiliar structure because there are different number of nparams params_PVM_single_c_gpu.resize(nvox*nparams); //between single_c and multi. We must read and write in different structures, thrust::copy(params_gpu.begin(), params_gpu.end(), params_PVM_single_c_gpu.begin()); //maybe 1 block finish before other one read their params. - int amount_shared = (THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+9)*sizeof(double)+(2+nparams)*sizeof(int); + int amount_shared = 6*sizeof(double)+(THREADS_BLOCK_FIT*nparams+THREADS_BLOCK_FIT+5*nparams+2*nparams*nparams+4*nfib+3)*sizeof(float)+(2+nparams)*sizeof(int); myfile << "Shared Memory Used in fit_PVM_multi: " << amount_shared << "\n"; @@ -225,10 +225,10 @@ void fit_PVM_multi( //INPUT } void calculate_tau( //INPUT - thrust::device_vector<double> datam_gpu, - thrust::device_vector<double> params_gpu, - thrust::device_vector<double> bvecs_gpu, - thrust::device_vector<double> bvals_gpu, + thrust::device_vector<float> datam_gpu, + thrust::device_vector<float> params_gpu, + thrust::device_vector<float> bvecs_gpu, + thrust::device_vector<float> bvals_gpu, thrust::host_vector<int> vox_repeat, int nrepeat, int ndirections, @@ -272,9 +272,9 @@ void calculate_tau( //INPUT dim3 Dim_Grid(blocks,1); dim3 Dim_Block(THREADS_BLOCK_FIT,1); - int amount_shared = (nparams+4*nfib+3)*sizeof(double) + sizeof(int); + int amount_shared = (nparams+4*nfib+3)*sizeof(float) + sizeof(int); - thrust::device_vector<double> residuals_gpu; + thrust::device_vector<float> residuals_gpu; residuals_gpu.resize(nvox*ndirections); if(model==1){ @@ -292,7 +292,7 @@ void calculate_tau( //INPUT sync_check("get_residuals_PVM_multi_kernel"); } - thrust::host_vector<double> residuals_host; + thrust::host_vector<float> residuals_host; residuals_host.resize(nvox*ndirections); thrust::copy(residuals_gpu.begin(), residuals_gpu.end(), residuals_host.begin());