From 14fb5977f2bce7ad850bbed5903d3130a04b5ea4 Mon Sep 17 00:00:00 2001 From: Moises Fernandez <moisesf@fmrib.ox.ac.uk> Date: Thu, 11 Apr 2013 15:10:53 +0000 Subject: [PATCH] Changed to use Dynamic Shared Memory --- CUDA/levenberg_marquardt.cu | 270 +++++++++++++++++++----------------- 1 file changed, 142 insertions(+), 128 deletions(-) diff --git a/CUDA/levenberg_marquardt.cu b/CUDA/levenberg_marquardt.cu index 03f09da..4d254fc 100644 --- a/CUDA/levenberg_marquardt.cu +++ b/CUDA/levenberg_marquardt.cu @@ -11,74 +11,78 @@ #include "solver_mult_inverse.cu" #include "diffmodels.cuh" -#include "dim_blocks.h" #include "options.h" //CPU version in nonlin.h __device__ const double EPS_gpu = 2.0e-16; //Losely based on NRinC 20.1 //CPU version in nonlin.cpp -__device__ inline bool zero_cf_diff_conv(double cfo,double cfn,double cftol){ - return(2.0*abs(cfo-cfn) <= cftol*(abs(cfo)+abs(cfn)+EPS_gpu)); +__device__ inline bool zero_cf_diff_conv(double* cfo,double* cfn,double* cftol){ + return(2.0*abs(*cfo-*cfn) <= *cftol*(abs(*cfo)+abs(*cfn)+EPS_gpu)); } __device__ void levenberg_marquardt_PVM_single_gpu( //INPUT const double* mydata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, + const int idSubVOX, double* step, //shared memory double* grad, //shared memory double* hess, //shared memory double* inverse, //shared memory - double &pcf, //shared memory - double &ncf, //shared memory - double &lambda, //shared memory - double &cftol, //shared memory - double <ol, //shared memory - double &olambda, //shared memory - bool &success, //shared memory - bool &end, //shared memory - double* shared, //shared memory + double* pcf, //shared memory + double* ncf, //shared memory + double* lambda, //shared memory + double* cftol, //shared memory + double* ltol, //shared memory + double* olambda, //shared memory + int* success, //shared memory + int* end, //shared memory + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory + double* C, //shared memory + double* el, //shared memory + int* indx, //shared memory //INPUT-OUTPUT double* myparams) //shared memory { int niter=0; int maxiter=200; - if(idB==0){ - end=false; - lambda=0.1; - cftol=1.0e-8; - ltol=1.0e20; - success = true; - olambda = 0.0; - ncf=0; + if(idSubVOX==0){ + *end=false; + *lambda=0.1; + *cftol=1.0e-8; + *ltol=1.0e20; + *success = true; + *olambda = 0.0; + *ncf=0; } - cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf); + cf_PVM_single(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,pcf); __syncthreads(); - while (!(success&&niter++>=maxiter)){ //if success we not increase niter (first condition is true) - //function cost has decreise, we have advanced. - if(success){ - grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,grad); + while (!(*success&&niter++>=maxiter)){ //if success we don't increase niter (first condition is true) + //function cost has been decreased, we have advanced. + if(*success){ + grad_PVM_single(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,grad); __syncthreads(); - hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,hess); + hess_PVM_single(myparams,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,hess); } - if(idB==0){ + if(idSubVOX==0){ for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + hess[(i*nparams)+i]+=*lambda-*olambda; //Levenberg LM_L } - solver(hess,grad,nparams,inverse); + solver(hess,grad,nparams,C,el,indx,inverse); for (int i=0;i<nparams;i++){ step[i]=-inverse[i]; @@ -90,91 +94,96 @@ __device__ void levenberg_marquardt_PVM_single_gpu( //INPUT } __syncthreads(); - cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); + cf_PVM_single(step,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,ncf); - if(idB==0){ - if (success = (ncf < pcf)){ - olambda = 0.0; + if(idSubVOX==0){ + if (*success = (*ncf < *pcf)){ + *olambda = 0.0; for(int i=0;i<nparams;i++){ myparams[i]=step[i]; } - lambda=lambda/10.0; + *lambda=*lambda/10.0; if (zero_cf_diff_conv(pcf,ncf,cftol)){ - end=true; + *end=true; } - pcf=ncf; + *pcf=*ncf; }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - end=true; + *olambda=*lambda; + *lambda=*lambda*10.0; + if(*lambda> *ltol){ + *end=true; } } } __syncthreads(); - if(end) return; + if(*end) return; } } __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT const double* mydata, const double* bvecs, - const double* bvals, + const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, + const int idSubVOX, double* step, //shared memory double* grad, //shared memory double* hess, //shared memory double* inverse, //shared memory - double &pcf, //shared memory - double &ncf, //shared memory - double &lambda, //shared memory - double &cftol, //shared memory - double <ol, //shared memory - double &olambda, //shared memory - bool &success, //shared memory - bool &end, //shared memory - double* shared, //shared memory + double* pcf, //shared memory + double* ncf, //shared memory + double* lambda, //shared memory + double* cftol, //shared memory + double* ltol, //shared memory + double* olambda, //shared memory + int* success, //shared memory + int* end, //shared memory + double* reduction, //shared memory double* fs, //shared memory double* f_deriv, //shared memory double* x, //shared memory - double &_d, //shared memory - double &sumf, //shared memory + double* _d, //shared memory + double* sumf, //shared memory + double* C, //shared memory + double* el, //shared memory + int* indx, //shared memory //INPUT-OUTPUT double* myparams) //shared memory { int niter=0; int maxiter=200; - if(idB==0){ - end=false; - lambda=0.1; - cftol=1.0e-8; - ltol=1.0e20; - success = true; - olambda = 0.0; - ncf=0; + if(idSubVOX==0){ + *end=false; + *lambda=0.1; + *cftol=1.0e-8; + *ltol=1.0e20; + *success = true; + *olambda = 0.0; + *ncf=0; } - cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf); + cf_PVM_single_c(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,pcf); __syncthreads(); - while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) - //function cost has decreise, we have advanced. - if(success){ - grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,grad); + while (!(*success&&niter++ >= maxiter)){ //if success we don't increase niter (first condition is true) + //function cost has been decreased, we have advanced. + if(*success){ + grad_PVM_single_c(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,f_deriv,x,_d,sumf,grad); __syncthreads(); - hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,hess); + hess_PVM_single_c(myparams,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,f_deriv,x,_d,sumf,hess); } - if(idB==0){ + if(idSubVOX==0){ for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + hess[(i*nparams)+i]+=*lambda-*olambda; //Levenberg LM_L } - solver(hess,grad,nparams,inverse); + solver(hess,grad,nparams,C,el,indx,inverse); for (int i=0;i<nparams;i++){ step[i]=-inverse[i]; @@ -186,30 +195,30 @@ __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT } __syncthreads(); - cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf); + cf_PVM_single_c(step,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_d,sumf,ncf); - if(idB==0){ - if (success = (ncf < pcf)) { - olambda = 0.0; + if(idSubVOX==0){ + if (*success = (*ncf < *pcf)) { + *olambda = 0.0; for(int i=0;i<nparams;i++){ myparams[i]=step[i]; } - lambda=lambda/10.0; + *lambda=*lambda/10.0; if (zero_cf_diff_conv(pcf,ncf,cftol)){ - end=true; + *end=true; } - pcf=ncf; + *pcf=*ncf; }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - end=true; + *olambda=*lambda; + *lambda=*lambda*10.0; + if(*lambda> *ltol){ + *end=true; } } } __syncthreads(); - if(end) return; + if(*end) return; } } @@ -218,60 +227,65 @@ __device__ void levenberg_marquardt_PVM_multi_gpu( //INPUT const double* mydata, const double* bvecs, const double* bvals, + const int ndirections, + const int nfib, const int nparams, const bool m_include_f0, - const int idB, + const int idSubVOX, double* step, //shared memory double* grad, //shared memory double* hess, //shared memory double* inverse, //shared memory - double &pcf, //shared memory - double &ncf, //shared memory - double &lambda, //shared memory - double &cftol, //shared memory - double <ol, //shared memory - double &olambda, //shared memory - bool &success, //shared memory - bool &end, //shared memory - double* shared, //shared memory + double* pcf, //shared memory + double* ncf, //shared memory + double* lambda, //shared memory + double* cftol, //shared memory + double* ltol, //shared memory + double* olambda, //shared memory + int* success, //shared memory + int* end, //shared memory + double* reduction, //shared memory double* fs, //shared memory double* x, //shared memory - double &_a, //shared memory - double &_b, //shared memory - double &sumf, //shared memory + double* _a, //shared memory + double* _b, //shared memory + double* sumf, //shared memory + double* C, //shared memory + double* el, //shared memory + int* indx, //shared memory //INPUT-OUTPUT double* myparams) //shared memory { int niter=0; int maxiter=200; - if(idB==0){ - end=false; - lambda=0.1; - cftol=1.0e-8; - ltol=1.0e20; - success = true; - olambda = 0.0; - ncf=0; + if(idSubVOX==0){ + *end=false; + *lambda=0.1; + *cftol=1.0e-8; + *ltol=1.0e20; + *success = true; + *olambda = 0.0; + *ncf=0; } - cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,pcf); + cf_PVM_multi(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_a,_b,sumf,pcf); __syncthreads(); - while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) - //function cost has decreise, we have advanced. - if(success){ - grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,grad); + while (!(*success&&niter++ >= maxiter)){ //if success we don't increase niter (first condition is true) + //function cost has been decreased, we have advanced. + if(*success){ + grad_PVM_multi(myparams,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_a,_b,sumf,grad); __syncthreads(); - hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,hess); + hess_PVM_multi(myparams,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_a,_b,sumf,hess); } - if(idB==0){ + if(idSubVOX==0){ for (int i=0; i<nparams; i++) { - hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L + hess[(i*nparams)+i]+=*lambda-*olambda; //Levenberg LM_L } - solver(hess,grad,nparams,inverse); + solver(hess,grad,nparams,C,el,indx,inverse); for (int i=0;i<nparams;i++){ step[i]=-inverse[i]; @@ -283,30 +297,30 @@ __device__ void levenberg_marquardt_PVM_multi_gpu( //INPUT } __syncthreads(); - cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,ncf); + cf_PVM_multi(step,mydata,bvecs,bvals,ndirections,nfib,nparams,m_include_f0,idSubVOX,reduction,fs,x,_a,_b,sumf,ncf); - if(idB==0){ - if (success = (ncf < pcf)) { - olambda = 0.0; + if(idSubVOX==0){ + if (*success = (*ncf < *pcf)) { + *olambda = 0.0; for(int i=0;i<nparams;i++){ myparams[i]=step[i]; } - lambda=lambda/10.0; + *lambda=*lambda/10.0; if (zero_cf_diff_conv(pcf,ncf,cftol)){ - end=true; + *end=true; } - pcf=ncf; + *pcf=*ncf; }else{ - olambda=lambda; - lambda=lambda*10.0; - if(lambda> ltol){ - end=true; + *olambda=*lambda; + *lambda=*lambda*10.0; + if(*lambda> *ltol){ + *end=true; } } } __syncthreads(); - if(end) return; + if(*end) return; } } #endif -- GitLab