Skip to content
Snippets Groups Projects
Commit d4a9f9f7 authored by Moises Fernandez's avatar Moises Fernandez
Browse files

new version Levenberg: several threads per voxel

parent 6b4a6a7d
No related branches found
No related tags found
No related merge requests found
...@@ -17,42 +17,48 @@ ...@@ -17,42 +17,48 @@
///////////////////////////////////// /////////////////////////////////////
__device__ inline double isoterm_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ __device__ inline double isoterm_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
return exp(double(-_a*log(double(1+bvals[pt]*_b)))); return exp(-_a*log(1+bvals[pt]*_b));
} }
__device__ inline double isoterm_a_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ __device__ inline double isoterm_a_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
return -log(double(1+bvals[pt]*_b))*exp(double(-_a*log(double(1+bvals[pt]*_b)))); return -log(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b));
} }
__device__ inline double isoterm_b_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){ __device__ inline double isoterm_b_PVM_multi(const int pt,const double _a,const double _b, const double *bvals){
return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(double(-_a*log(double(1+bvals[pt]*_b)))); return -_a*bvals[pt]/(1+bvals[pt]*_b)*exp(-_a*log(1+bvals[pt]*_b));
} }
__device__ inline double anisoterm_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ __device__ inline double anisoterm_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
return exp(double(-_a*log(double(1+bvals[pt]*_b*(dp*dp))))); return exp(-_a*log(1+bvals[pt]*_b*(dp*dp)));
} }
__device__ inline double anisoterm_a_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ __device__ inline double anisoterm_a_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
return -log(double(1+bvals[pt]*(dp*dp)*_b))* exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b)))); return -log(1+bvals[pt]*(dp*dp)*_b)* exp(-_a*log(1+bvals[pt]*(dp*dp)*_b));
} }
__device__ inline double anisoterm_b_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){ __device__ inline double anisoterm_b_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double *bvecs, const double *bvals){
double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))); return (-_a*bvals[pt]*(dp*dp)/ (1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b)));
} }
__device__ inline double anisoterm_th_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ __device__ inline double anisoterm_th_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){
double sinth,costh,sinph,cosph;
sincos(_th,&sinth,&costh);
sincos(_ph,&sinph,&cosph);
double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
double dp1 = cos(double(_th))* (bvecs[pt]*cos(double(_ph)) + bvecs[NDIRECTIONS+pt]*sin(double(_ph))) - bvecs[(2*NDIRECTIONS)+pt]*sin(double(_th)); double dp1 = costh* (bvecs[pt]*cosph + bvecs[NDIRECTIONS+pt]*sinph) - bvecs[(2*NDIRECTIONS)+pt]*sinth;
return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1); return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1);
} }
__device__ inline double anisoterm_ph_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){ __device__ inline double anisoterm_ph_PVM_multi(const int pt,const double _a,const double _b,const double3 x,const double _th,const double _ph,const double *bvecs, const double *bvals){
double sinth,sinph,cosph;
sinth=sin(_th);
sincos(_ph,&sinph,&cosph);
double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z; double dp = bvecs[pt]*x.x+bvecs[NDIRECTIONS+pt]*x.y+bvecs[(2*NDIRECTIONS)+pt]*x.z;
double dp1 = sin(double(_th))* (-bvecs[pt]*sin(double(_ph)) + bvecs[NDIRECTIONS+pt]*cos(double(_ph))); double dp1 = sinth* (-bvecs[pt]*sinph + bvecs[NDIRECTIONS+pt]*cosph);
return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(double(-_a*log(double(1+bvals[pt]*(dp*dp)*_b))))*2*dp*dp1); return (-_a*_b*bvals[pt]/(1+bvals[pt]*(dp*dp)*_b)*exp(-_a*log(1+bvals[pt]*(dp*dp)*_b))*2*dp*dp1);
} }
//in diffmodel.cc //in diffmodel.cc
...@@ -79,147 +85,99 @@ __device__ void fix_fsum_PVM_multi( //INPUT ...@@ -79,147 +85,99 @@ __device__ void fix_fsum_PVM_multi( //INPUT
} }
//in diffmodel.cc //in diffmodel.cc
__device__ void sort_PVM_multi(int nfib,int nparams,double* params) __device__ void sort_PVM_multi(int nfib,double* params)
{ {
double temp_f, temp_th, temp_ph; double temp_f, temp_th, temp_ph;
// Order vector descending using f parameters as index // Order vector descending using f parameters as index
for(int i=1; i<(nfib); i++){ for(int i=1; i<(nfib); i++){
for(int j=0; j<(nfib-i); j++){ for(int j=0; j<(nfib-i); j++){
if (params[3+j*3] < params[3+i*3]){ if (params[3+j*3] < params[3+(j+1)*3]){
temp_f = params[3+j*3]; temp_f = params[3+j*3];
temp_th = params[3+j*3+1]; temp_th = params[3+j*3+1];
temp_ph = params[3+j*3+2]; temp_ph = params[3+j*3+2];
params[3+j*3] = params[3+i*3]; params[3+j*3] = params[3+(j+1)*3];
params[3+j*3+1] = params[3+i*3+1]; params[3+j*3+1] = params[3+(j+1)*3+1];
params[3+j*3+2] = params[3+i*3+2]; params[3+j*3+2] = params[3+(j+1)*3+2];
params[3+i*3] = temp_f; params[3+(j+1)*3] = temp_f;
params[3+i*3+1] = temp_th; params[3+(j+1)*3+1] = temp_th;
params[3+i*3+2] = temp_ph; params[3+(j+1)*3+2] = temp_ph;
} }
} }
} }
} }
//in diffmodels.cc -- for calculate residuals
__device__ void forwardModel_PVM_multi( //INPUT
const double* p,
const double* bvecs,
const double* bvals,
const int nfib,
const int nparams,
const bool m_include_f0,
//OUTPUT
double* predicted_signal)
{
for(int i=0;i<NDIRECTIONS;i++){
predicted_signal[i]=0; //pred = 0;
}
double val;
double _a = abs(p[1]);
double _b = abs(p[2]);
////////////////////////////////////
double fs[NFIBRES];
double x[NFIBRES*3];
double sumf=0;
double3 x2;
for(int k=0;k<nfib;k++){
int kk = 3+3*k;
fs[k] = x2f_gpu(p[kk]);
sumf += fs[k];
x[k*3] = sin(p[kk+1])*cos(p[kk+2]);
x[k*3+1] = sin(p[kk+1])*sin(p[kk+2]);
x[k*3+2] = cos(p[kk+1]);
}
////////////////////////////////////
for(int i=0;i<NDIRECTIONS;i++){
val = 0.0;
for(int k=0;k<nfib;k++){
x2.x=x[k*3];
x2.y=x[k*3+1];
x2.z=x[k*3+2];
val += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals);
}
if (m_include_f0){
double temp_f0=x2f_gpu(p[nparams-1]);
predicted_signal[i] = abs(p[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+val);
}
else
predicted_signal[i] = abs(p[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+val);
}
}
//in diffmodels.cc -- for calculate residuals
__device__ void get_prediction_PVM_multi( //INPUT
const double* params,
const double* bvecs,
const double* bvals,
const int nfib,
const int nparams,
const bool m_include_f0,
//OUTPUT
double* predicted_signal)
{
//m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1]
double p[NPARAMS];
p[0] = params[0];
p[1] = params[1]*params[1]/params[2]/params[2]; //m_d*m_d/m_d_std/m_d_std;
p[2] = params[2]*params[2]/params[1]; //m_d_std*m_d_std/m_d; // =1/beta
for(int k=0;k<nfib;k++){
int kk = 3+3*k;
p[kk] = f2x_gpu(params[kk]);
p[kk+1] = params[kk+1];
p[kk+2] = params[kk+2];
}
if (m_include_f0)
p[nparams-1]=f2x_gpu(params[nparams-1]);
forwardModel_PVM_multi(p,bvecs,bvals,nfib,nparams,m_include_f0,predicted_signal);
}
//cost function PVM_multi //cost function PVM_multi
__device__ double cf_PVM_multi( //INPUT __device__ void cf_PVM_multi( //INPUT
const double* params, const double* params,
const double* mdata, const double* mdata,
const double* bvecs, const double* bvecs,
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0) const bool m_include_f0,
const int idB,
double* shared, //shared memory
double* fs, //shared memory
double* x, //shared memory
double &_a, //shared memory
double &_b, //shared memory
double &sumf, //shared memory
//OUTPUT
double &cfv)
{ {
double cfv = 0.0; if(idB<NFIBRES){
double err; int kk = 3+3*(idB);
double _a= abs(params[1]); double sinth,costh,sinph,cosph;
double _b= abs(params[2]); sincos(params[kk+1],&sinth,&costh);
double fs[NFIBRES]; sincos(params[kk+2],&sinph,&cosph);
double x[NFIBRES*3]; fs[idB] = x2f_gpu(params[kk]);
double sumf=0; x[idB*3] = sinth*cosph;
x[idB*3+1] = sinth*sinph;
x[idB*3+2] = costh;
}
if(idB==0){
_a= abs(params[1]);
_b= abs(params[2]);
cfv = 0.0;
sumf=0;
for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
}
int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
double err;
double3 x2; double3 x2;
int dir_iter=idB;
for(int k=0;k<NFIBRES;k++){ __syncthreads();
int kk = 3+3*(k);
fs[k] = x2f_gpu(params[kk]); shared[idB]=0;
sumf += fs[k]; for(int dir=0;dir<ndir;dir++){
x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
x[k*3+2] = cos(params[kk+1]);
}
for(int i=0;i<NDIRECTIONS;i++){
err = 0.0; err = 0.0;
for(int k=0;k<NFIBRES;k++){ for(int k=0;k<NFIBRES;k++){
x2.x=x[k*3]; x2.x=x[k*3];
x2.y=x[k*3+1]; x2.y=x[k*3+1];
x2.z=x[k*3+2]; x2.z=x[k*3+2];
err += fs[k]*anisoterm_PVM_multi(i,_a,_b,x2,bvecs,bvals); err += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,bvecs,bvals);
} }
if(m_include_f0){ if(m_include_f0){
double temp_f0=x2f_gpu(params[nparams-1]); double temp_f0=x2f_gpu(params[nparams-1]);
err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+err)))-mdata[i]; err = (abs(params[0])*(temp_f0+((1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)))-mdata[dir_iter];
}else{ }else{
err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+err)-mdata[i]; err = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+err)-mdata[dir_iter];
} }
cfv += err*err; shared[idB]+= err*err;
dir_iter+=THREADS_X_BLOCK_FIT;
} }
return(cfv);
}
__syncthreads();
if(idB==0){
for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
cfv+=shared[i];
}
}
}
//gradient function PVM_multi //gradient function PVM_multi
__device__ void grad_PVM_multi( //INPUT __device__ void grad_PVM_multi( //INPUT
...@@ -229,66 +187,94 @@ __device__ void grad_PVM_multi( //INPUT ...@@ -229,66 +187,94 @@ __device__ void grad_PVM_multi( //INPUT
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0, const bool m_include_f0,
const int idB,
double* shared, //shared memory
double* fs, //shared memory
double* x, //shared memory
double &_a, //shared memory
double &_b, //shared memory
double &sumf, //shared memory
//OUTPUT //OUTPUT
double* grad) double* grad)
{ {
double _a= abs(params[1]); if(idB<NFIBRES){
double _b= abs(params[2]); int kk = 3+3*(idB);
double fs[NFIBRES]; double sinth,costh,sinph,cosph;
double x[NFIBRES*3]; sincos(params[kk+1],&sinth,&costh);
double3 xx; sincos(params[kk+2],&sinph,&cosph);
double sumf=0; fs[idB] = x2f_gpu(params[kk]);
x[idB*3] = sinth*cosph;
for(int k=0;k<NFIBRES;k++){ x[idB*3+1] = sinth*sinph;
int kk = 3+3*(k); x[idB*3+2] = costh;
fs[k] = x2f_gpu(params[kk]);
sumf += fs[k];
x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
x[k*3+2] = cos(params[kk+1]);
} }
if(idB==0){
_a= abs(params[1]);
_b= abs(params[2]);
sumf=0;
for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
for (int p=0;p<nparams;p++) grad[p]=0;
}
double J[NPARAMS]; int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
double diff; if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
double sig; int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
for (int p=0;p<nparams;p++) grad[p]=0;
for(int i=0;i<NDIRECTIONS;i++){ double J[NPARAMS];
sig = 0; double diff;
for(int a=0;a<nparams;a++) J[a]=0; double sig;
for(int k=0;k<NFIBRES;k++){ double3 xx;
int kk = 3+3*(k); int dir_iter=idB;
xx.x=x[k*3];
xx.y=x[k*3+1]; __syncthreads();
xx.z=x[k*3+2];
sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals); for(int dir=0;dir<max_dir;dir++){
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals); for (int p=0; p<nparams; p++) J[p]=0;
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals); if(dir<ndir){
J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]); sig = 0;
J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); for(int k=0;k<NFIBRES;k++){
J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals); int kk = 3+3*(k);
} xx.x=x[k*3];
if(m_include_f0){ xx.y=x[k*3+1];
double temp_f0=x2f_gpu(params[nparams-1]); xx.z=x[k*3+2];
J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]); sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals))+sig); J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals); J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals); J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
}else{ J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig); J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals); }
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals); if(m_include_f0){
} double temp_f0=x2f_gpu(params[nparams-1]);
J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
sig=abs(params[0])*((temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals))+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
}else{
sig = abs(params[0]) * ((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
}
diff = sig - mdata[i]; diff = sig - mdata[dir_iter];
J[0] = (params[0]>0?1.0:-1.0)*sig/params[0]; J[0] = (params[0]>0?1.0:-1.0)*sig/params[0];
}
for (int p=0;p<nparams;p++) grad[p] += 2*J[p]*diff; for (int p=0;p<nparams;p++){
shared[idB]=2*J[p]*diff;
__syncthreads();
if(idB==0){
for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
grad[p] += shared[i];
}
}
__syncthreads();
}
dir_iter+=THREADS_X_BLOCK_FIT;
} }
} }
//hessian function PVM_multi //hessian function PVM_multi
__device__ void hess_PVM_multi( //INPUT __device__ void hess_PVM_multi( //INPUT
const double* params, const double* params,
...@@ -296,75 +282,105 @@ __device__ void hess_PVM_multi( //INPUT ...@@ -296,75 +282,105 @@ __device__ void hess_PVM_multi( //INPUT
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0, const bool m_include_f0,
const int idB,
double* shared, //shared memory
double* fs, //shared memory
double* x, //shared memory
double &_a, //shared memory
double &_b, //shared memory
double &sumf, //shared memory
//OUTPUT
double* hess) double* hess)
{ {
double _a= abs(params[1]); if(idB<NFIBRES){
double _b= abs(params[2]); int kk = 3+3*(idB);
double fs[NFIBRES]; double sinth,costh,sinph,cosph;
double x[NFIBRES*3]; sincos(params[kk+1],&sinth,&costh);
double3 xx; sincos(params[kk+2],&sinph,&cosph);
double sumf=0; fs[idB] = x2f_gpu(params[kk]);
x[idB*3] = sinth*cosph;
for(int k=0;k<NFIBRES;k++){ x[idB*3+1] = sinth*sinph;
int kk = 3+3*(k); x[idB*3+2] = costh;
fs[k] = x2f_gpu(params[kk]);
sumf += fs[k];
x[k*3] = sin(params[kk+1])*cos(params[kk+2]);
x[k*3+1] = sin(params[kk+1])*sin(params[kk+2]);
x[k*3+2] = cos(params[kk+1]);
} }
if(idB==0){
_a= abs(params[1]);
_b= abs(params[2]);
sumf=0;
for(int k=0;k<NFIBRES;k++) sumf+= fs[k];
for (int p=0;p<nparams;p++){
for (int p2=0;p2<nparams;p2++){
hess[p*nparams+p2] = 0;
}
}
}
int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
int max_dir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
if(NDIRECTIONS%THREADS_X_BLOCK_FIT) max_dir++;
double J[NPARAMS]; double J[NPARAMS];
double sig; double sig;
double3 xx;
int dir_iter=idB;
for (int p=0;p<nparams;p++){ __syncthreads();
for (int p2=0;p2<nparams;p2++){
hess[p*nparams+p2] = 0;
}
}
for(int i=0;i<NDIRECTIONS;i++){
sig = 0;
for(int a=0;a<nparams;a++) J[a]=0;
for(int k=0;k<NFIBRES;k++){
int kk = 3+3*(k);
xx.x=x[k*3];
xx.y=x[k*3+1];
xx.z=x[k*3+2];
sig += fs[k]*anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals);
double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(i,_a,_b,xx,bvecs,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(i,_a,_b,xx,bvecs,bvals);
J[kk] = abs(params[0])*(anisoterm_PVM_multi(i,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(i,_a,_b,bvals))*cov;
J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(i,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
}
if(m_include_f0){
double temp_f0=x2f_gpu(params[nparams-1]);
J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(i,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(i,_a,_b,bvals)+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(i,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(i,_a,_b,bvals);
}else{
sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(i,_a,_b,bvals)+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(i,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(i,_a,_b,bvals);
}
J[0] = sig/params[0]; for(int dir=0;dir<max_dir;dir++){
for (int p=0; p<nparams; p++) J[p]=0;
if(dir<ndir){
sig = 0;
for(int k=0;k<NFIBRES;k++){
int kk = 3+3*(k);
xx.x=x[k*3];
xx.y=x[k*3+1];
xx.z=x[k*3+2];
sig += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
double cov = two_pi_gpu*sign_gpu(params[kk])*1/(1+params[kk]*params[kk]);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_a_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*fs[k]*anisoterm_b_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals);
J[kk] = abs(params[0])*(anisoterm_PVM_multi(dir_iter,_a,_b,xx,bvecs,bvals)-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*cov;
J[kk+1] = abs(params[0])*fs[k]*anisoterm_th_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
J[kk+2] = abs(params[0])*fs[k]*anisoterm_ph_PVM_multi(dir_iter,_a,_b,xx,params[kk+1],params[kk+2],bvecs,bvals);
}
if(m_include_f0){
double temp_f0=x2f_gpu(params[nparams-1]);
J[nparams-1]= abs(params[0])*(1-isoterm_PVM_multi(dir_iter,_a,_b,bvals))*two_pi_gpu*sign_gpu(params[nparams-1])*1/(1+params[nparams-1]*params[nparams-1]);
sig = abs(params[0])* (temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf-temp_f0)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
}else{
sig = abs(params[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,bvals)+sig);
J[1] += (params[1]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_a_PVM_multi(dir_iter,_a,_b,bvals);
J[2] += (params[2]>0?1.0:-1.0)*abs(params[0])*(1-sumf)*isoterm_b_PVM_multi(dir_iter,_a,_b,bvals);
}
J[0] = sig/params[0];
}
for (int p=0;p<nparams;p++){ for (int p=0;p<nparams;p++){
for (int p2=p;p2<nparams;p2++){ for (int p2=p;p2<nparams;p2++){
hess[p*nparams+p2] += 2*(J[p]*J[p2]);
shared[idB]=2*(J[p]*J[p2]);
__syncthreads();
if(idB==0){
for(int i=0;i<THREADS_X_BLOCK_FIT;i++){
hess[p*nparams+p2] += shared[i];
}
}
__syncthreads();
} }
} }
dir_iter+=THREADS_X_BLOCK_FIT;
} }
for (int j=0; j<nparams; j++) { if(idB==0){
for (int i=j+1; i<nparams; i++) { for (int j=0; j<nparams; j++) {
hess[i*nparams+j]=hess[j*nparams+i]; for (int i=j+1; i<nparams; i++) {
} hess[i*nparams+j]=hess[j*nparams+i];
} }
}
}
} }
//in diffmodel.cc //in diffmodel.cc
...@@ -379,63 +395,80 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT ...@@ -379,63 +395,80 @@ extern "C" __global__ void fit_PVM_multi_kernel( //INPUT
//OUTPUT //OUTPUT
double* params) double* params)
{ {
int id = blockIdx.x * blockDim.x + threadIdx.x; int idB = threadIdx.x;
if (id >=nvox) { return; } int idVOX = blockIdx.x;
int nparams; __shared__ double myparams[NPARAMS];
if (m_include_f0) __shared__ int nparams;
nparams = nfib*3 + 4;
else __shared__ double step[NPARAMS];
nparams = nfib*3 + 3; __shared__ double grad[NPARAMS];
__shared__ double hess[NPARAMS*NPARAMS];
int nparams_single_c = nparams-1; __shared__ double inverse[NPARAMS];
__shared__ double pcf;
double myparams[NPARAMS]; __shared__ double ncf;
double myparams_aux[NPARAMS]; __shared__ double lambda;
double mydata[NDIRECTIONS]; __shared__ double cftol;
__shared__ double ltol;
for(int i=0;i<NDIRECTIONS;i++){ __shared__ double olambda;
mydata[i]=data[(id*NDIRECTIONS)+i]; __shared__ bool success;
} __shared__ bool end;
for(int i=0;i<nparams_single_c;i++){ __shared__ double shared[THREADS_X_BLOCK_FIT];
myparams_aux[i]=params_PVM_single_c[(id*nparams_single_c)+i]; __shared__ double fs[NFIBRES];
} __shared__ double x[NFIBRES*3];
__shared__ double _a;
myparams[0] = myparams_aux[0]; //pvm1.get_s0(); __shared__ double _b;
myparams[1] = 1.0; //start with d=d_std __shared__ double sumf;
for(int i=0,ii=3;i<nfib;i++,ii+=3){
myparams[ii] = f2x_gpu(myparams_aux[ii-1]); if(idB==0){
myparams[ii+1] = myparams_aux[ii]; if (m_include_f0)
myparams[ii+2] = myparams_aux[ii+1]; nparams = nfib*3 + 4;
} else
myparams[2] = myparams_aux[1]; //pvm1.get_d(); nparams = nfib*3 + 3;
if (m_include_f0)
myparams[nparams-1]=f2x_gpu(myparams_aux[nparams_single_c-1]); int nparams_single_c = nparams-1;
myparams[0] = params_PVM_single_c[(idVOX*nparams_single_c)+0]; //pvm1.get_s0();
myparams[1] = 1.0; //start with d=d_std
for(int i=0,ii=3;i<nfib;i++,ii+=3){
myparams[ii] = f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+ii-1]);
myparams[ii+1] = params_PVM_single_c[(idVOX*nparams_single_c)+ii];
myparams[ii+2] = params_PVM_single_c[(idVOX*nparams_single_c)+ii+1];
}
myparams[2] = params_PVM_single_c[(idVOX*nparams_single_c)+1] ; //pvm1.get_d();
if (m_include_f0)
myparams[nparams-1]=f2x_gpu(params_PVM_single_c[(idVOX*nparams_single_c)+nparams_single_c-1]);
}
__syncthreads();
//do the fit //do the fit
levenberg_marquardt_PVM_multi_gpu(mydata, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nparams, m_include_f0, myparams); levenberg_marquardt_PVM_multi_gpu(&data[idVOX*NDIRECTIONS],&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS],nparams,m_include_f0,idB,step,grad,hess,inverse, pcf,ncf,lambda,cftol,ltol,olambda,success,end,shared,fs,x,_a,_b,sumf,myparams);
__syncthreads();
// finalise parameters // finalise parameters
//m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1] //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1]
myparams_aux[1] = myparams[1]; if(idB==0){
double aux = myparams[1];
myparams[1] = abs(myparams_aux[1]*myparams[2]); myparams[1] = abs(aux*myparams[2]);
myparams[2] = sqrt(double(abs(myparams_aux[1]*myparams[2]*myparams[2]))); myparams[2] = sqrt(double(abs(aux*myparams[2]*myparams[2])));
for(int i=3,k=0;k<nfib;i+=3,k++){ for(int i=3,k=0;k<nfib;i+=3,k++){
myparams[i] = x2f_gpu(myparams[i]); myparams[i] = x2f_gpu(myparams[i]);
} }
if (m_include_f0) if (m_include_f0)
myparams[nparams-1]=x2f_gpu(myparams[nparams-1]); myparams[nparams-1]=x2f_gpu(myparams[nparams-1]);
sort_PVM_multi(nfib,nparams,myparams); sort_PVM_multi(nfib,myparams);
fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams); fix_fsum_PVM_multi(m_include_f0,nfib,nparams,myparams);
for(int i=0;i<nparams;i++){ for(int i=0;i<nparams;i++){
params[(id*nparams)+i] = myparams[i]; params[(idVOX*nparams)+i] = myparams[i];
//printf("PARAM[%i]: %.20f\n",i,myparams[i]); }
} }
} }
//in diffmodel.cc //in diffmodel.cc
...@@ -451,34 +484,99 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT ...@@ -451,34 +484,99 @@ extern "C" __global__ void get_residuals_PVM_multi_kernel( //INPUT
//OUTPUT //OUTPUT
double* residuals) double* residuals)
{ {
int id = blockIdx.x * blockDim.x + threadIdx.x; int idB = threadIdx.x;
if (id >=nvox) { return; } int idVOX = blockIdx.x;
__shared__ double myparams[NPARAMS];
__shared__ int nparams;
__shared__ bool my_include_f0;
__shared__ double val;
__shared__ double _a;
__shared__ double _b;
__shared__ double fs[NFIBRES];
__shared__ double x[NFIBRES*3];
__shared__ double sumf;
double predicted_signal;
double mydata;
if(idB==0){
if (m_include_f0)
nparams = nfib*3 + 4;
else
nparams = nfib*3 + 3;
int nparams; my_include_f0 = includes_f0[idVOX];
if (m_include_f0)
nparams = nfib*3 + 4; //m_s0-myparams[0] m_d-myparams[1] m_d_std-myparams[2] m_f-m_th-m_ph-myparams[3,4,5,6 etc..] m_f0-myparams[nparams-1]
else
nparams = nfib*3 + 3; myparams[0] = params[(idVOX*nparams)+0];
double aux1 = params[(idVOX*nparams)+1];
double aux2 = params[(idVOX*nparams)+2];
//m_d*m_d/m_d_std/m_d_std;
myparams[1] = aux1*aux1/aux2/aux2;
myparams[2] = aux2*aux2/aux1; //m_d_std*m_d_std/m_d; // =1/beta
if (my_include_f0)
myparams[nparams-1]=f2x_gpu(params[(idVOX*nparams)+nparams-1]);
}
bool my_include_f0 = includes_f0[id]; if(idB<nfib){
int kk = 3+3*idB;
double sinth,costh,sinph,cosph;
myparams[kk] = f2x_gpu(params[(idVOX*nparams)+kk]);
myparams[kk+1] = params[(idVOX*nparams)+kk+1];
myparams[kk+2] = params[(idVOX*nparams)+kk+2];
sincos(myparams[kk+1],&sinth,&costh);
sincos(myparams[kk+2],&sinph,&cosph);
fs[idB] = x2f_gpu(myparams[kk]);
x[idB*3] = sinth*cosph;
x[idB*3+1] = sinth*sinph;
x[idB*3+2] = costh;
}
double myparams[NPARAMS]; __syncthreads();
double mydata[NDIRECTIONS];
for(int i=0;i<nparams;i++){ if(idB==0){
myparams[i]=params[(id*nparams)+i]; _a = abs(myparams[1]);
} _b = abs(myparams[2]);
sumf=0;
for(int k=0;k<nfib;k++){
sumf += fs[k];
}
int ndir = NDIRECTIONS/THREADS_X_BLOCK_FIT;
if(idB<(NDIRECTIONS%THREADS_X_BLOCK_FIT)) ndir++;
double3 x2;
int dir_iter=idB;
for(int i=0;i<NDIRECTIONS;i++){ __syncthreads();
mydata[i]=data[(id*NDIRECTIONS)+i];
}
double predicted_signal[NDIRECTIONS]; for(int dir=0;dir<ndir;dir++){
mydata = data[(idVOX*NDIRECTIONS)+dir_iter];
predicted_signal=0; //pred = 0;
val = 0.0;
for(int k=0;k<nfib;k++){
x2.x=x[k*3];
x2.y=x[k*3+1];
x2.z=x[k*3+2];
val += fs[k]*anisoterm_PVM_multi(dir_iter,_a,_b,x2,&bvecs[idVOX*3*NDIRECTIONS],&bvals[idVOX*NDIRECTIONS]);
}
if (my_include_f0){
double temp_f0=x2f_gpu(myparams[nparams-1]);
predicted_signal = abs(myparams[0])*(temp_f0+(1-sumf-temp_f0)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val);
}
else
predicted_signal = abs(myparams[0])*((1-sumf)*isoterm_PVM_multi(dir_iter,_a,_b,&bvals[idVOX*NDIRECTIONS])+val);
}
get_prediction_PVM_multi(myparams, &bvecs[id*3*NDIRECTIONS], &bvals[id*NDIRECTIONS], nfib, nparams, my_include_f0, predicted_signal); //residuals=m_data-predicted_signal;
residuals[idVOX*NDIRECTIONS+dir_iter]= mydata - predicted_signal;
for(int i=0;i<NDIRECTIONS;i++){ //residuals=m_data-predicted_signal; dir_iter+=THREADS_X_BLOCK_FIT;
residuals[id*NDIRECTIONS+i]= mydata[i] - predicted_signal[i];
} }
} }
This diff is collapsed.
This diff is collapsed.
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "solver_mult_inverse.cu" #include "solver_mult_inverse.cu"
#include "diffmodels.cuh" #include "diffmodels.cuh"
#include "dim_blocks.h"
#include "options.h" #include "options.h"
//CPU version in nonlin.h //CPU version in nonlin.h
...@@ -27,72 +28,93 @@ __device__ void levenberg_marquardt_PVM_single_gpu( //INPUT ...@@ -27,72 +28,93 @@ __device__ void levenberg_marquardt_PVM_single_gpu( //INPUT
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0, const bool m_include_f0,
const int idB,
double* step, //shared memory
double* grad, //shared memory
double* hess, //shared memory
double* inverse, //shared memory
double &pcf, //shared memory
double &ncf, //shared memory
double &lambda, //shared memory
double &cftol, //shared memory
double &ltol, //shared memory
double &olambda, //shared memory
bool &success, //shared memory
bool &end, //shared memory
double* shared, //shared memory
double* fs, //shared memory
double* x, //shared memory
double &_d, //shared memory
double &sumf, //shared memory
//INPUT-OUTPUT //INPUT-OUTPUT
double* myparams) double* myparams) //shared memory
{ {
double pcf; int niter=0;
double lambda=0.1; int maxiter=200;
double cftol=1.0e-8;
double ltol=1.0e20; if(idB==0){
end=false;
bool success = true; lambda=0.1;
double olambda = 0.0; cftol=1.0e-8;
double grad[NPARAMS]; ltol=1.0e20;
double hess[NPARAMS*NPARAMS]; success = true;
double inverse[NPARAMS]; olambda = 0.0;
double step[NPARAMS]; ncf=0;
}
double ncf=0;
cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf);
int maxiter=200; __syncthreads();
int niter=0;
while (!(success&&niter++>=maxiter)){ //if success we not increase niter (first condition is true)
pcf=cf_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0); //function cost has decreise, we have advanced.
while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true)
//function cost has decreise, we have advanced.
if(success){ if(success){
grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); grad_PVM_single(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,grad);
hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,hess); __syncthreads();
} hess_PVM_single(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,hess);
for (int i=0; i<nparams; i++) {
hess[(i*nparams)+i]+=lambda-olambda;
} }
solver(hess,grad,nparams,inverse); if(idB==0){
for (int i=0; i<nparams; i++) {
for (int i=0;i<nparams;i++){ hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L
step[i]=-inverse[i]; }
}
for(int i=0;i<nparams;i++){ solver(hess,grad,nparams,inverse);
step[i]=myparams[i]+step[i];
}
ncf = cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0); for (int i=0;i<nparams;i++){
step[i]=-inverse[i];
}
if (success = (ncf < pcf)) { for(int i=0;i<nparams;i++){
olambda = 0.0; step[i]=myparams[i]+step[i];
for(int i=0;i<nparams;i++){
myparams[i]=step[i];
} }
lambda=lambda/10.0; }
if (zero_cf_diff_conv(pcf,ncf,cftol)){ __syncthreads();
return; cf_PVM_single(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf);
}
pcf=ncf; if(idB==0){
}else{ if (success = (ncf < pcf)){
olambda=lambda; olambda = 0.0;
lambda=lambda*10.0; for(int i=0;i<nparams;i++){
if(lambda> ltol){ myparams[i]=step[i];
return; }
lambda=lambda/10.0;
if (zero_cf_diff_conv(pcf,ncf,cftol)){
end=true;
}
pcf=ncf;
}else{
olambda=lambda;
lambda=lambda*10.0;
if(lambda> ltol){
end=true;
}
} }
} }
__syncthreads();
if(end) return;
} }
return;
} }
__device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT
...@@ -101,72 +123,94 @@ __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT ...@@ -101,72 +123,94 @@ __device__ void levenberg_marquardt_PVM_single_c_gpu( //INPUT
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0, const bool m_include_f0,
const int idB,
double* step, //shared memory
double* grad, //shared memory
double* hess, //shared memory
double* inverse, //shared memory
double &pcf, //shared memory
double &ncf, //shared memory
double &lambda, //shared memory
double &cftol, //shared memory
double &ltol, //shared memory
double &olambda, //shared memory
bool &success, //shared memory
bool &end, //shared memory
double* shared, //shared memory
double* fs, //shared memory
double* f_deriv, //shared memory
double* x, //shared memory
double &_d, //shared memory
double &sumf, //shared memory
//INPUT-OUTPUT //INPUT-OUTPUT
double* myparams) double* myparams) //shared memory
{ {
double pcf; int niter=0;
double lambda=0.1; int maxiter=200;
double cftol=1.0e-8;
double ltol=1.0e20; if(idB==0){
end=false;
bool success = true; lambda=0.1;
double olambda = 0.0; cftol=1.0e-8;
double grad[NPARAMS]; ltol=1.0e20;
double hess[NPARAMS*NPARAMS]; success = true;
double inverse[NPARAMS]; olambda = 0.0;
double step[NPARAMS]; ncf=0;
}
double ncf=0;
cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,pcf);
int maxiter=200; __syncthreads();
int niter=0;
pcf=cf_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0);
while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true)
//function cost has decreise, we have advanced. //function cost has decreise, we have advanced.
if(success){ if(success){
grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); grad_PVM_single_c(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,grad);
hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,hess); __syncthreads();
hess_PVM_single_c(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,f_deriv,x,_d,sumf,hess);
} }
for (int i=0; i<nparams; i++) { if(idB==0){
hess[(i*nparams)+i]+=lambda-olambda; for (int i=0; i<nparams; i++) {
} hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L
}
solver(hess,grad,nparams,inverse); solver(hess,grad,nparams,inverse);
for (int i=0;i<nparams;i++){ for (int i=0;i<nparams;i++){
step[i]=-inverse[i]; step[i]=-inverse[i];
} }
for(int i=0;i<nparams;i++){ for(int i=0;i<nparams;i++){
step[i]=myparams[i]+step[i]; step[i]=myparams[i]+step[i];
}
ncf = cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0);
if (success = (ncf < pcf)) {
olambda = 0.0;
for(int i=0;i<nparams;i++){
myparams[i]=step[i];
} }
lambda=lambda/10.0; }
if (zero_cf_diff_conv(pcf,ncf,cftol)){ __syncthreads();
return; cf_PVM_single_c(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_d,sumf,ncf);
}
pcf=ncf; if(idB==0){
}else{ if (success = (ncf < pcf)) {
olambda=lambda; olambda = 0.0;
lambda=lambda*10.0; for(int i=0;i<nparams;i++){
if(lambda> ltol){ myparams[i]=step[i];
return; }
} lambda=lambda/10.0;
}
if (zero_cf_diff_conv(pcf,ncf,cftol)){
end=true;
}
pcf=ncf;
}else{
olambda=lambda;
lambda=lambda*10.0;
if(lambda> ltol){
end=true;
}
}
}
__syncthreads();
if(end) return;
} }
return;
} }
...@@ -176,71 +220,93 @@ __device__ void levenberg_marquardt_PVM_multi_gpu( //INPUT ...@@ -176,71 +220,93 @@ __device__ void levenberg_marquardt_PVM_multi_gpu( //INPUT
const double* bvals, const double* bvals,
const int nparams, const int nparams,
const bool m_include_f0, const bool m_include_f0,
const int idB,
double* step, //shared memory
double* grad, //shared memory
double* hess, //shared memory
double* inverse, //shared memory
double &pcf, //shared memory
double &ncf, //shared memory
double &lambda, //shared memory
double &cftol, //shared memory
double &ltol, //shared memory
double &olambda, //shared memory
bool &success, //shared memory
bool &end, //shared memory
double* shared, //shared memory
double* fs, //shared memory
double* x, //shared memory
double &_a, //shared memory
double &_b, //shared memory
double &sumf, //shared memory
//INPUT-OUTPUT //INPUT-OUTPUT
double* myparams) double* myparams) //shared memory
{ {
double pcf; int niter=0;
double lambda=0.1; int maxiter=200;
double cftol=1.0e-8;
double ltol=1.0e20; if(idB==0){
end=false;
bool success = true; lambda=0.1;
double olambda = 0.0; cftol=1.0e-8;
double grad[NPARAMS]; ltol=1.0e20;
double hess[NPARAMS*NPARAMS]; success = true;
double inverse[NPARAMS]; olambda = 0.0;
double step[NPARAMS]; ncf=0;
}
double ncf=0;
cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,pcf);
int maxiter=200; __syncthreads();
int niter=0;
pcf=cf_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0);
while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true) while (!(success&&niter++ >= maxiter)){ //if success we not increase niter (first condition is true)
//function cost has decreise, we have advanced. //function cost has decreise, we have advanced.
if(success){ if(success){
grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,grad); grad_PVM_multi(myparams,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,grad);
hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,hess); __syncthreads();
} hess_PVM_multi(myparams,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,hess);
for (int i=0; i<nparams; i++) {
hess[(i*nparams)+i]+=lambda-olambda;
} }
solver(hess,grad,nparams,inverse); if(idB==0){
for (int i=0; i<nparams; i++) {
for (int i=0;i<nparams;i++){ hess[(i*nparams)+i]+=lambda-olambda; //Levenberg LM_L
step[i]=-inverse[i]; }
}
for(int i=0;i<nparams;i++){ solver(hess,grad,nparams,inverse);
step[i]=myparams[i]+step[i];
}
ncf = cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0); for (int i=0;i<nparams;i++){
step[i]=-inverse[i];
}
if (success = (ncf < pcf)) { for(int i=0;i<nparams;i++){
olambda = 0.0; step[i]=myparams[i]+step[i];
for(int i=0;i<nparams;i++){
myparams[i]=step[i];
} }
lambda=lambda/10.0; }
if (zero_cf_diff_conv(pcf,ncf,cftol)){ __syncthreads();
return; cf_PVM_multi(step,mydata,bvecs,bvals,nparams,m_include_f0,idB,shared,fs,x,_a,_b,sumf,ncf);
}
pcf=ncf; if(idB==0){
}else{ if (success = (ncf < pcf)) {
olambda=lambda; olambda = 0.0;
lambda=lambda*10.0; for(int i=0;i<nparams;i++){
if(lambda> ltol){ myparams[i]=step[i];
return; }
} lambda=lambda/10.0;
}
if (zero_cf_diff_conv(pcf,ncf,cftol)){
end=true;
}
pcf=ncf;
}else{
olambda=lambda;
lambda=lambda*10.0;
if(lambda> ltol){
end=true;
}
}
}
__syncthreads();
if(end) return;
} }
return;
} }
#endif #endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment