Commit e3e53e07 authored by Saad Jbabdi's avatar Saad Jbabdi
Browse files

Add GPs

parent a77b77a8
%
% This script is adapted from that which I used to generate figures for the
% Gaussian Processes talk I gave on Feb-11 2021. The idea is to read the
% comments for a bit of a reminder of the theory, and then test the code.
% I tend to run 'matlab -nodesktop' and then cut and paste from an editor
% into the matlab terminal window. But I hope it works well in the matlab
% GUI too for those who prefer that.
%
%
% This is how I generated the 1D test data.
% Uncomment if you want to generate new data, otherwise load data_exp_cos.mat
% to get the exact data that I used.
%
% x = 3*pi*rand(100,1);
% f = exp(-0.2*x).*cos(x) + 0.2*randn(100,1);
load ./data/data_exp_cos
% Take a look at the data
figure
plot(x,f,'d','MarkerSize',5,'LineWidth',2);
set(gca,'FontSize',16);
%
% As you can see this is quite noisy data that has been sampled quite densly.
% So our task here is not so much interpolation as trying to predict what the
% "true" function values should be (i.e. smoothing).
%
% Let us start with looking at the traditional way of doing it
[xm,indx] = sort(x);
fm = f(indx);
X = [ones(100,1) xm xm.^2 xm.^3 xm.^4]; % Fouth order polynomial
% Below we plot the full model versus the data. If you for example want to
% see what a 2nd order polynomial would look like you just change
% X*pinv(X)*f to X(:,1:3)*pinv(X(:,1:3))*f. Or add it to the existing plot
% in a new color.
figure
plot(xm,fm,'d','MarkerSize',5,'LineWidth',2);
set(gca,'FontSize',16);
hold on;
plot(xm,X*pinv(X)*fm,'r','LineWidth',2);
%
% So, as you can see above you will get very different results for different
% order polynomials. So if you don't have a theoretical reason for what the
% model should be, it can be very hard to decide between them.
% This is where Gaussian Processes come in
%
% As you may remember, the first thing we want to do is to decide on a
% distance metric that can then be combined with some function to give
% us a covariance function.
% In this case it is quite clear what the distance metric should be,
% it is simply the distance between the points along the x-axis.
% But it is not always so clear, and this is the first point at which
% one might need to think a little.
% Remember for my GP model for diffusion data where I tried two different
% distance metrics, a) the inner product of the diffusion gradients and
% b) the smallest angle between the extended diffusion gradients.
%
% So, let's start with creating a matrix of distances between each point
% and every other point.
%
D = zeros(length(x));
for i=1:length(x)
D(:,i) = abs(x - x(i));
end
figure
imagesc(D)
axis square
%
% Once we have a distance matrix we can easily convert it to a covariance
% matrix by applying a function to it. for example squared exponential.
% Take a look at slide 48 if you need to remind yourselves.
%
ell = 1;
sigmaf = 1;
K = sigmaf * exp(-0.5*(D.^2)/ell^2);
figure
imagesc(K)
axis square
%
% But that doesn't look at all like the covarince matrices we are used
% to seeing associated with GPs. I did this on purpose to stress that
% the way we are used to seeing them is because they tend to be sorted
% w.r.t. the independent variable (x). But this is not at all needed
% for GPs to work, and it is typically not possible when the independent
% variable has greater dimension than 1.
% But if you want to see the K-matrix the way they typically look, you
% just need to sort the x-values
%
[x,indx] = sort(x);
f = f(indx);
D = zeros(length(x));
for i=1:length(x)
D(:,i) = abs(x - x(i));
end
K = sigmaf^2 * exp(-0.5*(D.^2)/ell^2);
figure
subplot(1,2,1)
imagesc(D)
title('D')
axis square
subplot(1,2,2)
imagesc(K)
title('K')
axis square
%
% Feel free to play around a little with ell and sigmaf, to see what
% effect they have on K.
%
%
% The squared exponential covariance function is _very_ dominating,
% and it does impose a "nice" smoothness on the GP solution. And one
% that is easily and intuitively manipulated through ell. So it is
% probably reasonable to have that as one's first choice for testing.
%
% But it is far from the only possible choice, and you may want to
% for example play around with covariance functions from the Matern
% family (named after the Swede, Bertil Matern).
% As indicated in the slides (slide 59) it is only defined in its
% limit for d->0 (d for distance). So it is typically only used for
% the special cases of nu=i+1/2 where i is a non-negative integer.
% When nu->Inf the Matern kernel becomes identical to the squared
% exponential. So let's look at the n=1/2, since that is the one
% that is most distinct from the squared exponential.
%
Km = sigmaf^2 * exp(-D/ell); % Note that we use the same parameters
figure
subplot(1,2,1)
imagesc(K)
title('K - Squared exponential')
axis square
subplot(1,2,2)
imagesc(Km)
title('K - Matern, nu=1/2')
axis square
%
% We can now play around with drawing sample-functions from
% covariance-matrices such as these. You can think of them as
% priors on the functions.
%
% We draw samples in exactly the same way as we would draw samples
% from any multivariate Gaussian given a mean-vector mu and a
% covariance-matrix Sigma. Remember that we assume a zero-mean for
% our GP functions.
%
% We could use the covariance matrices that we have already
% calculated for this. But these are just realisations of the
% underlying covariance functions, sampled on the particular
% x-values in our x-vector. Our samples will look nicer if
% we create covariance matrices more akin to the underlying
% smooth covariance functions sampled on a dense regular grid.
% Let us call them cK and cKm respectively (c for "continuous").
%
cx = 0:0.01:3*pi;
cK = toeplitz(sigmaf^2*exp(-0.5*(cx.^2)/ell^2)); % Do you understand this trick?
cKm = toeplitz(sigmaf^2*exp(-cx/ell));
figure
subplot(1,2,1)
imagesc(cK)
title('K - Squared exponential')
axis square
subplot(1,2,2)
imagesc(cKm)
title('K - Matern, nu=1/2')
axis square
% Finally, let us draw some sample functions
L = chol(cK+1e-6*eye(size(cK)),'lower'); % Stabilised with tiny nudge to diagonal
Lm = chol(cKm+1e-6*eye(size(cKm)),'lower');
cf = L * randn(length(L),4); % 4 different realisations
cfm = Lm * randn(length(Lm),4);
figure
subplot(1,2,1)
plot(cx,cf);
title('Samples from squared exp');
axis([0 10 -3 3])
subplot(1,2,2)
plot(cx,cfm);
title('Samples from Matern, nu=1/2');
axis([0 10 -3 3])
%
% Play around a little with changing the values for sigmaf and ell. Maybe
% even have a go with a different Matern.
%
%
% So, we have drawn some samples from the prior. Now let us introduce some
% data.
%
% The general "form" for calculating a value of f given an x, when we have
% a set of {x,f} observations and a covariance matrix K is
%
% ff = k * inv(K) * f;
%
% where K is the covariance matrix for the data and where k is a
% "covariance vector", with the covariances between the point
% we want to predict and the data points. k is calculated in
% exactly the same way as we calculated K, but now with the distances
% between the x for ff and all the x's for the data.
% So, let us say we want to know the "best" value of f for x=5,
% and let us say that we want to use the Matern K with nu=0.5
% sigmaf = 1 and ell = 1.
%
k = sigmaf^2 * exp(-(abs(x' - 5))/ell);
ff = k * inv(Km) * f;
figure
plot(x,f,'*b');
hold on
plot(5,ff,'dr','MarkerSize',15,'MarkerFaceColor','r')
axis([0 10 -1 1.5])
% So we have a prediction for a single value of x, but we want to predict
% the function. So we use the same trick again with dense sampling on a
% regular grid. Note that k now becomes a matrix, but I tend to think of
% it as a "set of vectors" that are usefully represented as a matrix when
% working in matlab.
cx = 0:0.01:10;
k = zeros(length(cx),length(Km));
for i=1:length(cx)
k(i,:) = sigmaf^2 * exp(-(abs(x' - cx(i)))/ell);
end
cf = k * inv(Km) * f;
figure
plot(x,f,'*b');
hold on
plot(cx,cf,'-r')
axis([0 10 -1 1.5])
%
% Right, not the most plausible function, no? This is because we haven't
% allowed for the possibility of any error in our data points, so the
% function pretty much have to tie itself up into a knot to go through
% all of them. In fact, if you had tried this with the squared exponential
% it would have "blown up" because it cannot reconcile the smoothness of
% the prior with the data.
%
% So, we need an additional hyper-parameter (in addition to sigmaf and ell)
% that specifies what we think the uncertainty in the data might be.
% Let us call it sigman.
%
% Now, what would happen if we thought that sigman = 1?
%
sigman = 1;
cf = k * inv(Km + sigman^2*eye(size(Km))) * f;
figure
plot(x,f,'*b');
hold on
plot(cx,cf,'-r')
axis([0 10 -1 1.5])
%
% I think that looks a bit more sensible. It still feels a little rough
% on the short length scales though, which is what you get with Matern
% with nu=0.5. Maybe we should try with the squared exponential and
% compare.
%
km = k;
cfm = km * inv(Km + sigman^2*eye(size(Km))) * f;
k = zeros(length(cx),length(Km));
for i=1:length(cx)
k(i,:) = sigmaf^2 * exp(-0.5*(abs(x' - cx(i))).^2/ell^2);
end
cf = k * inv(K + sigman^2*eye(size(K))) * f;
figure
plot(x,f,'*b');
hold on
plot(cx,cf,'-r')
axis([0 10 -1 1.5])
plot(cx,cfm,'-g')
legend('data','Sqr-exp','Matern')
%
% I think that the squared exponential looks better. But we are still
% just working with guesses about what the hyper-parameters (sigmaf,
% ell and sigman) should be. We will soon look at that, but before
% that I suggest you play around a little with changing the hyper-
% parameters to see the impact it has on our predictions.
%
%
% Ok, hopefully you have done some playing around.
% Let us now look at how we can find more objective guesstimates of
% what the hyper-parameters should be. We will use Marginal Likelihood
% as a cost-function, finding the hyper-parameters that maximise it
% using non-linear optimisation (in fact we minimise the neg-log
% marginal likelihood).
%
% I will use the Matlab concepts of function handles and anonymous
% functions for this. If you don't know about these things, don't
% worry. But they are very useful, so it might be worth considering
% learning about them.
% This is a handle to an anonymous function that calculates the
% marginal likelihood given the data as represented by f (the
% values of the dependent variable) and D (the distance matrix,
% which has the information about the independent variable x)
% and p, which is a vector with [sigmaf^2 ell^2 sigman^2]
mml_f = @(p,f,D)f'*inv(p(1)*exp(-0.5*(D.^2)/p(2)) + p(3)*eye(size(D)))*f + log(det(p(1)*exp(-0.5*(D.^2)/p(2)) + p(3)*eye(size(D))));
% This "captures" f and D, such that the resulting function handle
% is a function only of p, which is required by fminsearch.
mml_fun = @(p)mml_f(p,f,D);
%
% You can test-run mml_fun by putting in different values
% for the vector p.
%
% And here we use fminsearch to find the minimum.
p_mml = fminsearch(mml_fun,[.1 1 .1]);
%
% Let us see what predictions we get from this.
%
K_mml = p_mml(1) * exp(-0.5*(D.^2)/p_mml(2));
k_mml = zeros(length(cx),length(K_mml));
for i=1:length(cx)
k_mml(i,:) = p_mml(1) * exp(-0.5*(abs(x' - cx(i))).^2/p_mml(2));
end
cf_mml = k_mml * inv(K_mml + p_mml(3)*eye(size(K_mml))) * f;
figure
plot(x,f,'*b');
hold on
plot(cx,cf_mml,'-r')
%
% We haven't talked about the estimates of the uncertainty.
% We will now estimate the covariance-matrix of the estimates
% of the mean. And the diagonal of that is the uncertainty
% at every given point.
%
% The uncertainty before we have seen the data
pCOV_mml = p_mml(1) * toeplitz(exp(-0.5*cx.^2/p_mml(2)));
% And then the reduction in uncertainty due to the data
red_mml = k_mml * inv(K_mml + p_mml(3)*eye(size(K_mml))) * k_mml';
% And the uncertainty of the estimates given the data
COV_mml = pCOV_mml - red_mml;
figure
plot(x,f,'*b');
hold on
plot(cx,cf_mml,'-r')
axis([0 10 -1 1.5])
plot(cx,cf_mml+1.96*sqrt(diag(COV_mml)),'g--');
plot(cx,cf_mml-1.96*sqrt(diag(COV_mml)),'g--');
title('Mean function +- 1.96 std')
% And since we know the true function we can see how we did.
tf = exp(-0.2*cx).*cos(cx);
figure
plot(x,f,'*b');
hold on
plot(cx,tf,'-r')
axis([0 10 -1 1.5])
plot(cx,cf_mml+1.96*sqrt(diag(COV_mml)),'g--');
plot(cx,cf_mml-1.96*sqrt(diag(COV_mml)),'g--');
title('True function inside bounds')
%
% Right, that was all for now. Hopefully you have enough in here
% to keep playing and testing if you are interested in Gaussian
% Processes.
%
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment