Demonstration of covariance matrix and basis function implementation of Gaussian process model in Stan.
The basics of the covariance matrix approach is based on the Chapter 10 of Stan User’s Guide, Version 2.26 by Stan Development Team (2021). https://mc-stan.org/docs/stan-users-guide/
The basics of the Hilbert space basis function approximation is based on Riutort-Mayol, Bürkner, Andersen, Solin, and Vehtari (2020). Practical Hilbert space approximate Bayesian Gaussian processes for probabilistic programming. https://arxiv.org/abs/2004.11408
Data are measurements of head acceleration in a simulated motorcycle accident, used to test crash helmets.
Data are modelled first with normal distribution having Gaussian process prior on mean: \[ y \sim \mbox{normal}(f(x), \sigma)\\ f \sim GP(0, K_1)\\ \sigma \sim \mbox{normal}^{+}(0, 1), \] and then with normal distribution having Gaussian process prior on mean and log standard deviation: \[ y \sim \mbox{normal}(f(x), \exp(g(x))\\ f \sim GP(0, K_1)\\ g \sim GP(0, K_2). \]
library(cmdstanr)
library(posterior)
library(loo)
library(tidybayes)
options(pillar.neg = FALSE, pillar.subtle=FALSE, pillar.sigfig=2)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(latex2exp)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans", base_size=16))
set1 <- RColorBrewer::brewer.pal(7, "Set1")
SEED <- 48927 # set random seed for reproducibility
Load data
data(mcycle, package="MASS")
head(mcycle)
times accel
1 2.4 0.0
2 2.6 -1.3
3 3.2 -2.7
4 3.6 0.0
5 4.0 -2.7
6 6.2 -2.7
Plot data
mcycle %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")
We start with a simpler homoskedastic residual Gaussian process model \[ y \sim \mbox{normal}(f(x), \sigma)\\ f \sim GP(0, K_1)\\ \sigma \sim \mbox{normal}^{+}(0, 1), \] that has analytic marginal likelihood for the covariance function and residual scale parameters.
file_gpcovf <- "gpcovf.stan"
writeLines(readLines(file_gpcovf))
functions {
vector gp_pred_rng(array[] real x2,
vector y1,
array[] real x1,
real sigma_f,
real lengthscale_f,
real sigma,
real jitter) {
int N1 = rows(y1);
int N2 = size(x2);
vector[N2] f2;
{
matrix[N1, N1] L_K;
vector[N1] K_div_y1;
matrix[N1, N2] k_x1_x2;
matrix[N1, N2] v_pred;
vector[N2] f2_mu;
matrix[N2, N2] cov_f2;
matrix[N1, N1] K;
K = gp_exp_quad_cov(x1, sigma_f, lengthscale_f);
for (n in 1:N1)
K[n, n] = K[n,n] + square(sigma);
L_K = cholesky_decompose(K);
K_div_y1 = mdivide_left_tri_low(L_K, y1);
K_div_y1 = mdivide_right_tri_low(K_div_y1', L_K)';
k_x1_x2 = gp_exp_quad_cov(x1, x2, sigma_f, lengthscale_f);
f2_mu = (k_x1_x2' * K_div_y1);
v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
cov_f2 = gp_exp_quad_cov(x2, sigma_f, lengthscale_f) - v_pred' * v_pred;
f2 = multi_normal_rng(f2_mu, add_diag(cov_f2, rep_vector(jitter, N2)));
}
return f2;
}
}
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
vector[N] y; // target variable
int<lower=1> N2; // number of test points
vector[N2] x2; // univariate test points
}
transformed data {
// Normalize data
real xmean = mean(x);
real ymean = mean(y);
real xsd = sd(x);
real ysd = sd(y);
array[N] real xn = to_array_1d((x - xmean)/xsd);
array[N2] real x2n = to_array_1d((x2 - xmean)/xsd);
vector[N] yn = (y - ymean)/ysd;
real sigma_intercept = 1;
vector[N] zeros = rep_vector(0, N);
}
parameters {
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
real<lower=0> sigman; // noise sigma
}
model {
// covariances and Cholesky decompositions
matrix[N, N] K_f = gp_exp_quad_cov(xn, sigma_f, lengthscale_f)+
sigma_intercept^2;
matrix[N, N] L_f = cholesky_decompose(add_diag(K_f, sigman^2));
// priors
lengthscale_f ~ normal(0, 1);
sigma_f ~ normal(0, 1);
sigman ~ normal(0, 1);
// model
yn ~ multi_normal_cholesky(zeros, L_f);
}
generated quantities {
// function scaled back to the original scale
vector[N2] f = gp_pred_rng(x2n, yn, xn, sigma_f, lengthscale_f, sigman, 1e-9)*ysd + ymean;
real sigma = sigman*ysd;
}
Compile Stan model
model_gpcovf <- cmdstan_model(stan_file = file_gpcovf)
Data to be passed to Stan
standata_gpcovf <- list(x=mcycle$times,
x2=mcycle$times,
y=mcycle$accel,
N=length(mcycle$times),
N2=length(mcycle$times))
opt_gpcovf <- model_gpcovf$optimize(data=standata_gpcovf,
init=0.1, algorithm='bfgs')
Check whether parameters have reasonable values
odraws_gpcovf <- as_draws_rvars(opt_gpcovf$draws())
subset(odraws_gpcovf, variable=c('sigma_','lengthscale_','sigma'), regex=TRUE)
# A draws_rvars: 1 iterations, 1 chains, and 4 variables
$sigma_f: rvar<1>[1] mean ± sd:
[1] 0.91 ± NA
$lengthscale_f: rvar<1>[1] mean ± sd:
[1] 0.39 ± NA
$sigman: rvar<1>[1] mean ± sd:
[1] 0.47 ± NA
$sigma: rvar<1>[1] mean ± sd:
[1] 23 ± NA
Compare the model to the data
mcycle %>%
mutate(Ef=mean(odraws_gpcovf$f),
sigma=mean(odraws_gpcovf$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The model fit given optimized parameters, looks reasonable considering the use of homoskedastic residual model.
fit_gpcovf <- model_gpcovf$sample(data=standata_gpcovf,
iter_warmup=500, iter_sampling=500,
chains=4, parallel_chains=4, refresh=100)
Check whether parameters have reasonable values
draws_gpcovf <- as_draws_rvars(fit_gpcovf$draws())
summarise_draws(subset(draws_gpcovf,
variable=c('sigma_','lengthscale_','sigma'),
regex=TRUE))
# A tibble: 4 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 sigma_f 1.1 1.0 0.31 0.27 0.69 1.7 1.0 1263. 1184.
2 lengthscale_f 0.40 0.41 0.063 0.060 0.30 0.51 1.0 1151. 1172.
3 sigman 0.47 0.47 0.030 0.029 0.42 0.52 1.0 1545. 1411.
4 sigma 23. 23. 1.5 1.4 20. 25. 1.0 1545. 1411.
Compare the model to the data
mcycle %>%
mutate(Ef=mean(draws_gpcovf$f),
sigma=mean(draws_gpcovf$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The model fit given integrated parameters looks similar to the optimized one.
Compare the posterior draws to the optimized parameters
odraws_gpcovf <- as_draws_df(opt_gpcovf$draws())
draws_gpcovf %>%
as_draws_df() %>%
ggplot(aes(x=lengthscale_f,y=sigma_f))+
geom_point(color=set1[2])+
geom_point(data=odraws_gpcovf,color=set1[1],size=4)+
annotate("text",x=median(draws_gpcovf$lengthscale_f),
y=max(draws_gpcovf$sigma_f)+0.1,
label='Posterior draws',hjust=0.5,color=set1[2],size=6)+
annotate("text",x=odraws_gpcovf$lengthscale_f+0.01,
y=odraws_gpcovf$sigma_f,
label='Optimized',hjust=0,color=set1[1],size=6)
The optimization result is in the middle of the posterior and quite well representative of the low dimensional posterior (in higher dimensions the mean or mode of the posterior is not likely to be representative).
Compare optimized and posterior predictive distributions
odraws_gpcovf <- as_draws_rvars(opt_gpcovf$draws())
mcycle %>%
mutate(Ef=mean(draws_gpcovf$f),
sigma=mean(draws_gpcovf$sigma),
Efo=mean(odraws_gpcovf$f),
sigmao=mean(odraws_gpcovf$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Efo), color=set1[2])+
geom_line(aes(y=Efo-2*sigmao), color=set1[2],linetype="dashed")+
geom_line(aes(y=Efo+2*sigmao), color=set1[2],linetype="dashed")
The model predictions are very similar, and in general GP covariance function and observation model parameters can be quite safely optimized if there are only a few of them and thus marginal posterior is low dimensional and the number of observations is relatively high.
To demonstrate that the optimization is not always safe, we use only 10% of the data for model fitting.
Data to be passed to Stan
mcycle_10p <- mcycle[seq(1,133,by=10),]
standata_10p <- list(x=mcycle_10p$times,
x2=mcycle$times,
y=mcycle_10p$accel,
N=length(mcycle_10p$times),
N2=length(mcycle$times))
Optimize and find MAP estimate
opt_10p <- model_gpcovf$optimize(data=standata_10p, init=0.1,
algorithm='lbfgs')
Check whether parameters have reasonable values
odraws_10p <- as_draws_rvars(opt_10p$draws())
subset(odraws_10p, variable=c('sigma_','lengthscale_','sigma'), regex=TRUE)
# A draws_rvars: 1 iterations, 1 chains, and 4 variables
$sigma_f: rvar<1>[1] mean ± sd:
[1] 0.93 ± NA
$lengthscale_f: rvar<1>[1] mean ± sd:
[1] 0.25 ± NA
$sigman: rvar<1>[1] mean ± sd:
[1] 0.18 ± NA
$sigma: rvar<1>[1] mean ± sd:
[1] 8 ± NA
Compare the model to the data
mcycle_10p %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(data=mcycle,aes(x=times,y=mean(odraws_10p$f)), color=set1[1])+
geom_line(data=mcycle,aes(x=times,y=mean(odraws_10p$f-2*odraws_10p$sigma)), color=set1[1],
linetype="dashed")+
geom_line(data=mcycle,aes(x=times,y=mean(odraws_10p$f+2*odraws_10p$sigma)), color=set1[1],
linetype="dashed")
The model fit is clearly over-fitted and over-confident.
Sample using dynamic HMC
fit_10p <- model_gpcovf$sample(data=standata_10p,
iter_warmup=1000, iter_sampling=1000,
adapt_delta=0.95,
chains=4, parallel_chains=4, refresh=100)
Check whether parameters have reasonable values
draws_10p <- as_draws_rvars(fit_10p$draws())
summarise_draws(subset(draws_10p, variable=c('sigma_','lengthscale_','sigma'),
regex=TRUE))
# A tibble: 4 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 sigma_f 0.99 0.96 0.34 0.29 0.45 1.6 1.0 1173. 572.
2 lengthscale_f 0.35 0.29 0.25 0.092 0.15 0.78 1.0 1099. 840.
3 sigman 0.42 0.34 0.27 0.21 0.13 0.98 1.0 920. 1316.
4 sigma 19. 15. 12. 9.4 5.6 44. 1.0 920. 1316.
Compare the model to the data
mcycle_10p %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(data=mcycle,aes(x=times,y=mean(draws_10p$f)), color=set1[1])+
geom_line(data=mcycle,aes(x=times,y=mean(draws_10p$f-2*draws_10p$sigma)), color=set1[1],
linetype="dashed")+
geom_line(data=mcycle,aes(x=times,y=mean(draws_10p$f+2*draws_10p$sigma)), color=set1[1],
linetype="dashed")
The posterior predictive distribution is much more conservative and shows the uncertainty due to having only a small number of observations.
Compare the posterior draws to the optimized parameters
odraws_10p <- as_draws_df(opt_10p$draws())
draws_10p %>%
thin_draws(thin=5) %>%
as_draws_df() %>%
ggplot(aes(x=sigma,y=sigma_f))+
geom_point(color=set1[2])+
geom_point(data=odraws_10p,color=set1[1],size=4)+
annotate("text",x=median(draws_10p$sigma),
y=max(draws_10p$sigma_f)+0.1,
label='Posterior draws',hjust=0.5,color=set1[2],size=6)+
annotate("text",x=odraws_10p$sigma+0.01,
y=odraws_10p$sigma_f,
label='Optimized',hjust=0,color=set1[1],size=6)
The optimization result is in the edge of the posterior close to zero residual scale. While there are posterior draws close to zero, integrating over the wide posterior takes into account the uncertainty and the predictions thus are more uncertain, too.
Compare optimized and posterior predictive distributions
odraws_10p <- as_draws_rvars(opt_10p$draws())
mcycle %>%
mutate(Ef=mean(draws_10p$f),
sigma=mean(draws_10p$sigma),
Efo=mean(odraws_10p$f),
sigmao=mean(odraws_10p$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Efo), color=set1[2])+
geom_line(aes(y=Efo-2*sigmao), color=set1[2],linetype="dashed")+
geom_line(aes(y=Efo+2*sigmao), color=set1[2],linetype="dashed")
The figure shows the model prediction given 10% of data, but also the full data as test data. The optimized model is over-fitted and overconfident. Even if the homoskedastic residual is wrong here, the posterior predictive interval covers most of the observation (and in case of good calibration should not cover them all).
We next make a model with heteroskedastic residual model using Gaussian process prior also for the logarithm of the residual scale: \[ y \sim \mbox{normal}(f(x), \exp(g(x))\\ f \sim GP(0, K_1)\\ g \sim GP(0, K_2). \]
Now there is no analytical solution as GP prior through the exponential function is not a conjugate prior. In this case we present the latent values of f and g explicitly and sample from the joint posterior of the covariance function parameters, and the latent values. It would be possible also to use Laplace, variational inference, or expectation propagation to integrate over the latent values, but that is another story.
file_gpcovfg <- "gpcovfg.stan"
writeLines(readLines(file_gpcovfg))
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
vector[N] y; // target variable
}
transformed data {
// Normalize data
real xmean = mean(x);
real ymean = mean(y);
real xsd = sd(x);
real ysd = sd(y);
array[N] real xn = to_array_1d((x - xmean)/xsd);
vector[N] yn = (y - ymean)/ysd;
real sigma_intercept = 0.1;
vector[N] jitter = rep_vector(1e-9, N);
}
parameters {
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
real<lower=0> lengthscale_g; // lengthscale of g
real<lower=0> sigma_g; // scale of g
vector[N] z_f;
vector[N] z_g;
}
model {
// covariances and Cholesky decompositions
matrix[N, N] K_f = gp_exp_quad_cov(xn, sigma_f, lengthscale_f)+
sigma_intercept^2;
matrix[N, N] L_f = cholesky_decompose(add_diag(K_f, jitter));
matrix[N, N] K_g = gp_exp_quad_cov(xn, sigma_g, lengthscale_g)+
sigma_intercept^2;
matrix[N, N] L_g = cholesky_decompose(add_diag(K_g, jitter));
// priors
z_f ~ std_normal();
z_g ~ std_normal();
lengthscale_f ~ lognormal(log(.3), .2);
lengthscale_g ~ lognormal(log(.5), .2);
sigma_f ~ normal(0, .5);
sigma_g ~ normal(0, .5);
// model
yn ~ normal(L_f * z_f, exp(L_g * z_g));
}
generated quantities {
vector[N] f;
vector[N] sigma;
{
// covariances and Cholesky decompositions
matrix[N, N] K_f = gp_exp_quad_cov(xn, sigma_f, lengthscale_f)+
sigma_intercept^2;
matrix[N, N] L_f = cholesky_decompose(add_diag(K_f, jitter));
matrix[N, N] K_g = gp_exp_quad_cov(xn, sigma_g, lengthscale_g)+
sigma_intercept^2;
matrix[N, N] L_g = cholesky_decompose(add_diag(K_g, jitter));
// function scaled back to the original scale
f = (L_f * z_f)*ysd + ymean;
sigma = exp(L_g * z_g)*ysd;
}
}
Compile Stan model
model_gpcovfg <- cmdstan_model(stan_file = file_gpcovfg)
Data to be passed to Stan
standata_gpcovfg <- list(x=mcycle$times,
y=mcycle$accel,
N=length(mcycle$times))
opt_gpcovfg <- model_gpcovfg$optimize(data=standata_gpcovfg,
init=0.1, algorithm='bfgs')
Check whether parameters have reasonable values
odraws_gpcovfg <- as_draws_rvars(opt_gpcovfg$draws())
subset(odraws_gpcovfg, variable=c('sigma_','lengthscale_'), regex=TRUE)
# A draws_rvars: 1 iterations, 1 chains, and 4 variables
$sigma_f: rvar<1>[1] mean ± sd:
[1] 1.7 ± NA
$sigma_g: rvar<1>[1] mean ± sd:
[1] 2 ± NA
$lengthscale_f: rvar<1>[1] mean ± sd:
[1] 0.11 ± NA
$lengthscale_g: rvar<1>[1] mean ± sd:
[1] 0.29 ± NA
Compare the model to the data
mcycle %>%
mutate(Ef = mean(odraws_gpcovfg$f),
sigma = mean(odraws_gpcovfg$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The optimization overfits, as we are now optimizing the joint posterior of 2 covariance function parameters and 2 x 133 latent values.
fit_gpcovfg <- model_gpcovfg$sample(data=standata_gpcovfg,
iter_warmup=100, iter_sampling=200,
chains=4, parallel_chains=4, refresh=20)
Check whether parameters have reasonable values
draws_gpcovfg <- as_draws_rvars(fit_gpcovfg$draws())
summarise_draws(subset(draws_gpcovfg, variable=c('sigma_','lengthscale_'),
regex=TRUE))
# A tibble: 4 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 sigma_f 0.77 0.74 0.15 0.14 0.55 1.1 1.0 353. 479.
2 sigma_g 1.3 1.2 0.22 0.21 0.95 1.6 1.0 1014. 674.
3 lengthscale_f 0.32 0.32 0.041 0.045 0.26 0.39 1.0 210. 372.
4 lengthscale_g 0.51 0.51 0.072 0.068 0.40 0.63 1.0 315. 382.
Compare the model to the data
mcycle %>%
mutate(Ef = mean(draws_gpcovfg$f),
sigma = mean(draws_gpcovfg$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The MCMC integration works well and the model fit looks good.
Plot posterior draws and posterior mean of the mean function
draws_gpcovfg %>%
spread_rvars(f[i]) %>%
unnest_rvars() %>%
mutate(time=mcycle$times[i]) %>%
ggplot(aes(x=time, y=f, group = .draw)) +
geom_line(color=set1[2], alpha = 0.1) +
geom_point(data=mcycle, mapping=aes(x=times,y=accel), inherit.aes=FALSE) +
geom_line(data=mcycle, mapping=aes(x=times,y=mean(draws_gpcovfg$f)),
inherit.aes=FALSE, color=set1[1], size=1)+
labs(x="Time (ms)", y="Acceleration (g)")
We can also plot the posterior draws of the latent functions, which is a good reminder that individual draws are more wiggly than the average of the draws, and thus show better also the uncertainty, for example, in the edge of the data.
Compare the posterior draws to the optimized parameters
odraws_gpcovfg <- as_draws_df(opt_gpcovfg$draws())
draws_gpcovfg %>%
as_draws_df() %>%
ggplot(aes(x=lengthscale_f,y=sigma_f))+
geom_point(color=set1[2])+
geom_point(data=odraws_gpcovfg,color=set1[1],size=4)+
annotate("text",x=median(draws_gpcovfg$lengthscale_f),
y=max(draws_gpcovfg$sigma_f)+0.1,
label='Posterior draws',hjust=0.5,color=set1[2],size=6)+
annotate("text",x=odraws_gpcovfg$lengthscale_f+0.01,
y=odraws_gpcovfg$sigma_f,
label='Optimized',hjust=0,color=set1[1],size=6)
Optimization result is far from being representative of the posterior.
The covariance matrix approach requires computation of Cholesky of the covariance matrix which has time cost O(n^3) and this is needs to be done every time the parameters change, which in case of MCMC can be quite many times and thus the computation time can be significant when n grows. One way to speed up the computation in low dimensional covariate case is to use basis function approximation which changes the GP to a linear model. Here we use Hilbert space basis functions. With increasing number of basis functions and factor c, the approximation error can be made arbitrarily small. Sufficient accuracy and significant saving in the computation speed is often achieveved with a relatively small number of basis functions.
Code
filebf0 <- "gpbf0.stan"
writeLines(readLines(filebf0))
functions {
#include gpbasisfun_functions.stan
}
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
real<lower=0> c_f; // factor c to determine the boundary value L
int<lower=1> M_f; // number of basis functions
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
}
transformed data {
// Normalize data
real xmean = mean(x);
real xsd = sd(x);
vector[N] xn = (x - xmean)/xsd;
// Basis functions for f
real L_f = c_f*max(xn);
}
generated quantities {
// Basis functions for f
matrix[N,M_f] PHI_f = PHI(N, M_f, L_f, xn);
// spectral densities
vector[M_f] diagSPD_EQ_f = diagSPD_EQ(sigma_f, lengthscale_f, L_f, M_f);
vector[M_f] diagSPD_Matern32_f = diagSPD_Matern32(sigma_f, lengthscale_f, L_f, M_f);
}
The model code includes Hilbert space basis function helpers
writeLines(readLines("gpbasisfun_functions.stan"))
vector diagSPD_EQ(real alpha, real rho, real L, int M) {
return alpha * sqrt(sqrt(2*pi()) * rho) * exp(-0.25*(rho*pi()/2/L)^2 * linspaced_vector(M, 1, M)^2);
}
vector diagSPD_Matern32(real alpha, real rho, real L, int M) {
return 2*alpha * (sqrt(3)/rho)^1.5 * inv((sqrt(3)/rho)^2 + ((pi()/2/L) * linspaced_vector(M, 1, M))^2);
}
vector diagSPD_periodic(real alpha, real rho, int M) {
real a = 1/rho^2;
vector[M] q = exp(log(alpha) + 0.5 * (log(2) - a + to_vector(log_modified_bessel_first_kind(linspaced_int_array(M, 1, M), a))));
return append_row(q,q);
}
matrix PHI(int N, int M, real L, vector x) {
return sin(diag_post_multiply(rep_matrix(pi()/(2*L) * (x+L), M), linspaced_vector(M, 1, M)))/sqrt(L);
}
matrix PHI_periodic(int N, int M, real w0, vector x) {
matrix[N,M] mw0x = diag_post_multiply(rep_matrix(w0*x, M), linspaced_vector(M, 1, M));
return append_col(cos(mw0x), sin(mw0x));
}
Compile basis function generation code
modelbf0 <- cmdstan_model(stan_file = filebf0, include_paths = ".")
Data to be passed to Stan
standatabf0 <- list(x=seq(0,1,length.out=100),
N=100,
c_f=3, # factor c of basis functions for GP for f1
M_f=160, # number of basis functions for GP for f1
sigma_f=1,
lengthscale_f=1)
Generate basis functions
fixbf0 <- modelbf0$sample(data=standatabf0, fixed_param=TRUE,
chains=1, iter=1, iter_sampling=1)
Running MCMC with 1 chain...
Chain 1 Iteration: 1 / 1 [100%] (Sampling)
Chain 1 finished in 0.0 seconds.
There is certainly easier way to do this, but this is what I came up quickly
q<-subset(fixbf0$draws(), variable="PHI_f") %>%
as_draws_matrix() %>%
as.numeric()%>%
matrix(nrow=100,ncol=160)%>%
as.data.frame()
id <- rownames(q)
q <- cbind(x=as.numeric(id), q)
q <- q %>%
pivot_longer(!x,
names_to="ind",
names_transform = list(ind = readr::parse_number),
values_to="f")%>%
mutate(x=x/100)
Plot the first 6 basis functions. These are just sine and cosine functions with different frequencies and truncated to a pre-defined box.
q %>%
filter(ind<=6) %>%
ggplot(aes(x=x, y=f, group=ind, color=factor(ind))) +
geom_line()+
geom_text_repel(data=filter(q, ind<=6 & x==0.01),aes(x=-0.01,y=f,label=ind),
direction="y")+
geom_text_repel(data=filter(q, ind<=6 & x==1),aes(x=1.02,y=f,label=ind),
direction="y")+
theme(legend.position="none")
The first 8 spectral densities for exponentiated quadratic covariance function with sigma_f=1 and lengthscale_f=1. These spectral densities give a prior weight for each basis function. Bigger weights on the smoother basis functions thus imply a prior on function space favoring smoother functions.
spd_EQ <- as.matrix(fixbf0$draws(variable='diagSPD_EQ_f'))
round(spd_EQ[1:12],2)
[1] 1.55 1.44 1.28 1.09 0.88 0.68 0.50 0.35 0.24 0.15 0.09 0.05
The first 8 spectral densities for Matern-3/2 covariance function with sigma_f=1 and lengthscale_f=1. The spectral density values go down much slower than for the exponentiated quadratic covariance function, which is natural as Matern-3/2 is less smooth.
spd_Matern32 <- as.matrix(fixbf0$draws(variable='diagSPD_Matern32_f'))
round(spd_Matern32[1:12],2)
[1] 1.47 1.35 1.18 1.01 0.85 0.71 0.60 0.51 0.43 0.37 0.32 0.28
Plot 4 random draws from the prior on function space with exponentiated quadratic covariance function and sigma_f=1 and lengthscale_f=1. The basis function approximation is just a linear model, with the basis functions weighted by the spectral densities depending on the sigma_f and lengthscale_f, and the prior for the linear model coefficients is simply independent normal(0,1).
set.seed(365)
qr <- bind_rows(lapply(1:4, function(i) {
q %>%
mutate(r=rep(rnorm(160),times=100),fr=f*r*spd_EQ[ind]) %>%
group_by(x) %>%
summarise(f=sum(fr)) %>%
mutate(ind=i) }))
qr %>%
ggplot(aes(x=x, y=f, group=ind, color=factor(ind))) +
geom_line()+
geom_text_repel(data=filter(qr, x==0.01),aes(x=-0.01,y=f,label=ind),
direction="y")+
geom_text_repel(data=filter(qr, x==1),aes(x=1.02,y=f,label=ind),
direction="y")+
theme(legend.position="none")
Plot 4 random draws from the prior on function space with Matern-3/2 covariance function and sigma_f=1 and lengthscale_f=1. The same random number generator seed was used so that you can compare this plot to the above one. Matern-3/2 had more prior mass on higher frequencies and the prior draws are more wiggly.
set.seed(365)
qr <- bind_rows(lapply(1:4, function(i) {
q %>%
mutate(r=rep(rnorm(160),times=100),fr=f*r*spd_Matern32[ind]) %>%
group_by(x) %>%
summarise(f=sum(fr)) %>%
mutate(ind=i) }))
qr %>%
ggplot(aes(x=x, y=f, group=ind, color=factor(ind))) +
geom_line()+
geom_text_repel(data=filter(qr, x==0.01),aes(x=-0.01,y=f,label=ind),
direction="y")+
geom_text_repel(data=filter(qr, x==1),aes(x=1.02,y=f,label=ind),
direction="y")+
theme(legend.position="none")
Let’s do the same with lengthscale_f=0.3
standatabf0 <- list(x=seq(0,1,length.out=100),
N=100,
c_f=1.5, # factor c of basis functions for GP for f1
M_f=160, # number of basis functions for GP for f1
sigma_f=1,
lengthscale_f=0.3)
fixbf0 <- modelbf0$sample(data=standatabf0, fixed_param=TRUE,
chains=1, iter=1, iter_sampling=1)
Running MCMC with 1 chain...
Chain 1 Iteration: 1 / 1 [100%] (Sampling)
Chain 1 finished in 0.0 seconds.
The basis functions are exactly the same, and only the spectral densities have changed. Now the weight doesn’t drop as fast for the more wiggly basis functions.
spd_EQ <- as.matrix(fixbf0$draws(variable='diagSPD_EQ_f'))
round(spd_EQ[1:15],2)
[1] 0.86 0.84 0.80 0.76 0.70 0.64 0.57 0.50 0.44 0.37 0.31 0.26 0.21 0.16 0.13
spd_Matern32 <- as.matrix(fixbf0$draws(variable='diagSPD_Matern32_f'))
round(spd_Matern32[1:15],2)
[1] 0.82 0.80 0.76 0.70 0.65 0.59 0.54 0.48 0.43 0.39 0.35 0.32 0.29 0.26 0.23
Plot 4 random draws from the prior on function space with exponentiated quadratic covariance function and sigma_f=1 and lengthscale_f=0.3. The random functions from the prior are now more wiggly. The same random number generator seed was used so that you can compare this plot to the above one. Above the prior draw number 2 looks like a decreasing slope. Here the prior draw number 2 still has downward trend, but is more wiggly. The same random draw from the coefficient space produces a wigglier function as the spectral densities go down slower for the more wiggly basis functions.
set.seed(365)
qr <- bind_rows(lapply(1:4, function(i) {
q %>%
mutate(r=rep(rnorm(160),times=100),fr=f*r*spd_EQ[ind]) %>%
group_by(x) %>%
summarise(f=sum(fr)) %>%
mutate(ind=i) }))
qr %>%
ggplot(aes(x=x, y=f, group=ind, color=factor(ind))) +
geom_line()+
geom_text_repel(data=filter(qr, x==0.01),aes(x=-0.01,y=f,label=ind),
direction="y")+
geom_text_repel(data=filter(qr, x==1),aes(x=1.02,y=f,label=ind),
direction="y")+
theme(legend.position="none")
Plot 4 random draws from the prior on function space with Matern-3/2 covariance function and sigma_f=1 and lengthscale_f=0.3. The prior draws are more wiggly than with exponentiated quadratic.
set.seed(365)
qr <- bind_rows(lapply(1:4, function(i) {
q %>%
mutate(r=rep(rnorm(160),times=100),fr=f*r*spd_Matern32[ind]) %>%
group_by(x) %>%
summarise(f=sum(fr)) %>%
mutate(ind=i) }))
qr %>%
ggplot(aes(x=x, y=f, group=ind, color=factor(ind))) +
geom_line()+
geom_text_repel(data=filter(qr, x==0.01),aes(x=-0.01,y=f,label=ind),
direction="y")+
geom_text_repel(data=filter(qr, x==1),aes(x=1.02,y=f,label=ind),
direction="y")+
theme(legend.position="none")
Model code
file_gpbff <- "gpbff.stan"
writeLines(readLines(file_gpbff))
functions {
#include gpbasisfun_functions.stan
}
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
vector[N] y; // target variable
real<lower=0> c_f; // factor c to determine the boundary value L
int<lower=1> M_f; // number of basis functions
}
transformed data {
// Normalize data
real xmean = mean(x);
real ymean = mean(y);
real xsd = sd(x);
real ysd = sd(y);
vector[N] xn = (x - xmean)/xsd;
vector[N] yn = (y - ymean)/ysd;
// Basis functions for f
real L_f = c_f*max(xn);
matrix[N,M_f] PHI_f = PHI(N, M_f, L_f, xn);
}
parameters {
real intercept_f;
vector[M_f] beta_f; // the basis functions coefficients
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
real<lower=0> sigman; // noise sigma
}
model {
// spectral densities for f and g
vector[M_f] diagSPD_f = diagSPD_EQ(sigma_f, lengthscale_f, L_f, M_f);
// priors
intercept_f ~ normal(0, 0.1);
beta_f ~ normal(0, 1);
lengthscale_f ~ normal(0, 1);
sigma_f ~ normal(0, 1);
sigman ~ normal(0, 1);
// model
yn ~ normal(intercept_f + PHI_f * (diagSPD_f .* beta_f), sigman);
}
generated quantities {
vector[N] f;
vector[N] log_lik;
real sigma = sigman*ysd;
{
// spectral densities
vector[M_f] diagSPD_f = diagSPD_EQ(sigma_f, lengthscale_f, L_f, M_f);
// function scaled back to the original scale
f = (intercept_f + PHI_f * (diagSPD_f .* beta_f))*ysd + ymean;
for (n in 1:N) {
log_lik[n] = normal_lpdf(yn[n] | intercept_f + PHI_f[n] * (diagSPD_f .* beta_f),
sigman);
}
}
}
Compile Stan model
model_gpbff <- cmdstan_model(stan_file = file_gpbff, include_paths = ".", stanc_options = list("O1"))
Data to be passed to Stan
standata_gpbff <- list(x=mcycle$times,
y=mcycle$accel,
N=length(mcycle$times),
c_f=1.5, # factor c of basis functions for GP for f1
M_f=40, # number of basis functions for GP for f1
c_g=1.5, # factor c of basis functions for GP for g3
M_g=40) # number of basis functions for GP for g3
opt_gpbff <- model_gpbff$optimize(data=standata_gpbff,
init=0.1, algorithm='bfgs')
Check whether parameters have reasonable values
odraws_gpbff <- as_draws_rvars(opt_gpbff$draws())
subset(odraws_gpbff, variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE)
# A draws_rvars: 1 iterations, 1 chains, and 3 variables
$intercept_f: rvar<1>[1] mean ± sd:
[1] 0.0088 ± NA
$sigma_f: rvar<1>[1] mean ± sd:
[1] 1.8 ± NA
$lengthscale_f: rvar<1>[1] mean ± sd:
[1] 0.15 ± NA
Compare the model to the data
mcycle %>%
mutate(Ef=mean(odraws_gpbff$f),
sigma=mean(odraws_gpbff$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The optimization is not that bad. We are now optimizing the joint posterior of 1 covariance function parameters and 40 basis function co-efficients.
fit_gpbff <- model_gpbff$sample(data=standata_gpbff,
iter_warmup=500, iter_sampling=500, refresh=100,
chains=4, parallel_chains=4, adapt_delta=0.9)
Check whether parameters have reasonable values
draws_gpbff <- as_draws_rvars(fit_gpbff$draws())
summarise_draws(subset(draws_gpbff,
variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE))
# A tibble: 3 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 intercept_f 0.016 0.017 0.10 0.11 -0.15 0.17 1.0 2764. 1431.
2 sigma_f 1.0 0.97 0.28 0.25 0.66 1.6 1.0 908. 1261.
3 lengthscale_f 0.40 0.40 0.061 0.062 0.30 0.50 1.0 1228. 1186.
Compare the model to the data
mcycle %>%
mutate(Ef=mean(draws_gpbff$f),
sigma=mean(draws_gpbff$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The MCMC integration works well and the model fit looks good. The model fit is clearly more smooth than with optimization.
Plot posterior draws and posterior mean of the mean function
draws_gpbff %>%
thin_draws(thin=5) %>%
spread_rvars(f[i]) %>%
unnest_rvars() %>%
mutate(time=mcycle$times[i]) %>%
ggplot(aes(x=time, y=f, group = .draw)) +
geom_line(color=set1[2], alpha = 0.1) +
geom_point(data=mcycle, mapping=aes(x=times,y=accel), inherit.aes=FALSE)+
geom_line(data=mcycle, mapping=aes(x=times,y=mean(draws_gpbff$f)),
inherit.aes=FALSE, color=set1[1], size=1)+
labs(x="Time (ms)", y="Acceleration (g)")
We can also plot the posterior draws of the latent functions, which is a good reminder that individual draws are more wiggly than the average of the draws, and thus show better also the uncertainty, for example, in the edge of the data.
Compare the posterior draws to the optimized parameters
odraws_gpbff <- as_draws_df(opt_gpbff$draws())
draws_gpbff %>%
thin_draws(thin=5) %>%
as_draws_df() %>%
ggplot(aes(x=lengthscale_f,y=sigma_f))+
geom_point(color=set1[2])+
geom_point(data=odraws_gpbff,color=set1[1],size=4)+
annotate("text",x=median(draws_gpbff$lengthscale_f),
y=max(draws_gpbff$sigma_f)+0.1,
label='Posterior draws',hjust=0.5,color=set1[2],size=6)+
annotate("text",x=odraws_gpbff$lengthscale_f+0.01,
y=odraws_gpbff$sigma_f,
label='Optimized',hjust=0,color=set1[1],size=6)
Optimization result is far from being representative of the posterior.
Model code
file_gpbffg <- "gpbffg.stan"
writeLines(readLines(file_gpbffg))
functions {
#include gpbasisfun_functions.stan
}
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
vector[N] y; // target variable
real<lower=0> c_f; // factor c to determine the boundary value L
int<lower=1> M_f; // number of basis functions
real<lower=0> c_g; // factor c to determine the boundary value L
int<lower=1> M_g; // number of basis functions
}
transformed data {
// Normalize data
real xmean = mean(x);
real ymean = mean(y);
real xsd = sd(x);
real ysd = sd(y);
vector[N] xn = (x - xmean)/xsd;
vector[N] yn = (y - ymean)/ysd;
// Basis functions for f
real L_f = c_f*max(xn);
matrix[N,M_f] PHI_f = PHI(N, M_f, L_f, xn);
// Basis functions for g
real L_g= c_g*max(xn);
matrix[N,M_g] PHI_g = PHI(N, M_g, L_g, xn);
}
parameters {
real intercept_f;
real intercept_g;
vector[M_f] beta_f; // the basis functions coefficients
vector[M_g] beta_g; // the basis functions coefficients
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
real<lower=0> lengthscale_g; // lengthscale of g
real<lower=0> sigma_g; // scale of g
}
model {
// spectral densities for f and g
vector[M_f] diagSPD_f = diagSPD_EQ(sigma_f, lengthscale_f, L_f, M_f);
vector[M_g] diagSPD_g = diagSPD_EQ(sigma_g, lengthscale_g, L_g, M_g);
// priors
intercept_f ~ normal(0, .1);
intercept_g ~ normal(0, .1);
beta_f ~ normal(0, 1);
beta_g ~ normal(0, 1);
lengthscale_f ~ normal(0, 1);
lengthscale_g ~ normal(0, 1);
sigma_f ~ normal(0, .5);
sigma_g ~ normal(0, .5);
// model
yn ~ normal(intercept_f + PHI_f * (diagSPD_f .* beta_f),
exp(intercept_g + PHI_g * (diagSPD_g .* beta_g)));
}
generated quantities {
vector[N] f;
vector[N] sigma;
vector[N] log_lik;
{
// spectral densities
vector[M_f] diagSPD_f = diagSPD_EQ(sigma_f, lengthscale_f, L_f, M_f);
vector[M_g] diagSPD_g = diagSPD_EQ(sigma_g, lengthscale_g, L_g, M_g);
// function scaled back to the original scale
f = (intercept_f + PHI_f * (diagSPD_f .* beta_f))*ysd + ymean;
sigma = exp(intercept_g + PHI_g * (diagSPD_g .* beta_g))*ysd;
for (n in 1:N) {
log_lik[n] = normal_lpdf(yn[n] | intercept_f + PHI_f[n] * (diagSPD_f .* beta_f),
exp(intercept_g + PHI_g[n] * (diagSPD_g .* beta_g)));
}
}
}
Compile Stan model
model_gpbffg <- cmdstan_model(stan_file = file_gpbffg, include_paths = ".", stanc_options = list("O1"))
Data to be passed to Stan
standata_gpbffg <- list(x=mcycle$times,
y=mcycle$accel,
N=length(mcycle$times),
c_f=1.5, # factor c of basis functions for GP for f1
M_f=40, # number of basis functions for GP for f1
c_g=1.5, # factor c of basis functions for GP for g3
M_g=40) # number of basis functions for GP for g3
opt_gpbffg <- model_gpbffg$optimize(data=standata_gpbffg,
init=0.1, algorithm='bfgs')
Check whether parameters have reasonable values
odraws_gpbffg <- as_draws_rvars(opt_gpbffg$draws())
subset(odraws_gpbffg, variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE)
# A draws_rvars: 1 iterations, 1 chains, and 6 variables
$intercept_f: rvar<1>[1] mean ± sd:
[1] 0.021 ± NA
$intercept_g: rvar<1>[1] mean ± sd:
[1] -0.065 ± NA
$sigma_f: rvar<1>[1] mean ± sd:
[1] 1.5 ± NA
$sigma_g: rvar<1>[1] mean ± sd:
[1] 4 ± NA
$lengthscale_f: rvar<1>[1] mean ± sd:
[1] 0.11 ± NA
$lengthscale_g: rvar<1>[1] mean ± sd:
[1] 0.1 ± NA
Compare the model to the data
mcycle %>%
mutate(Ef=mean(odraws_gpbffg$f),
sigma=mean(odraws_gpbffg$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The optimization overfits, as we are now optimizing the joint posterior of 2 covariance function parameters and 2 x 40 basis function co-efficients.
fit_gpbffg <- model_gpbffg$sample(data=standata_gpbffg,
iter_warmup=500, iter_sampling=500, refresh=100,
chains=4, parallel_chains=4, adapt_delta=0.9)
Check whether parameters have reasonable values
draws_gpbffg <- as_draws_rvars(fit_gpbffg$draws())
summarise_draws(subset(draws_gpbffg,
variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE))
# A tibble: 6 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 intercept_f 0.029 0.031 0.097 0.096 -0.13 0.19 1.0 1617. 1269.
2 intercept_g -0.044 -0.046 0.10 0.097 -0.21 0.13 1.0 2845. 1323.
3 sigma_f 0.80 0.78 0.16 0.15 0.58 1.1 1.0 992. 1134.
4 sigma_g 1.3 1.2 0.23 0.22 0.93 1.7 1.0 1169. 1233.
5 lengthscale_f 0.34 0.33 0.050 0.052 0.26 0.42 1.0 634. 1133.
6 lengthscale_g 0.51 0.51 0.10 0.10 0.34 0.68 1.0 756. 1119.
Compare the model to the data
mcycle %>%
mutate(Ef=mean(draws_gpbffg$f),
sigma=mean(draws_gpbffg$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The MCMC integration works well and the model fit looks good.
Plot posterior draws and posterior mean of the mean function
draws_gpbffg %>%
thin_draws(thin=5) %>%
spread_rvars(f[i]) %>%
unnest_rvars() %>%
mutate(time=mcycle$times[i]) %>%
ggplot(aes(x=time, y=f, group = .draw)) +
geom_line(color=set1[2], alpha = 0.1) +
geom_point(data=mcycle, mapping=aes(x=times,y=accel), inherit.aes=FALSE)+
geom_line(data=mcycle, mapping=aes(x=times,y=mean(draws_gpbffg$f)),
inherit.aes=FALSE, color=set1[1], size=1)+
labs(x="Time (ms)", y="Acceleration (g)")
We can also plot the posterior draws of the latent functions, which is a good reminder that individual draws are more wiggly than the average of the draws, and thus show better also the uncertainty, for example, in the edge of the data.
Compare the posterior draws to the optimized parameters
odraws_gpbffg <- as_draws_df(opt_gpbffg$draws())
draws_gpbffg %>%
thin_draws(thin=5) %>%
as_draws_df() %>%
ggplot(aes(x=lengthscale_f,y=sigma_f))+
geom_point(color=set1[2])+
geom_point(data=odraws_gpbffg,color=set1[1],size=4)+
annotate("text",x=median(draws_gpbffg$lengthscale_f),
y=max(draws_gpbffg$sigma_f)+0.1,
label='Posterior draws',hjust=0.5,color=set1[2],size=6)+
annotate("text",x=odraws_gpbffg$lengthscale_f+0.01,
y=odraws_gpbffg$sigma_f,
label='Optimized',hjust=0,color=set1[1],size=6)
Optimization result is far from being representative of the posterior.
Looking at the plots comparing model predictions and data, it is quite obvious in this case that the heteroskedastic model is better for these data. In cases when it is not as clear, we can use leave-one-out cross-validation comparison. Here we compare homoskedastic and heteroskedastic models.
loobff <- fit_gpbff$loo()
loobffg <- fit_gpbffg$loo()
loo_compare(list(homoskedastic=loobff,heteroskedastic=loobffg))
elpd_diff se_diff
heteroskedastic 0.0 0.0
homoskedastic -49.5 10.0
Heteroskedastic model has clearly much higher elpd estimate.
We can plot also the difference in the pointwise elpd values (log scores) so that we see in which parts the heteroskedastic model is better
data.frame(time=mcycle$times,
elpd_diff=loobffg$pointwise[,'elpd_loo']-loobff$pointwise[,'elpd_loo']) |>
ggplot(aes(x=time,y=elpd_diff))+
geom_point(color=set1[2])+
geom_hline(yintercept=0, linetype="dashed")+
labs(x="Time (ms)", y=TeX("elpd of heteroskedastic GP is higher $\\rightarrow$"))
Variational inference is popular in machine learning and also with Gaussian processes. When used carefully for selected models, parameters, parameterization, and approximate distribution, variational inference can be useful and fast. The following example illustrates, why it can also fail when applied in black box style. Run auto-differentiated variational inference (ADVI) with meanfield normal approximation, and in the end, sample from the approximation.
vi_gpbffg <- model_gpbffg$variational(data=standata_gpbffg,
init=0.01, tol_rel_obj=1e-4, iter=1e5, refresh=1000, seed=75)
Check whether parameters have reasonable values
vidraws_gpbffg <- as_draws_rvars(vi_gpbffg$draws())
summarise_draws(subset(vidraws_gpbffg,
variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE))
# A tibble: 6 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 intercept_f 0.031 0.032 0.029 0.028 -0.017 0.081 1.0 931. 1031.
2 intercept_g -0.039 -0.039 0.057 0.055 -0.13 0.054 1.0 975. 1057.
3 sigma_f 0.66 0.65 0.032 0.032 0.61 0.71 1.0 935. 951.
4 sigma_g 0.93 0.93 0.070 0.072 0.82 1.0 1.0 861. 919.
5 lengthscale_f 0.39 0.39 0.020 0.020 0.36 0.42 1.0 986. 946.
6 lengthscale_g 1.4 1.4 0.088 0.085 1.2 1.5 1.0 982. 983.
Compare the model to the data
mcycle %>%
mutate(Ef=mean(vidraws_gpbffg$f),
sigma=mean(vidraws_gpbffg$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
ADVI inference is catching the mean function well, and some of the varying noise variance, but clearly overestimating the noise variance in the early part.
Plot posterior draws and posterior mean of the mean function
vidraws_gpbffg %>%
thin_draws(thin=5) %>%
spread_rvars(f[i]) %>%
unnest_rvars() %>%
mutate(time=mcycle$times[i]) %>%
ggplot(aes(x=time, y=f, group = .draw)) +
geom_line(color=set1[2], alpha = 0.1) +
geom_point(data=mcycle, mapping=aes(x=times,y=accel), inherit.aes=FALSE)+
geom_line(data=mcycle, mapping=aes(x=times,y=mean(vidraws_gpbffg$f)),
inherit.aes=FALSE, color=set1[1], size=1)+
labs(x="Time (ms)", y="Acceleration (g)")
We can also plot the posterior draws of the latent functions, which is a good reminder that individual draws are more wiggly than the average of the draws, and thus show better also the uncertainty, for example, in the edge of the data.
Compare the draws from the variational approximation to the MCMC draws and optimized parameters. This time show f[1] and g[1] to illustrate the challenging funnel shape. Although the inference happens in the space of beta_f and beta_g, f[1] and g[1] are linear projection of beta_f and beta_g, and thus the funnel is causing the problems for ADVI. Full rank normal approximation would not be able to help here.
odraws_gpbffg <- as_draws_df(opt_gpbffg$draws())
draws_gpbffg %>%
thin_draws(thin=5) %>%
as_draws_df() %>%
ggplot(aes(x=`f[1]`,y=log(`sigma[1]`)))+
geom_point(color=set1[2])+
geom_point(data=as_draws_df(vidraws_gpbffg),color=set1[3])+
geom_point(data=odraws_gpbffg,color=set1[1],size=4)+
annotate("text",x=median(vidraws_gpbffg$f[1])+1.3,
y=max(log(vidraws_gpbffg$sigma[1]))+0.1,
label='Variational inference',hjust=0,color=set1[3],size=6)+
annotate("text",x=median(draws_gpbffg$f[1])+1,
y=min(log(draws_gpbffg$sigma[1]))-0.1,
label='MCMC draws',hjust=0,color=set1[2],size=6)+
annotate("text",x=odraws_gpbffg$`f[1]`+1,
y=log(odraws_gpbffg$`sigma[1]`),
label='Optimized',hjust=0,color=set1[1],size=6)+
labs(y="g[1]")
Exponentiated quadratic is sometimes considered to be too smooth as all the derivatives are continuos. For comparison we use Matern-3/2 covariance. The Hilbert space basis functions are the same and only the spectral density values change (that is different basis functions have a different weighting).
file_gpbffg2 <- "gpbffg_matern.stan"
writeLines(readLines(file_gpbffg2))
functions {
#include gpbasisfun_functions.stan
}
data {
int<lower=1> N; // number of observations
vector[N] x; // univariate covariate
vector[N] y; // target variable
real<lower=0> c_f; // factor c to determine the boundary value L
int<lower=1> M_f; // number of basis functions
real<lower=0> c_g; // factor c to determine the boundary value L
int<lower=1> M_g; // number of basis functions
}
transformed data {
// Normalize data
real xmean = mean(x);
real ymean = mean(y);
real xsd = sd(x);
real ysd = sd(y);
vector[N] xn = (x - xmean)/xsd;
vector[N] yn = (y - ymean)/ysd;
// Basis functions for f
real L_f = c_f*max(xn);
matrix[N,M_f] PHI_f = PHI(N, M_f, L_f, xn);
// Basis functions for g
real L_g= c_g*max(xn);
matrix[N,M_g] PHI_g = PHI(N, M_g, L_g, xn);
}
parameters {
real intercept_f;
real intercept_g;
vector[M_f] beta_f; // the basis functions coefficients
vector[M_g] beta_g; // the basis functions coefficients
real<lower=0> lengthscale_f; // lengthscale of f
real<lower=0> sigma_f; // scale of f
real<lower=0> lengthscale_g; // lengthscale of g
real<lower=0> sigma_g; // scale of g
}
model {
// spectral densities for f and g
vector[M_f] diagSPD_f = diagSPD_Matern32(sigma_f, lengthscale_f, L_f, M_f);
vector[M_g] diagSPD_g = diagSPD_Matern32(sigma_g, lengthscale_g, L_g, M_g);
// priors
intercept_f ~ normal(0, 1);
intercept_g ~ normal(0, 1);
beta_f ~ normal(0, 1);
beta_g ~ normal(0, 1);
lengthscale_f ~ normal(0, 1);
lengthscale_g ~ normal(0, 1);
sigma_f ~ normal(0, .5);
sigma_g ~ normal(0, .5);
// model
yn ~ normal(intercept_f + PHI_f * (diagSPD_f .* beta_f),
exp(intercept_g + PHI_g * (diagSPD_g .* beta_g)));
}
generated quantities {
vector[N] f;
vector[N] sigma;
vector[N] log_lik;
{
// spectral densities
vector[M_f] diagSPD_f = diagSPD_Matern32(sigma_f, lengthscale_f, L_f, M_f);
vector[M_g] diagSPD_g = diagSPD_Matern32(sigma_g, lengthscale_g, L_g, M_g);
// function scaled back to the original scale
f = (intercept_f + PHI_f * (diagSPD_f .* beta_f))*ysd + ymean;
sigma = exp(intercept_g + PHI_g * (diagSPD_g .* beta_g))*ysd;
for (n in 1:N) {
log_lik[n] = normal_lpdf(yn[n] | intercept_f + PHI_f[n] * (diagSPD_f .* beta_f),
exp(intercept_g + PHI_g[n] * (diagSPD_g .* beta_g)));
}
}
}
Compile Stan model
model_gpbffg2 <- cmdstan_model(stan_file = file_gpbffg2, include_paths = ".")
Data to be passed to Stan
standata_gpbffg2 <- list(x=mcycle$times,
y=mcycle$accel,
N=length(mcycle$times),
c_f=1.5, # factor c of basis functions for GP for f1
M_f=160, # number of basis functions for GP for f1
c_g=1.5, # factor c of basis functions for GP for g3
M_g=160) # number of basis functions for GP for g3
fit_gpbffg2 <- model_gpbffg2$sample(data=standata_gpbffg2,
iter_warmup=500, iter_sampling=500,
chains=4, parallel_chains=4, adapt_delta=0.9)
Check whether parameters have reasonable values
draws_gpbffg2 <- as_draws_rvars(fit_gpbffg2$draws())
summarise_draws(subset(draws_gpbffg2,
variable=c('intercept','sigma_','lengthscale_'),
regex=TRUE))
# A tibble: 6 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1 intercept_f 0.27 0.28 0.46 0.42 -0.48 1.0 1.0 843. 1044.
2 intercept_g -1.2 -1.3 0.52 0.48 -2.0 -0.35 1.0 1248. 1278.
3 sigma_f 0.93 0.89 0.21 0.21 0.62 1.3 1.0 955. 748.
4 sigma_g 1.1 1.0 0.23 0.22 0.73 1.5 1.0 1032. 1403.
5 lengthscale_f 0.66 0.64 0.15 0.14 0.43 0.93 1.0 930. 1025.
6 lengthscale_g 0.69 0.66 0.23 0.22 0.37 1.1 1.0 690. 577.
Compare the model to the data
mcycle %>%
mutate(Ef=mean(draws_gpbffg2$f),
sigma=mean(draws_gpbffg2$sigma)) %>%
ggplot(aes(x=times,y=accel))+
geom_point()+
labs(x="Time (ms)", y="Acceleration (g)")+
geom_line(aes(y=Ef), color=set1[1])+
geom_line(aes(y=Ef-2*sigma), color=set1[1],linetype="dashed")+
geom_line(aes(y=Ef+2*sigma), color=set1[1],linetype="dashed")
The MCMC integration works well and the model fit looks good.
Plot posterior draws and posterior mean of the mean function
draws_gpbffg2 %>%
thin_draws(thin=5) %>%
spread_rvars(f[i]) %>%
unnest_rvars() %>%
mutate(time=mcycle$times[i]) %>%
ggplot(aes(x=time, y=f, group = .draw)) +
geom_line(color=set1[2], alpha = 0.1) +
geom_point(data=mcycle, mapping=aes(x=times,y=accel), inherit.aes=FALSE)+
geom_line(data=mcycle, mapping=aes(x=times,y=mean(draws_gpbffg2$f)),
inherit.aes=FALSE, color=set1[1], size=1)+
labs(x="Time (ms)", y="Acceleration (g)")
We see that when using Matern-3/2 covariance instead of the exponentiated quadratic, the model fit is more wigggly.