Gaussian Process Regression Without Noise

MAKE_BOOK_FIGURES=Trueimport numpy as npimport scipy.stats as stimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats('svg')import seaborn as snssns.set_context("paper")sns.set_style("ticks")def set_book_style():    plt.style.use('seaborn-v0_8-white')     sns.set_style("ticks")    sns.set_palette("deep")    mpl.rcParams.update({        # Font settings        'font.family': 'serif',  # For academic publishing        'font.size': 8,  # As requested, 10pt font        'axes.labelsize': 8,        'axes.titlesize': 8,        'xtick.labelsize': 7,  # Slightly smaller for better readability        'ytick.labelsize': 7,        'legend.fontsize': 7,                # Line and marker settings for consistency        'axes.linewidth': 0.5,        'grid.linewidth': 0.5,        'lines.linewidth': 1.0,        'lines.markersize': 4,                # Layout to prevent clipped labels        'figure.constrained_layout.use': True,                # Default DPI (will override when saving)        'figure.dpi': 600,        'savefig.dpi': 600,                # Despine - remove top and right spines        'axes.spines.top': False,        'axes.spines.right': False,                # Remove legend frame        'legend.frameon': False,                # Additional trim settings        'figure.autolayout': True,  # Alternative to constrained_layout        'savefig.bbox': 'tight',    # Trim when saving        'savefig.pad_inches': 0.1   # Small padding to ensure nothing gets cut off    })def set_notebook_style():    plt.style.use('seaborn-v0_8-white')    sns.set_style("ticks")    sns.set_palette("deep")    mpl.rcParams.update({        # Font settings - using default sizes        'font.family': 'serif',        'axes.labelsize': 10,        'axes.titlesize': 10,        'xtick.labelsize': 9,        'ytick.labelsize': 9,        'legend.fontsize': 9,                # Line and marker settings        'axes.linewidth': 0.5,        'grid.linewidth': 0.5,        'lines.linewidth': 1.0,        'lines.markersize': 4,                # Layout settings        'figure.constrained_layout.use': True,                # Remove only top and right spines        'axes.spines.top': False,        'axes.spines.right': False,                # Remove legend frame        'legend.frameon': False,                # Additional settings        'figure.autolayout': True,        'savefig.bbox': 'tight',        'savefig.pad_inches': 0.1    })def save_for_book(fig, filename, is_vector=True, **kwargs):    """    Save a figure with book-optimized settings.        Parameters:    -----------    fig : matplotlib figure        The figure to save    filename : str        Filename without extension    is_vector : bool        If True, saves as vector at 1000 dpi. If False, saves as raster at 600 dpi.    **kwargs : dict        Additional kwargs to pass to savefig    """        # Set appropriate DPI and format based on figure type    if is_vector:        dpi = 1000        ext = '.pdf'    else:        dpi = 600        ext = '.tif'        # Save the figure with book settings    fig.savefig(f"{filename}{ext}", dpi=dpi, **kwargs)def make_full_width_fig():    return plt.subplots(figsize=(4.7, 2.9), constrained_layout=True)def make_half_width_fig():    return plt.subplots(figsize=(2.35, 1.45), constrained_layout=True)if MAKE_BOOK_FIGURES:    set_book_style()else:    set_notebook_style()make_full_width_fig = make_full_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()make_half_width_fig = make_half_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()

Gaussian Process Regression Without Noise#

We perform Gaussian process regression on a synthetic 1D dataset without noise.

Example: Gaussian process regression in 1D with fixed hyper-parameters#

Let’s generate some synthetic 1D data to work with:

np.random.seed(1234)

# Number of observations
n = 10
# The inputs are in [0, 1]
X = np.random.rand(n) 
# The true function that we will try to identify
f_true = lambda x: -np.cos(np.pi * x) + np.sin(4. * np.pi * x)
# Some data to train on
Y = f_true(X)
# Let's visualize the data
fig, ax = plt.subplots()
ax.plot(X, Y, 'kx', markersize=10, markeredgewidth=2)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
sns.despine(trim=True);

../_images/e75673dc9cae501d6722210990bcbd1bbc9fa191d64fe232d413bc6b21ebab02.svg

Don’t forget to install GPyTorch first if you are running this notebook on Colab:

!pip install gpytorch

A GPyTorch GP model requires a mean function, a covariance function (kernel), and a likelihood function. For simplicity, we will use a constant mean function, so we will set: $$ m(x) = 0. $$ Our kernel is the squared exponential kernel. We will use a Gaussian likelihood function.

import torch
import gpytorch
from gpytorch.kernels import RBFKernel, ScaleKernel

class ExactGP(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = ScaleKernel(RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

likelihood = gpytorch.likelihoods.GaussianLikelihood(
    noise_constraint=gpytorch.constraints.GreaterThan(0.0)
)
train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y, likelihood)

Here is how the model looks like if you try to print it:

print(model)

ExactGP(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (raw_noise_constraint): GreaterThan(0.000E+00)
    )
  )
  (mean_module): ConstantMean()
  (covar_module): ScaleKernel(
    (base_kernel): RBFKernel(
      (raw_lengthscale_constraint): Positive()
    )
    (raw_outputscale_constraint): Positive()
  )
)

You see all the parameters. We are going to pick the parameters by hand in this example.

model.mean_module.constant = torch.tensor(0.0)
model.covar_module.outputscale = torch.tensor(1.0)
model.covar_module.base_kernel.lengthscale = torch.tensor(0.1)
model.likelihood.noise = torch.tensor(0.0) # There is no noise in the data

That’s it. We have now specified the model completely. The posterior GP is completely defined. Where is the posterior mean $m_n(x)$ and variance $\sigma_n^2(x)$?

Now we can make predictions.

model.eval()
likelihood.eval()
# The test points on which we will make predictions
x_star = torch.linspace(0, 1, 100)[:, None]
# The predictive mean and variance for the test points
f_star = model(x_star)
m_star = f_star.mean
fig, ax = plt.subplots()
ax.plot(X, Y, 'kx', markersize=10, markeredgewidth=2, label='data')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.plot(x_star, m_star.detach(), lw=2, label='$m_n(x)$')
plt.legend(loc='best', frameon=False)
sns.despine(trim=True);

../_images/84dc6aaad7c67bafe1321f05b8d5e2206461eda59f5b9d4bf46ad66da1d378db.svg

The variance on the same test points is:

# Now the variance on the same test points
fig, ax = plt.subplots()
v_star = f_star.variance
ax.plot(x_star, v_star.detach(), lw=2, label='$\sigma_n^2(x)$')
ax.set_xlabel('$x$')
ax.set_ylabel('$\sigma_n^2(x)$')
sns.despine(trim=True);

/opt/homebrew/lib/python3.11/site-packages/gpytorch/distributions/multivariate_normal.py:319: NumericalWarning: Negative variance values detected. This is likely due to numerical instabilities. Rounding negative variances up to 1e-06.
  warnings.warn(

../_images/839d074b417e7ca8813e98e0c0b98b7ce5e54f31ff9285dfaeacf4e3a496d024.svg

Notice that the variance is zero wherever we have an observation.

Having the posterior mean and variance, we can derive 95% predictive intervals for $f(x^*)$ and $y^*$. For $f(x^*)$ these are:

\[ m_n(\mathbf{x}^*)) - 2\sigma_n(\mathbf{x}^*) \le f(\mathbf{x}^*) \le m_n(\mathbf{x}^*)) + 2\sigma_n(\mathbf{x}^*). \]

Let’s plot this:

fig, ax = plt.subplots()
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
f_lower = m_star - 2.0 * torch.sqrt(v_star)
f_upper = m_star + 2.0 * torch.sqrt(v_star)
ax.fill_between(
    x_star.detach().flatten(),
    f_lower.detach().flatten(),
    f_upper.detach().flatten(),
    alpha=0.5
)
ax.plot(x_star, m_star.detach(), lw=2, label='$m_n(x)$')
ax.plot(X, Y, 'kx', markersize=10, markeredgewidth=2, label='data')
plt.legend(loc='best', frameon=False)
sns.despine(trim=True);

../_images/80318c84b0092f9a7ee70ec91ee8959a4c8625ca782dedc97fbbffdeaf2189a6.svg

Let’s also put the actual function there for comparison:

fig, ax = plt.subplots()
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
f_lower = m_star - 2.0 * torch.sqrt(v_star)
f_upper = m_star + 2.0 * torch.sqrt(v_star)
ax.fill_between(
    x_star.detach().flatten(),
    f_lower.detach().flatten(),
    f_upper.detach().flatten(),
    alpha=0.5
)
ax.plot(x_star, m_star.detach(), lw=2, label='$m_n(x)$')
ax.plot(x_star, f_true(x_star), 'm-.', label='True function')
ax.plot(
    X,
    Y,
    'kx',
    markersize=10,
    markeredgewidth=2,
    label='Observations'
)
plt.legend(loc='best', frameon=False)
sns.despine(trim=True);

../_images/07f13d44c63132d46ebe755ef719422e4c43513014820cd6c89721ed103b737f.svg

You see that the actual function is almost entirely within the blue bounds. It is ok that it is a bit off because these are 95% prediction intervals. About 5% of the function can be off.

That’s good. However, we have much more information encoded in the posterior GP. It is a probability measure over the space of functions. How do we sample functions? Well, you can’t sample functions. They are infinite objects. But you can sample the function values on many test points. The joint probability density of the function values at any collection of set points is a multivariate Gaussian. We did it manually in the last lecture. In this lecture, we will use the capabilities of GPyTorch. Here it is:

# Here is how you take the samples
f_post_samples = f_star.sample(sample_shape=torch.Size([10]))
# Here is the size of f_post_samples
print(f_post_samples.shape)

This is test points x number of outputs (1 here) x number of samples. Let’s plot them along with the data and the truth:

fig, ax = plt.subplots(dpi=100)
ax.plot(x_star, f_post_samples.detach().T, 'r', lw=0.5)
ax.plot([], [], 'r', lw=0.5, label="Posterior samples")
ax.plot(X, Y, 'kx', markersize=10, markeredgewidth=2, label='Observations')
ax.plot(x_star, f_true(x_star), 'm-.', label='True function')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
plt.legend(loc="best", frameon=False)
sns.despine(trim=True);

../_images/64085c91a4b8f838fb70ca220ab20864398feaa77237c1f32569fc59e123ceb6.svg

Okay, the length scale we have assumed does not match the length scale of the actual function perfectly. But that’s how it is. In real problems, you won’t know this function anyway.

As always, it is a good idea to arrange your code in functions for reusability:

Let’s test it:

plot_1d_regression(x_star, model)

../_images/e202aeb9930db2facff85235f5b493367276f96b3a13789b867d7981366eef9f.svg

Diagnostics: How do you know if the fit is good?#

To objective test the resulting model, we need a validation dataset consisting of the inputs:

\[ \mathbf{x}^v_{1:n^v} = \left(\mathbf{x}^v_1,\dots,\mathbf{x}^v_{n^v}\right), \]

and corresponding observed outputs:

\[ \mathbf{y}^v_{1:n^v} = \left(y^v_1,\dots,y^v_{n^v}\right). \]

We will use this validation dataset to define some diagnostics. Let’s do it directly through the 1D example above. First, we generate some validation data:

n_v = 100
X_v = np.random.rand(n_v)[:, None]
Y_v = f_true(X_v)

Point-predictions#

Point-predictions only use $m_n\left(\mathbf{x}^v_i\right)$. Of course, when there is a lot of noise, they are not very useful. But let’s look at what we get anyway. (In the questions section, I will ask you to reduce the noise and repeat).

The simplest thing we can do is to compare $y^v_i$ to $m_n\left(\mathbf{x}^v_i\right)$. We start with the mean square error:

\[ \operatorname{MSE} := \frac{1}{n^v}\sum_{i=1}^{n^v}\left[y^v_i-m_n\left(\mathbf{x}^v_i\right)\right]^2. \]

f_v = model(torch.from_numpy(X_v).float())
m_v = f_v.mean.detach().numpy()
v_v = f_v.variance.detach().numpy()
mse = np.mean((Y_v - m_v) ** 2)
print(f'MSE = {mse:1.2f}')

MSE = 1.25

/opt/homebrew/lib/python3.11/site-packages/gpytorch/distributions/multivariate_normal.py:319: NumericalWarning: Negative variance values detected. This is likely due to numerical instabilities. Rounding negative variances up to 1e-06.
  warnings.warn(

This could be more intuitive, though. A somewhat intuitive measure is coefficient of determination, also known as $R^2$, R squared. It is defined as:

\[ R^2 = 1 - \frac{\sum_{i=1}^{n^v}\left[y_i^v - m_n(\mathbf{x}_i^v)\right]^2}{\sum_{i=1}^{n^v}\left[y_i^v-\bar{y}^v\right]^2}, \]

where $\bar{y}^v$ is the mean of the observed data:

\[ \bar{y}^v = \frac{1}{n^v}\sum_{i=1}^{n^v}y_i^v. \]

The interpretation of $R^2$, and take this with a grain of salt, is that it gives the percentage of variance of the data explained by the model. A score of $R^2=1$ is a perfect fit. In our data, we get:

R2 = 1.0 - np.sum((Y_v - m_v) ** 2) / np.sum((Y_v - np.mean(Y_v)) ** 2)
print(f'R2 = {R2:1.2f}')

R2 = -188.47

Finally, on point-predictions, we can simply plot the predictions vs the observations:

fig, ax = plt.subplots()
y_range = np.linspace(Y_v.min(), Y_v.max(), 50)
ax.plot(y_range, y_range, 'r', lw=2)
ax.plot(Y_v, m_v, 'bo')
ax.set_xlabel('Prediction')
ax.set_ylabel('Observation')
sns.despine(trim=True);

../_images/fca7a90c6f1564d3f4a6b2985ea3d06ad07ae4a9880e7d43c5fe0fdfab8e1fa1.svg

Questions#

Experiment with different length scales for the kernel. What happens to the posterior mean and the 95% predictive error bar as the length scale increases (decreases)?
Experiment with different kernel variances. It specifies our prior variance about the function values. What is its effect?
Try some other kernels.