Hide code cell source
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns
sns.set_context("paper")
sns.set_style("ticks")

import torch
import gpytorch
from gpytorch.kernels import RBFKernel, ScaleKernel

class ExactGP(gpytorch.models.ExactGP):
    def __init__(self,
                 train_x,
                 train_y,
                 likelihood=gpytorch.likelihoods.GaussianLikelihood(
                     noise_constraint=gpytorch.constraints.GreaterThan(0.0)
                ),
                mean_module=gpytorch.means.ConstantMean(),
                covar_module=ScaleKernel(RBFKernel())
        ):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = mean_module
        self.covar_module = covar_module

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


def plot_1d_regression(
    x_star,
    model,
    ax=None,
    f_true=None,
    num_samples=10
):
    """Plot the posterior predictive.
    
    Arguments
    x_start  --  The test points on which to evaluate.
    model    --  The trained model.
    
    Keyword Arguments
    ax          --  An axes object to write on.
    f_true      --  The true function.
    num_samples --  The number of samples.
    """
    f_star = model(x_star)
    m_star = f_star.mean
    v_star = f_star.variance
    y_star = model.likelihood(f_star)
    yv_star = y_star.variance

    f_lower = (
        m_star - 2.0 * torch.sqrt(v_star)
    )
    f_upper = (
        m_star + 2.0 * torch.sqrt(v_star)
    )
    
    y_lower = m_star - 2.0 * torch.sqrt(yv_star)
    y_upper = m_star + 2.0 * torch.sqrt(yv_star)

    if ax is None:
        fig, ax = plt.subplots()
    
    ax.plot(model.train_inputs[0].flatten().detach(),
            model.train_targets.detach(),
            'kx',
            markersize=10,
            markeredgewidth=2,
            label='Observations'
    )

    ax.plot(
        x_star,
        m_star.detach(),
        lw=2,
        label='$m_n(x)$',
        color=sns.color_palette()[0]
    )
    
    ax.fill_between(
        x_star.flatten().detach(),
        f_lower.flatten().detach(),
        f_upper.flatten().detach(),
        alpha=0.5,
        label='$f(\mathbf{x}^*)$ 95% pred.',
        color=sns.color_palette()[0]
    )

    ax.fill_between(
        x_star.detach().flatten(),
        y_lower.detach().flatten(),
        f_lower.detach().flatten(),
        color=sns.color_palette()[1],
        alpha=0.5,
        label='$y^*$ 95% pred.'
    )
    ax.fill_between(
        x_star.detach().flatten(),
        f_upper.detach().flatten(),
        y_upper.detach().flatten(),
        color=sns.color_palette()[1],
        alpha=0.5,
        label=None
    )

    
    if f_true is not None:
        ax.plot(
            x_star,
            f_true(x_star),
            'm-.',
            label='True function'
        )
        
    if num_samples > 0:
        f_post_samples = f_star.sample(
            sample_shape=torch.Size([10])
        )
        ax.plot(
            x_star.numpy(),
            f_post_samples.T.detach().numpy(),
            color="red",
            lw=0.5
        )
        # This is just to add the legend entry
        ax.plot(
            [],
            [],
            color="red",
            lw=0.5,
            label="Posterior samples"
        )
        
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')

    plt.legend(loc='best', frameon=False)
    sns.despine(trim=True);

def train(model, train_x, train_y, n_iter=10, lr=0.1):
    """Train the model.

    Arguments
    model   --  The model to train.
    train_x --  The training inputs.
    train_y --  The training labels.
    n_iter  --  The number of iterations.
    """
    model.train()
    optimizer = torch.optim.LBFGS(model.parameters(), line_search_fn='strong_wolfe')
    likelihood = model.likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    def closure():
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        return loss
    for i in range(n_iter):
        loss = optimizer.step(closure)
        if (i + 1) % 1 == 0:
            print(f'Iter {i + 1:3d}/{n_iter} - Loss: {loss.item():.3f}')
    model.eval()
Hide code cell source
!pip install gpytorch

Maximum Mean - A Bad Information Acquisition Function#

An naive information acquisition function is to select the arm with the highest mean. This is a bad idea. You should never do this. We will show why.

Working 1D Example#

It is easier to introduce the ideas using an example. Let’s work with a synthetic 1D function defined in \([0,1]\):

def f(x):
    """A function to optimize."""
    return -4 * (1. - np.sin(6 * x + 8 * np.exp(6 * x - 7.))) 

x = np.linspace(0, 1)
plt.plot(x, f(x), linewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$')
sns.despine(trim=True);
../_images/1f3297e19b06090b6f20edb8df34df7d3d404d2adc37d9746e8ad42c264d8cda.svg

We wish to maximize this function. Let us generate some starting data:

np.random.seed(12345)

n_init = 3

X = np.random.rand(n_init)
Y = f(X)

plt.plot(X, Y, 'kx', markersize=10, markeredgewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$')
sns.despine(trim=True);
../_images/8d36f4abfad327a1981606131f913c89400c138544c54d0b43cf41539fb78954.svg

Assume we do some Bayesian regression using our data so far. Here, we will do GPR, but any Bayesian regression would work. We will not work right now with the full predictive \(p(f(\cdot)|\mathcal{D}_{n})\), but with the point-predictive distribution: $\( p(y|\mathbf{x},\mathcal{D}_{n}) = \mathcal{N}\left(y|m_{n}(\mathbf{x}), \sigma^2_{n}(\mathbf{x})\right), \)\( where \)m_{n}(\mathbf{x})\( and \)\sigma^2_{n}(\mathbf{x})$ are the predictive mean and variance respectively.

Here is an example with GPR:

train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)

# It is not a good idea to train the model when we don't have enough data
# So we fix the hyperparameters to something reasonable
model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 1e-4
model.eval()

plot_1d_regression(
    torch.from_numpy(x).float(),
    model,
    f_true=f
);
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
  warnings.warn(
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
  warnings.warn(
../_images/a0ce2e415fed91511cb3695c951a7921a70086397cc61054c2513554653f4686.svg
help(model.set_train_data)
Help on method set_train_data in module gpytorch.models.exact_gp:

set_train_data(inputs=None, targets=None, strict=True) method of __main__.ExactGP instance
    Set training data (does not re-fit model hyper-parameters).
    
    :param torch.Tensor inputs: The new training inputs.
    :param torch.Tensor targets: The new training targets.
    :param bool strict: (default True) If `True`, the new inputs and
        targets must have the same shape, dtype, and device
        as the current inputs and targets. Otherwise, any shape/dtype/device are allowed.

Now, the question is this: “Where should we evaluate the function next if our goal is to maximize the function?” Let’s start with the naive assumption that we should evaluate the function at the point that maximizes the Gaussian process posterior mean, i.e., wherever is the max of the thick blue line. In other words, the information acquisition function is the posterior mean of the Gaussian process. Let’s see what happens.

def maximize_naive(f, model, X_design, max_it=6):
    """
    Optimize f using a limited number of evaluations.
    
    Arguments
    f        --  The function to optimize.
    model    --  A Gaussian process model to use for representing our state of knowledge.
    X_design --  The set of candidate points for identifying the maximum.
    max_it   --  The maximum number of iterations.
    """
    for count in range(max_it):
        model.train()
        model.eval()
        m = model(X_design).mean.detach()
        i = np.argmax(m)
        
        new_x = X_design[i:(i+1)].float()
        new_y = f(new_x)
        train_x = torch.cat([model.train_inputs[0], new_x[:, None]])
        train_y = torch.cat([model.train_targets, new_y])
        model.set_train_data(train_x, train_y, strict=False)
        model.train()
        model.eval()
        
        fig, ax = plt.subplots()
        plot_1d_regression(
            X_design,
            model,
            ax=ax,
            f_true=f,
            num_samples=0
        )
        ax.plot(x[i], f(x[i]), 'go', label='Next observation')
        ax.set_title('BGO iteration #{0:d}'.format(count+1))
        plt.legend(loc='best', frameon=False)
        sns.despine(trim=True);
X_design = torch.from_numpy(x).float()
model.train()
maximize_naive(f, model, X_design, max_it=3)
Hide code cell output
../_images/e5438abea453aabd1a127c1c18e6a87532c7162a65120d90c4849f513bac3449.svg../_images/3106458cdb46ee8d70314b3830c71f8b17fdec21533aa97617b7b0c9e8505df5.svg../_images/69a3f5472428c78cde5f16437c04ac89b653e0fcebf4f1d7dca9451d2586948a.svg

Observe that the algorithm misses the actual maximum. It gets trapped. This is because using the posterior mean as an acquisition function focuses too much on exploiting the currently available information but fails to explore regions of the input space that we have yet to visit.

Questions#

  • Experiment with different numbers of initial observations. How many must you use for the algorithm to converge to the global maximum?