Hide code cell source
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns
sns.set_context("paper")
sns.set_style("ticks")

import torch
import gpytorch
from gpytorch.kernels import RBFKernel, ScaleKernel


class ExactGP(gpytorch.models.ExactGP):
    def __init__(self,
                 train_x,
                 train_y,
                 likelihood=gpytorch.likelihoods.GaussianLikelihood(
                     noise_constraint=gpytorch.constraints.GreaterThan(0.0)
                ),
                mean_module=gpytorch.means.ConstantMean(),
                covar_module=ScaleKernel(RBFKernel())
        ):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = mean_module
        self.covar_module = covar_module

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)


def plot_1d_regression(
    x_star,
    model,
    ax=None,
    f_true=None,
    num_samples=10
):
    """Plot the posterior predictive.
    
    Arguments
    x_start  --  The test points on which to evaluate.
    model    --  The trained model.
    
    Keyword Arguments
    ax          --  An axes object to write on.
    f_true      --  The true function.
    num_samples --  The number of samples.
    """
    f_star = model(x_star)
    m_star = f_star.mean
    v_star = f_star.variance
    y_star = model.likelihood(f_star)
    yv_star = y_star.variance

    f_lower = (
        m_star - 2.0 * torch.sqrt(v_star)
    )
    f_upper = (
        m_star + 2.0 * torch.sqrt(v_star)
    )
    
    y_lower = m_star - 2.0 * torch.sqrt(yv_star)
    y_upper = m_star + 2.0 * torch.sqrt(yv_star)

    if ax is None:
        fig, ax = plt.subplots()
    
    ax.plot(model.train_inputs[0].flatten().detach(),
            model.train_targets.detach(),
            'kx',
            markersize=10,
            markeredgewidth=2,
            label='Observations'
    )

    ax.plot(
        x_star,
        m_star.detach(),
        lw=2,
        label='$m_n(x)$',
        color=sns.color_palette()[0]
    )
    
    ax.fill_between(
        x_star.flatten().detach(),
        f_lower.flatten().detach(),
        f_upper.flatten().detach(),
        alpha=0.5,
        label='$f(\mathbf{x}^*)$ 95% pred.',
        color=sns.color_palette()[0]
    )

    ax.fill_between(
        x_star.detach().flatten(),
        y_lower.detach().flatten(),
        f_lower.detach().flatten(),
        color=sns.color_palette()[1],
        alpha=0.5,
        label='$y^*$ 95% pred.'
    )
    ax.fill_between(
        x_star.detach().flatten(),
        f_upper.detach().flatten(),
        y_upper.detach().flatten(),
        color=sns.color_palette()[1],
        alpha=0.5,
        label=None
    )

    
    if f_true is not None:
        ax.plot(
            x_star,
            f_true(x_star),
            'm-.',
            label='True function'
        )
        
    if num_samples > 0:
        f_post_samples = f_star.sample(
            sample_shape=torch.Size([10])
        )
        ax.plot(
            x_star.numpy(),
            f_post_samples.T.detach().numpy(),
            color="red",
            lw=0.5
        )
        # This is just to add the legend entry
        ax.plot(
            [],
            [],
            color="red",
            lw=0.5,
            label="Posterior samples"
        )
        
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')

    plt.legend(loc='best', frameon=False)
    sns.despine(trim=True)
    
    return m_star, v_star


def train(model, train_x, train_y, n_iter=10, lr=0.1):
    """Train the model.

    Arguments
    model   --  The model to train.
    train_x --  The training inputs.
    train_y --  The training labels.
    n_iter  --  The number of iterations.
    """
    model.train()
    optimizer = torch.optim.LBFGS(model.parameters(), line_search_fn='strong_wolfe')
    likelihood = model.likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    def closure():
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, train_y)
        loss.backward()
        return loss
    for i in range(n_iter):
        loss = optimizer.step(closure)
        if (i + 1) % 1 == 0:
            print(f'Iter {i + 1:3d}/{n_iter} - Loss: {loss.item():.3f}')
    model.eval()


def plot_iaf(
    x_star,
    gpr,
    alpha,
    alpha_params={},
    ax=None,
    f_true=None,
    iaf_label="Information Acquisition Function"
):
    """Plot the information acquisition function.
    
    Arguments
    x_star       -- A set of points to plot on.
    gpr          -- A rained Gaussian process regression
                    object.
    alpha        -- The information acquisition function.
                    This assumed to be a function of the
                    posterior mean and standard deviation.
              
    Keyword Arguments
    ax           -- An axes object to plot on.
    f_true       -- The true function - if available.
    alpha_params -- Extra parameters to the information
                    acquisition function.
    ax           -- An axes object to plot on.
    f_true       -- The true function - if available.
    iaf_label    -- The label for the information acquisition
                    function. Default is "Information Acquisition".
    
    The evaluation of the information acquisition function
    is as follows:
    
        af_values = alpha(mu, sigma, y_max, **alpha_params)

    """
    if ax is None:
        fig, ax = plt.subplots()
    
    ax.set_title(
        ", ".join(
            f"{n}={k:.2f}"
            for n, k in alpha_params.items()
            )
    )
    
    m, v = plot_1d_regression(
        x_star,
        gpr,
        ax=ax,
        f_true=f_true,
        num_samples=0
    )
    
    sigma = torch.sqrt(v)
    af_values = alpha(m, sigma, gpr.train_targets.numpy().max(), **alpha_params)
    next_id = torch.argmax(af_values)
    next_x = x_star[next_id]
    af_max = af_values[next_id]
    
    ax2 = ax.twinx()
    ax2.plot(x_star, af_values.detach(), color=sns.color_palette()[1])
    ax2.set_ylabel(
        iaf_label,
        color=sns.color_palette()[1]
    )
    plt.setp(
        ax2.get_yticklabels(),
        color=sns.color_palette()[1]
    )
    ax2.plot(
        next_x * np.ones(100),
        torch.linspace(0, af_max.item(), 100),
        color=sns.color_palette()[1],
        linewidth=1
    )


def maximize(
    f,
    model,
    X_design,
    alpha,
    alpha_params={},
    max_it=10,
    optimize=False,
    plot=False,
    **kwargs
):
    """Optimize a function using a limited number of evaluations.
    
    Arguments
    f            -- The function to optimize.
    gpr          -- A Gaussian process model to use for representing
                    our state of knowledge.
    X_design     -- The set of candidate points for identifying the
                    maximum.
    alpha        -- The information acquisition function.
                    This assumed to be a function of the
                    posterior mean and standard deviation.
    
    Keyword Arguments
    alpha_params -- Extra parameters to the information
                    acquisition function.
    max_it       -- The maximum number of iterations.
    optimize     -- Whether or not to optimize the hyper-parameters.
    plot         -- Determines how often to plot. Make it one
                    to plot at each iteration. Make it max_it
                    to plot at the last iteration.
                    
    The rest of the keyword arguments are passed to plot_iaf().
    """
    af_all = []
    for count in range(max_it):
        # Predict
        f_design = model(X_design)
        m = f_design.mean
        sigma2 = f_design.variance
        sigma = torch.sqrt(sigma2)
        
        # Evaluate information acquisition function
        y_train = model.train_targets.numpy()
        af_values = alpha(
            m,
            sigma,
            y_train.max(),
            **alpha_params
        )
        
        # Find best point to include
        i = torch.argmax(af_values)
        af_all.append(af_values[i])
        
        new_x = X_design[i:(i+1)].float()
        new_y = f(new_x)
        train_x = torch.cat([model.train_inputs[0], new_x[:, None]])
        train_y = torch.cat([model.train_targets, new_y])
        model.set_train_data(train_x, train_y, strict=False)
        
        if optimize:
            train(model, train_x, train_y, n_iter=100, lr=0.1)
        else:
            model.train()
            model.eval()
        
        # Plot if required
        if count % plot == 0:
            if "ax" in kwargs:
                ax = kwargs[ax]
            else:
                fig, ax = plt.subplots()
            plot_iaf(
                X_design,
                model,
                alpha,
                alpha_params=alpha_params,
                f_true=f,
                ax=ax,
                **kwargs
            )
            ax.set_title(
                f"N={count}, " + ax.get_title()
            )
    return af_all


def mui(m, sigma, ymax, psi=1.96):
    """The maximum upper interval acquisition function."""
    return m + psi * sigma


def poi(m, sigma, ymax, psi=0.):
    """Return the probability of improvement.
    
    Arguments
    m     -- The predictive mean at the test points.
    sigma -- The predictive standard deviation at
             the test points.
    ymax  -- The maximum observed value (so far).
    psi   -- A parameter that controls exploration.
    """
    return torch.distributions.Normal(0, 1).cdf((m - ymax - psi) / sigma)


def ei(m, sigma, ymax):
    """Return the expected improvement.
    
    Arguments
    m     -- The predictive mean at the test points.
    sigma -- The predictive standard deviation at
             the test points.
    ymax  -- The maximum observed value (so far).
    """
    diff = m - ymax
    u = diff / sigma
    ei = ( diff * torch.distributions.Normal(0, 1).cdf(u) + 
          sigma * torch.distributions.Normal(0, 1).log_prob(u).exp()
    )
    ei[sigma <= 0.] = 0.
    return ei
Hide code cell source
!pip install gpytorch

Expected Improvement - With Observation Noise#

We develop intuition about the expected improvement in the presence of observation noise.

Optimizing Noisy Functions#

The optimization of noisy functions is relevant when you are dealing with experimentally measured objectives. In such a scenario, you do not observe \(f(\mathbf{x})\), but a noisy version. Here is a prototypical scenario of an experimentally measured objective. Let \(\xi\) be all the variables that affect the objective and assume that they are distributed in a way not necessarily known to you:

\[ \xi \sim p(\xi). \]

The you setup your experiment using a design \(\mathbf{x}\) and you measure:

\[ y = g(\mathbf{x}, \xi). \]

Let’s assume now that you would like to maximize the expectation of this function, i.e., you want to maximize

\[ f(\mathbf{x}) = \mathbb{E}_\xi[g(\mathbf{x},\xi)]. \]

The expectation here is over the experimental noise.

A naïve way of solving this problem is to approximate the expectation using sample averaging. At each \(\mathbf{x}\), you do many experiments instead of just one. If your experiments are not very expensive, you may do that. Then, any of the algorithms above would work in your problem.

So, what do you do when you cannot eliminate the noise? Well, this is an open problem. But here is a quick and dirty solution that may work in many cases. First, use GPR to approximate \(f(\mathbf{x})\) using noisy measurements:

\[ y_i = g(\mathbf{x},\xi_i), \]

for \(i=1,\dots,n\). You do not necessarily have to observe the \(\xi\)’s. You can assume that they are hidden. If you do observe them, you can exploit this fact. But, for now, let’s assume that you don’t observe them. If you don’t observe them, you need to model their effect. The more straightforward thing to do is to assume that their effect is additive, zero mean, and Gaussian. That is, we assume that:

\[ y_i = f(\mathbf{x}_i) + \epsilon_i, \]

where \(\epsilon_i \sim \mathcal{N}(0,\sigma^2)\), where \(\sigma^2\) is to be determined. When we do this, we can use GPR to find our posterior state of knowledge about \(f(\cdot)\) and any hyperparameters). We will be just using a MAP estimate for the hyperparameters.

The posterior GP conditioned on the observed data is given by the usual formulas:

\[ f(\cdot)|\mathcal{D}_n \sim \operatorname{GP}(m_n(\cdot), k_n(\cdot,\cdot)). \]

Since we will be doing sequential experiment design, we will also need the point-predictive distribution for the measurement \(y\) at a hypothetical \(\mathbf{x}\). It is:

\[ p(y|\mathbf{x},\mathcal{D}_n) = \mathcal{N}(y|m_n(\mathbf{x}), \sigma_n^2(\mathbf{x}) + \sigma^2), \]

where \(\sigma_n^2(\mathbf{x}) = k_n(\mathbf{x},\mathbf{x})\), i.e., the posterior variance for \(f(\mathbf{x})\). We now have all the ingredients to modify the information acquisition functions above for the noisy cases.

np.random.seed(123456)


def f(x):
    """A function to optimize."""
    return -4 * (1. - np.sin(6 * x + 8 * np.exp(6 * x - 7.))) 

# noisy version of the above function
sigma_noise = 0.4
g = lambda x: (
    f(x)
    + sigma_noise * np.random.randn(x.shape[0])
)

# Generate some data:
n_init = 4
X = np.random.rand(n_init)
Y = g(X)

# Do GPR
train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)

model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 0.4
model.eval()

x = torch.linspace(0, 1, 100)
plot_1d_regression(
    x,
    model,
    f_true=f
);
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
  warnings.warn(
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
  warnings.warn(
../_images/17eeb9a9babf49eabd0a027fcdb72c03a4f4eb43578ce1fcf555e2d35b932978.svg

Maximum upper interval with noise#

The maximum upper interval remains the same:

\[ a_n(\mathbf{x}) = \mu_n(\mathbf{x}) + \psi \sigma_n(\mathbf{x}), \]

for some \(\psi \ge 0\). Ensure you don’t add the \(\sigma^2\) term in the variance. Here it is:

plot_iaf(
    x,
    model,
    mui,
    alpha_params=dict(psi=1.96)
)
../_images/8e3af5d501eb87d1814e17a2cdf7f301f3f27fd97d584d0db12edf8307a25d5f.svg

Probability of improvement with noise#

We need to make some modifications to the probability of improvement. First, because they are noisy, we cannot use the \(y_n^*\) as the maximum of the \(y_i\)’s. Instead, we are looking at the predictive mean of the GP, \(m_n(\mathbf{x}_i)\), at the corresponding inputs, \(\mathbf{x}_i\), and we find their maximum. So, define:

\[ m_n^* = \max_{1\le i\le n}m_n(\mathbf{x}_i). \]

Instead of finding the maximum of the noisy observations, we are smoothing with the predictive mean of the GP and find the maximum of the smoothed versions. The rest is similar. The acquisition function is defined by:

\[ a_n(\mathbf{x}) = \mathbb{P}[f(\mathbf{x}) > m_n^* + \psi | \mathbf{x}, \mathcal{D}_n]. \]

We read that “\(a_n(\mathbf{x})\)” is the probability that we observe at \(x\) a \(y\) that is greater than the currently observed maximum \(y_n^*\) by at least \(\psi>0\). The good thing is that it is possible to get an analytical answer because our point predictive distribution is Gaussian. In particular, we get:

\[\begin{split} \begin{align} a_n(\mathbf{x}) &=& \mathbb{P}[f(\mathbf{x}) > m_n^* + \psi | \mathbf{x}, \mathcal{D}_n]\\ &=& \mathbb{P}\left[\frac{f(\mathbf{x}) - \mu_n(\mathbf{x})}{\sigma_n(\mathbf{x})} > \frac{m_n^* + \psi - \mu_n(\mathbf{x})}{\sigma_n(\mathbf{x})} \Big| \mathbf{x}, \mathcal{D}_n\right]\\ &=& \dots\\ &=& \Phi\left(\frac{\mu_n(\mathbf{x}) - m_n^* - \psi}{\sigma_n(\mathbf{x})} \right). \end{align} \end{split}\]
plot_iaf(
    x,
    model,
    poi,
    alpha_params=dict(psi=0.1)
)
../_images/3a5cb52bde1c6f6cfa2f7fbb122cf8d21457e30eb1a24e3c3de00c06e0d28464.svg

Expected improvment with noise#

The arguments here are as in the previous section about the probability of improvement. Consider a hypothetical experiment at \(\mathbf{x}\) and assume that you observed \(y\). How much improvement is that compared to your currently best observed point \(m_n^*\). It is:

\[\begin{split} I_n(\mathbf{x}, f(\mathbf{x})) = \begin{cases} 0,&\;\text{if}\;f(\mathbf{x}) \le m_n^*,\\ f(\mathbf{x}) - m_n^*,&\;\text{otherwise}, \end{cases} \end{split}\]

and this conditional of \(y\), i.e., \(f(\mathbf{x})\) is a random variable conditioned on \(y\). Taking the expectation over \(y\) would give you:

\[ \operatorname{EI}_n(\mathbf{x}) = \left(m_n(\mathbf{x}) - m_n^*\right)\Phi\left(\frac{m_n(\mathbf{x}) - m_n^*}{\sigma_n(\mathbf{x})}\right) + \sigma_n(\mathbf{x})\phi\left(\frac{m_n(\mathbf{x}) - m_n^*}{\sigma_n(\mathbf{x})}\right). \]
plot_iaf(
    x,
    model,
    ei
)
../_images/5141d424d30cd9c2329c76f27173aecf178867956e46d3384ff2dcb535d2f951.svg

And here optimization with noise:

# Run the algorithm
af_all = maximize(
    f,
    model,
    x,
    ei,
    max_it=20,
    plot=1
)
Hide code cell output
../_images/a152f4d8633c88e4271d0b791b5b80e9ef58e38a35bc31f792c7874392349a13.svg../_images/f95cbd58678891a387768bc203c77f41667f4dfc03c4bc95c5ac74aa79a65e2f.svg../_images/dcd365cdccd4f6218a617cf9e7ab8c98dd93f9c0c78e485147d21f423710d666.svg../_images/ec637f9a379ef06a8146d9bd1d6fc42de9784464e62cec9fc3e3d90d68e059df.svg../_images/862997dd8562ef03af387545291917725fed6a024ebeb4764187dad3df16355c.svg../_images/01fbdb0086ff133261c6e88d3ffcaa1b3e7feb508b9aa7a651084473868ba676.svg../_images/594dd79ec1ae1934ead5b8f178ac6965d0e2aa8334aa4aaca167611ef0e19360.svg../_images/46b7ddc07e6c15c3dd8e03e199aafc9ecae4cf535f3b898b947875a0443395c8.svg../_images/2e6ff61db519b2392ce69694fea2f483de31a675d5e4ddbd3447fdb24d945ccb.svg../_images/7bc512b239822fa88690d7f6473db62ec24678840d5916d12a8c40414c24cb65.svg../_images/2492cd5f34a05a5428bc7a4ba03e7eed625c1fa5279c4909bdfa614da473d2ef.svg../_images/42ebd7b98bc3f0f2dae452c84e280bebb0ce43bea98a1d342f89ea57b6878c64.svg../_images/90327170616010d92eae7832d81e89a66b598b2c51e37fd76d4cd9848fd0aca3.svg../_images/47045983a085dd1c0aa0d19e1d6939b5c20ca15a847013c7535ca95f58053f1d.svg../_images/a881deaeba3333d58107b972ec8cb5f5020b3841dd88952d37ad61218590b65c.svg../_images/fc65320a0002b392dedb6cd9ae3f62b5f0a6de1f7392cfc1f644a3969c939c6f.svg../_images/dd64a30ccd2f510661d940e2307080d4fab1f3440553afd14832f39abe6ed475.svg../_images/f52d5628739b6b137a0caaaa4096efc05988134bf8c95f24a9a49961e026082b.svg../_images/a1c35b72974b61cfad30e1133f3db4e2efd21103b5eefdf1d4f882d182f344b1.svg../_images/4dddcb5044a4bdc9b77a81030c23e1f71b0bacbac0a88ba0d7fdf4a5b312fc5a.svg

Questions#

  • Rerun the algorithm above starting with 3 observations. Does the algorithm find the same local maximum? If not, why?

  • Experiment with smaller noise.

  • Experiment with larger noise.