Show code cell source
MAKE_BOOK_FIGURES=False
import numpy as np
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns
sns.set_context("paper")
sns.set_style("ticks")
def set_book_style():
plt.style.use('seaborn-v0_8-white')
sns.set_style("ticks")
sns.set_palette("deep")
mpl.rcParams.update({
# Font settings
'font.family': 'serif', # For academic publishing
'font.size': 8, # As requested, 10pt font
'axes.labelsize': 8,
'axes.titlesize': 8,
'xtick.labelsize': 7, # Slightly smaller for better readability
'ytick.labelsize': 7,
'legend.fontsize': 7,
# Line and marker settings for consistency
'axes.linewidth': 0.5,
'grid.linewidth': 0.5,
'lines.linewidth': 1.0,
'lines.markersize': 4,
# Layout to prevent clipped labels
'figure.constrained_layout.use': True,
# Default DPI (will override when saving)
'figure.dpi': 600,
'savefig.dpi': 600,
# Despine - remove top and right spines
'axes.spines.top': False,
'axes.spines.right': False,
# Remove legend frame
'legend.frameon': False,
# Additional trim settings
'figure.autolayout': True, # Alternative to constrained_layout
'savefig.bbox': 'tight', # Trim when saving
'savefig.pad_inches': 0.1 # Small padding to ensure nothing gets cut off
})
def set_notebook_style():
plt.style.use('seaborn-v0_8-white')
sns.set_style("ticks")
sns.set_palette("deep")
mpl.rcParams.update({
# Font settings - using default sizes
'font.family': 'serif',
'axes.labelsize': 10,
'axes.titlesize': 10,
'xtick.labelsize': 9,
'ytick.labelsize': 9,
'legend.fontsize': 9,
# Line and marker settings
'axes.linewidth': 0.5,
'grid.linewidth': 0.5,
'lines.linewidth': 1.0,
'lines.markersize': 4,
# Layout settings
'figure.constrained_layout.use': True,
# Remove only top and right spines
'axes.spines.top': False,
'axes.spines.right': False,
# Remove legend frame
'legend.frameon': False,
# Additional settings
'figure.autolayout': True,
'savefig.bbox': 'tight',
'savefig.pad_inches': 0.1
})
def save_for_book(fig, filename, is_vector=True, **kwargs):
"""
Save a figure with book-optimized settings.
Parameters:
-----------
fig : matplotlib figure
The figure to save
filename : str
Filename without extension
is_vector : bool
If True, saves as vector at 1000 dpi. If False, saves as raster at 600 dpi.
**kwargs : dict
Additional kwargs to pass to savefig
"""
# Set appropriate DPI and format based on figure type
if is_vector:
dpi = 1000
ext = '.pdf'
else:
dpi = 600
ext = '.tif'
# Save the figure with book settings
fig.savefig(f"{filename}{ext}", dpi=dpi, **kwargs)
def make_full_width_fig():
return plt.subplots(figsize=(4.7, 2.9), constrained_layout=True)
def make_half_width_fig():
return plt.subplots(figsize=(2.35, 1.45), constrained_layout=True)
if MAKE_BOOK_FIGURES:
set_book_style()
else:
set_notebook_style()
make_full_width_fig = make_full_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()
make_half_width_fig = make_half_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()
Show code cell source
!pip install gpytorch
import torch
import gpytorch
from gpytorch.kernels import ScaleKernel, RBFKernel
class ExactGP(gpytorch.models.ExactGP):
"""Exact Gaussian Process model.
Arguments
train_x -- The training inputs.
train_y -- The training labels.
mean_module -- The mean module. Defaults to a constant mean.
covar_module-- The covariance module. Defaults to a RBF kernel.
likelihood -- The likelihood function. Defaults to Gaussian.
"""
def __init__(
self,
train_x,
train_y,
mean_module=gpytorch.means.ConstantMean(),
covar_module = ScaleKernel(RBFKernel()),
likelihood=gpytorch.likelihoods.GaussianLikelihood(
noise_constraint=gpytorch.constraints.GreaterThan(0.0)
)
):
super().__init__(train_x, train_y, likelihood)
self.mean_module = mean_module
self.covar_module = covar_module
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
def plot_1d_regression(
x_star,
model,
ax=None,
f_true=None,
num_samples=10
):
"""Plot the posterior predictive.
Arguments
x_start -- The test points on which to evaluate.
model -- The trained model.
Keyword Arguments
ax -- An axes object to write on.
f_true -- The true function.
num_samples -- The number of samples.
"""
f_star = model(x_star)
m_star = f_star.mean
v_star = f_star.variance
y_star = model.likelihood(f_star)
yv_star = y_star.variance
f_lower = (
m_star - 2.0 * torch.sqrt(v_star)
)
f_upper = (
m_star + 2.0 * torch.sqrt(v_star)
)
y_lower = m_star - 2.0 * torch.sqrt(yv_star)
y_upper = m_star + 2.0 * torch.sqrt(yv_star)
if ax is None:
fig, ax = plt.subplots()
ax.plot(model.train_inputs[0].flatten().detach(),
model.train_targets.detach(),
'kx',
markersize=10,
markeredgewidth=2,
label='Observations'
)
ax.plot(
x_star,
m_star.detach(),
lw=2,
label='$m_n(x)$',
color=sns.color_palette()[0]
)
ax.fill_between(
x_star.flatten().detach(),
f_lower.flatten().detach(),
f_upper.flatten().detach(),
alpha=0.5,
label=r'$f(\mathbf{x}^*)$ 95% pred.',
color=sns.color_palette()[0]
)
ax.fill_between(
x_star.detach().flatten(),
y_lower.detach().flatten(),
f_lower.detach().flatten(),
color=sns.color_palette()[1],
alpha=0.5,
label='$y^*$ 95% pred.'
)
ax.fill_between(
x_star.detach().flatten(),
f_upper.detach().flatten(),
y_upper.detach().flatten(),
color=sns.color_palette()[1],
alpha=0.5,
label=None
)
if f_true is not None:
ax.plot(
x_star,
f_true(x_star),
'm-.',
label='True function'
)
if num_samples > 0:
f_post_samples = f_star.sample(
sample_shape=torch.Size([10])
)
ax.plot(
x_star,
f_post_samples.T.detach(),
color="red",
lw=0.5
)
# This is just to add the legend entry
ax.plot(
[],
[],
color="red",
lw=0.5,
label="Posterior samples"
)
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
plt.legend(loc='best', frameon=False)
sns.despine(trim=True)
return m_star, v_star
def plot_iaf(
x_star,
gpr,
alpha,
alpha_params={},
ax=None,
f_true=None
):
"""Plot the information acquisition function.
Arguments
x_star -- A set of points to plot on.
gpr -- A rained Gaussian process regression
object.
alpha -- The information acquisition function.
This assumed to be a function of the
posterior mean and standard deviation.
Keyword Arguments
ax -- An axes object to plot on.
f_true -- The true function - if available.
The evaluation of the information acquisition function
is as follows:
af_values = alpha(mu, sigma, y_max, **alpha_params)
"""
if ax is None:
fig, ax = plt.subplots()
ax.set_title(
", ".join(
f"{n}={k:.2f}"
for n, k in alpha_params.items()
)
)
m, v = plot_1d_regression(
x_star,
gpr,
ax=ax,
f_true=f_true,
num_samples=0
)
sigma = torch.sqrt(v)
af_values = alpha(m, sigma, Y.max(), **alpha_params)
next_id = torch.argmax(af_values)
next_x = x_star[next_id]
af_max = af_values[next_id]
ax2 = ax.twinx()
ax2.plot(x_star, af_values.detach(), color=sns.color_palette()[1])
ax2.set_ylabel(
'Maximum Upper Interval',
color=sns.color_palette()[1]
)
plt.setp(
ax2.get_yticklabels(),
color=sns.color_palette()[1]
)
ax2.plot(
next_x * np.ones(100),
torch.linspace(0, af_max.item(), 100),
color=sns.color_palette()[1],
linewidth=1
)
def maximize(
f,
model,
X_design,
alpha,
alpha_params={},
max_it=10,
optimize=False,
plot=False,
**kwargs
):
"""Optimize a function using a limited number of evaluations.
Arguments
f -- The function to optimize.
gpr -- A Gaussian process model to use for representing
our state of knowledge.
X_design -- The set of candidate points for identifying the
maximum.
alpha -- The information acquisition function.
This assumed to be a function of the
posterior mean and standard deviation.
Keyword Arguments
alpha_params -- Extra parameters to the information
acquisition function.
max_it -- The maximum number of iterations.
optimize -- Whether or not to optimize the hyper-parameters.
plot -- Determines how often to plot. Make it one
to plot at each iteration. Make it max_it
to plot at the last iteration.
The rest of the keyword arguments are passed to plot_iaf().
"""
af_all = []
for count in range(max_it):
# Predict
f_design = model(X_design)
m = f_design.mean
sigma2 = f_design.variance
sigma = torch.sqrt(sigma2)
# Evaluate information acquisition function
y_train = model.train_targets.numpy()
af_values = alpha(
m,
sigma,
y_train.max(),
**alpha_params
)
# Find best point to include
i = torch.argmax(af_values)
af_all.append(af_values[i])
new_x = X_design[i:(i+1)].float()
new_y = f(new_x)
train_x = torch.cat([model.train_inputs[0], new_x[:, None]])
train_y = torch.cat([model.train_targets, new_y])
model.set_train_data(train_x, train_y, strict=False)
if optimize:
train(model, train_x, train_y, n_iter=100, lr=0.1)
else:
model.train()
model.eval()
# Plot if required
if count % plot == 0:
if "ax" in kwargs:
ax = kwargs[ax]
else:
fig, ax = plt.subplots()
plot_iaf(
X_design,
model,
alpha,
alpha_params=alpha_params,
f_true=f,
ax=ax
)
ax.set_title(
f"N={count}, " + ax.get_title()
)
return af_all
def poi(m, sigma, ymax, psi=0.):
"""Return the probability of improvement.
Arguments
m -- The predictive mean at the test points.
sigma -- The predictive standard deviation at
the test points.
ymax -- The maximum observed value (so far).
psi -- A parameter that controls exploration.
"""
return torch.distributions.Normal(0, 1).cdf((m - ymax - psi) / sigma)
def mui(m, sigma, ymax, psi=1.96):
"""The maximum upper interval acquisition function."""
return m + psi * sigma
def ei(m, sigma, ymax):
"""Return the expected improvement.
Arguments
m -- The predictive mean at the test points.
sigma -- The predictive standard deviation at
the test points.
ymax -- The maximum observed value (so far).
"""
diff = m - ymax
u = diff / sigma
ei = ( diff * torch.distributions.Normal(0, 1).cdf(u) +
sigma * torch.distributions.Normal(0, 1).log_prob(u).exp()
)
ei[sigma <= 0.] = 0.
return ei
Show code cell output
Requirement already satisfied: gpytorch in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (1.14)
Requirement already satisfied: jaxtyping in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from gpytorch) (0.2.33)
Requirement already satisfied: mpmath<=1.3,>=0.19 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from gpytorch) (1.3.0)
Requirement already satisfied: scikit-learn in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from gpytorch) (1.6.1)
Requirement already satisfied: scipy>=1.6.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from gpytorch) (1.12.0)
Requirement already satisfied: linear-operator>=0.6 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from gpytorch) (0.6)
Requirement already satisfied: torch>=2.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from linear-operator>=0.6->gpytorch) (2.7.1)
Requirement already satisfied: numpy<1.29.0,>=1.22.4 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from scipy>=1.6.0->gpytorch) (1.26.4)
Requirement already satisfied: filelock in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (3.18.0)
Requirement already satisfied: typing-extensions>=4.10.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (4.12.2)
Requirement already satisfied: setuptools in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (73.0.1)
Requirement already satisfied: sympy>=1.13.3 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (1.13.3)
Requirement already satisfied: networkx in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (3.4.2)
Requirement already satisfied: jinja2 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (3.1.4)
Requirement already satisfied: fsspec in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from torch>=2.0->linear-operator>=0.6->gpytorch) (2024.12.0)
Requirement already satisfied: typeguard==2.13.3 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from jaxtyping->gpytorch) (2.13.3)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from jinja2->torch>=2.0->linear-operator>=0.6->gpytorch) (2.1.5)
Requirement already satisfied: joblib>=1.2.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from scikit-learn->gpytorch) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages (from scikit-learn->gpytorch) (3.5.0)
Quantifying Epistemic Uncertainty about the Solution of the Optimization problem#
We wish to quantify the epistemic uncertainty in the solution of an optimization problem.
Let’s start by recreating our working example:
def f(x):
"""A function to optimize."""
return -4 * (1. - np.sin(6 * x + 8 * np.exp(6 * x - 7.)))
np.random.seed(12345)
n_init = 3
X = np.random.rand(n_init)
Y = f(X)
plt.plot(X, Y, 'kx', markersize=10, markeredgewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$');
Let’s fit the usual GP:
train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)
model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 1e-2
model.eval()
x = torch.linspace(0, 1, 100)
plot_1d_regression(
x,
model,
f_true=f
);
/Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
warnings.warn(
/Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
warnings.warn(
Imagine that you have observed data \(\mathcal{D}_n\). How certain are you about the location of the maximum? If \(n\) is small, you can be more confident. How do you quantify this epistemic uncertainty? Notice that the maximum and the location of the maximum are operators acting on \(f\):
and
respectively. So since we are uncertain about \(f\), we will be unsure about \(f^*\) and \(\mathbf{x}^*\). In particular, we want to quantify the joint probability density \(p(\mathbf{x}^*, f^*|\mathcal{D}_n)\). Here is what is the formal answer:
Of course, this is not technically correct because you cannot integrate over a function this way. The correct way to write this mathematically is to use conditional expectations:
where the expectation is taken over \(f(\cdot)\) conditional on \(\mathcal{D}_n\). In any case, there are two questions:
What does this mean?
How do you compute it?
First, what does it mean? To understand this, you need to pay attention to the delta function. Take for example \(\delta(f^* - \max[f])\). What does it do? It just hits a counter whenever \(\max[f]\) matches \(f^*\) precisely as you take the expectation over \(f(\cdot)\).
Second, how do you compute it? The simplest way to do this is through sampling. You just sample functions from \(p(f(\cdot)|\mathcal{D}_n)\), and you find their maximum location of the maximum. Of course, you cannot sample a function. You sample the function values at a finite but dense number of input points and find the maximum amongst these points. Once you get these samples, you look at their histogram.
Okay, let’s do it for our working example:
def plot_max_and_argmax(gpr, X_design, n_samples=1000):
"""Plot histograms of the max and argmax of the function represented by the model gpr.
Arguments
gpr -- A trained Gaussian process object.
X_design -- A set of points to evaluate the response on.
Keyword Arguments
n_samples -- The number of samples to take to make the histograms.
"""
f_star = gpr(X_design)
f_samples = f_star.sample(sample_shape=torch.Size([n_samples])).numpy()
max_f_samples = np.max(f_samples, axis=1)
x_star_samples = X_design.numpy()[np.argmax(f_samples, axis=1)]
fig, ax = plt.subplots(1,2)
ax[0].hist(max_f_samples, density=True, alpha=0.25)
ax[0].set_xlabel('$f^*$')
ax[0].set_ylabel(r'$p(f^*|\mathcal{D}_n)$')
ax[1].hist(x_star_samples, density=True, alpha=0.25)
ax[1].set_xlabel('$x^*$')
ax[1].set_ylabel(r'$p(x^*|\mathcal{D}_n)$')
plt.tight_layout()
sns.despine(trim=True)
return fig, ax
plot_max_and_argmax(model, x);
Let’s do a few iterations ofour optimization algorithm and repeat that plot.
train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)
model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 0.4
model.eval()
x = torch.linspace(0, 1, 100)
af_all = maximize(
f,
model,
x,
ei,
max_it=3,
plot=1
)
Show code cell output
Here it is again:
plot_max_and_argmax(model, x);
/Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
warnings.warn(
/Users/ibilion/.pyenv/versions/3.12.5/lib/python3.12/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
warnings.warn(
Questions#
How does the epistemic uncertainty about the optimization problem change when you decrease the number of initial samples?
Try changing the number of initial samples to a very small number. Does the algorithm work?