Show code cell source
MAKE_BOOK_FIGURES=Trueimport numpy as npimport scipy.stats as stimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats('svg')import seaborn as snssns.set_context("paper")sns.set_style("ticks")def set_book_style(): plt.style.use('seaborn-v0_8-white') sns.set_style("ticks") sns.set_palette("deep") mpl.rcParams.update({ # Font settings 'font.family': 'serif', # For academic publishing 'font.size': 8, # As requested, 10pt font 'axes.labelsize': 8, 'axes.titlesize': 8, 'xtick.labelsize': 7, # Slightly smaller for better readability 'ytick.labelsize': 7, 'legend.fontsize': 7, # Line and marker settings for consistency 'axes.linewidth': 0.5, 'grid.linewidth': 0.5, 'lines.linewidth': 1.0, 'lines.markersize': 4, # Layout to prevent clipped labels 'figure.constrained_layout.use': True, # Default DPI (will override when saving) 'figure.dpi': 600, 'savefig.dpi': 600, # Despine - remove top and right spines 'axes.spines.top': False, 'axes.spines.right': False, # Remove legend frame 'legend.frameon': False, # Additional trim settings 'figure.autolayout': True, # Alternative to constrained_layout 'savefig.bbox': 'tight', # Trim when saving 'savefig.pad_inches': 0.1 # Small padding to ensure nothing gets cut off })def set_notebook_style(): plt.style.use('seaborn-v0_8-white') sns.set_style("ticks") sns.set_palette("deep") mpl.rcParams.update({ # Font settings - using default sizes 'font.family': 'serif', 'axes.labelsize': 10, 'axes.titlesize': 10, 'xtick.labelsize': 9, 'ytick.labelsize': 9, 'legend.fontsize': 9, # Line and marker settings 'axes.linewidth': 0.5, 'grid.linewidth': 0.5, 'lines.linewidth': 1.0, 'lines.markersize': 4, # Layout settings 'figure.constrained_layout.use': True, # Remove only top and right spines 'axes.spines.top': False, 'axes.spines.right': False, # Remove legend frame 'legend.frameon': False, # Additional settings 'figure.autolayout': True, 'savefig.bbox': 'tight', 'savefig.pad_inches': 0.1 })def save_for_book(fig, filename, is_vector=True, **kwargs): """ Save a figure with book-optimized settings. Parameters: ----------- fig : matplotlib figure The figure to save filename : str Filename without extension is_vector : bool If True, saves as vector at 1000 dpi. If False, saves as raster at 600 dpi. **kwargs : dict Additional kwargs to pass to savefig """ # Set appropriate DPI and format based on figure type if is_vector: dpi = 1000 ext = '.pdf' else: dpi = 600 ext = '.tif' # Save the figure with book settings fig.savefig(f"{filename}{ext}", dpi=dpi, **kwargs)def make_full_width_fig(): return plt.subplots(figsize=(4.7, 2.9), constrained_layout=True)def make_half_width_fig(): return plt.subplots(figsize=(2.35, 1.45), constrained_layout=True)if MAKE_BOOK_FIGURES: set_book_style()else: set_notebook_style()make_full_width_fig = make_full_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()make_half_width_fig = make_half_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()
Show code cell source
!pip install gpytorch
Maximum Mean - A Bad Information Acquisition Function#
An naive information acquisition function is to select the arm with the highest mean. This is a bad idea. You should never do this. We will show why.
Working 1D Example#
It is easier to introduce the ideas using an example. Let’s work with a synthetic 1D function defined in \([0,1]\):
def f(x):
"""A function to optimize."""
return -4 * (1. - np.sin(6 * x + 8 * np.exp(6 * x - 7.)))
x = np.linspace(0, 1)
plt.plot(x, f(x), linewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$')
sns.despine(trim=True);
We wish to maximize this function. Let us generate some starting data:
np.random.seed(12345)
n_init = 3
X = np.random.rand(n_init)
Y = f(X)
plt.plot(X, Y, 'kx', markersize=10, markeredgewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$')
sns.despine(trim=True);
Assume we do some Bayesian regression using our data so far. Here, we will do GPR, but any Bayesian regression would work. We will not work right now with the full predictive \(p(f(\cdot)|\mathcal{D}_{n})\), but with the point-predictive distribution: $\( p(y|\mathbf{x},\mathcal{D}_{n}) = \mathcal{N}\left(y|m_{n}(\mathbf{x}), \sigma^2_{n}(\mathbf{x})\right), \)\( where \)m_{n}(\mathbf{x})\( and \)\sigma^2_{n}(\mathbf{x})$ are the predictive mean and variance respectively.
Here is an example with GPR:
train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)
# It is not a good idea to train the model when we don't have enough data
# So we fix the hyperparameters to something reasonable
model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 1e-4
model.eval()
plot_1d_regression(
torch.from_numpy(x).float(),
model,
f_true=f
);
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
warnings.warn(
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
warnings.warn(
help(model.set_train_data)
Help on method set_train_data in module gpytorch.models.exact_gp:
set_train_data(inputs=None, targets=None, strict=True) method of __main__.ExactGP instance
Set training data (does not re-fit model hyper-parameters).
:param torch.Tensor inputs: The new training inputs.
:param torch.Tensor targets: The new training targets.
:param bool strict: (default True) If `True`, the new inputs and
targets must have the same shape, dtype, and device
as the current inputs and targets. Otherwise, any shape/dtype/device are allowed.
Now, the question is this: “Where should we evaluate the function next if our goal is to maximize the function?” Let’s start with the naive assumption that we should evaluate the function at the point that maximizes the Gaussian process posterior mean, i.e., wherever is the max of the thick blue line. In other words, the information acquisition function is the posterior mean of the Gaussian process. Let’s see what happens.
def maximize_naive(f, model, X_design, max_it=6):
"""
Optimize f using a limited number of evaluations.
Arguments
f -- The function to optimize.
model -- A Gaussian process model to use for representing our state of knowledge.
X_design -- The set of candidate points for identifying the maximum.
max_it -- The maximum number of iterations.
"""
for count in range(max_it):
model.train()
model.eval()
m = model(X_design).mean.detach()
i = np.argmax(m)
new_x = X_design[i:(i+1)].float()
new_y = f(new_x)
train_x = torch.cat([model.train_inputs[0], new_x[:, None]])
train_y = torch.cat([model.train_targets, new_y])
model.set_train_data(train_x, train_y, strict=False)
model.train()
model.eval()
fig, ax = plt.subplots()
plot_1d_regression(
X_design,
model,
ax=ax,
f_true=f,
num_samples=0
)
ax.plot(x[i], f(x[i]), 'go', label='Next observation')
ax.set_title('BGO iteration #{0:d}'.format(count+1))
plt.legend(loc='best', frameon=False)
sns.despine(trim=True);
X_design = torch.from_numpy(x).float()
model.train()
maximize_naive(f, model, X_design, max_it=3)
Show code cell output
Observe that the algorithm misses the actual maximum. It gets trapped. This is because using the posterior mean as an acquisition function focuses too much on exploiting the currently available information but fails to explore regions of the input space that we have yet to visit.
Questions#
Experiment with different numbers of initial observations. How many must you use for the algorithm to converge to the global maximum?