Quantifying Epistemic Uncertainty about the Solution of the Optimization problem

Contents

Hide code cell source
MAKE_BOOK_FIGURES=Trueimport numpy as npimport scipy.stats as stimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats('svg')import seaborn as snssns.set_context("paper")sns.set_style("ticks")def set_book_style():    plt.style.use('seaborn-v0_8-white')     sns.set_style("ticks")    sns.set_palette("deep")    mpl.rcParams.update({        # Font settings        'font.family': 'serif',  # For academic publishing        'font.size': 8,  # As requested, 10pt font        'axes.labelsize': 8,        'axes.titlesize': 8,        'xtick.labelsize': 7,  # Slightly smaller for better readability        'ytick.labelsize': 7,        'legend.fontsize': 7,                # Line and marker settings for consistency        'axes.linewidth': 0.5,        'grid.linewidth': 0.5,        'lines.linewidth': 1.0,        'lines.markersize': 4,                # Layout to prevent clipped labels        'figure.constrained_layout.use': True,                # Default DPI (will override when saving)        'figure.dpi': 600,        'savefig.dpi': 600,                # Despine - remove top and right spines        'axes.spines.top': False,        'axes.spines.right': False,                # Remove legend frame        'legend.frameon': False,                # Additional trim settings        'figure.autolayout': True,  # Alternative to constrained_layout        'savefig.bbox': 'tight',    # Trim when saving        'savefig.pad_inches': 0.1   # Small padding to ensure nothing gets cut off    })def set_notebook_style():    plt.style.use('seaborn-v0_8-white')    sns.set_style("ticks")    sns.set_palette("deep")    mpl.rcParams.update({        # Font settings - using default sizes        'font.family': 'serif',        'axes.labelsize': 10,        'axes.titlesize': 10,        'xtick.labelsize': 9,        'ytick.labelsize': 9,        'legend.fontsize': 9,                # Line and marker settings        'axes.linewidth': 0.5,        'grid.linewidth': 0.5,        'lines.linewidth': 1.0,        'lines.markersize': 4,                # Layout settings        'figure.constrained_layout.use': True,                # Remove only top and right spines        'axes.spines.top': False,        'axes.spines.right': False,                # Remove legend frame        'legend.frameon': False,                # Additional settings        'figure.autolayout': True,        'savefig.bbox': 'tight',        'savefig.pad_inches': 0.1    })def save_for_book(fig, filename, is_vector=True, **kwargs):    """    Save a figure with book-optimized settings.        Parameters:    -----------    fig : matplotlib figure        The figure to save    filename : str        Filename without extension    is_vector : bool        If True, saves as vector at 1000 dpi. If False, saves as raster at 600 dpi.    **kwargs : dict        Additional kwargs to pass to savefig    """        # Set appropriate DPI and format based on figure type    if is_vector:        dpi = 1000        ext = '.pdf'    else:        dpi = 600        ext = '.tif'        # Save the figure with book settings    fig.savefig(f"{filename}{ext}", dpi=dpi, **kwargs)def make_full_width_fig():    return plt.subplots(figsize=(4.7, 2.9), constrained_layout=True)def make_half_width_fig():    return plt.subplots(figsize=(2.35, 1.45), constrained_layout=True)if MAKE_BOOK_FIGURES:    set_book_style()else:    set_notebook_style()make_full_width_fig = make_full_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()make_half_width_fig = make_half_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()
Hide code cell source
!pip install gpytorch

Quantifying Epistemic Uncertainty about the Solution of the Optimization problem#

We wish to quantify the epistemic uncertainty in the solution of an optimization problem.

Let’s start by recreating our working example:

def f(x):
    """A function to optimize."""
    return -4 * (1. - np.sin(6 * x + 8 * np.exp(6 * x - 7.))) 

np.random.seed(12345)

n_init = 3

X = np.random.rand(n_init)
Y = f(X)

plt.plot(X, Y, 'kx', markersize=10, markeredgewidth=2)
plt.xlabel('$x$')
plt.ylabel('$y$');
../_images/f3c2f8d474c56aeb264ea23278a3ac581541ca64fe85190a6069d329f678dc11.svg

Let’s fit the usual GP:

train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)

model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 0.4
model.eval()

x = torch.linspace(0, 1, 100)
plot_1d_regression(
    x,
    model,
    f_true=f
);
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
  warnings.warn(
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
  warnings.warn(
../_images/c64bc5ea40ccec79ca8e7b83af94509c2e82e5660a7a90ae1764befb87e2b0d1.svg

Imagine that you have observed data \(\mathcal{D}_n\). How certain are you about the location of the maximum? If \(n\) is small, you can be more confident. How do you quantify this epistemic uncertainty? Notice that the maximum and the location of the maximum are operators acting on \(f\):

\[ f^* := \max[f] := \max_{\mathbf{x}} f(\mathbf{x}), \]

and

\[ \mathbf{x}^* := \mathbf{X}^*[f] := \arg\max_{\mathbf{x}} f(\mathbf{x}), \]

respectively. So since we are uncertain about \(f\), we will be unsure about \(f^*\) and \(\mathbf{x}^*\). In particular, we want to quantify the joint probability density \(p(\mathbf{x}^*, f^*|\mathcal{D}_n)\). Here is what is the formal answer:

\[ p(\mathbf{x}^*, f^*|\mathcal{D}_n) = \int \delta(\mathbf{x}^* - \mathbf{X}^*[f])\delta(f^*-\max[f])p(f(\cdot)|\mathcal{D}_n)df(\cdot). \]

Of course, this is not technically correct because you cannot integrate over a function this way. The correct way to write this mathematically is to use conditional expectations:

\[ p(\mathbf{x}^*, f^*|\mathcal{D}_n) = \mathbb{E}\left[\delta(\mathbf{x}^* - \mathbf{X}^*[f])\delta(f^*-\max[f])|\mathcal{D}_n\right], \]

where the expectation is taken over \(f(\cdot)\) conditional on \(\mathcal{D}_n\). In any case, there are two questions:

  • What does this mean?

  • How do you compute it?

First, what does it mean? To understand this, you need to pay attention to the delta function. Take for example \(\delta(f^* - \max[f])\). What does it do? It just hits a counter whenever \(\max[f]\) matches \(f^*\) precisely as you take the expectation over \(f(\cdot)\).

Second, how do you compute it? The simplest way to do this is through sampling. You just sample functions from \(p(f(\cdot)|\mathcal{D}_n)\), and you find their maximum location of the maximum. Of course, you cannot sample a function. You sample the function values at a finite but dense number of input points and find the maximum amongst these points. Once you get these samples, you look at their histogram.

Okay, let’s do it for our working example:

def plot_max_and_argmax(gpr, X_design, n_samples=1000):
    """Plot histograms of the max and argmax of the function represented by the model gpr.
    
    Arguments
    gpr      -- A trained Gaussian process object.
    X_design -- A set of points to evaluate the response on.
    
    Keyword Arguments
    n_samples -- The number of samples to take to make the histograms.
    """
    f_star = gpr(X_design)
    f_samples = f_star.sample(sample_shape=torch.Size([n_samples])).numpy()
    max_f_samples = np.max(f_samples, axis=1)
    x_star_samples = X_design.numpy()[np.argmax(f_samples, axis=1)]
    
    fig, ax = plt.subplots(1,2)
    ax[0].hist(max_f_samples, density=True, alpha=0.25)
    ax[0].set_xlabel('$f^*$')
    ax[0].set_ylabel('$p(f^*|\mathcal{D}_n)$')
    
    ax[1].hist(x_star_samples, density=True, alpha=0.25)
    ax[1].set_xlabel('$x^*$')
    ax[1].set_ylabel('$p(x^*|\mathcal{D}_n)$')
    
    plt.tight_layout()
    sns.despine(trim=True)
    
    return fig, ax

plot_max_and_argmax(model, x);
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-06 to the diagonal
  warnings.warn(
/Users/ibilion/.pyenv/versions/3.11.6/lib/python3.11/site-packages/linear_operator/utils/cholesky.py:40: NumericalWarning: A not p.d., added jitter of 1.0e-05 to the diagonal
  warnings.warn(
../_images/56c6e062ace9b2cfd16dc697ed6dc27db17cee55135c4d36c4693669c2fb0f1c.svg

Let’s do a few iterations ofour optimization algorithm and repeat that plot.

train_x = torch.from_numpy(X).float()
train_y = torch.from_numpy(Y).float()
model = ExactGP(train_x, train_y)

model.covar_module.base_kernel.lengthscale = 0.15
model.covar_module.outputscale = 4.0
model.likelihood.noise = 0.4
model.eval()

x = torch.linspace(0, 1, 100)
af_all = maximize(
    f,
    model,
    x,
    ei,
    max_it=3,
    plot=1
)
Hide code cell output
../_images/d18e93675f0638b318ef6185af7058ed9dea4a1293f42d72cd55389fbcffea25.svg ../_images/622cfe07aca5ac3c25cb45e2aa0e88723362074091122184dca0ce57629b2705.svg ../_images/a0eb5777b87cbc839e7d4ebd9ad05f058f45d33d7beb3ee537475f33ae7502f5.svg

Here it is again:

plot_max_and_argmax(gpr, x_star);
../_images/bdf49d441c15d02cd5bab3d6266f6f03242378385a2d2b61f13c282ad49f61ab.png

Questions#

  • How does the epistemic uncertainty about the optimization problem change when you decrease the number of initial samples?

  • Try changing the number of initial samples to a very small number. Does the algorithm work?