Hide code cell source
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
import seaborn as sns
sns.set_context("paper")
sns.set_style("ticks")

import scipy
import scipy.stats as st
import urllib.request
import os

def download(
    url : str,
    local_filename : str = None
):
    """Download a file from a url.
    
    Arguments
    url            -- The url we want to download.
    local_filename -- The filemame to write on. If not
                      specified 
    """
    if local_filename is None:
        local_filename = os.path.basename(url)
    urllib.request.urlretrieve(url, local_filename)

Decision making#

We demonstrate how to use the results of a binary classifier to make decisions.

High melting explosives sensitivity#

Let’s repeat what we did for the HMX example after splitting the dataset into training and validation subsets. We will be making predictions on the validation subset.

Hide code cell source
url = "https://github.com/PredictiveScienceLab/data-analytics-se/raw/master/lecturebook/data/hmx_data.csv"
download(url)

import pandas as pd
data = pd.read_csv('hmx_data.csv')
x = data['Height'].values
label_coding = {'E': 1, 'N': 0}
y = np.array([label_coding[r] for r in data['Result']])
data['y'] = y
data.head()
Height Result y
0 40.5 E 1
1 40.5 E 1
2 40.5 E 1
3 40.5 E 1
4 40.5 E 1

Separate data into training and validation:

from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33)

num_obs = x.shape[0]

fig, ax = plt.subplots()
ax.plot(x_train, y_train, 'x', label='Training data')
ax.plot(x_valid, y_valid, 'o', label='Validation data')
plt.legend(loc='best', frameon=True)
sns.despine(trim=True);
../_images/b7dbdcf5a37d9547e1b9748d26a44d88d55b4d8fb26c09c250e0995c03373a0a.svg

Train the model:

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression

# Design matrix
poly = PolynomialFeatures(2)
Phi = poly.fit_transform(x[:, None])

# Fit
model = LogisticRegression(
    penalty=None,
    fit_intercept=False
).fit(Phi, y)

Make probabilistic predictions on the validation data:

Phi_valid = poly.fit_transform(x_valid[:, None])
predictions = model.predict_proba(Phi_valid)
print('x\tp(y=0|x)\tp(y=1|x)\tTrue label')
print('-' * 80)
for i in range(x_valid.shape[0]):
    print(
        f"{x_valid[i]:1.2f}\t"
        + f"{predictions[i, 0]:1.2f}\t\t"
        + f"{predictions[i, 1]:1.2f}\t\t"
        + f"{y_valid[i]:d}"
    )
x	p(y=0|x)	p(y=1|x)	True label
--------------------------------------------------------------------------------
22.50	0.93		0.07		0
28.50	0.75		0.25		0
40.50	0.01		0.99		1
36.00	0.10		0.90		0
25.50	0.88		0.12		0
28.50	0.75		0.25		1
22.50	0.93		0.07		0
36.00	0.10		0.90		1
22.50	0.93		0.07		0
40.50	0.01		0.99		1
40.50	0.01		0.99		1
25.50	0.88		0.12		0
36.00	0.10		0.90		0
28.50	0.75		0.25		0
25.50	0.88		0.12		1
22.50	0.93		0.07		0
22.50	0.93		0.07		0
22.50	0.93		0.07		0
40.50	0.01		0.99		1
40.50	0.01		0.99		1

And here is a nice way to visualize these probabilities:

for i in range(x_valid.shape[0]):
    fig, ax = plt.subplots()
    ax.bar(np.arange(2), predictions[i])
    ax.set_title(f'$x={x_valid[i]:1.2f}$ cm, True result = {y_valid[i]:d}')
    ax.set_ylim([0, 1.0])
    ax.set_xticks([0, 1])
    ax.set_xticklabels(model.classes_)
    ax.set_ylabel('Probability')
    sns.despine(trim=True)
Hide code cell output
../_images/5eafe5ca7d025c72a20eb1c106c10f51410a9c3ee6c162dbac5fb8ea301bc85c.svg../_images/ec757d9e4a6c85b3aaeddd88ae1f732e3851c8f605dad08e0bc79656977263d8.svg../_images/900a2f8d134b61d7714683308696cef56e7a4721888479561f30847ddc35ebd0.svg../_images/01de5050d1eb547581264cf02e3dd4e6d62947d4b04a9764077ae6b18ca5c439.svg../_images/e555f2680b8cbfebc941314c188e22d384855a858548a6e2f61edcf9cc465bd0.svg../_images/d03be1561472d79added729f904b5bf03de5728157ba269db62ad907fca9ffd4.svg../_images/6e9e493a250308274d58e720d97f7a93696724583111ce0f162401952e724a60.svg../_images/7f007253ada21df79a9af83f859d62cefaf9c595dd982692b11ccff220e8c402.svg../_images/85d0d122c7bb09a731f50375a0612d5b57e133edbdb7623072d8205e27b1cd8f.svg../_images/253bce8c94455ea1a03db8f1ce53da03d20efbf606255f92e651a6a090ceda13.svg../_images/c97f909b36406cb6463821bc9182a09622ccf3cbfae75fcc95ad341283414a4e.svg../_images/f0eec0f934596c2d5196eb2014017d464c3d94c610ec1b9f5e64c92e5ff4a497.svg../_images/96d00026c1ac0041bbb6cabab6bb0907ec002e5f668e8be2556207bd714f8797.svg../_images/8d13c81cee94fc2f9a8b5bf7db3757833b42fef4702f7f24d00d68f1c3945ad4.svg../_images/457b7b2d9ed76b5d7df7311b47b0fada46400183ab3c0d3ea5752d115ecac057.svg../_images/0a68514e2f99a8f95822170151db79372f522ce4189292f1a00f0fa66c661917.svg../_images/24396a8f59c3b0d77ab0e14b89781967fd49f51bce480cf1156d6b7d7fb177a8.svg../_images/ef93fe8259c4eede37813830e2ecfea5a4545be0ae3f409d38896beff3f3c24f.svg../_images/111e5a4db24e49a61a182ad313a6974930803b4eaab45469ac43f5fbd22f06ed.svg../_images/16760f5cda3fe7bfdb549d14392d3c4c1bc9c15aba4f4b84e26439a501d0296e.svg

Now we are ready to pose and solve the decision-making problem. We just need to define a cost matrix:

# c_00 = cost of correctly picking 0 when 0 is true
# c_01 = cost of wrongly picking 0 when 1 is true
# c_11 = cost of correctly picking 1 when 1 is true
# c_10 = cost of wrongly picking 1 when 0 is true
cost_matrix = np.array(
    [
        [0.0, 1.0],
        [1.0, 0.0]
    ]
)

Here is some code that computes the expected cost of each choice given the predicted probabilities:

def expected_cost(cost_matrix, prediction_prob):
    """Calculate the expected cost of each decision.
    
    Arguments
    cost_matrix     --  A D x D matrix. `cost_matrix[i, j]`
                        is the cost of picking `i` and then
                        `j` happens.
    prediction_prob --  An array with D elements containing
                        the probability that each event
                        happens.
    """
    assert cost_matrix.ndim == 2
    D = cost_matrix.shape[0]
    assert cost_matrix.shape[1] == D
    assert prediction_prob.ndim == 1
    assert prediction_prob.shape[0] == D
    res = np.zeros((2,))
    for i in range(2):
        res[i] = (
            cost_matrix[i, 0] * prediction_prob[0]
            + cost_matrix[i, 1] * prediction_prob[1]
        )
    return res

As a demonstration, here is the expected cost of each decision for the first few validation points. We will put a star (*) next to the choice with minimum cost.

print('x\tCost of 0\tCost of 1\tTrue label\tChoice')
print('-' * 80)
for i in range(x_valid.shape[0]):
    exp_c = expected_cost(cost_matrix, predictions[i])
    line = f'{x_valid[i]:1.2f}\t{exp_c[0]:1.2f}'
    tmp = f'\t\t{exp_c[1]:1.2f}'
    correct_choice = True
    if exp_c[0] < exp_c[1]:
        line += '*'
        if y_valid[i] == 1:
            correct_choice = False
    else:
        tmp += '*'
        if y_valid[i] == 0:
            correct_choice = False
    line += tmp + f'\t\t{y_valid[i]}'
    if correct_choice:
        line += '\t\tCORRECT'
    else:
        line += '\t\tWRONG'
    print(line)
x	Cost of 0	Cost of 1	True label	Choice
--------------------------------------------------------------------------------
22.50	0.07*		0.93		0		CORRECT
28.50	0.25*		0.75		0		CORRECT
40.50	0.99		0.01*		1		CORRECT
36.00	0.90		0.10*		0		WRONG
25.50	0.12*		0.88		0		CORRECT
28.50	0.25*		0.75		1		WRONG
22.50	0.07*		0.93		0		CORRECT
36.00	0.90		0.10*		1		CORRECT
22.50	0.07*		0.93		0		CORRECT
40.50	0.99		0.01*		1		CORRECT
40.50	0.99		0.01*		1		CORRECT
25.50	0.12*		0.88		0		CORRECT
36.00	0.90		0.10*		0		WRONG
28.50	0.25*		0.75		0		CORRECT
25.50	0.12*		0.88		1		WRONG
22.50	0.07*		0.93		0		CORRECT
22.50	0.07*		0.93		0		CORRECT
22.50	0.07*		0.93		0		CORRECT
40.50	0.99		0.01*		1		CORRECT
40.50	0.99		0.01*		1		CORRECT

Notice that most of the choices are correct. But there are some wrong choices. The particularly bad wrong choices are the ones where we predict 0 (no explosion), but there is an explosion. Are there any such cases?

Let me show you another very nice way to compute the expected cost for all the validation points in one line. This way is using the einsum function (Einstein summation convention). It takes a while to understand what it does, but if you do, you can shorten your linear algebra code by a lot. The idea is that repeated indices are summed over.

exp_cost = np.einsum('ij,kj->ki', cost_matrix, predictions)
print(exp_cost)
[[0.06527731 0.93472269]
 [0.25337129 0.74662871]
 [0.99307148 0.00692852]
 [0.90441891 0.09558109]
 [0.12011635 0.87988365]
 [0.25337129 0.74662871]
 [0.06527731 0.93472269]
 [0.90441891 0.09558109]
 [0.06527731 0.93472269]
 [0.99307148 0.00692852]
 [0.99307148 0.00692852]
 [0.12011635 0.87988365]
 [0.90441891 0.09558109]
 [0.25337129 0.74662871]
 [0.12011635 0.87988365]
 [0.06527731 0.93472269]
 [0.06527731 0.93472269]
 [0.06527731 0.93472269]
 [0.99307148 0.00692852]
 [0.99307148 0.00692852]]

Here is yet another way to visualize the decisions of binary classification:

for i in range(x_valid.shape[0]):
    # Make decision
    decision = model.classes_[np.argmin(exp_cost[i])]
    fig, ax = plt.subplots()
    ax.bar(np.arange(2), predictions[i])
    ax.set_title(
        f'$x={x_valid[i]:1.2f}$ cm, '
        + f'True result = {y_valid[i]:d}, '
        + f'Decision = {decision:d}'
    )
    ax.set_ylim([0, 1.0])
    ax.set_xticks([0, 1])
    ax.set_xticklabels(model.classes_)
    ax.set_ylabel('Probability')
    sns.despine(trim=True)
Hide code cell output
../_images/c83c795a50cc136097f18d311f32d48cda3d546443a21e7423ed746890daae6d.svg../_images/f9e3852c342628ed6f267a76e3fdf48c0e6aa8ef7a7be9d361421f141adf70ac.svg../_images/5ff582f2ae4bdece61109c24235181797ecadd7fde22f8a66c24edb137537092.svg../_images/26b26e1d0ff8583f0feb34892de45d0e67f73fa506ddcb3411e85e2bb9a4d434.svg../_images/b0e334b87738f2f0f3cb753657387fcf5e8c4bab7c84eca082ba89c7f21a75b5.svg../_images/ea86c251487f231aa4421d636b34850fef1ee074edc2f2479d49a2c60b056811.svg../_images/c85de32081bab54855f9b10e79c636c75260da3d69237f3f7a49531d53aba1e8.svg../_images/eb42bb88cbb31a7c204c05be3b98b0dfd690920768404f99940ed14af62a5347.svg../_images/a9a9821e6bcb711b33a8d34be2a73a2db7c54bf1871b5f543b0f7e52c4f7f5bb.svg../_images/c4b1af43f95355e7b42bfe440e2871ba751ceff5cce0a71f63f61a142e26c912.svg../_images/8fafc0afd554ab0e3a1a839afef931b0a88b5b2ccdce4cde2122c89bbb283f0d.svg../_images/9d121bdc70c37ca295de77891683073a7379f52734fcf786a2e7794e8c80718a.svg../_images/596d7f764ae6a882078b07bba59470c97ad8758ccab58e04362f5ecc9c38fa8d.svg../_images/b71a947cc331492e10224464cdf41f256d3425135da890313247af7db486243d.svg../_images/fd5c46f2122789d729dced5e38871a9ec28eef9a1859ad7b8bff93b6ae069c71.svg../_images/c21fc183d6c8ea91333410aebe2a98d9004579b87e041c0f75db646e320ae6b7.svg../_images/d121f535cdc66ee09b51566091f9775c1b684ef65aada3127e058bb642ede7f2.svg../_images/4debc85849ebb27117a3f3c67ff4fb2c99e04d5bd284c23de73b3f9f7917b93e.svg../_images/9417dd6f9d3564f19854420469d96cf9e3f7c221ac933340591de0fc4e305e8b.svg../_images/65c07b4cfe50854f3750c9ef9a4a9b211b6e33fbdceb70ceb55bd9ea1c92e5fb.svg

Now let’s plot the decision boundary of our model:

fig, ax = plt.subplots()
pE = np.linspace(0, 1, 100)
pN = 1.0 - pE
probs = np.hstack([pN[:, None], pE[:, None]])
exp_cost = np.einsum('ij,kj->ki', cost_matrix, probs)
decision_idx = np.argmin(exp_cost, axis=1)
ax.plot(pE, decision_idx)
ax.set_yticks([0, 1])
ax.set_yticklabels(['N', 'E'])
ax.set_ylabel('Decision')
ax.set_xlabel('Predictive probability of E')
sns.despine(trim=True)
../_images/a801b86dc4ce0daf3031b81b26835fd5f3e7c6fd590609ee03b30f3abcc80a82.svg

Questions#

  • Repeat the analysis above with a different cost matrix that penalizes more by calling a non-explosion when there is an explosion.