Show code cell source
MAKE_BOOK_FIGURES=Trueimport numpy as npimport scipy.stats as stimport matplotlib as mplimport matplotlib.pyplot as plt%matplotlib inlineimport matplotlib_inlinematplotlib_inline.backend_inline.set_matplotlib_formats('svg')import seaborn as snssns.set_context("paper")sns.set_style("ticks")def set_book_style(): plt.style.use('seaborn-v0_8-white') sns.set_style("ticks") sns.set_palette("deep") mpl.rcParams.update({ # Font settings 'font.family': 'serif', # For academic publishing 'font.size': 8, # As requested, 10pt font 'axes.labelsize': 8, 'axes.titlesize': 8, 'xtick.labelsize': 7, # Slightly smaller for better readability 'ytick.labelsize': 7, 'legend.fontsize': 7, # Line and marker settings for consistency 'axes.linewidth': 0.5, 'grid.linewidth': 0.5, 'lines.linewidth': 1.0, 'lines.markersize': 4, # Layout to prevent clipped labels 'figure.constrained_layout.use': True, # Default DPI (will override when saving) 'figure.dpi': 600, 'savefig.dpi': 600, # Despine - remove top and right spines 'axes.spines.top': False, 'axes.spines.right': False, # Remove legend frame 'legend.frameon': False, # Additional trim settings 'figure.autolayout': True, # Alternative to constrained_layout 'savefig.bbox': 'tight', # Trim when saving 'savefig.pad_inches': 0.1 # Small padding to ensure nothing gets cut off })def set_notebook_style(): plt.style.use('seaborn-v0_8-white') sns.set_style("ticks") sns.set_palette("deep") mpl.rcParams.update({ # Font settings - using default sizes 'font.family': 'serif', 'axes.labelsize': 10, 'axes.titlesize': 10, 'xtick.labelsize': 9, 'ytick.labelsize': 9, 'legend.fontsize': 9, # Line and marker settings 'axes.linewidth': 0.5, 'grid.linewidth': 0.5, 'lines.linewidth': 1.0, 'lines.markersize': 4, # Layout settings 'figure.constrained_layout.use': True, # Remove only top and right spines 'axes.spines.top': False, 'axes.spines.right': False, # Remove legend frame 'legend.frameon': False, # Additional settings 'figure.autolayout': True, 'savefig.bbox': 'tight', 'savefig.pad_inches': 0.1 })def save_for_book(fig, filename, is_vector=True, **kwargs): """ Save a figure with book-optimized settings. Parameters: ----------- fig : matplotlib figure The figure to save filename : str Filename without extension is_vector : bool If True, saves as vector at 1000 dpi. If False, saves as raster at 600 dpi. **kwargs : dict Additional kwargs to pass to savefig """ # Set appropriate DPI and format based on figure type if is_vector: dpi = 1000 ext = '.pdf' else: dpi = 600 ext = '.tif' # Save the figure with book settings fig.savefig(f"{filename}{ext}", dpi=dpi, **kwargs)def make_full_width_fig(): return plt.subplots(figsize=(4.7, 2.9), constrained_layout=True)def make_half_width_fig(): return plt.subplots(figsize=(2.35, 1.45), constrained_layout=True)if MAKE_BOOK_FIGURES: set_book_style()else: set_notebook_style()make_full_width_fig = make_full_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()make_half_width_fig = make_half_width_fig if MAKE_BOOK_FIGURES else lambda: plt.subplots()
Sampling the uniform distribution#
If we have a PRNG that samples between zero and a big integer, say \(m\), we can create a generator that samples from the uniform distribution. If \(d\) is the sample from the PRNG, then
is approximately uniformly distributed. Let’s experiment with this idea.
We are going to need some of the code we developed in the previous section:
Show code cell source
MAX_INT = 6012119
def lcg(
x : int,
a : int = 123456,
b : int = 978564,
m : int = MAX_INT
):
"""Sample random numbers using a linear congruential generator.
Arguments:
x - The previous number in the sequence.
a - A big integer.
b - Another big integer.
m - Another big integer.
"""
return (a * x + b) % m
First, let’s make a random generator based on LCG:
LCG_STATE = 123456
def unif_lcg():
"""Sample from the uniform using LCG."""
global LCG_STATE
LCG_STATE = lcg(LCG_STATE)
return LCG_STATE / MAX_INT
Let’s try it:
print("LCG Uniform Samples:")
for i in range(5):
u = unif_lcg()
print(f"sample {i} = {u:.2f}")
LCG Uniform Samples:
sample 0 = 0.27
sample 1 = 0.93
sample 2 = 0.33
sample 3 = 0.10
sample 4 = 0.59
And let’s also do it with Mersenne-Twister from numpy:
import numpy as np
np.random.seed(123456)
def unif_mt():
"""Sample from the uniform using the MT."""
return np.random.randint(0, MAX_INT) / MAX_INT
Let’s test this as well:
print("MT Uniform Samples:")
for i in range(5):
u = unif_mt()
print(f"sample {i} = {u:.2f}")
MT Uniform Samples:
sample 0 = 0.01
sample 1 = 0.89
sample 2 = 0.51
sample 3 = 0.53
sample 4 = 0.54
Which one of the two is better? There are many statistical tests that we would like our uniform random number generator to go through. First (and most importantly) the empirical histograms of the generated numbers should be uniform. Let’s test this.
N = 100
# Some LCG-based uniform samples:
lcg_X = np.array(
[unif_lcg() for _ in range(N)]
)
# Some MT-based uniform samples:
mt_X = np.array(
[unif_mt() for _ in range(N)]
)
fig, ax = plt.subplots()
ax.hist(
lcg_X,
density=True,
alpha=0.5,
label="LGC_unif"
)
ax.hist(
mt_X,
density=True,
alpha=0.5,
label="MT_unif"
)
ax.set_xlabel(r"$x$")
ax.set_ylabel(r"$p(x)$")
plt.legend(loc="best", frameon=False)
sns.despine(trim=True);
This was a rough visual test. We can do better. We can compare the empirical CDF of each one of these algorithms with the ideal CDF, i.e., that of an actual uniform. But what is the empirical CDF of a bunch of samples \(x_{1:N}\)? It is defined as follows:
We will explain in what sense the empirical CDF approximates the actual PDF in Lecture 9. For now, let’s use it:
Show code cell source
def ecdf(x):
"""The empirical distribution function of scalar samples.
From: https://stackoverflow.com/questions/15792552/numpy-scipy-equivalent-of-r-ecdfxx-function
"""
xs = np.sort(x)
ys = np.arange(1, len(xs)+1)/float(len(xs))
return xs, ys
Now, let’s plot the empirical CDF of each of the samples and plot it against \(F(x) = x\) (the true CDF of the uniform).
Show code cell source
fig, ax = plt.subplots()
ax.plot(*ecdf(lcg_X), label="LCG")
ax.plot(*ecdf(mt_X), label="MT")
ax.plot(np.linspace(0, 1), np.linspace(0, 1), label='Uniform')
ax.set_xlabel("$x$")
ax.set_ylabel("$\\bar{F}_N(x)$")
plt.legend(loc="best", frameon=False)
sns.despine(trim=True);
This is still visual. The Kolmogorov-Smirnov test summarizes calculate a distance between the empirical distribution and the ideal one. It is defined as follows:
where (if you don’t know what it is) you can think of the supremum (\(\sup\)) as just the maximum. In other words, \(D_N\) is the maximum absolute difference between \(F(x)\) and \(\hat{F}_N(x)\). Let’s see what we get for LCG and MT compared to the uniform:
import scipy.stats as st
D_lcg, p_val_lcg = st.kstest(lcg_X, "uniform")
D_mt, p_val_mt = st.kstest(mt_X, "uniform")
print(f"KS statistic for LCG vs uniform: {D_lcg:.2f}")
print(f"KS statistic for MT vs uniform: {D_mt:1.2f}")
KS statistic for LCG vs uniform: 0.13
KS statistic for MT vs uniform: 0.06
Questions#
Hmm, we need to increase the number of samples to observe this statistic better. Increase \(N\) from 100 to \(1,000\) and then to \(10,000\). What do the distributions look like now?
A second thing we want to test is whether or not consecutive numbers are all independent (independently identically distributed). Unfortunately, we need more theory than we know to do this.
For future reference, note that you should not use
unif_mt
to generate uniform random numbers. Numpy already implements this innumpy.random.rand
. We provide an example right below.
# Generate some random numbers with numpy's unif_mt:
X = np.random.rand(10)
print(X)
[0.15584706 0.86260898 0.6830463 0.0857089 0.56610253 0.14982485
0.47745445 0.84893827 0.14014442 0.32955081]