The Old Faithful eruption durations are modelled as a mixture of two normals: \[P(X) = \frac{1}{2}\bigl[\phi(X - \mu_1) + \phi(X - \mu_2)\bigr]\] where \(\phi\) is the standard normal density. The log-likelihood is: \[\ell(\mu_1, \mu_2) = \sum_{i=1}^n \log\!\left[\frac{1}{2}\bigl(\phi(x_i-\mu_1)+\phi(x_i-\mu_2)\bigr)\right]\]
The Levenberg-Marquardt (LM) update maximises \(\ell\) by modifying Newton-Raphson with a damping term \(\lambda\): \[\boldsymbol{\mu}^{(k+1)} = \boldsymbol{\mu}^{(k)} + \bigl(-\mathbf{H}^{(k)} + \lambda_k \mathbf{I}\bigr)^{-1} \nabla\ell^{(k)}\] The step is accepted when \(\ell\) increases (and \(\lambda\) is reduced); otherwise \(\lambda\) is increased.
# Log-likelihood
loglik_faithful <- function(mu, x) {
sum(log(0.5 * (dnorm(x, mu[1], 1) + dnorm(x, mu[2], 1))))
}
# Analytical gradient
grad_faithful <- function(mu, x) {
a <- dnorm(x, mu[1], 1); b <- dnorm(x, mu[2], 1); w <- a + b
c(sum((x - mu[1]) * a / w), sum((x - mu[2]) * b / w))
}
# Numerical Hessian (central finite differences)
num_hessian <- function(f, th, h = 1e-5) {
n <- length(th); H <- matrix(0, n, n)
for (i in seq_len(n)) for (j in seq_len(i)) {
ei <- ej <- numeric(n); ei[i] <- 1; ej[j] <- 1
if (i == j) {
H[i,i] <- (f(th + h*ei) - 2*f(th) + f(th - h*ei)) / h^2
} else {
H[i,j] <- (f(th+h*ei+h*ej) - f(th+h*ei-h*ej) -
f(th-h*ei+h*ej) + f(th-h*ei-h*ej)) / (4*h^2)
H[j,i] <- H[i,j]
}
}
H
}
# Levenberg-Marquardt algorithm
LevenbergMarquardt <- function(x, mu0, tol = 1e-6, maxiter = 200, lambda0 = 1) {
mu <- mu0; lambda <- lambda0
path <- matrix(mu0, nrow = 1, ncol = 2,
dimnames = list(NULL, c("mu1","mu2")))
ll <- function(m) loglik_faithful(m, x)
for (iter in seq_len(maxiter)) {
g <- grad_faithful(mu, x)
if (sqrt(sum(g^2)) < tol) break
H <- num_hessian(ll, mu)
A <- -H + lambda * diag(2)
step <- tryCatch(solve(A, g), error = function(e) NULL)
if (is.null(step)) { lambda <- min(lambda*10, 1e12); next }
mu_new <- mu + step
if (ll(mu_new) >= ll(mu)) {
mu <- mu_new; lambda <- max(lambda/10, 1e-12)
path <- rbind(path, mu)
} else {
lambda <- min(lambda*10, 1e12)
}
}
list(mu = mu, path = path, iter = iter, loglik = ll(mu))
}
x_faith <- faithful$eruptions
res_lm <- LevenbergMarquardt(x_faith, mu0 = c(2, 3))
cat(sprintf("MLE: mu1 = %.4f, mu2 = %.4f (converged in %d iterations)\n",
res_lm$mu[1], res_lm$mu[2], res_lm$iter))
## MLE: mu1 = 2.7071, mu2 = 4.1731 (converged in 6 iterations)
# Build grid
grid <- expand.grid(mu1 = seq(1.5, 5.5, length.out = 100),
mu2 = seq(1.5, 5.5, length.out = 100))
grid$ll <- apply(grid, 1, function(r) loglik_faithful(c(r[1], r[2]), x_faith))
path_df <- as.data.frame(res_lm$path)
contourplot(
ll ~ mu1 * mu2, data = grid, cuts = 25,
xlab = expression(mu[1]),
ylab = expression(mu[2]),
main = "Old Faithful Log-likelihood – LM Path",
panel = function(...) {
panel.contourplot(...)
panel.lines(path_df$mu1, path_df$mu2, col = "red", lwd = 2)
panel.points(path_df$mu1, path_df$mu2, col = "red", pch = 19, cex = 0.7)
panel.points(res_lm$mu[1], res_lm$mu[2], col = "blue", pch = 8, cex = 1.5)
}
)
Contour plot of the Old Faithful log-likelihood with the LM optimisation path (red). Starting point (2, 3); blue asterisk marks the MLE.
The data on nitrogen application and crop yield:
df_N <- data.frame(
nitrogen = c(rep(0,4), rep(30,4), rep(60,4), rep(90,4), rep(120,4)),
yield = c(1.41,1.75,2.02,2.13, 1.93,2.24,2.29,2.35,
2.12,2.38,2.49,2.57, 2.16,2.20,2.28,2.49,
2.34,2.45,2.59,2.62)
)
lm_fit <- lm(yield ~ nitrogen, data = df_N)
summary(lm_fit)
##
## Call:
## lm(formula = yield ~ nitrogen, data = df_N)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5455 -0.1718 0.0645 0.1501 0.3295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.95550 0.08595 22.752 1.03e-14 ***
## nitrogen 0.00475 0.00117 4.061 0.000733 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2219 on 18 degrees of freedom
## Multiple R-squared: 0.4782, Adjusted R-squared: 0.4492
## F-statistic: 16.49 on 1 and 18 DF, p-value: 0.0007328
plot(yield ~ nitrogen, data = df_N, pch = 16,
xlab = "Nitrogen (lbs/acre)", ylab = "Yield",
main = "Yield vs Nitrogen – Simple Linear Regression")
abline(lm_fit, col = "steelblue", lwd = 2)
legend("bottomright", "SLR fit", col = "steelblue",
lty = 1, lwd = 2, bty = "n")
Scatterplot with simple linear regression fit.
For \(f(x, \boldsymbol{\theta}) = \beta_0 + \beta_1 \min(x, N_{\max})\) with \(\boldsymbol{\theta} = (\beta_0, \beta_1, N_{\max})^\top\):
\[\frac{\partial f}{\partial \beta_0} = 1\]
\[\frac{\partial f}{\partial \beta_1} = \min(x,\, N_{\max})\]
\[\frac{\partial f}{\partial N_{\max}} = \begin{cases} \beta_1 & \text{if } x > N_{\max} \\ 0 & \text{if } x < N_{\max} \end{cases}\]
Because \(\min(x, N_{\max})\) is not differentiable at \(x = N_{\max}\) exactly, we treat those (measure-zero) points as either case. The full gradient vector is:
\[\nabla_{\boldsymbol{\theta}} f(x, \boldsymbol{\theta}) = \begin{pmatrix} 1 \\ \min(x, N_{\max}) \\ \beta_1 \,\mathbf{1}(x > N_{\max}) \end{pmatrix}\]
LPmodel)LPmodel <- function(Y, X, tol = 1e-5, maxiter = 100) {
# Starting values from SLR + max(X) - 5 for Nmax
lm0 <- lm(Y ~ X)
theta <- c(unname(coef(lm0)[1]), # b0
unname(coef(lm0)[2]), # b1
max(X) - 5) # Nmax
f_model <- function(X, th) th[1] + th[2] * pmin(X, th[3])
SS_old <- sum((Y - f_model(X, theta))^2)
del <- Inf; iter <- 0
for (i in seq_len(maxiter)) {
iter <- i
J <- cbind(
rep(1, length(X)), # df/d(beta0)
pmin(X, theta[3]), # df/d(beta1)
theta[2] * as.numeric(X > theta[3]) # df/d(Nmax)
)
r <- Y - f_model(X, theta)
step <- tryCatch(solve(crossprod(J), crossprod(J, r)),
error = function(e) rep(0, 3))
theta <- theta + step
theta[3] <- max(theta[3], 1e-6) # keep Nmax > 0
SS_new <- sum((Y - f_model(X, theta))^2)
del <- abs(SS_old - SS_new)
SS_old <- SS_new
if (del < tol) break
}
list(b0 = theta[1], b1 = theta[2], Nmax = theta[3],
iter = iter, del = del)
}
lp_res <- LPmodel(df_N$yield, df_N$nitrogen)
cat(sprintf("b0 = %.4f, b1 = %.6f, Nmax = %.2f (iterations = %d)\n",
lp_res$b0, lp_res$b1, lp_res$Nmax, lp_res$iter))
## b0 = 1.9427, b1 = 0.005175, Nmax = 107.68 (iterations = 3)
# Verification with nls
nls_fit <- nls(
yield ~ b0 + b1 * pmin(nitrogen, Nmax), data = df_N,
start = list(b0 = coef(lm_fit)[1], b1 = coef(lm_fit)[2],
Nmax = max(df_N$nitrogen) - 5)
)
cat("\nnls verification:\n"); print(coef(nls_fit))
##
## nls verification:
## b0 b1 Nmax
## 1.942750 0.005175 107.681159
The LPmodel estimates agree with nls.
x_seq <- seq(0, 130, length.out = 300)
lm_pred <- predict(lm_fit, newdata = data.frame(nitrogen = x_seq))
lp_pred <- lp_res$b0 + lp_res$b1 * pmin(x_seq, lp_res$Nmax)
plot(yield ~ nitrogen, data = df_N, pch = 16,
xlab = "Nitrogen (lbs/acre)", ylab = "Yield",
main = "Yield vs Nitrogen – Model Comparison")
lines(x_seq, lm_pred, col = "steelblue", lwd = 2)
lines(x_seq, lp_pred, col = "firebrick", lwd = 2, lty = 2)
legend("bottomright",
legend = c("Simple Linear Regression", "Linear Plateau Model"),
col = c("steelblue", "firebrick"),
lty = c(1, 2), lwd = 2, bty = "n")
Scatterplot with SLR (blue) and linear plateau model (red dashed) fits.
Setting up the log-likelihood.
Throughout, \(\log(\cdot)\) denotes the natural logarithm.
Each \(y_i\) is Bernoulli with success probability \(\pi_i\), so the likelihood is: \[L(\boldsymbol{\beta}) = \prod_{i=1}^n \pi_i^{y_i}(1-\pi_i)^{1-y_i}\]
Taking logs and using \(\log\!\left(\dfrac{\pi_i}{1-\pi_i}\right) = \eta_i\) where \(\eta_i = \beta_0 + \beta_1 x_i\): \[\ell(\boldsymbol{\beta}) = \sum_{i=1}^n \bigl[y_i \log\pi_i + (1-y_i)\log(1-\pi_i)\bigr] = \sum_{i=1}^n \bigl[y_i\eta_i + \log(1-\pi_i)\bigr]\]
Since \(1 - \pi_i = \dfrac{1}{1+e^{\eta_i}}\), we have \(\log(1-\pi_i) = -\log(1+e^{\eta_i})\), giving: \[\boxed{\ell(\beta_0,\beta_1) = \sum_{i=1}^n \bigl[y_i\eta_i - \log(1+e^{\eta_i})\bigr]}\]
Gradient.
Note that \(\dfrac{\partial \eta_i}{\partial \beta_0} = 1\) and \(\dfrac{\partial \eta_i}{\partial \beta_1} = x_i\), and: \[\frac{\partial}{\partial \eta_i}\log(1+e^{\eta_i}) = \frac{e^{\eta_i}}{1+e^{\eta_i}} = \pi_i\]
Therefore: \[\frac{\partial \ell}{\partial \beta_0} = \sum_{i=1}^n \left(y_i - \pi_i\right), \qquad \frac{\partial \ell}{\partial \beta_1} = \sum_{i=1}^n x_i\left(y_i - \pi_i\right)\]
In vector form, \(\nabla\ell = \mathbf{X}^\top(\mathbf{y} - \boldsymbol{\pi})\) where \(\mathbf{X} = [\mathbf{1},\, \mathbf{x}]\).
Hessian.
The sigmoid satisfies \(\dfrac{\partial \pi_i}{\partial \eta_i} = \pi_i(1-\pi_i)\); let \(w_i = \pi_i(1-\pi_i)\). Differentiating each gradient component via the chain rule:
\[\frac{\partial^2 \ell}{\partial \beta_0^2} = \frac{\partial}{\partial \beta_0}\sum_i(y_i - \pi_i) = -\sum_i \frac{\partial \pi_i}{\partial \beta_0} = -\sum_i \frac{\partial \pi_i}{\partial \eta_i}\cdot\frac{\partial \eta_i}{\partial \beta_0} = -\sum_i w_i \cdot 1 = -\sum_{i=1}^n w_i\]
\[\frac{\partial^2 \ell}{\partial \beta_1^2} = -\sum_i \frac{\partial \pi_i}{\partial \eta_i}\cdot\frac{\partial \eta_i}{\partial \beta_1}\cdot x_i = -\sum_{i=1}^n x_i^2 w_i\]
\[\frac{\partial^2 \ell}{\partial \beta_0 \partial \beta_1} = -\sum_i \frac{\partial \pi_i}{\partial \eta_i}\cdot\frac{\partial \eta_i}{\partial \beta_1} = -\sum_{i=1}^n x_i w_i\]
Thus the Hessian is: \[\mathbf{H} = -\begin{pmatrix} \displaystyle\sum_i w_i & \displaystyle\sum_i x_i w_i \\[6pt] \displaystyle\sum_i x_i w_i & \displaystyle\sum_i x_i^2 w_i \end{pmatrix} = -\mathbf{X}^\top \mathbf{W} \mathbf{X}\]
where \(\mathbf{W} = \operatorname{diag}(w_1,\ldots,w_n)\). Since \(w_i > 0\), \(\mathbf{H}\) is negative definite, confirming that the log-likelihood is strictly concave.
oringNR)The Newton-Raphson update is: \[\boldsymbol{\beta}^{(k+1)} = \boldsymbol{\beta}^{(k)} - \mathbf{H}^{-1}\nabla\ell = \boldsymbol{\beta}^{(k)} + (-\mathbf{H})^{-1}\nabla\ell\]
oringNR <- function(filename) {
data <- read.csv(filename)
y <- as.integer(data$ndo > 0) # binary damage indicator
x <- data$temp
# Starting values
ybar <- mean(y)
beta <- c(log(ybar / (1 - ybar)), 0)
histOR <- matrix(c(0, beta), nrow = 1,
dimnames = list(NULL, c("iter","beta0","beta1")))
for (iter in seq_len(100)) {
eta <- beta[1] + beta[2] * x
pi <- 1 / (1 + exp(-eta))
w <- pi * (1 - pi)
g <- c(sum(y - pi), sum(x * (y - pi)))
H <- -matrix(c(sum(w), sum(x*w), sum(x*w), sum(x^2*w)), 2)
step <- tryCatch(solve(-H, g), error = function(e) c(0, 0))
beta <- beta + step
histOR <- rbind(histOR, c(iter, beta[1], beta[2]))
if (sqrt(sum(step^2)) < 1e-8) break
}
return(histOR)
}
histOR <- oringNR("shuttle.csv")
beta_mle <- histOR[nrow(histOR), 2:3]
cat(sprintf("MLE: beta0 = %.4f, beta1 = %.4f\n", beta_mle[1], beta_mle[2]))
## MLE: beta0 = 15.0429, beta1 = -0.2322
cat(sprintf("Converged in %d iterations.\n", nrow(histOR) - 1))
## Converged in 6 iterations.
set.seed(42)
data_or <- read.csv("shuttle.csv")
y_or <- as.integer(data_or$ndo > 0)
x_or <- data_or$temp
boot_logistic <- function(y, x, B = 1000) {
n <- length(y); ybar <- mean(y)
b0_s <- log(ybar / (1 - ybar))
betas <- matrix(NA, B, 2)
for (b in seq_len(B)) {
idx <- sample(n, n, replace = TRUE)
yb <- y[idx]; xb <- x[idx]
beta <- c(b0_s, 0)
for (iter in seq_len(100)) {
eta <- beta[1] + beta[2] * xb
pi_b <- 1 / (1 + exp(-eta)); w <- pi_b * (1 - pi_b)
g <- c(sum(yb - pi_b), sum(xb * (yb - pi_b)))
H <- -matrix(c(sum(w), sum(xb*w), sum(xb*w), sum(xb^2*w)), 2)
step <- tryCatch(solve(-H, g), error = function(e) c(0,0))
beta <- beta + step
if (sqrt(sum(step^2)) < 1e-8) break
}
betas[b, ] <- beta
}
apply(betas, 2, sd)
}
se_boot <- boot_logistic(y_or, x_or, B = 1000)
cat(sprintf("Bootstrap SE: SE(beta0) = %.4f, SE(beta1) = %.4f\n",
se_boot[1], se_boot[2]))
## Bootstrap SE: SE(beta0) = 220.9317, SE(beta1) = 3.4386
temp_seq <- seq(25, 85, length.out = 300)
prob_seq <- 1 / (1 + exp(-(beta_mle[1] + beta_mle[2] * temp_seq)))
prob_31 <- 1 / (1 + exp(-(beta_mle[1] + beta_mle[2] * 31)))
cat(sprintf("Predicted P(o-ring damage | temp = 31 F) = %.4f\n", prob_31))
## Predicted P(o-ring damage | temp = 31 F) = 0.9996
plot(temp_seq, prob_seq, type = "l", lwd = 2, col = "steelblue",
ylim = c(0, 1),
xlab = "Temperature (°F)", ylab = "P(O-ring Damage)",
main = "Probability of O-ring Damage vs Temperature")
points(x_or, y_or, pch = 16, cex = 0.9)
abline(v = 31, lty = 2, col = "firebrick")
text(33, 0.55,
sprintf("T = 31°F\nP = %.4f", prob_31),
col = "firebrick", adj = 0, cex = 0.85)
Estimated probability of o-ring damage as a function of launch temperature. The dashed red line marks 31°F (Challenger launch day).
The model predicts a very high probability of o-ring damage at 31°F, which was the recorded temperature on the day of the Challenger launch.
Setup. \(S \sim U\{-1,1\}\), \(Y \sim \text{Exp}(1)\), and \(Z \mid Y \sim U\!\left(0,\, e^{-Y}\!\sqrt{2e/\pi}\right)\). We want to show that \(SY \mid Z < \phi(Y) \sim N(0,1)\).
Proof.
Note that \(\phi(y) = \frac{1}{\sqrt{2\pi}}e^{-y^2/2}\) and the upper bound of \(Z \mid Y\) is \(e^{-y}\!\sqrt{2e/\pi}\). We verify that \(\phi(y) \le e^{-y}\sqrt{2e/\pi}\) for all \(y>0\) by computing the ratio: \[\frac{\phi(y)}{e^{-y}\sqrt{2e/\pi}} = \frac{\frac{1}{\sqrt{2\pi}}e^{-y^2/2}}{e^{-y}\sqrt{2e/\pi}} = \frac{1}{2\sqrt{e}}\,e^{-y^2/2+y} = \frac{1}{2\sqrt{e}}\,e^{1/2}\,e^{-(y-1)^2/2} = \frac{1}{2}\,e^{-(y-1)^2/2} \le \frac{1}{2} < 1,\] where we completed the square \(-y^2/2 + y = -(y-1)^2/2 + 1/2\). Hence the event \(Z < \phi(Y)\) has positive probability for all \(y > 0\).
By construction, \(S\) is independent of \((Y, Z)\) (they are generated separately). The joint density of \((S, Y, Z)\) is therefore: \[f(s, y, z) = \tfrac{1}{2} \cdot e^{-y} \cdot \frac{1}{e^{-y}\sqrt{2e/\pi}} = \frac{1}{2\sqrt{2e/\pi}}, \quad s \in \{-1,1\},\; y>0,\; 0 < z < e^{-y}\sqrt{2e/\pi}.\]
The conditional density of \(Y\) given \(Z < \phi(Y)\) follows from Bayes’ rule. First, compute the marginal acceptance probability: \[P(Z < \phi(Y)) = \int_0^\infty e^{-y} \cdot \frac{\phi(y)}{e^{-y}\sqrt{2e/\pi}}\,dy = \frac{1}{\sqrt{2e/\pi}}\int_0^\infty \phi(y)\,dy = \frac{1}{2\sqrt{2e/\pi}}.\]
Then the conditional density of \(Y \mid Z < \phi(Y)\) is: \[f_{Y \mid Z<\phi(Y)}(y) = \frac{f_Y(y)\,P(Z<\phi(Y)\mid Y=y)}{P(Z<\phi(Y))} = \frac{e^{-y} \cdot \dfrac{\phi(y)}{e^{-y}\sqrt{2e/\pi}}}{\dfrac{1}{2\sqrt{2e/\pi}}} = 2\phi(y), \quad y > 0.\]
This is the half-normal density. Now let \(X = SY\). Since \(S \in \{-1,1\}\) with equal probability and \(S \perp (Y,Z)\) by construction, we use the law of total probability to find the density of \(X\):
For \(x > 0\): only \(S = +1\) contributes (since \(Y > 0\)), so \[f_X(x) = \tfrac{1}{2}\cdot f_{Y \mid Z<\phi(Y)}(x) = \tfrac{1}{2}\cdot 2\phi(x) = \phi(x).\]
For \(x < 0\): only \(S = -1\) contributes (so \(Y = -x > 0\)), so \[f_X(x) = \tfrac{1}{2}\cdot f_{Y \mid Z<\phi(Y)}(-x) = \tfrac{1}{2}\cdot 2\phi(-x) = \phi(x),\] where the last equality uses the symmetry of \(\phi\).
Therefore \(f_X(x) = \phi(x)\) for all \(x \in \mathbb{R}\), which is exactly the \(N(0,1)\) density. Hence \(SY \mid Z < \phi(Y) \sim N(0,1)\). \(\blacksquare\)
Setup. \((A,B) \sim \text{Uniform}(\text{unit disk})\), so \(f_{A,B}(a,b) = \tfrac{1}{\pi}\) for \(a^2+b^2 \le 1\). Let \(A = S\cos\Psi\), \(B = S\sin\Psi\) with \(S \in (0,1]\), \(\Psi \in [0,2\pi)\). (The point \(s=0\) is a single point of measure zero and is excluded so that the Jacobian is non-singular.)
Proof.
The transformation \((s,\psi) \mapsto (a,b) = (s\cos\psi,\, s\sin\psi)\) has Jacobian matrix: \[\frac{\partial(a,b)}{\partial(s,\psi)} = \begin{pmatrix} \cos\psi & -s\sin\psi \\ \sin\psi & \phantom{-}s\cos\psi \end{pmatrix},\] with determinant \(s\cos^2\!\psi + s\sin^2\!\psi = s > 0\) for \(s \in (0,1]\). By the change-of-variables formula: \[f_{S,\Psi}(s,\psi) = f_{A,B}(s\cos\psi, s\sin\psi)\cdot s = \frac{1}{\pi}\cdot s = \frac{s}{\pi}, \quad s \in (0,1],\; \psi \in [0,2\pi).\]
This factors as: \[f_{S,\Psi}(s,\psi) = \underbrace{2s}_{f_S(s)} \cdot \underbrace{\frac{1}{2\pi}}_{f_\Psi(\psi)}.\]
The support of \((S,\Psi)\) is \((0,1]\times[0,2\pi)\), which is a Cartesian product. Since the joint density factors into a function of \(s\) alone times a function of \(\psi\) alone and the support is a Cartesian product, \(S\) and \(\Psi\) are independent, with marginals:
\[f_S(s) = 2s,\quad 0 \le s \le 1 \qquad \text{and} \qquad f_\Psi(\psi) = \frac{1}{2\pi},\quad 0 \le \psi < 2\pi.\]
Hence \(\Psi \sim U(0, 2\pi)\).
Showing \(S^2 \sim U(0,1)\). Apply the change-of-variables formula directly to \(S^2\). For \(u \in [0,1]\), the inverse map is \(s = \sqrt{u}\) with \(\dfrac{ds}{du} = \dfrac{1}{2\sqrt{u}}\), so: \[f_{S^2}(u) = f_S\!\left(\sqrt{u}\right)\cdot\frac{1}{2\sqrt{u}} = 2\sqrt{u}\cdot\frac{1}{2\sqrt{u}} = 1, \quad 0 \le u \le 1.\]
Thus \(S^2 \sim U(0,1)\). Since \(S^2 = g(S)\) is a deterministic function of \(S\), independence of \(S\) and \(\Psi\) implies independence of \(S^2\) and \(\Psi\). \(\blacksquare\)