knitr::opts_chunk$set(
echo = TRUE, message = FALSE, warning = FALSE,
fig.width = 7, fig.height = 4.5, dpi = 160
)
set.seed(2025)
After completion of the course, students will be able to:
For a two-way table with cell probabilities \(p_{ij}\) and counts \(n_{ij}\) (\(i=1,\dots,r\), \(j=1,\dots,c\)), the null of independence is \[ H_0: p_{ij} = p_{i\cdot} p_{\cdot j}. \] The Pearson chi-square test statistic is \[ X^2 = \sum_{i=1}^r\sum_{j=1}^c \frac{(n_{ij}-e_{ij})^2}{e_{ij}}, \quad e_{ij}=\frac{n_{i\cdot}n_{\cdot j}}{n_{\cdot\cdot}}, \] with df \(=(r-1)(c-1)\) (asymptotically). For \(2\times2\), the sample odds ratio \[ \widehat{\text{OR}}=\frac{n_{11}n_{22}}{n_{12}n_{21}} \] measures association; \(\log(\widehat{\text{OR}})\) has approximate normality for large counts.
# Titanic is a 4-way table; we create a 2-way table: Survived x Sex
tab <- apply(Titanic, c("Survived","Sex"), sum)
tab
## Sex
## Survived Male Female
## No 1364 126
## Yes 367 344
chisq.test(tab)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: tab
## X-squared = 454.5, df = 1, p-value < 2.2e-16
# Odds ratio (manual) using the 2x2 for "Survived" (Yes/No) vs "Sex" (Male/Female)
OR <- (tab["Yes","Female"] * tab["No","Male"]) /
(tab["No","Female"] * tab["Yes","Male"])
OR
## [1] 10.14697
# Mosaic plot (base)
mosaicplot(tab, main = "Titanic: Survived vs Sex", color = TRUE)
A GLM assumes \(Y_i \mid \mathbf{x}_i \sim \text{EF}(\theta_i, \phi)\) with mean \(\mu_i = b'(\theta_i)\) and variance \(\Var(Y_i)=V(\mu_i)a(\phi)\). A link \(g(\mu_i)=\eta_i=\mathbf{x}_i^\top\beta\) connects the mean to the linear predictor.
Binary response \(Y\in\{0,1\}\) with \(\Pr(Y=1\mid \mathbf{x})=\pi(\mathbf{x})\): \[ \log \frac{\pi(\mathbf{x})}{1-\pi(\mathbf{x})} = \mathbf{x}^\top\beta. \] Wald, score, and LR tests provide inference; AIC/BIC for model selection.
n <- 600
x1 <- rnorm(n)
x2 <- rnorm(n)
lin <- -0.5 + 1.2*x1 - 0.8*x2
p <- 1/(1+exp(-lin))
y <- rbinom(n, 1, p)
sim_logit <- data.frame(y, x1, x2)
fit_sim <- glm(y ~ x1 + x2, data = sim_logit, family = binomial())
summary(fit_sim)
##
## Call:
## glm(formula = y ~ x1 + x2, family = binomial(), data = sim_logit)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.38580 0.09642 -4.001 6.3e-05 ***
## x1 1.11985 0.11745 9.534 < 2e-16 ***
## x2 -0.74295 0.10472 -7.095 1.3e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 821.08 on 599 degrees of freedom
## Residual deviance: 658.47 on 597 degrees of freedom
## AIC: 664.47
##
## Number of Fisher Scoring iterations: 4
# ROC-like curve (quick)
phat <- predict(fit_sim, type = "response")
th <- seq(0.05, 0.95, by = 0.01)
acc <- sapply(th, function(t) mean( (phat>t)==y ))
plot(th, acc, type="l", xlab="Threshold", ylab="Accuracy", main="Accuracy vs Threshold")
mt <- within(mtcars, am <- factor(am, labels = c("auto","manual")))
fit_mtcars <- glm(am ~ mpg + wt + hp, data = mt, family = binomial())
summary(fit_mtcars)
##
## Call:
## glm(formula = am ~ mpg + wt + hp, family = binomial(), data = mt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.72137 40.00281 -0.393 0.6943
## mpg 1.22930 1.58109 0.778 0.4369
## wt -6.95492 3.35297 -2.074 0.0381 *
## hp 0.08389 0.08228 1.020 0.3079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.2297 on 31 degrees of freedom
## Residual deviance: 8.7661 on 28 degrees of freedom
## AIC: 16.766
##
## Number of Fisher Scoring iterations: 10
# LR test vs null
fit_null <- glm(am ~ 1, data = mt, family = binomial())
anova(fit_null, fit_mtcars, test = "Chisq")
# Pseudo R^2
ll_full <- as.numeric(logLik(fit_mtcars))
ll_null <- as.numeric(logLik(fit_null))
pseudoR2 <- 1 - (ll_full/ll_null); pseudoR2
## [1] 0.7972204
# Partial effects plot for mpg (hold others at median)
grid <- with(mt, data.frame(
mpg = seq(min(mpg), max(mpg), length.out = 100),
wt = median(wt), hp = median(hp)
))
grid$phat <- predict(fit_mtcars, newdata = grid, type = "response")
plot(grid$mpg, grid$phat, type="l", xlab="mpg", ylab="Pr(manual)",
main="Effect of mpg (wt,hp at median)")
Counts \(Y \in \{0,1,2,\ldots\}\) with mean \(\mu\) satisfy \[ \log \mu = \mathbf{x}^\top\beta, \qquad \Var(Y)=\mu. \] Use an offset \(\log E\) for exposure: \(\log \mu = \log E + \mathbf{x}^\top\beta\).
Check overdispersion via \(\text{RD/df}>1\); consider quasipoisson or Negative Binomial (MASS).
n <- 500
x <- runif(n, 0, 3)
eta <- 0.2 + 0.7*x
mu <- exp(eta)
y <- rpois(n, mu)
sim_pois <- data.frame(y, x)
fit_p <- glm(y ~ x, data = sim_pois, family = poisson(link="log"))
summary(fit_p)
##
## Call:
## glm(formula = y ~ x, family = poisson(link = "log"), data = sim_pois)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.14960 0.05977 2.503 0.0123 *
## x 0.70387 0.02828 24.894 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1176.4 on 499 degrees of freedom
## Residual deviance: 497.6 on 498 degrees of freedom
## AIC: 1924.4
##
## Number of Fisher Scoring iterations: 5
plot(x, y, pch=16, cex=0.6, main="Poisson Simulated Data")
ord <- order(x)
lines(x[ord], mu[ord], lwd=2)
lines(x[ord], fitted(fit_p)[ord], lwd=2, lty=2)
legend("topleft", c("true mean","fitted"), lwd=2, lty=c(1,2), bty="n")
data(InsectSprays)
fit_is <- glm(count ~ spray, data = InsectSprays, family = poisson())
summary(fit_is)
##
## Call:
## glm(formula = count ~ spray, family = poisson(), data = InsectSprays)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.67415 0.07581 35.274 < 2e-16 ***
## sprayB 0.05588 0.10574 0.528 0.597
## sprayC -1.94018 0.21389 -9.071 < 2e-16 ***
## sprayD -1.08152 0.15065 -7.179 7.03e-13 ***
## sprayE -1.42139 0.17192 -8.268 < 2e-16 ***
## sprayF 0.13926 0.10367 1.343 0.179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 409.041 on 71 degrees of freedom
## Residual deviance: 98.329 on 66 degrees of freedom
## AIC: 376.59
##
## Number of Fisher Scoring iterations: 5
# Overdispersion check
overdisp <- deviance(fit_is) / df.residual(fit_is); overdisp
## [1] 1.489828
# If overdispersed, use quasipoisson
fit_is_q <- glm(count ~ spray, data = InsectSprays, family = quasipoisson())
summary(fit_is_q)
##
## Call:
## glm(formula = count ~ spray, family = quasipoisson(), data = InsectSprays)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.67415 0.09309 28.728 < 2e-16 ***
## sprayB 0.05588 0.12984 0.430 0.668
## sprayC -1.94018 0.26263 -7.388 3.30e-10 ***
## sprayD -1.08152 0.18499 -5.847 1.70e-07 ***
## sprayE -1.42139 0.21110 -6.733 4.82e-09 ***
## sprayF 0.13926 0.12729 1.094 0.278
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 1.507713)
##
## Null deviance: 409.041 on 71 degrees of freedom
## Residual deviance: 98.329 on 66 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
boxplot(count ~ spray, data = InsectSprays, main="Counts by Spray")
To ensure the model parameters are uniquely identifiable, constraints (such as summing to zero across each index) are typically imposed.
The principle states that if a certain interaction term is included in the model, then all lower-order interaction terms that are a part of it must also be included. For example, if the three-way interaction \(\lambda^{ABC}_{ijk}\) is in the model, then all two-way interactions (\(\lambda^{AB}_{ij}\), \(\lambda^{AC}_{ik}\), \(\lambda^{BC}_{jk}\)) and all main effects (\(\lambda^A_i\), \(\lambda^B_j\), \(\lambda^C_k\)) must also be in the model. This principle ensures that the model is interpretable and respects the underlying structure of the data. For instance, a model with only the interaction \(\lambda^{AB}_{ij}\) but without the main effects \(\lambda^A_i\) and \(\lambda^B_j\) would not make sense from a statistical perspective.
tab3 <- HairEyeColor # 4 (Hair) x 4 (Eye) x 2 (Sex)
ftable(tab3)
## Sex Male Female
## Hair Eye
## Black Brown 32 36
## Blue 11 9
## Hazel 10 5
## Green 3 2
## Brown Brown 53 66
## Blue 50 34
## Hazel 25 29
## Green 15 14
## Red Brown 10 16
## Blue 10 7
## Hazel 7 7
## Green 7 7
## Blond Brown 3 4
## Blue 30 64
## Hazel 5 5
## Green 8 8
# Fit models by glm (Poisson) using data.frame of counts:
df3 <- as.data.frame(tab3)
head(df3)
# Independence of all factors: [H][E][S]
m_ind <- glm(Freq ~ Hair + Eye + Sex, data = df3, family = poisson())
# Add all 2-way interactions but no 3-way: [HE][HS][ES]
m_2way <- glm(Freq ~ Hair*Eye + Hair*Sex + Eye*Sex, data = df3, family = poisson())
# Saturated (includes 3-way)
m_sat <- glm(Freq ~ Hair*Eye*Sex, data = df3, family = poisson())
anova(m_ind, m_2way, m_sat, test = "Chisq")
AIC(m_ind, m_2way, m_sat)
# Residual plot (Pearson residuals vs fitted)
plot(fitted(m_2way), residuals(m_2way, type="pearson"),
pch=16, xlab="Fitted", ylab="Pearson resid", main="[HE][HS][ES] model")
abline(h=0, col="gray")
The Expectation-Maximization (EM) algorithm is a powerful iterative method for finding maximum likelihood estimates of parameters in statistical models, particularly when the data is incomplete or when the model can be thought of as having unobserved latent variables.
For the simple case of a normal distribution, let’s assume we have a set of independent and identically distributed (iid) observations \(y_1, \dots, y_n \sim \mathcal{N}(\mu, \sigma^2)\), but some of these values are missing completely at random (MCAR). This means the probability of a value being missing doesn’t depend on the value itself or on any other observed values.
The EM algorithm proceeds in two steps, which are repeated until convergence:For this specific simple case under MCAR, the EM algorithm converges in just one step because the E-step effectively ignores the missing data and the M-step directly computes the maximum likelihood estimates from the observed data. The true power of the EM algorithm becomes apparent in more complex situations, such as multivariate normal distributions with missing data or mixture models, where the E-step involves more complex imputation of missing values.
# Univariate example with missing values
y_full <- rnorm(200, mean = 2, sd = 1.5)
miss_idx <- sample(seq_along(y_full), 60)
y_obs <- y_full
y_obs[miss_idx] <- NA
# "EM" under MCAR just uses observed sufficient statistics
mu_hat <- mean(y_obs, na.rm = TRUE)
sig2hat <- mean( (y_obs - mu_hat)^2, na.rm = TRUE )
c(mu_hat = mu_hat, sigma_hat = sqrt(sig2hat))
## mu_hat sigma_hat
## 2.077042 1.563466
The Expectation-Maximization (EM) algorithm is particularly useful for fitting mixture models, where the data are assumed to come from a combination of several underlying distributions. In this case, we consider a two-component Gaussian mixture model. We assume our observed data \(Y\) comes from a mixture of two normal distributions, \(\mathcal{N}(\mu_1, \sigma_1^2)\) and \(\mathcal{N}(\mu_2, \sigma_2^2)\), with mixing probabilities \(\pi\) and \((1-\pi)\), respectively.
The model for an observation \(Y_i\) can be written as: \[ Y_i \sim \pi \cdot \mathcal{N}(\mu_1, \sigma_1^2) + (1-\pi) \cdot \mathcal{N}(\mu_2, \sigma_2^2). \] We introduce a latent (unobserved) variable \(Z_i \in \{1, 2\}\) for each observation \(Y_i\). This variable indicates which of the two Gaussian components the observation \(Y_i\) was generated from. This allows us to frame the problem as one with “missing data” (the component memberships \(Z_i\)), making it a perfect fit for the EM algorithm.
The algorithm proceeds by iteratively applying the following two steps:
This step is about “expectation.” We use the current parameter estimates (\(\pi^{(t)}, \mu_1^{(t)}, \sigma_1^{2(t)}, \mu_2^{(t)}, \sigma_2^{2(t)}\)) to compute the responsibilities for each data point. A responsibility, denoted \(\gamma_{ij}\), is the probability that observation \(Y_i\) belongs to component \(j\), given the current model parameters and the data. It’s essentially our “best guess” for the missing \(Z_i\). For the two-component Gaussian mixture, the responsibilities are: \[ \gamma_{i1} = \frac{\pi^{(t)}\phi(y_i;\mu_1^{(t)},\sigma_1^{2(t)})}{ \pi^{(t)}\phi(y_i;\mu_1^{(t)},\sigma_1^{2(t)}) + (1-\pi^{(t)})\phi(y_i;\mu_2^{(t)},\sigma_2^{2(t)})},\quad \gamma_{i2}=1-\gamma_{i1}. \] Here, \(\phi(\cdot;\mu,\sigma^2)\) is the probability density function (PDF) of a normal distribution with mean \(\mu\) and variance \(\sigma^2\). The numerator is the probability of \(Y_i\) coming from component 1, while the denominator is the total probability of \(Y_i\) coming from either component.
This step is about “maximization.” Using the responsibilities (\(\gamma_{i1}, \gamma_{i2}\)) computed in the E-step, we update our parameter estimates by maximizing the complete-data likelihood. This is equivalent to performing a weighted maximum likelihood estimation (MLE), where each data point’s contribution to the update is weighted by its responsibility. The updates are: \[\begin{align*} \pi^{(t+1)} &= \frac{1}{n}\sum_i \gamma_{i1} \\ \mu_1^{(t+1)} &= \frac{\sum_i \gamma_{i1} y_i}{\sum_i \gamma_{i1}} \\ (\sigma_1^2)^{(t+1)} &= \frac{\sum_i \gamma_{i1}(y_i-\mu_1^{(t+1)})^2}{\sum_i \gamma_{i1}} \end{align*}\] The updates for the second component (\(\mu_2^{(t+1)}\) and \(\sigma_2^{2(t+1)}\)) are analogous, simply replacing \(\gamma_{i1}\) with \(\gamma_{i2}\) in the formulas. These new parameter estimates are then used for the next iteration’s E-step, and the process continues until the parameters converge.
em_gmm1d <- function(y, maxit = 200, tol = 1e-6,
pi0 = 0.5, mu10 = quantile(y, .25), mu20 = quantile(y, .75),
sd10 = sd(y)/2, sd20 = sd(y)/2, verbose = FALSE) {
n <- length(y)
pi <- pi0; mu1 <- mu10; mu2 <- mu20; sd1 <- sd10; sd2 <- sd20
loglik <- function() sum(log(pi*dnorm(y, mu1, sd1) + (1-pi)*dnorm(y, mu2, sd2)))
ll_old <- -Inf
for (it in 1:maxit) {
# E-step
num1 <- pi * dnorm(y, mu1, sd1)
num2 <- (1 - pi) * dnorm(y, mu2, sd2)
gamma1 <- num1 / (num1 + num2)
gamma2 <- 1 - gamma1
# M-step
N1 <- sum(gamma1); N2 <- n - N1
pi <- N1 / n
mu1 <- sum(gamma1 * y) / N1
mu2 <- sum(gamma2 * y) / N2
sd1 <- sqrt(sum(gamma1 * (y - mu1)^2) / N1)
sd2 <- sqrt(sum(gamma2 * (y - mu2)^2) / N2)
ll_new <- loglik()
if (verbose) cat(sprintf("iter=%03d ll=%.4f\n", it, ll_new))
if (abs(ll_new - ll_old) < tol) break
ll_old <- ll_new
}
list(pi = pi, mu1 = mu1, sd1 = sd1, mu2 = mu2, sd2 = sd2,
iters = it, loglik = ll_old, gamma1 = gamma1)
}
# Simulate a two-component mixture
n <- 600
y <- c(rnorm(n/2, -1, 0.6), rnorm(n/2, 2.2, 0.9))
fit_em <- em_gmm1d(y, verbose = TRUE)
## iter=001 ll=-1052.1292
## iter=002 ll=-1047.6755
## iter=003 ll=-1046.8813
## iter=004 ll=-1046.6732
## iter=005 ll=-1046.6156
## iter=006 ll=-1046.5994
## iter=007 ll=-1046.5949
## iter=008 ll=-1046.5935
## iter=009 ll=-1046.5932
## iter=010 ll=-1046.5931
## iter=011 ll=-1046.5930
## iter=012 ll=-1046.5930
## iter=013 ll=-1046.5930
## iter=014 ll=-1046.5930
fit_em
## $pi
## [1] 0.5027359
##
## $mu1
## [1] -1.033378
##
## $sd1
## [1] 0.5630339
##
## $mu2
## [1] 2.328641
##
## $sd2
## [1] 0.9064508
##
## $iters
## [1] 14
##
## $loglik
## [1] -1046.593
##
## $gamma1
## [1] 9.997622e-01 9.915173e-01 9.994583e-01 9.987280e-01 9.996270e-01
## [6] 9.991162e-01 5.438552e-01 9.999023e-01 9.995050e-01 9.999519e-01
## [11] 9.992418e-01 9.999467e-01 9.981650e-01 9.983874e-01 9.996814e-01
## [16] 9.126180e-01 9.999510e-01 9.996506e-01 9.999759e-01 9.999367e-01
## [21] 9.995877e-01 9.947454e-01 9.999456e-01 9.999304e-01 9.997870e-01
## [26] 9.995982e-01 9.982864e-01 9.999079e-01 9.989537e-01 7.745471e-01
## [31] 9.929477e-01 9.998318e-01 9.999625e-01 9.877138e-01 1.504336e-01
## [36] 9.998274e-01 9.915184e-01 9.974909e-01 9.676281e-01 9.990748e-01
## [41] 9.996794e-01 9.994736e-01 9.999399e-01 9.993115e-01 9.941170e-01
## [46] 9.998936e-01 8.426259e-01 9.999344e-01 9.999788e-01 9.945737e-01
## [51] 9.924649e-01 9.994651e-01 9.996540e-01 9.985189e-01 9.970515e-01
## [56] 9.999604e-01 9.992574e-01 9.992467e-01 9.998631e-01 9.995068e-01
## [61] 9.991630e-01 9.998823e-01 9.999804e-01 9.998677e-01 9.991313e-01
## [66] 9.999799e-01 9.991250e-01 9.999421e-01 9.998762e-01 9.868345e-01
## [71] 9.514255e-01 9.879950e-01 9.993292e-01 9.731315e-01 9.988518e-01
## [76] 8.957875e-01 9.825844e-01 9.943350e-01 9.994581e-01 9.987045e-01
## [81] 9.984909e-01 9.998287e-01 9.999477e-01 9.937106e-01 9.998309e-01
## [86] 9.983995e-01 9.995740e-01 9.964173e-01 9.994048e-01 9.998966e-01
## [91] 9.969617e-01 9.358616e-01 9.989480e-01 9.999717e-01 9.976424e-01
## [96] 9.996415e-01 9.999542e-01 9.994147e-01 9.911468e-01 9.958441e-01
## [101] 9.995114e-01 9.817077e-01 9.998748e-01 9.927781e-01 9.990868e-01
## [106] 9.998042e-01 9.964309e-01 9.993374e-01 9.991699e-01 9.921894e-01
## [111] 9.968797e-01 9.997976e-01 9.723165e-01 9.998802e-01 9.932122e-01
## [116] 3.959150e-01 9.994522e-01 8.058205e-01 9.999289e-01 9.950852e-01
## [121] 9.982373e-01 9.994424e-01 9.975827e-01 9.918369e-01 9.998850e-01
## [126] 9.999396e-01 9.998700e-01 9.997935e-01 9.999005e-01 9.999688e-01
## [131] 9.981044e-01 9.990066e-01 9.998338e-01 9.479012e-01 9.999203e-01
## [136] 9.979898e-01 9.998203e-01 9.999049e-01 9.993024e-01 9.959227e-01
## [141] 9.521942e-01 9.904796e-01 9.996892e-01 9.995287e-01 9.981010e-01
## [146] 9.996536e-01 9.868852e-01 9.916431e-01 9.985867e-01 9.941639e-01
## [151] 9.991168e-01 9.797562e-01 9.997283e-01 5.773741e-01 9.998225e-01
## [156] 9.998430e-01 9.975853e-01 9.817656e-01 9.990274e-01 9.998949e-01
## [161] 9.998581e-01 9.994772e-01 9.999869e-01 9.999228e-01 9.986002e-01
## [166] 9.819279e-01 9.996082e-01 9.985332e-01 9.999742e-01 9.999667e-01
## [171] 9.996619e-01 9.953187e-01 9.998987e-01 9.957093e-01 9.996730e-01
## [176] 9.996851e-01 9.999841e-01 9.999016e-01 9.980226e-01 9.999262e-01
## [181] 9.991946e-01 9.993063e-01 9.997264e-01 9.995844e-01 9.997725e-01
## [186] 9.378490e-01 9.997777e-01 9.978115e-01 9.995653e-01 7.960418e-01
## [191] 9.995591e-01 9.999411e-01 9.985202e-01 9.999603e-01 9.923755e-01
## [196] 9.988442e-01 9.997411e-01 9.972933e-01 9.998230e-01 9.998398e-01
## [201] 9.998700e-01 9.975333e-01 9.999808e-01 9.894445e-01 9.999697e-01
## [206] 9.998226e-01 9.327044e-01 9.997022e-01 9.981365e-01 9.966337e-01
## [211] 9.998623e-01 9.917364e-01 9.995507e-01 9.993164e-01 9.999612e-01
## [216] 9.976645e-01 9.920044e-01 9.998978e-01 9.816791e-01 9.985668e-01
## [221] 9.994539e-01 9.999418e-01 9.999665e-01 9.999439e-01 9.963746e-01
## [226] 9.988263e-01 9.792033e-01 9.996187e-01 9.971869e-01 9.990498e-01
## [231] 9.959164e-01 9.998332e-01 9.998615e-01 9.998222e-01 9.999321e-01
## [236] 9.995378e-01 9.994092e-01 9.992619e-01 9.689129e-01 9.999895e-01
## [241] 9.999247e-01 9.940573e-01 9.990538e-01 9.988721e-01 9.994636e-01
## [246] 9.980347e-01 9.913722e-01 9.999580e-01 9.999481e-01 9.998101e-01
## [251] 9.999093e-01 9.977054e-01 9.999839e-01 9.990340e-01 9.999017e-01
## [256] 9.999610e-01 9.993865e-01 9.929570e-01 9.999154e-01 9.937318e-01
## [261] 9.730549e-01 9.962113e-01 9.987095e-01 9.970133e-01 9.999514e-01
## [266] 9.999632e-01 9.992342e-01 9.989552e-01 9.999835e-01 9.928587e-01
## [271] 9.695553e-01 9.918068e-01 9.933219e-01 9.992556e-01 9.977135e-01
## [276] 9.971961e-01 9.999443e-01 9.996886e-01 9.987580e-01 9.872968e-01
## [281] 9.998077e-01 9.870236e-01 9.985666e-01 9.998371e-01 9.986385e-01
## [286] 9.953668e-01 9.275924e-01 9.999568e-01 9.999538e-01 9.997382e-01
## [291] 9.998922e-01 9.991005e-01 9.991345e-01 9.973744e-01 9.982259e-01
## [296] 9.995968e-01 9.931470e-01 9.992400e-01 9.992301e-01 9.998717e-01
## [301] 4.091306e-11 5.947534e-13 1.919961e-05 5.431581e-08 9.776848e-08
## [306] 4.397515e-08 1.190430e-09 5.933055e-10 1.412021e-09 6.532991e-10
## [311] 7.603014e-08 1.073887e-08 3.150676e-09 6.574532e-13 2.279389e-13
## [316] 2.246542e-06 1.541412e-04 9.033801e-13 4.021435e-07 1.211402e-06
## [321] 3.862589e-09 2.649834e-09 7.363669e-06 8.725457e-07 3.559127e-08
## [326] 2.506126e-04 2.249938e-04 1.162866e-13 1.472255e-05 3.438769e-05
## [331] 1.191023e-09 2.175089e-05 3.019822e-10 4.109078e-03 4.149221e-05
## [336] 1.583513e-01 2.490125e-08 1.200899e-11 2.391246e-07 5.864112e-08
## [341] 8.051897e-08 8.317991e-08 7.914372e-02 9.290891e-12 8.743673e-03
## [346] 1.217924e-08 4.927200e-11 1.094807e-07 2.326396e-11 1.312598e-10
## [351] 9.401432e-21 1.120749e-09 1.176079e-05 5.074362e-13 4.037751e-10
## [356] 5.792419e-03 1.420329e-01 3.719292e-05 8.821597e-08 2.181628e-06
## [361] 7.200194e-05 9.896251e-10 2.693421e-08 1.005403e-11 7.052896e-18
## [366] 1.215231e-06 2.860390e-20 7.478828e-08 2.401115e-08 5.603484e-03
## [371] 2.375492e-05 2.261389e-10 7.732381e-05 2.099542e-06 5.472501e-17
## [376] 1.528970e-13 2.607147e-07 5.681304e-10 1.160435e-04 8.761773e-12
## [381] 1.594656e-02 3.425616e-07 6.324206e-05 4.906365e-10 2.045481e-12
## [386] 9.528564e-06 6.286274e-01 1.905478e-09 8.513599e-07 2.720408e-16
## [391] 1.524225e-14 2.727477e-08 6.640237e-10 3.255016e-06 8.083584e-09
## [396] 2.805478e-10 5.616630e-05 1.623867e-08 3.852458e-11 7.784193e-10
## [401] 2.163996e-06 9.131497e-06 1.346001e-08 4.861731e-07 3.515424e-07
## [406] 5.539891e-12 5.747765e-07 6.737232e-11 8.680016e-09 7.760426e-08
## [411] 7.605296e-09 1.344408e-12 1.068355e-01 3.843902e-06 1.691182e-06
## [416] 2.215796e-03 6.433168e-09 4.675013e-23 1.092805e-08 1.635846e-01
## [421] 1.311303e-10 9.707553e-01 1.378365e-11 2.421642e-14 6.349572e-05
## [426] 7.653071e-04 1.077720e-06 8.131702e-04 1.221877e-09 1.061702e-10
## [431] 5.754293e-05 5.998207e-07 1.070649e-14 6.252148e-16 5.192906e-05
## [436] 5.497076e-13 1.685614e-10 1.255871e-06 2.708715e-07 9.437774e-03
## [441] 4.609063e-04 1.539325e-14 3.064865e-06 5.279830e-09 3.772746e-16
## [446] 9.821896e-11 1.489144e-10 2.683785e-17 6.183675e-06 2.052255e-10
## [451] 7.686413e-07 5.124245e-17 1.127071e-04 1.144678e-19 1.041929e-08
## [456] 6.920915e-09 3.093892e-02 5.944300e-13 3.230309e-07 1.290713e-10
## [461] 8.771448e-10 5.151404e-09 1.168152e-11 1.497757e-05 3.242327e-08
## [466] 2.433546e-08 2.177466e-09 5.988094e-07 1.289370e-08 4.254451e-07
## [471] 4.931151e-07 6.158569e-08 2.832945e-14 8.874407e-11 6.985010e-10
## [476] 4.011466e-04 3.515585e-04 6.310922e-08 3.111511e-01 1.969579e-01
## [481] 8.917796e-09 3.429752e-01 7.289875e-14 2.517541e-03 1.472804e-06
## [486] 2.168619e-06 8.207777e-07 1.622745e-18 4.225800e-11 1.696987e-12
## [491] 3.005953e-15 1.290965e-03 1.043569e-08 1.669515e-08 4.982227e-11
## [496] 7.792060e-05 3.309859e-14 4.599490e-03 5.024556e-12 3.787791e-16
## [501] 1.270237e-08 6.473158e-05 1.157294e-06 6.650501e-14 2.019010e-05
## [506] 5.049416e-01 6.138239e-05 1.537783e-23 3.263420e-03 4.080360e-10
## [511] 8.221825e-08 5.410191e-05 7.533267e-17 2.906664e-08 9.943516e-01
## [516] 7.591630e-09 1.387002e-07 2.730757e-05 1.419141e-04 8.877266e-24
## [521] 3.742273e-11 5.749408e-19 2.422834e-11 1.877489e-10 2.285293e-04
## [526] 2.682085e-05 5.869686e-15 4.403700e-08 6.690392e-08 7.104295e-06
## [531] 7.383523e-11 9.132816e-09 8.379204e-06 3.408087e-25 4.037814e-05
## [536] 2.572127e-06 4.425989e-06 1.718455e-18 6.041757e-17 2.152426e-10
## [541] 1.388032e-06 4.907576e-08 6.579489e-10 1.285402e-10 9.597057e-07
## [546] 5.407242e-05 2.982417e-06 5.336019e-08 2.534320e-07 3.878754e-12
## [551] 2.197275e-09 2.778713e-09 2.947129e-15 1.427802e-12 1.670296e-06
## [556] 7.992469e-13 5.448417e-03 1.168453e-11 7.413299e-09 1.992836e-06
## [561] 4.424027e-05 9.850402e-03 2.280590e-05 3.285807e-04 5.061588e-01
## [566] 1.836060e-11 7.380491e-04 1.330538e-11 2.821458e-05 9.882139e-05
## [571] 1.439961e-08 2.606697e-07 2.445154e-09 9.937116e-10 5.673007e-07
## [576] 9.199272e-01 2.834756e-06 1.772850e-11 1.258038e-10 3.647183e-08
## [581] 1.136122e-03 8.588075e-04 3.131820e-07 1.349835e-09 1.190421e-09
## [586] 1.015447e-11 1.981754e-03 5.526994e-03 1.772727e-09 7.995092e-05
## [591] 2.375468e-09 4.941524e-14 1.391371e-06 2.731799e-04 7.645585e-12
## [596] 1.559183e-11 5.237846e-06 1.062288e-12 7.573436e-08 1.750847e-09
# Visualize fit
hist(y, breaks = 40, freq = FALSE, main = "Gaussian Mixture: EM fit", xlab = "y")
curve(fit_em$pi*dnorm(x, fit_em$mu1, fit_em$sd1) +
(1-fit_em$pi)*dnorm(x, fit_em$mu2, fit_em$sd2),
add = TRUE, lwd = 2)
data(faithful) # eruptions & waiting
y <- scale(faithful$eruptions) # 1D for illustration
fit_faithful <- em_gmm1d(as.numeric(y))
fit_faithful
## $pi
## [1] 0.3484092
##
## $mu1
## [1] -1.287193
##
## $sd1
## [1] 0.2064523
##
## $mu2
## [1] 0.688269
##
## $sd2
## [1] 0.3829148
##
## $iters
## [1] 18
##
## $loglik
## [1] -240.3934
##
## $gamma1
## [1] 5.407820e-10 9.999998e-01 1.762881e-06 9.999407e-01 2.245765e-25
## [6] 1.577371e-01 1.226519e-28 5.407820e-10 9.999992e-01 5.610554e-22
## [11] 9.999998e-01 1.118069e-14 2.485549e-19 9.999999e-01 1.226519e-28
## [16] 9.999889e-01 9.999999e-01 1.150578e-30 1.000000e+00 3.366285e-20
## [21] 9.999998e-01 9.999999e-01 5.693844e-08 2.249421e-03 2.245765e-25
## [26] 5.407820e-10 9.999991e-01 2.360405e-17 1.216285e-13 1.702232e-23
## [31] 4.415827e-21 3.964241e-24 6.619437e-07 1.566538e-16 2.208432e-13
## [36] 9.999984e-01 9.999997e-01 2.396530e-31 9.999998e-01 2.567724e-30
## [41] 5.610554e-22 9.999996e-01 5.007786e-26 9.999999e-01 2.245765e-25
## [46] 2.780950e-06 2.208432e-13 9.999954e-01 2.607595e-27 9.999987e-01
## [51] 1.150578e-30 5.860851e-29 9.999998e-01 2.396530e-31 9.999999e-01
## [56] 2.166457e-32 1.171986e-11 9.999999e-01 5.007786e-26 2.197484e-21
## [61] 9.999718e-01 9.501288e-25 9.999999e-01 1.150578e-30 9.999998e-01
## [66] 6.904497e-23 9.138711e-19 1.226519e-28 9.999970e-01 1.226519e-28
## [71] 1.566538e-16 9.999991e-01 9.501288e-25 5.368267e-16 9.999989e-01
## [76] 2.371539e-36 9.999984e-01 5.007786e-26 3.780937e-14 5.407820e-10
## [81] 3.444806e-18 1.135544e-21 1.231298e-17 9.743424e-01 4.340309e-17
## [86] 1.896924e-33 3.378847e-15 4.527272e-25 9.999889e-01 5.368267e-16
## [91] 9.999824e-01 1.135544e-21 9.999997e-01 5.136658e-31 9.999998e-01
## [96] 4.415827e-21 5.567522e-28 3.853295e-12 9.999997e-01 9.499220e-33
## [101] 9.984077e-01 2.761879e-22 9.999954e-01 9.501288e-25 8.260994e-17
## [106] 9.999997e-01 1.226519e-28 9.999999e-01 1.062272e-31 3.633572e-11
## [111] 2.664688e-29 9.999231e-01 9.499220e-33 3.362142e-23 9.999999e-01
## [116] 2.607595e-27 9.998999e-01 1.150703e-26 9.999998e-01 3.362142e-23
## [121] 9.811038e-01 4.340309e-17 3.366285e-20 9.999991e-01 1.150703e-26
## [126] 2.160778e-12 9.999995e-01 9.501288e-25 9.999534e-01 1.207126e-27
## [131] 9.999997e-01 9.138711e-19 5.441270e-01 1.135544e-21 9.999998e-01
## [136] 1.412684e-22 9.999996e-01 1.896924e-33 9.999980e-01 6.846227e-12
## [141] 6.666826e-20 9.999718e-01 2.245765e-25 5.136658e-31 1.135544e-21
## [146] 9.999989e-01 2.607595e-27 9.999984e-01 4.413904e-37 9.999998e-01
## [151] 1.321439e-35 5.368267e-16 9.996178e-01 1.150703e-26 1.544027e-09
## [156] 5.368267e-16 9.501288e-25 2.360405e-17 9.999998e-01 1.814207e-15
## [161] 9.999824e-01 1.777569e-18 9.999987e-01 2.208432e-13 1.244931e-08
## [166] 2.458872e-26 9.997780e-01 6.902243e-35 9.999994e-01 5.365220e-27
## [171] 9.999995e-01 9.999963e-01 2.458872e-26 1.762881e-06 9.138711e-19
## [176] 1.135544e-21 9.501288e-25 9.994916e-01 5.368267e-16 9.138711e-19
## [181] 9.999996e-01 2.458872e-26 3.366285e-20 2.160778e-12 9.999980e-01
## [186] 1.702232e-23 2.360405e-17 9.999998e-01 3.362142e-23 9.999861e-01
## [191] 1.150578e-30 9.999998e-01 1.150578e-30 1.231298e-17 1.881994e-15
## [196] 6.666826e-20 1.244931e-08 2.879751e-22 9.999638e-01 5.567522e-28
## [201] 9.999954e-01 5.610554e-22 3.444806e-18 9.999997e-01 1.150703e-26
## [206] 9.999999e-01 2.761879e-22 1.216285e-13 9.999994e-01 9.501288e-25
## [211] 9.997116e-01 1.226519e-28 9.999997e-01 2.208432e-13 1.526092e-07
## [216] 6.666826e-20 9.996178e-01 1.150578e-30 9.999987e-01 1.777569e-18
## [221] 9.999997e-01 1.693478e-20 9.999999e-01 1.986668e-24 5.368267e-16
## [226] 6.399360e-18 2.360405e-17 1.693478e-20 1.118069e-14 1.062445e-25
## [231] 2.360405e-17 9.994916e-01 4.869402e-19 9.999776e-01 8.229832e-24
## [236] 9.999996e-01 9.999997e-01 8.840873e-21 3.378847e-15 9.998713e-01
## [241] 1.777569e-18 9.998312e-01 1.896924e-33 1.125084e-01 2.458872e-26
## [246] 2.208432e-13 9.999963e-01 2.761879e-22 9.999930e-01 5.610554e-22
## [251] 9.999824e-01 8.229832e-24 1.544027e-09 9.501288e-25 1.777569e-18
## [256] 3.858606e-13 1.118069e-14 8.229832e-24 9.999987e-01 8.840873e-21
## [261] 5.447641e-30 2.245765e-25 9.999997e-01 3.366285e-20 9.999989e-01
## [266] 9.999638e-01 1.207060e-29 6.399360e-18 9.999912e-01 3.362142e-23
## [271] 9.999998e-01 3.964241e-24
# Soft assignments -> cluster labels
cl <- ifelse(fit_faithful$gamma1 > 0.5, 1, 2)
plot(faithful$waiting, faithful$eruptions, pch = 19, cex = 0.7,
col = c("#1b9e77","#d95f02")[cl],
xlab = "Waiting time (min)", ylab = "Eruption duration (min)",
main = "Old Faithful: EM-based soft clustering (2 components)")
legend("topleft", c("Comp 1","Comp 2"), pch=19, col=c("#1b9e77","#d95f02"), bty="n")
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Calcutta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.35 R6_2.5.1 fastmap_1.2.0 xfun_0.50
## [5] cachem_1.1.0 knitr_1.48 htmltools_0.5.8.1 rmarkdown_2.27
## [9] lifecycle_1.0.4 cli_3.6.2 vctrs_0.6.5 sass_0.4.9
## [13] jquerylib_0.1.4 compiler_4.4.0 highr_0.11 rstudioapi_0.17.1
## [17] tools_4.4.0 evaluate_0.24.0 bslib_0.8.0 yaml_2.3.10
## [21] rlang_1.1.4 jsonlite_1.8.8