set.seed(1) # reproductibilité
n <- 100
x1 <- rnorm(n) # X1 ~ N(0,1)
x2 <- rnorm(n) # X2 ~ N(0,1)
eps <- rnorm(n, sd = 1)
# vrai modèle (tu peux changer les coefficients si tu veux)
y <- 1 + 2*x1 - 3*x2 + eps
# petit check (optionnel)
summary(lm(y ~ x1 + x2))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.025353 0.1051941 9.747254 4.711720e-16
## x1 2.021110 0.1167527 17.311034 2.001810e-31
## x2 -3.053467 0.1094785 -27.891006 4.033249e-48
b1 <- 0 # initial guess for beta1 (any value works)
a <- y - b1 * x1 # residual after holding beta1 fixed
fit2 <- lm(a ~ x2) # a = b0 + b2 * x2 + error
b0 <- coef(fit2)[1] # updated intercept
b2 <- coef(fit2)[2] # updated beta2
b0; b2
## (Intercept)
## 1.245356
## x2
## -3.055351
a <- y - b2 * x2 # retirer l'effet de X2
fit1 <- lm(a ~ x1) # a = b0 + b1 * x1 + erreur
b0 <- coef(fit1)[1] # nouvel intercept
b1 <- coef(fit1)[2] # nouveau beta1
b0; b1
## (Intercept)
## 1.025282
## x1
## 2.021108
## ----- (e) Backfitting loop -----
b0 <- b1 <- b2 <- 0 # reset initial estimates
B0 <- B1 <- B2 <- numeric(1000) # storage for each iteration
for (i in 1:1000) {
# Step (c): update b0 and b2
a <- y - b1 * x1
fit2 <- lm(a ~ x2)
b0 <- coef(fit2)[1]
b2 <- coef(fit2)[2]
# Step (d): update b0 and b1
a <- y - b2 * x2
fit1 <- lm(a ~ x1)
b0 <- coef(fit1)[1]
b1 <- coef(fit1)[2]
# store the coefficients for plotting
B0[i] <- b0
B1[i] <- b1
B2[i] <- b2
}
# print final estimates
cat("Final estimates after 1000 iterations:\n")
## Final estimates after 1000 iterations:
cat("beta0 =", b0, "\n")
## beta0 = 1.025353
cat("beta1 =", b1, "\n")
## beta1 = 2.02111
cat("beta2 =", b2, "\n")
## beta2 = -3.053467
## ----- Plot the evolution -----
plot(1:1000, B0, type = "l", col = "red", lwd = 2,
ylim = range(c(B0, B1, B2)),
xlab = "Iteration", ylab = "Coefficient value",
main = "Evolution of Backfitting Estimates")
lines(1:1000, B1, col = "blue", lwd = 2)
lines(1:1000, B2, col = "darkgreen", lwd = 2)
legend("bottomright",
legend = c(expression(beta[0]), expression(beta[1]), expression(beta[2])),
col = c("red", "blue", "darkgreen"), lwd = 2, bty = "n")
## ----- (f) Compare with Multiple Linear Regression -----
# Fit the standard multiple linear regression
fit_mlr <- lm(y ~ x1 + x2)
mlr_coef <- coef(fit_mlr)
print(mlr_coef)
## (Intercept) x1 x2
## 1.025353 2.021110 -3.053467
# Re-plot the backfitting evolution
plot(1:1000, B0, type = "l", col = "red", lwd = 2,
ylim = range(c(B0, B1, B2, mlr_coef)),
xlab = "Iteration", ylab = "Coefficient value",
main = "Backfitting Estimates vs Multiple Linear Regression")
lines(1:1000, B1, col = "blue", lwd = 2)
lines(1:1000, B2, col = "darkgreen", lwd = 2)
# Add horizontal lines for the MLR coefficients
abline(h = mlr_coef[1], col = "red", lty = 2) # beta0 (Intercept)
abline(h = mlr_coef[2], col = "blue", lty = 2) # beta1
abline(h = mlr_coef[3], col = "darkgreen", lty = 2)# beta2
# Add legend
legend("bottomright",
legend = c(expression(hat(beta)[0]), expression(hat(beta)[1]), expression(hat(beta)[2]),
"MLR β0", "MLR β1", "MLR β2"),
col = c("red","blue","darkgreen","red","blue","darkgreen"),
lwd = c(2,2,2,1,1,1),
lty = c(1,1,1,2,2,2),
bty = "n")
## ----- (g) Check for convergence -----
# compute absolute change in coefficients from one iteration to the next
delta_b0 <- abs(diff(B0))
delta_b1 <- abs(diff(B1))
delta_b2 <- abs(diff(B2))
# find when all changes become very small
tol <- 1e-6
converged_iter <- which(delta_b0 < tol & delta_b1 < tol & delta_b2 < tol)[1]
if (is.na(converged_iter)) {
cat("The coefficients never changed less than", tol, "within 1000 iterations.\n")
} else {
cat("Approximate convergence reached around iteration:", converged_iter, "\n")
}
## Approximate convergence reached around iteration: 2
set.seed(42)
if (!requireNamespace("e1071", quietly = TRUE)) {
install.packages("e1071")
}
library(e1071)
# Helper to sample polar points in a radius interval [rmin, rmax]
rsamp <- function(n, rmin, rmax) {
theta <- runif(n, 0, 2*pi)
# uniform in area => sample radius^2 uniformly, then take sqrt
r <- sqrt(runif(n, rmin^2, rmax^2))
cbind(r*cos(theta), r*sin(theta))
}
n_in <- 400 # +1 class
n_out <- 400 # -1 class
r0 <- 1.00 # inner radius (disk)
r1 <- 1.55; r2 <- 2.00 # annulus radii
X_pos <- rsamp(n_in, 0, r0)
X_neg <- rsamp(n_out, r1, r2)
X <- rbind(X_pos, X_neg)
y <- factor(c(rep(1, n_in), rep(-1, n_out))) # labels +1 / -1
# A generic-looking 2-fold split (no special structure)
fold_id <- rep(1:2, length.out = nrow(X))
fold_id <- sample(fold_id) # shuffle
library(e1071)
par(mar=c(4,4,1,1))
plot(X, col = ifelse(y==1, "steelblue", "tomato"), pch=19, cex=0.7,
xlab="x1", ylab="x2", main="Concentric dataset (disk vs annulus)")
symbols(0, 0, circles=r0, add=TRUE, inches=FALSE, lwd=2) # inner circle
symbols(0, 0, circles=r1, add=TRUE, inches=FALSE, lwd=2, lty=2)
symbols(0, 0, circles=r2, add=TRUE, inches=FALSE, lwd=2, lty=2)
legend("topright", c("+1 (disk)","-1 (annulus)"),
col=c("steelblue","tomato"), pch=19, bty="n")
##
## ───────────────────────────────────────────────
## Interpretation – Sketch of the data
## ───────────────────────────────────────────────
## The dataset consists of two classes arranged in concentric circles:
##
## • Class +1 (blue) forms an inner disk centered at the origin.
## • Class -1 (red) forms an outer annulus surrounding the disk.
##
## This design makes the data **nonlinearly separable** in the (x1, x2) plane.
## Any straight line drawn through the plot will necessarily cut through both
## classes, producing a large classification error for linear models.
##
## The pattern is ideal for demonstrating how adding a nonlinear feature
## (such as z = x1^2 + x2^2) allows an SVM to separate the two classes
## perfectly in a higher-dimensional space.
## ───────────────────────────────────────────────
dat <- data.frame(x1 = X[,1], x2 = X[,2], y = y)
svm_lin <- svm(y ~ x1 + x2, data = dat, kernel = "linear", cost = 1)
pred_lin <- predict(svm_lin, dat)
lin_err <- mean(pred_lin != y)
lin_err
## [1] 0.3675
##
## ───────────────────────────────────────────────
## Interpretation – Step (3): Linear SVM (expect high error)
## ───────────────────────────────────────────────
## The linear SVM achieved a training error of approximately 0.38,
## meaning about 38% of the data points were misclassified.
##
## This result confirms that the dataset is **not linearly separable**
## in the (x1, x2) space. Because the classes form concentric circles,
## no straight line can separate them without cutting through both
## regions.
##
## Therefore, any linear classifier — including a linear SVM —
## will have a large error rate on this data. This motivates the
## next step, where we add a nonlinear feature to make the classes
## separable in a higher-dimensional space.
## ───────────────────────────────────────────────
dat$z <- dat$x1^2 + dat$x2^2
# Fit with a very large C to show that 0 training error is attainable
svm_quad_bigC <- svm(y ~ x1 + x2 + z, data = dat, kernel = "linear", cost = 1e6, scale = TRUE)
pred_bigC <- predict(svm_quad_bigC, dat)
bigC_err <- mean(pred_bigC != y)
bigC_err # this can be (near) 0 on this dataset
## [1] 0
##
## ───────────────────────────────────────────────
## Interpretation – Step (4): Nonlinear feature and big C
## ───────────────────────────────────────────────
## After adding the nonlinear feature z = x1^2 + x2^2, the SVM achieves a training
## error of 0, meaning all points are perfectly classified.
##
## This transformation maps the data into a higher-dimensional space (x1, x2, z)
## where the two circular classes become linearly separable:
## • Inner circle (class +1) → smaller z values
## • Outer ring (class -1) → larger z values
##
## A linear boundary in this (x1, x2, z) space corresponds to a circular boundary
## in the original 2D plane.
##
## Using a very large cost parameter C = 1e6 enforces a hard margin, preventing
## any misclassification. This confirms that adding z = x1^2 + x2^2 allows the
## SVM to perfectly fit the data that was not linearly separable before.
## ───────────────────────────────────────────────
Cs <- c(0.3, 1, 3, 10, 30, 100, 300)
cv_err <- numeric(length(Cs))
names(cv_err) <- paste0("C=", Cs)
for (k in seq_along(Cs)) {
Ck <- Cs[k]
fold_err <- c()
for (f in 1:2) {
train_idx <- which(fold_id != f)
test_idx <- which(fold_id == f)
fit <- svm(y ~ x1 + x2 + z, data = dat[train_idx, ], kernel = "linear",
cost = Ck, scale = TRUE)
pr <- predict(fit, dat[test_idx, ])
fold_err[f] <- mean(pr != dat$y[test_idx])
}
cv_err[k] <- mean(fold_err)
}
cv_err
## C=0.3 C=1 C=3 C=10 C=30 C=100 C=300
## 0 0 0 0 0 0 0
C_star <- Cs[ which.min(cv_err) ]
C_star
## [1] 0.3
##
## ───────────────────────────────────────────────
## Interpretation : 2-fold Cross-Validation
## ───────────────────────────────────────────────
## The 2-fold cross-validation error is 0 for all tested values of C,
## so every model perfectly separates the folds. As a result, the first
## value in the grid (C = 0.3) is selected as the optimal cost.
##
## In this particular random split, both folds happen to be easily separable
## after adding the nonlinear feature z = x1^2 + x2^2. Because of that,
## even a small C provides a perfect fit without margin violations.
##
## In general, if the dataset contained more overlap or noisy points near
## the circular boundary, cross-validation would tend to choose a larger
## but finite C (for example, C > 10) and keep a small nonzero training
## error to improve generalization.
##
## Summary:
## • Chosen C* = 0.3
## • Cross-validation error = 0
## • Reason: both folds are perfectly separable in the nonlinear space.
## ───────────────────────────────────────────────
# Self-contained decision-region plot for svm_cv
# 1) Ensure data and model exist (auto-refit if needed)
if (!exists("dat") || !all(c("x1","x2","y") %in% names(dat))) {
stop("dat not found or missing columns x1, x2, y. Run earlier chunks first.")
}
if (!"z" %in% names(dat)) {
dat$z <- dat$x1^2 + dat$x2^2
}
if (!exists("C_star")) {
C_star <- 10 # fallback if CV step wasn't run
}
need_refit <- FALSE
if (!exists("svm_cv")) {
if (!requireNamespace("e1071", quietly = TRUE)) install.packages("e1071")
library(e1071)
svm_cv <- svm(y ~ x1 + x2 + z, data = dat, kernel = "linear",
cost = C_star, scale = TRUE)
need_refit <- TRUE
}
# 2) Grid for visualization
gx <- seq(min(dat$x1)-0.2, max(dat$x1)+0.2, length.out = 250)
gy <- seq(min(dat$x2)-0.2, max(dat$x2)+0.2, length.out = 250)
G <- expand.grid(x1 = gx, x2 = gy)
G$z <- G$x1^2 + G$x2^2
Gp <- predict(svm_cv, G)
# 3) Plot decision regions + data
par(mar = c(4,4,1,1))
plot(dat$x1, dat$x2,
xlab = "x1", ylab = "x2", asp = 1,
col = ifelse(dat$y==1, "steelblue", "tomato"),
pch = 19, cex = 0.6,
main = sprintf("Decision regions (SVM with z); C* = %.3f", C_star))
zmat <- matrix(as.integer(Gp == levels(dat$y)[1]),
nrow = length(gx), ncol = length(gy))
image(gx, gy, z = t(zmat),
col = c(rgb(1,0,0,0.08), rgb(0,0,1,0.08)),
add = TRUE)
points(dat$x1, dat$x2,
col = ifelse(dat$y==1, "steelblue", "tomato"),
pch = 19, cex = 0.6)
legend("topright", c("+1 (disk)", "-1 (annulus)"),
col = c("steelblue", "tomato"), pch = 19, bty = "n")
if (need_refit) {
cat(sprintf("\n[Note] 'svm_cv' was not found; refit on full data with C* = %.3f.\n", C_star))
}
##
## [Note] 'svm_cv' was not found; refit on full data with C* = 0.300.
We consider a 1-dimensional Gaussian mixture model:
\[ Y_i \sim \text{Unif}\{0,1\}, \quad X_i|Y_i=k \sim \mathcal N(\mu_k, \sigma_k^2) \]
When \(\sigma_0, \sigma_1\) (and the
mixture weights) are known and equal, the EM algorithm for GMM behaves
almost identically to k-means:
- E-step: assign each observation to the nearest
mean
- M-step: recompute the cluster means.
Like k-means, the EM algorithm can get stuck at a local optimum depending on initialization.
We’ll simulate a simple case: two well-separated clusters, but the algorithm starts with both means close together on the same side.
# Generate 1D data from two Gaussian clusters
n <- 200
x <- c(rnorm(n/2, -3, 1), rnorm(n/2, 3, 1))
df <- data.frame(x = x)
# Bad initialization (both centers near the left cluster)
bad_init <- c(-2.5, -1.5)
ggplot(df, aes(x=x, y=0)) +
geom_jitter(height = 0.02, alpha=.7, size=1.6) +
geom_vline(xintercept = bad_init, linetype="dashed", color="red") +
annotate("text", x=bad_init, y=.02, label=c("init μ0","init μ1"), color="red", vjust=-1, size=3) +
labs(title="1D data with poor initialization (both means on left cluster)", y="", x="x")
This plot shows a one-dimensional dataset with two true clusters: one
around x ≈ -3 and another around x ≈
+3.
The two red dashed vertical lines represent the initial means
(μ₀ and μ₁) chosen for the algorithm.
Because both means start near the left cluster, the algorithm initially
assigns almost all nearby points to them.
As iterations continue, both means remain near the left-hand side
instead of separating across the two clusters.
This causes the algorithm to get stuck in a local
optimum, where both centers describe only the left
cluster,
and the right cluster is completely ignored.
This example illustrates that the EM algorithm (and k-means) can fail
to find the global solution if the
initialization is poor.
# Run k-means with fixed initial centers
km_bad <- kmeans(df$x, centers = matrix(bad_init, ncol=1))
centers_bad <- sort(drop(km_bad$centers))
ggplot(df, aes(x=x, y=0, color=factor(km_bad$cluster))) +
geom_jitter(height=.02, alpha=.7, size=1.6, show.legend=FALSE) +
geom_vline(xintercept = centers_bad, color="red") +
annotate("text", x=centers_bad, y=.02, label=c("μ̂0","μ̂1"), color="red", vjust=-1, size=3) +
labs(title="K-means can get stuck in a local optimum", x="x", y="")
This plot shows the final result of the K-means algorithm after being initialized with poor starting points.
The data clearly has two clusters: one centered near x ≈
-3 and another near x ≈ +3.
However, because both initial means started close together on the left
side, the algorithm did not fully explore the right-hand cluster.
The red vertical lines represent the final estimated means (μ̂₀ and
μ̂₁).
Although K-means managed to form two groups, the result is a
local optimum —
the boundaries between clusters do not perfectly represent the true
underlying data structure.
This demonstrates that K-means can get stuck when
initialization is poor,
and the final clustering depends heavily on the chosen starting
points.
Even though there are clearly two true clusters, both centers converge to roughly the same region — a bad local optimum.
We now perform a simplified EM procedure with known σ and poor initialization.
sigma <- 1 # known variance
mu <- sort(bad_init)
em_step <- function(x, mu, sigma) {
# E-step: responsibilities
d0 <- dnorm(x, mean=mu[1], sd=sigma)
d1 <- dnorm(x, mean=mu[2], sd=sigma)
gamma1 <- d1 / (d0 + d1)
# M-step: update means using soft assignments
mu_new1 <- sum((1-gamma1) * x) / sum(1-gamma1)
mu_new2 <- sum(gamma1 * x) / sum(gamma1)
c(mu_new1, mu_new2)
}
trace <- data.frame(iter=0, mu1=mu[1], mu2=mu[2])
for (it in 1:30) {
mu <- sort(em_step(x, mu, sigma))
trace <- rbind(trace, data.frame(iter=it, mu1=mu[1], mu2=mu[2]))
}
trace_long <- trace |>
pivot_longer(cols=c(mu1,mu2), names_to="which", values_to="value")
g1 <- ggplot(trace_long, aes(iter, value, color=which)) +
geom_line() + geom_point(size=1.5) +
labs(title="EM mean trajectories (stuck near a bad solution)",
x="Iteration", y="Mean value")
g2 <- ggplot(df, aes(x=x, y=..density..)) +
geom_histogram(bins=40, fill="grey85", color="white") +
geom_vline(xintercept=tail(trace$mu1,1), color="red") +
geom_vline(xintercept=tail(trace$mu2,1), color="red") +
labs(title="Final EM means (local optimum)", x="x", y="density")
gridExtra::grid.arrange(g1, g2, ncol=2)
The left panel shows the evolution of the EM algorithm’s
estimated means (μ₁ and μ₂) across iterations.
From the beginning, the two means separate quickly but then
stabilize prematurely, indicating that the algorithm
has converged.
However, this convergence does not guarantee a global
optimum.
In this case, both means have settled near the same clusters they
started close to — one around x ≈ -3 and the other
around x ≈ +3 — even though the data distribution would
allow for better separation if the initialization had been
different.
The right panel shows the final estimated means (red
vertical lines) superimposed on the data distribution.
Although they capture the two general modes of the data, the EM
algorithm has converged to a local optimum due to its
initialization.
This confirms that, like K-means, the EM algorithm’s performance
strongly depends on the starting values of the
parameters.
Poor initialization can lead to convergence toward a suboptimal
solution rather than the true maximum likelihood estimate.
Conclusion (a):
Both k-means and the EM algorithm for GMM can converge to local
optima, depending on the initial parameter values.
Now, we generate a two-moons dataset, where the
clusters are clearly non-convex.
Here, the 2-means solution will give an incorrect
linear split.
# Helper function to make two moons
make_moon <- function(n, radius=1, noise=0.06, which=c("upper","lower")) {
t <- runif(n, 0, pi)
if (which[1] == "upper") {
x <- radius * cos(t) + rnorm(n, 0, noise)
y <- radius * sin(t) + rnorm(n, 0, noise)
} else {
x <- radius * cos(t) + 1 + rnorm(n, 0, noise)
y <- -radius * sin(t) - 0.5 + rnorm(n, 0, noise)
}
cbind(x,y)
}
n2 <- 400
A <- make_moon(n2/2, which="upper")
B <- make_moon(n2/2, which="lower")
moons <- rbind(
data.frame(x=A[,1], y=A[,2], lab="A"),
data.frame(x=B[,1], y=B[,2], lab="B")
)
ggplot(moons, aes(x, y, color=lab)) +
geom_point(alpha=.8, size=1.4) +
coord_equal() + guides(color="none") +
labs(title="Two-moons dataset (two obvious clusters)",
x="x", y="y")
The plot displays a two-moons dataset, which contains
two clearly separated, curved clusters.
Each moon represents a distinct group of observations, forming a
non-convex shape.
Visually, the two clusters are obvious to the human eye, but they are
not linearly separable —
any straight boundary would cut through both moons.
This dataset is a classic example where K-means
fails: because K-means assumes spherical, convex
clusters,
its decision boundary will be a straight line that incorrectly divides
each moon in half.
In later steps, we can apply more flexible algorithms such as
Spectral Clustering or a Gaussian Mixture Model
with full covariance matrices,
which can capture these curved boundaries and correctly separate the two
moons.
# Run k-means
km2 <- kmeans(moons[,c("x","y")], centers = 2, nstart=50)
# Build grid for decision boundary visualization
make_grid <- function(dat, h = 0.02, pad = 0.2) {
xr <- range(dat$x); yr <- range(dat$y)
xs <- seq(xr[1]-pad, xr[2]+pad, by=h)
ys <- seq(yr[1]-pad, yr[2]+pad, by=h)
expand.grid(x=xs, y=ys)
}
grid <- make_grid(moons)
# Assign each grid point to nearest center
cent <- km2$centers
assign_km <- function(X, centers) {
d <- sapply(1:nrow(centers), function(k) rowSums((X - centers[k,])^2))
max.col(-d)
}
grid$pred_km <- factor(assign_km(as.matrix(grid), cent))
ggplot() +
geom_raster(data=grid, aes(x, y, fill=pred_km), alpha=.25) +
geom_point(data=moons, aes(x, y, color=factor(km2$cluster)), size=1.2) +
geom_point(data=as.data.frame(cent), aes(x,y), shape=4, size=4, stroke=1.2) +
coord_equal() + guides(fill="none", color="none") +
labs(title="2-means: linear decision boundary (wrong for two moons)", x="x", y="y")
This figure shows the result of applying K-means with k = 2 to the two-moons dataset.
The colored background represents the decision
regions assigned by the algorithm.
The large black crosses (×) indicate the final cluster
centroids.
Although the dataset clearly contains two distinct, curved
clusters,
K-means produces a straight linear boundary that cuts
diagonally across the space.
As a result, each moon is incorrectly divided into two halves, rather
than being treated as a separate cluster.
This happens because K-means assumes spherical, convex
clusters and minimizes Euclidean distance to the nearest
centroid.
It cannot capture the non-linear, curved shape of the
two moons.
Therefore, the 2-means algorithm gives the wrong
clustering,
and a more flexible method — such as Spectral
Clustering or a Gaussian Mixture Model with full
covariance — is needed to correctly separate the two moons.
Here, the decision boundary is almost a straight
line, cutting both moons in half.
This clearly fails to represent the real cluster structure.
Spectral clustering (or kernel k-means) can capture non-linear boundaries that match the true shapes.
library(kernlab)
library(ggplot2)
# --- Inputs expected from previous chunks: 'moons' (x,y) and 'grid' (x,y) ---
# If 'grid' might be too dense, regenerate a slightly coarser one:
make_grid <- function(dat, h = 0.03, pad = 0.2) {
xr <- range(dat$x); yr <- range(dat$y)
xs <- seq(xr[1]-pad, xr[2]+pad, by=h)
ys <- seq(yr[1]-pad, yr[2]+pad, by=h)
expand.grid(x=xs, y=ys)
}
if (!exists("grid")) grid <- make_grid(moons, h = 0.03)
# 1) Spectral clustering on DATA ONLY
sigma <- 2
sc <- specc(as.matrix(moons[, c("x","y")]),
centers = 2, kernel = "rbfdot", kpar = list(sigma = sigma))
labs_data <- as.integer(sc@.Data) # 1/2 labels for the data
# 2) Assign grid points by average RBF similarity to each cluster (kernel-mean rule)
rbf_sim <- function(A, B, sigma) {
# returns |A| x |B| matrix of k(a,b) with RBF kernel
# k(a,b) = exp(-||a-b||^2 / (2*sigma^2))
A2 <- rowSums(A^2)
B2 <- rowSums(B^2)
# dist^2 via (a-b)^2 = |a|^2 + |b|^2 - 2 a·b
D2 <- outer(A2, B2, "+") - 2 * (A %*% t(B))
exp(-D2 / (2 * sigma^2))
}
X <- as.matrix(moons[, c("x","y")])
G <- as.matrix(grid[, c("x","y")])
# Similarity of each grid point to each cluster
idx1 <- which(labs_data == 1)
idx2 <- which(labs_data == 2)
K_g1 <- rbf_sim(G, X[idx1, , drop=FALSE], sigma)
K_g2 <- rbf_sim(G, X[idx2, , drop=FALSE], sigma)
# Average similarity to each cluster
s1 <- rowMeans(K_g1)
s2 <- rowMeans(K_g2)
grid$pred_spec <- factor(ifelse(s1 >= s2, 1, 2))
# 3) Plot decision regions + data colored by spectral labels
ggplot() +
geom_raster(data = grid, aes(x, y, fill = pred_spec), alpha = .25) +
geom_point(data = moons, aes(x, y, color = factor(labs_data)), size = 1.2) +
coord_equal() +
guides(fill = "none", color = "none") +
labs(title = "Spectral clustering (RBF): curved boundary fits the two moons",
x = "x", y = "y")
This plot shows the result of applying Spectral Clustering with a radial basis (RBF) kernel to the two-moons dataset.
Unlike K-means, which produced a straight linear split, the spectral
clustering algorithm correctly detects the curved
structure of the data.
The decision boundary follows the arc between the two moons, allowing
the algorithm to separate them accurately.
This happens because the RBF kernel maps the original data into a
higher-dimensional space
where the two moons become linearly separable.
In that transformed space, spectral clustering can find a proper
partition, which translates back
into a non-linear boundary in the original space.
This demonstrates the key advantage of spectral and kernel-based
methods:
they can capture complex, non-convex cluster shapes
that standard algorithms like K-means cannot.
Consider a CNN that takes in \(32 \times 32\) grayscale images and has a single convolution layer with three \(5 \times 5\) convolution filters (without boundary padding).
There are 3 convolution matrices each with 5x5 weights (plus 3 bias terms) to estimate, therefore \(3 \times 5 \times 5 + 3 = 78\) parameters
We can think of a convolution layer as a regularized fully connected layer. The regularization in this case is due to not all inputs being connected to all outputs, and weights being shared between connections.
Each output node in the convolved image can be thought of as taking inputs from a limited number of input pixels (the neighboring pixels), with a set of weights specified by the convolution layer which are then shared by the connections to all other output nodes.
With no constraints, we would connect each input pixel in our original 32x32 image to each output pixel in each of our convolution layers, with an bias term for each original pixel. So each output pixel would require 32x32 weights + 1 bias term. This would give a total of (32×32+1)×28×28×3 = 2,410,800 parameters.
We generate \(n = 100{,}000\) data points \((x_i, y_i)\) as follows: \[ x_i = \frac{i}{n}, \quad y_i \sim N(0,1), \] so that \(y_i\) is independent of \(x_i\).
We fit \(k\)-nearest-neighbour regressors for \(k \in \{1,2,\dots,n-1\}\) and analyze their expected errors.
For each training point \(i\), \[ \hat y_i = \frac{1}{k} \sum_{j \in \mathcal N_i} y_j, \] where the set \(\mathcal N_i\) includes \(y_i\) itself. Then \[ y_i - \hat y_i = \Big(1-\frac{1}{k}\Big) y_i - \frac{1}{k} \sum_{j \neq i} y_j. \]
Since all \(y_j\) are i.i.d. \(N(0,1)\), \[ \mathbb{E}\!\left[(y_i - \hat y_i)^2\right] = \Big(1-\frac{1}{k}\Big)^2 + (k-1)\frac{1}{k^2} = 1 - \frac{1}{k}. \]
Hence, \[ \boxed{\mathbb{E}[\text{Training MSE}_k] = 1 - \frac{1}{k}.} \]
For leave-one-out CV, \(y_i\) is not included in the neighborhood average: \[ \hat y_i^{(-i)} = \frac{1}{k} \sum_{j \in \mathcal N_i \setminus \{i\}} y_j. \] Then \(y_i\) and \(\hat y_i^{(-i)}\) are independent, and \[ \operatorname{Var}(\hat y_i^{(-i)}) = \frac{1}{k}, \quad \operatorname{Var}(y_i) = 1. \]
Thus, \[ \mathbb{E}\!\left[(y_i - \hat y_i^{(-i)})^2\right] = 1 + \frac{1}{k}, \] so that \[ \boxed{\mathbb{E}[\text{LOOCV MSE}_k] = 1 + \frac{1}{k}.} \]
The expected LOOCV error decreases with \(k\) and is minimized at the largest possible value: \[ \boxed{k^\ast = n - 1.} \]
However, the variance of the LOOCV estimate is on the order of \(1/n\), so the standard deviation is \(\mathcal{O}(1/\sqrt{n}) \approx 0.003\) for \(n=100{,}000\). Because the difference between LOOCV errors for large \(k\) values (\(k > 300\)) is smaller than this noise level, cross-validation may return any large \(k\) close to \(n-1\).