<- c(1, 3, 2, 5)
x
x#> [1] 1 3 2 5
Chapter 1 — Introduction. Chapter 1 presents statistical learning as a broad toolkit for understanding data and modeling relationships, bridging statistics and computer science to support data-driven decision-making. It distinguishes supervised learning, where we predict an outcome 𝑌 Y from inputs 𝑋 X (e.g., regression for numeric targets and classification for categorical targets), from unsupervised learning, which uncovers structure without labeled outcomes (e.g., clustering or association rules such as “people who buy X also buy Y”). The chapter grounds these ideas with real-world examples like predicting sales from advertising or segmenting customers, and briefly traces the field’s growth alongside the explosion of data across biology, finance, and marketing. It also outlines the book’s roadmap—covering linear regression, classification, and unsupervised methods—and sets expectations for hands-on implementation in R (or Python in other editions).
Chapter 2 — Statistical Learning. Chapter 2 formalizes the core framework 𝑌 = 𝑓 ( 𝑋 ) + 𝜀 Y=f(X)+ε, emphasizing that statistical learning aims either to predict 𝑌 Y accurately or to infer how 𝑋 X relates to 𝑌 Y. It contrasts parametric methods, which assume a specific functional form (e.g., linear models) and estimate a small set of parameters, with non-parametric methods, which are more flexible but need more data and can overfit. The chapter highlights the tension between prediction accuracy and interpretability (flexible models may predict well yet be harder to explain) and develops the bias–variance trade-off, where optimal test performance balances underfitting (high bias) and overfitting (high variance) plus irreducible error. It revisits regression versus classification, underscores that training error is optimistic, and stresses using test error or cross-validation for honest assessment. Finally, it flags the curse of dimensionality, noting that as predictors proliferate, local methods and many algorithms degrade unless sample sizes grow commensurately.
= c(1, 6, 2)
x = c(1, 4, 3)
y length(x)
#> [1] 3
length(y)
#> [1] 3
+ y
x #> [1] 2 10 5
<- matrix(c(1,2,3,4),2,2)
x
x#> [,1] [,2]
#> [1,] 1 3
#> [2,] 2 4
sqrt(x)
#> [,1] [,2]
#> [1,] 1.000000 1.732051
#> [2,] 1.414214 2.000000
^2
x#> [,1] [,2]
#> [1,] 1 9
#> [2,] 4 16
Random Numbers
set.seed(1303)
rnorm(50)
#> [1] -1.1439763145 1.3421293656 2.1853904757 0.5363925179 0.0631929665
#> [6] 0.5022344825 -0.0004167247 0.5658198405 -0.5725226890 -1.1102250073
#> [11] -0.0486871234 -0.6956562176 0.8289174803 0.2066528551 -0.2356745091
#> [16] -0.5563104914 -0.3647543571 0.8623550343 -0.6307715354 0.3136021252
#> [21] -0.9314953177 0.8238676185 0.5233707021 0.7069214120 0.4202043256
#> [26] -0.2690521547 -1.5103172999 -0.6902124766 -0.1434719524 -1.0135274099
#> [31] 1.5732737361 0.0127465055 0.8726470499 0.4220661905 -0.0188157917
#> [36] 2.6157489689 -0.6931401748 -0.2663217810 -0.7206364412 1.3677342065
#> [41] 0.2640073322 0.6321868074 -1.3306509858 0.0268888182 1.0406363208
#> [46] 1.3120237985 -0.0300020767 -0.2500257125 0.0234144857 1.6598706557
set.seed(3)
<- rnorm(100)
y mean(y)
#> [1] 0.01103557
var(y)
#> [1] 0.7328675
sd(y)
#> [1] 0.8560768
Graphics
<- rnorm(100)
x <- rnorm(100)
y plot(x, y, xlab="X-axis", ylab="Y-axis", main="Scatterplot")
pdf("Figure.pdf")
plot(x, y, col="green")
dev.off()
#> png
#> 2
Sequencing and More Plots
<- seq(1,10)
x <- 1:10
x <- seq(-pi, pi, length=50)
x <- x
y <- outer(x, y, function(x,y) cos(y)/(1+x^2))
f contour(x, y, f)
image(x, y, f)
persp(x, y, f)
Matrix Operations
<- matrix(1:16, 4, 4)
A 2,3]
A[#> [1] 10
c(1,3), c(2,4)]
A[#> [,1] [,2]
#> [1,] 5 13
#> [2,] 7 15
1:3,2:4]
A[#> [,1] [,2] [,3]
#> [1,] 5 9 13
#> [2,] 6 10 14
#> [3,] 7 11 15
dim(A)
#> [1] 4 4
%*% A
A #> [,1] [,2] [,3] [,4]
#> [1,] 90 202 314 426
#> [2,] 100 228 356 484
#> [3,] 110 254 398 542
#> [4,] 120 280 440 600
Loading Data
#install.packages("ISLR2")
library(ISLR2)
<- ISLR2::Auto
Auto str(Auto)
#> 'data.frame': 392 obs. of 9 variables:
#> $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
#> $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
#> $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
#> $ horsepower : int 130 165 150 150 140 198 220 215 225 190 ...
#> $ weight : int 3504 3693 3436 3433 3449 4341 4354 4312 4425 3850 ...
#> $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
#> $ year : int 70 70 70 70 70 70 70 70 70 70 ...
#> $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
#> $ name : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...
#> - attr(*, "na.action")= 'omit' Named int [1:5] 33 127 331 337 355
#> ..- attr(*, "names")= chr [1:5] "33" "127" "331" "337" ...
fix(Auto)
dim(Auto)
#> [1] 392 9
names(Auto)
#> [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
#> [6] "acceleration" "year" "origin" "name"
Plots on Data
plot(Auto$cylinders, Auto$mpg)
attach(Auto)
plot(cylinders, mpg)
<- as.factor(cylinders)
cylinders plot(cylinders, mpg, col="red", varwidth=T, xlab="Cylinders", ylab="MPG")
hist(mpg, col=2, breaks=15)
##Pairs and Summary
pairs(Auto)
pairs(~ mpg + displacement + horsepower + weight + acceleration, Auto)
plot(horsepower, mpg)
identify(horsepower, mpg, name)
#> integer(0)
summary(Auto)
#> mpg cylinders displacement horsepower weight
#> Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
#> 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
#> Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
#> Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
#> 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
#> Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
#>
#> acceleration year origin name
#> Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
#> 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
#> Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
#> Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
#> 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
#> Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
#> (Other) :365
Lab 1.1 — Train/Test Split & Simple vs. Flexible Fits
set.seed(123)
<- 200
n <- sort(runif(n, -3, 3))
x <- function(x) sin(x) + 0.3 * x
f_true <- f_true(x) + rnorm(n, sd = 0.4)
y <- data.frame(x, y)
dat
# split 70/30
<- sample(seq_len(n), size = floor(0.7*n))
idx <- dat[idx, ]; test <- dat[-idx, ]
train
# two models: linear vs polynomial
<- lm(y ~ x, data = train)
m_lin <- lm(y ~ poly(x, 5), data = train)
m_poly
<- predict(m_lin, newdata = test)
pred_lin <- predict(m_poly, newdata = test)
pred_poly
<- function(a, b) sqrt(mean((a - b)^2))
rmse data.frame(
Model = c("Linear", "5th-degree Polynomial"),
RMSE = c(rmse(test$y, pred_lin), rmse(test$y, pred_poly))
)
Model | RMSE |
---|---|
Linear | 0.5105007 |
5th-degree Polynomial | 0.3241120 |
plot(train$x, train$y, pch = 19, col = "gray50", main = "Training Data & Fits")
<- order(train$x)
ord lines(train$x[ord], f_true(train$x[ord]), lwd = 2, col = "black")
lines(train$x[ord], fitted(m_lin)[ord], lwd = 2, col = "steelblue")
lines(train$x[ord], fitted(m_poly)[ord], lwd = 2, col = "tomato")
legend("topleft", bty = "n",
legend = c("Truth", "Linear", "Poly (deg 5)"),
col = c("black", "steelblue", "tomato"), lwd = 2)
library(pROC)
library(caret)
library(dplyr)
library(tidyr)
library(ggplot2)
library(class)
Labs — Chapter 2
Lab 2.1 — Classification via Logistic Regression vs. KNN
Objective: Compare a parametric classifier vs. a nonparametric one.
set.seed(2024)
<- 400
n <- rnorm(n); x2 <- rnorm(n)
x1 # nonlinear boundary
<- plogis(1.2*x1 - 0.8*x2 + 0.9*x1*x2)
p <- factor(ifelse(runif(n) < p, "Yes", "No"), levels = c("No", "Yes"))
y <- data.frame(x1, x2, y)
df
<- createDataPartition(df$y, p = 0.7, list = FALSE)
idx <- df[idx, ]; test <- df[-idx, ]
train
<- trainControl(method="cv", number=10, classProbs=TRUE, summaryFunction=twoClassSummary)
ctrl
<- train(y ~ ., data = train, method = "glm", family = binomial(),
m_logit trControl = ctrl, metric = "ROC")
<- train(y ~ ., data = train, method = "knn",
m_knn tuneGrid = expand.grid(k = c(3,5,7,9,11,15)),
trControl = ctrl, metric = "ROC")
<- resamples(list(Logistic = m_logit, KNN = m_knn))
resamps summary(resamps)
#>
#> Call:
#> summary.resamples(object = resamps)
#>
#> Models: Logistic, KNN
#> Number of resamples: 10
#>
#> ROC
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> Logistic 0.6510417 0.7111673 0.7968750 0.7750919 0.8151042 0.8802083 0
#> KNN 0.5859375 0.7239583 0.7851562 0.7674479 0.8248698 0.9114583 0
#>
#> Sens
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> Logistic 0.4166667 0.5000000 0.5833333 0.60 0.6666667 0.8333333 0
#> KNN 0.3333333 0.4166667 0.5416667 0.55 0.7291667 0.7500000 0
#>
#> Spec
#> Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#> Logistic 0.6875 0.7500000 0.78125 0.8011029 0.8621324 0.9375 0
#> KNN 0.6250 0.7169118 0.75000 0.7893382 0.8593750 1.0000 0
<- summary(resamps)$statistics$ROC
S <- which.max(S[,"Mean"])
best_idx <- resamps$models[best_idx] # <-- FIX: no names()
best_name cat("Best by CV ROC:", best_name, "\n")
#> Best by CV ROC: Logistic
# map name -> fitted model
<- list(Logistic = m_logit, KNN = m_knn)
models_map stopifnot(best_name %in% names(models_map))
<- models_map[[best_name]]
best_fit
# ensure factor order is c("No","Yes")
$y <- factor(test$y, levels = c("No","Yes"))
test<- levels(test$y)[2] # "Yes"
pos
# get probabilities for the positive class
<- predict(best_fit, newdata = test, type = "prob")[, pos]
probs stopifnot(length(probs) == nrow(test)) # sanity check
# ROC / AUC
<- roc(response = test$y, predictor = probs, levels = c("No","Yes"), direction = "<")
roc_obj print(auc(roc_obj))
#> Area under the curve: 0.7523
# confusion matrix @ 0.5
<- factor(ifelse(probs >= 0.5, "Yes", "No"), levels = c("No","Yes"))
pred_05 print(confusionMatrix(pred_05, test$y, positive = "Yes"))
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 25 12
#> Yes 26 56
#>
#> Accuracy : 0.6807
#> 95% CI : (0.589, 0.7631)
#> No Information Rate : 0.5714
#> P-Value [Acc > NIR] : 0.009564
#>
#> Kappa : 0.3249
#>
#> Mcnemar's Test P-Value : 0.034955
#>
#> Sensitivity : 0.8235
#> Specificity : 0.4902
#> Pos Pred Value : 0.6829
#> Neg Pred Value : 0.6757
#> Prevalence : 0.5714
#> Detection Rate : 0.4706
#> Detection Prevalence : 0.6891
#> Balanced Accuracy : 0.6569
#>
#> 'Positive' Class : Yes
#>
# confusion matrix @ Youden-optimal threshold
<- as.numeric(coords(roc_obj, "best", ret = "threshold", best.method = "youden"))
thr <- factor(ifelse(probs >= thr, "Yes", "No"), levels = c("No","Yes"))
pred_best cat(sprintf("Youden threshold: %.3f\n", thr))
#> Youden threshold: 0.621
print(confusionMatrix(pred_best, test$y, positive = "Yes"))
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 36 20
#> Yes 15 48
#>
#> Accuracy : 0.7059
#> 95% CI : (0.6154, 0.7858)
#> No Information Rate : 0.5714
#> P-Value [Acc > NIR] : 0.001747
#>
#> Kappa : 0.4068
#>
#> Mcnemar's Test P-Value : 0.498962
#>
#> Sensitivity : 0.7059
#> Specificity : 0.7059
#> Pos Pred Value : 0.7619
#> Neg Pred Value : 0.6429
#> Prevalence : 0.5714
#> Detection Rate : 0.4034
#> Detection Prevalence : 0.5294
#> Balanced Accuracy : 0.7059
#>
#> 'Positive' Class : Yes
#>
Lab 2.2 — Bias–Variance Demo (KNN)
library(FNN)
set.seed(99)
<- c(1,3,5,7,11,21)
ks <- data.frame(x1 = seq(-3,3,length=75), x2 = seq(-3,3,length=75))
test_grid <- expand.grid(x1 = test_grid$x1, x2 = test_grid$x2)
test_grid
<- lapply(ks, function(k) {
bv <- knn(train = as.matrix(train[,c("x1","x2")]),
pr test = as.matrix(test[,c("x1","x2")]),
cl = train$y, k = k, prob = TRUE)
<- mean(pr == test$y)
acc data.frame(k = k, Test_Accuracy = acc)
})do.call(rbind, bv)
k | Test_Accuracy |
---|---|
1 | 0.7142857 |
3 | 0.7394958 |
5 | 0.6974790 |
7 | 0.6974790 |
11 | 0.7058824 |
21 | 0.7310924 |
# Optionally, plot for different k in bv to show bias-variance
for (k in ks) {
<- knn(train = as.matrix(train[,c("x1","x2")]),
pr_grid test = as.matrix(test_grid[,c("x1","x2")]),
cl = train$y, k = k)
paste0("pred_k", k)]] <- pr_grid
test_grid[[# Then plot similarly
}
$y <- factor(train$y, levels = c("No","Yes"))
train$y <- factor(test$y, levels = c("No","Yes"))
test
<- c(1,3,5,7,11,21)
ks
<- lapply(ks, function(k) {
pred_grid_long <- class::knn(
pr train = as.matrix(train[, c("x1","x2")]),
test = as.matrix(test_grid[, c("x1","x2")]),
cl = train$y,
k = k,
prob = TRUE
)
<- attr(pr, "prob")
p_win
<- ifelse(pr == "Yes", p_win, 1 - p_win)
p_yes
tibble(
x1 = test_grid$x1,
x2 = test_grid$x2,
k = factor(k, levels = ks, labels = paste0("k = ", ks)),
pred = pr,
p_yes = p_yes
)%>% bind_rows()
})
ggplot(pred_grid_long, aes(x = x1, y = x2)) +
geom_raster(aes(fill = pred), alpha = 0.35, interpolate = TRUE) +
stat_contour(aes(z = p_yes), breaks = 0.5, linetype = "dashed") +
geom_point(data = train, aes(color = y), size = 1.2, alpha = 0.8) +
coord_equal() +
facet_wrap(~ k, ncol = 3) +
scale_fill_manual(values = c("No" = "#7fa1c3", "Yes" = "#e28d8d")) +
scale_color_manual(values = c("No" = "#2c5282", "Yes" = "#b22222")) +
labs(title = "KNN Decision Boundaries for Different k",
fill = "Predicted", color = "Training class") +
theme_minimal(base_size = 12)
<- lapply(ks, function(k) {
acc_df # Test predictions
<- class::knn(
pr_test train = as.matrix(train[, c("x1","x2")]),
test = as.matrix(test[, c("x1","x2")]),
cl = train$y,
k = k
)<- mean(pr_test == test$y)
acc_test
<- class::knn(
pr_tr train = as.matrix(train[, c("x1","x2")]),
test = as.matrix(train[, c("x1","x2")]),
cl = train$y,
k = k
)<- mean(pr_tr == train$y)
acc_train
tibble(k = k, Set = c("Train","Test"), Accuracy = c(acc_train, acc_test))
%>% bind_rows()
})
ggplot(acc_df, aes(x = k, y = Accuracy, group = Set, shape = Set)) +
geom_line() +
geom_point(size = 2) +
scale_x_continuous(breaks = ks) +
ylim(0, 1) +
theme_minimal(base_size = 12) +
labs(title = "Bias–Variance Illustration for KNN",
x = "k (number of neighbors)", y = "Accuracy")
References
ames, G., Witten, D., Hastie, T., & Tibshirani, R. (2021). An Introduction to Statistical Learning: with Applications in R (Second Edition). Springer. Available at: https://hastie.su.domains/ISLR2/ISLRv2_corrected_June_2023.pdf Course videos: YouTube playlist at https://www.youtube.com/playlist?list=PL5-da3qGB5IC8_kWZXDcmLx7_n4RTBkAS Slides for Chapter 1: https://web.stanford.edu/~hastie/ISLR2/Slides/Ch1_Introduction.pdf Slides for Chapter 2: https://web.stanford.edu/~hastie/ISLR2/Slides/Ch2_Statistical_Learning.pdf Lab resources: https://web.stanford.edu/~hastie/ISLR2/Labs/Rmarkdown_Notebooks/Ch2-statlearn-lab.Rmd GitHub solutions for exercises: https://github.com/asadoughi/stat-learning