Baseball salary determination is one of the most data-rich problems in sports analytics. The Hitters dataset (ISLR) contains 1986 season and career statistics for 322 Major League Baseball players, along with their 1987 salaries. We use this data to answer: which statistics most strongly predict salary, and how well can statistical learning methods predict it?
library(ISLR)
library(glmnet)
library(randomForest)
library(gbm)
library(e1071)
library(tree)
library(cluster)
library(class)
library(boot)
library(pROC)
# Load Hitters — built into ISLR package
data(Hitters)
cat("Raw dimensions:", dim(Hitters), "\n")#> Raw dimensions: 322 20
#> Missing salaries: 59
# Remove rows with missing Salary (59 players)
Hitters <- na.omit(Hitters)
cat("After removing NAs:", nrow(Hitters), "complete observations\n")#> After removing NAs: 263 complete observations
# Log-transform response (right-skewed salary)
Hitters$LogSalary <- log(Hitters$Salary)
# Binary classification response: above/below median salary
Hitters$HighSalary <- ifelse(Hitters$Salary > median(Hitters$Salary), 1, 0)
# Encode categorical variables as binary indicators
Hitters$League_N <- ifelse(Hitters$League == "N", 1, 0)
Hitters$Division_W <- ifelse(Hitters$Division == "W", 1, 0)
Hitters$NewLeague_N <- ifelse(Hitters$NewLeague == "N", 1, 0)
cat("\nSalary summary:\n")#>
#> Salary summary:
#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 67.5 190.0 425.0 535.9 750.0 2460.0
cat("\nHigh salary count:", sum(Hitters$HighSalary),
"(", round(mean(Hitters$HighSalary)*100, 1), "%)\n")#>
#> High salary count: 129 ( 49 %)
#> AtBat Hits HmRun Years
#> Min. : 19.0 Min. : 1.0 Min. : 0.00 Min. : 1.000
#> 1st Qu.:282.5 1st Qu.: 71.5 1st Qu.: 5.00 1st Qu.: 4.000
#> Median :413.0 Median :103.0 Median : 9.00 Median : 6.000
#> Mean :403.6 Mean :107.8 Mean :11.62 Mean : 7.312
#> 3rd Qu.:526.0 3rd Qu.:141.5 3rd Qu.:18.00 3rd Qu.:10.000
#> Max. :687.0 Max. :238.0 Max. :40.00 Max. :24.000
#> CHits CRuns Salary LogSalary
#> Min. : 4.0 Min. : 2.0 Min. : 67.5 Min. :4.212
#> 1st Qu.: 212.0 1st Qu.: 105.5 1st Qu.: 190.0 1st Qu.:5.247
#> Median : 516.0 Median : 250.0 Median : 425.0 Median :6.052
#> Mean : 722.2 Mean : 361.2 Mean : 535.9 Mean :5.927
#> 3rd Qu.:1054.0 3rd Qu.: 497.5 3rd Qu.: 750.0 3rd Qu.:6.620
#> Max. :4256.0 Max. :2165.0 Max. :2460.0 Max. :7.808
par(mfrow = c(1, 3))
# Raw salary
hist(Hitters$Salary, breaks = 35, col = "steelblue", border = "white",
main = "Raw Salary Distribution\n(Right-skewed)",
xlab = "Salary ($1,000s)")
abline(v = mean(Hitters$Salary), col = "red", lwd = 2, lty = 2)
abline(v = median(Hitters$Salary), col = "orange", lwd = 2, lty = 3)
legend("topright",
legend = c(paste("Mean =", round(mean(Hitters$Salary))),
paste("Median =", round(median(Hitters$Salary)))),
col = c("red","orange"), lty = c(2,3), lwd = 2, bty = "n", cex = 0.85)
# Log salary
hist(Hitters$LogSalary, breaks = 30, col = "seagreen", border = "white",
main = "log(Salary) Distribution\n(Approximately normal)",
xlab = "log(Salary)")
abline(v = mean(Hitters$LogSalary), col = "red", lwd = 2, lty = 2)
# Salary by League
boxplot(Salary ~ League, data = Hitters,
col = c("#AED6F1","#F1948A"),
main = "Salary by League", ylab = "Salary ($1,000s)")Key finding: Raw salary is strongly right-skewed (mean > median). Log transformation produces an approximately normal distribution, satisfying OLS regression assumptions.
par(mfrow = c(3, 3))
scat_vars <- c("Years","CHits","CRuns","Hits","Walks","HmRun","PutOuts","Assists","Errors")
for (v in scat_vars) {
plot(Hitters[[v]], Hitters$LogSalary,
col = ifelse(Hitters$League == "N", "#E84646", "#2E75B6"),
pch = 20, cex = 0.75,
xlab = v, ylab = "log(Salary)",
main = paste(v, "vs log(Salary)"))
abline(lm(Hitters$LogSalary ~ Hitters[[v]]), col = "black", lwd = 1.8)
r_val <- round(cor(Hitters[[v]], Hitters$LogSalary), 2)
legend("topleft", legend = paste("r =", r_val), bty = "n", cex = 0.9)
}num_vars <- c("AtBat","Hits","HmRun","Runs","RBI","Walks","Years",
"CAtBat","CHits","CHmRun","CRuns","CRBI","CWalks",
"PutOuts","Assists","Errors","LogSalary")
corr_mat <- cor(Hitters[, num_vars])
# Print top correlations with LogSalary
cat("Correlations with log(Salary):\n")#> Correlations with log(Salary):
#> LogSalary CRuns CHits CAtBat CRBI CWalks
#> 1.00000000 0.62108935 0.61994113 0.61079998 0.60267128 0.54658066
#> Years CHmRun Hits RBI Walks Runs
#> 0.53737141 0.52394269 0.44958408 0.44415019 0.43243922 0.42559551
#> AtBat HmRun PutOuts Assists Errors
#> 0.41494587 0.33985428 0.22448262 0.04996914 -0.02076464
Key findings:
set.seed(1)
feat_cols <- c("AtBat","Hits","HmRun","Runs","RBI","Walks","Years",
"CAtBat","CHits","CHmRun","CRuns","CRBI","CWalks",
"PutOuts","Assists","Errors",
"League_N","Division_W","NewLeague_N")
X <- as.matrix(Hitters[, feat_cols])
y <- Hitters$LogSalary
# 75/25 train-test split
train_id <- sample(1:nrow(Hitters), floor(0.75 * nrow(Hitters)))
test_id <- setdiff(1:nrow(Hitters), train_id)
X_tr <- X[train_id, ]; X_te <- X[test_id, ]
y_tr <- y[train_id]; y_te <- y[test_id]
# Standardise using training set statistics
X_tr_s <- scale(X_tr)
X_te_s <- scale(X_te,
center = attr(X_tr_s, "scaled:center"),
scale = attr(X_tr_s, "scaled:scale"))
cat("Training:", length(y_tr), "obs\n")#> Training: 197 obs
#> Test: 66 obs
ols_fit <- lm(y_tr ~ X_tr_s)
pred_ols <- cbind(1, X_te_s) %*% coef(ols_fit)
rmse_ols <- sqrt(mean((pred_ols - y_te)^2))
r2_ols <- cor(pred_ols, y_te)^2
cat("OLS — RMSE:", round(rmse_ols, 4), "| R²:", round(r2_ols, 4), "\n")#> OLS — RMSE: 0.6847 | R²: 0.5463
#>
#> Call:
#> lm(formula = y_tr ~ X_tr_s)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -2.07133 -0.45502 0.06055 0.40194 2.69209
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 5.91514 0.04389 134.776 < 2e-16 ***
#> X_tr_sAtBat -0.20264 0.22480 -0.901 0.36857
#> X_tr_sHits 0.28045 0.26769 1.048 0.29622
#> X_tr_sHmRun 0.01162 0.12876 0.090 0.92819
#> X_tr_sRuns -0.01062 0.18042 -0.059 0.95313
#> X_tr_sRBI 0.08773 0.16494 0.532 0.59550
#> X_tr_sWalks 0.14575 0.09082 1.605 0.11029
#> X_tr_sYears 0.12515 0.12211 1.025 0.30684
#> X_tr_sCAtBat -0.04362 0.66112 -0.066 0.94747
#> X_tr_sCHits 0.80287 0.90739 0.885 0.37746
#> X_tr_sCHmRun 0.12882 0.28872 0.446 0.65601
#> X_tr_sCRuns -0.13219 0.51707 -0.256 0.79852
#> X_tr_sCRBI -0.28713 0.49589 -0.579 0.56331
#> X_tr_sCWalks -0.15679 0.19414 -0.808 0.42037
#> X_tr_sPutOuts 0.14137 0.05012 2.821 0.00534 **
#> X_tr_sAssists 0.07555 0.07340 1.029 0.30476
#> X_tr_sErrors -0.09668 0.06659 -1.452 0.14830
#> X_tr_sLeague_N 0.22691 0.09363 2.424 0.01638 *
#> X_tr_sDivision_W -0.13236 0.04594 -2.881 0.00446 **
#> X_tr_sNewLeague_N -0.18176 0.09388 -1.936 0.05444 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.616 on 177 degrees of freedom
#> Multiple R-squared: 0.5445, Adjusted R-squared: 0.4956
#> F-statistic: 11.13 on 19 and 177 DF, p-value: < 2.2e-16
set.seed(1)
ridge_cv <- cv.glmnet(X_tr_s, y_tr, alpha = 0, nfolds = 10)
plot(ridge_cv, main = "Ridge: CV Error vs log(lambda)")#> Ridge best lambda: 0.3442
pred_ridge <- predict(ridge_cv, X_te_s, s = "lambda.min")
rmse_ridge <- sqrt(mean((pred_ridge - y_te)^2))
r2_ridge <- cor(pred_ridge, y_te)^2
cat("Ridge — RMSE:", round(rmse_ridge, 4), "| R²:", round(r2_ridge, 4), "\n")#> Ridge — RMSE: 0.6525 | R²: 0.561
set.seed(1)
lasso_cv <- cv.glmnet(X_tr_s, y_tr, alpha = 1, nfolds = 10)
plot(lasso_cv, main = "Lasso: CV Error vs log(lambda)")#> Lasso best lambda: 0.0249
# Non-zero coefficients
lasso_coef <- coef(lasso_cv, s = "lambda.min")
nonzero <- lasso_coef[lasso_coef[, 1] != 0, ]
cat("Variables selected:", length(nonzero) - 1, "of", length(feat_cols), "\n")#> Variables selected: 9 of 19
#> Zeroed out: AtBat HmRun Runs CAtBat CHmRun CRuns CRBI CWalks Assists NewLeague_N
#> CHits Hits PutOuts Years Walks League_N RBI
#> 0.3702 0.1557 0.1121 0.0776 0.0563 0.0450 0.0338
#> Errors Division_W
#> -0.0164 -0.1123
pred_lasso <- predict(lasso_cv, X_te_s, s = "lambda.min")
rmse_lasso <- sqrt(mean((pred_lasso - y_te)^2))
r2_lasso <- cor(pred_lasso, y_te)^2
cat("Lasso — RMSE:", round(rmse_lasso, 4), "| R²:", round(r2_lasso, 4), "\n")#> Lasso — RMSE: 0.673 | R²: 0.5429
par(mfrow = c(1, 2))
plot(glmnet(X_tr_s, y_tr, alpha = 0), xvar = "lambda",
main = "Ridge Coefficient Paths")
abline(v = log(ridge_cv$lambda.min), col = "red", lty = 2, lwd = 2)
plot(glmnet(X_tr_s, y_tr, alpha = 1), xvar = "lambda",
main = "Lasso Coefficient Paths")
abline(v = log(lasso_cv$lambda.min), col = "red", lty = 2, lwd = 2)# 10-fold CV to select polynomial degree for Years
set.seed(1)
Hitters_tr <- Hitters[train_id, ]
cv_errors_poly <- sapply(1:7, function(d) {
fit <- glm(LogSalary ~ poly(Years, d), data = Hitters_tr)
cv.glm(Hitters_tr, fit, K = 10)$delta[1]
})
best_deg <- which.min(cv_errors_poly)
cat("Optimal polynomial degree (CV):", best_deg, "\n")#> Optimal polynomial degree (CV): 3
# Plot
years_grid <- seq(min(Hitters$Years), max(Hitters$Years), length.out = 200)
plot(Hitters$Years[train_id], Hitters$LogSalary[train_id],
col = "gray60", pch = 20, cex = 0.8,
xlab = "Years in MLB", ylab = "log(Salary)",
main = "Polynomial Fits: log(Salary) ~ Years")
for (d in c(1, 3, best_deg)) {
fit_poly <- lm(LogSalary ~ poly(Years, d), data = Hitters_tr)
lines(years_grid,
predict(fit_poly, data.frame(Years = years_grid)),
lwd = 2,
col = c("1"="steelblue","3"="tomato","4"="seagreen3","5"="purple")[as.character(d)],
lty = ifelse(d == best_deg, 1, 2))
}
legend("topleft",
legend = c("Linear (d=1)","Cubic (d=3)",paste0("CV best (d=",best_deg,")")),
col = c("steelblue","tomato","seagreen3"), lty = c(2,2,1), lwd = 2, bty = "n")set.seed(1)
tree_data <- data.frame(X_tr_s, y = y_tr)
tree_fit <- tree(y ~ ., data = tree_data)
# CV-based pruning
cv_tree <- cv.tree(tree_fit)
plot(cv_tree$size, cv_tree$dev, type = "b", pch = 19,
xlab = "Tree Size (# Terminal Nodes)", ylab = "CV Deviance",
main = "Regression Tree: CV Pruning")
best_leaves <- cv_tree$size[which.min(cv_tree$dev)]
abline(v = best_leaves, col = "red", lty = 2, lwd = 1.5)#> Best tree size by CV: 9
# Pruned tree
pruned_tree <- prune.tree(tree_fit, best = best_leaves)
pred_pruned <- predict(pruned_tree, data.frame(X_te_s))
rmse_tree <- sqrt(mean((pred_pruned - y_te)^2))
cat("Pruned Tree — Test RMSE:", round(rmse_tree, 4), "\n")#> Pruned Tree — Test RMSE: 0.6514
set.seed(1)
rf_fit <- randomForest(y = y_tr, x = X_tr_s,
ntree = 500, importance = TRUE)
pred_rf <- predict(rf_fit, X_te_s)
rmse_rf <- sqrt(mean((pred_rf - y_te)^2))
r2_rf <- cor(pred_rf, y_te)^2
cat("Random Forest — RMSE:", round(rmse_rf, 4), "| R²:", round(r2_rf, 4), "\n")#> Random Forest — RMSE: 0.4713 | R²: 0.7583
#>
#> Call:
#> randomForest(x = X_tr_s, y = y_tr, ntree = 500, importance = TRUE)
#> Type of random forest: regression
#> Number of trees: 500
#> No. of variables tried at each split: 6
#>
#> Mean of squared residuals: 0.1689606
#> % Var explained: 77.42
set.seed(1)
gb_fit <- gbm(y ~ ., data = data.frame(X_tr_s, y = y_tr),
distribution = "gaussian",
n.trees = 500,
shrinkage = 0.05,
interaction.depth = 3,
cv.folds = 5,
verbose = FALSE)
best_trees <- gbm.perf(gb_fit, method = "cv", plot.it = TRUE)#> Optimal trees (CV): 352
pred_gb <- predict(gb_fit, data.frame(X_te_s), n.trees = best_trees)
rmse_gb <- sqrt(mean((pred_gb - y_te)^2))
r2_gb <- cor(pred_gb, y_te)^2
cat("Gradient Boosting — RMSE:", round(rmse_gb, 4), "| R²:", round(r2_gb, 4), "\n")#> Gradient Boosting — RMSE: 0.5077 | R²: 0.7328
par(mar = c(5, 10, 4, 2))
summary(gb_fit, n.trees = best_trees, las = 2,
main = "Gradient Boosting Variable Importance")#> var rel.inf
#> CAtBat CAtBat 16.8463850
#> CRuns CRuns 12.4242030
#> CWalks CWalks 10.0856758
#> CRBI CRBI 9.4061614
#> PutOuts PutOuts 6.9355504
#> RBI RBI 6.4704575
#> Walks Walks 5.6480015
#> Years Years 4.7190412
#> AtBat AtBat 4.2435843
#> CHits CHits 4.0267071
#> CHmRun CHmRun 3.6925256
#> Hits Hits 3.2395708
#> Assists Assists 2.9249340
#> Errors Errors 2.5194014
#> Runs Runs 2.4488674
#> HmRun HmRun 2.2106278
#> League_N League_N 1.0623577
#> Division_W Division_W 0.8322022
#> NewLeague_N NewLeague_N 0.2637458
reg_results <- data.frame(
Method = c("OLS","Ridge","Lasso","Pruned Tree","Random Forest","Gradient Boosting"),
RMSE = round(c(rmse_ols, rmse_ridge, rmse_lasso,
rmse_tree, rmse_rf, rmse_gb), 4),
R2 = round(c(r2_ols, r2_ridge, r2_lasso,
cor(pred_pruned, y_te)^2, r2_rf, r2_gb), 4)
)
print(reg_results[order(reg_results$RMSE), ])#> Method RMSE R2
#> 5 Random Forest 0.4713 0.7583
#> 6 Gradient Boosting 0.5077 0.7328
#> 4 Pruned Tree 0.6514 0.5437
#> 2 Ridge 0.6525 0.5610
#> 3 Lasso 0.6730 0.5429
#> 1 OLS 0.6847 0.5463
par(mfrow = c(1, 2))
bar_cols <- c("#2E75B6","#5B9E5B","#2A9D8F","#7B5EA7","#E07B39","#E84646")
barplot(reg_results$RMSE, names.arg = reg_results$Method,
col = bar_cols, border = "white", las = 2, cex.names = 0.8,
main = "Test RMSE (lower = better)", ylab = "RMSE")
barplot(reg_results$R2, names.arg = reg_results$Method,
col = bar_cols, border = "white", las = 2, cex.names = 0.8,
main = "Test R² (higher = better)", ylab = "R²",
ylim = c(0, 1.1))par(mfrow = c(1, 3))
plot_pa <- function(y_true, y_hat, title) {
lims <- range(c(y_true, y_hat))
plot(y_true, y_hat, col = "#2E75B6", pch = 20, cex = 0.8,
xlim = lims, ylim = lims,
xlab = "Actual log(Salary)", ylab = "Predicted log(Salary)",
main = title)
abline(0, 1, col = "red", lwd = 1.5, lty = 2)
legend("topleft",
legend = c(paste("RMSE =", round(sqrt(mean((y_true-y_hat)^2)),3)),
paste("R² =", round(cor(y_true,y_hat)^2, 3))),
bty = "n", cex = 0.85)
}
plot_pa(y_te, pred_ols, "OLS")
plot_pa(y_te, pred_ridge, "Ridge")
plot_pa(y_te, pred_gb, "Gradient Boosting")par(mfrow = c(1, 3))
res_ols <- y_te - pred_ols; res_gb <- y_te - pred_gb
plot(pred_ols, res_ols, pch=20, col="#2E75B6", cex=0.8,
xlab="Fitted", ylab="Residual", main="OLS Residuals")
abline(h=0, col="red", lwd=1.5)
plot(pred_gb, res_gb, pch=20, col="#E84646", cex=0.8,
xlab="Fitted", ylab="Residual", main="GBM Residuals")
abline(h=0, col="red", lwd=1.5)
hist(res_gb, breaks=28, col="#E84646", border="white",
main="GBM Residual Distribution", xlab="Residual")y_clf <- as.factor(Hitters$HighSalary)
X_clf <- X # same feature matrix
X_tr_c <- X_clf[train_id, ]
X_te_c <- X_clf[test_id, ]
y_tr_c <- y_clf[train_id]
y_te_c <- y_clf[test_id]
X_tr_cs <- scale(X_tr_c)
X_te_cs <- scale(X_te_c,
center = attr(X_tr_cs, "scaled:center"),
scale = attr(X_tr_cs, "scaled:scale"))
cat("Class distribution (test):\n")#> Class distribution (test):
#> y_te_c
#> 0 1
#> 27 39
# Build proper data frames so predict() column names match exactly
y_tr_num <- as.numeric(as.character(y_tr_c)) # factor -> numeric 0/1
y_te_num <- as.numeric(as.character(y_te_c)) # factor -> numeric 0/1
train_df_lr <- data.frame(X_tr_cs, HighSalary = y_tr_num)
test_df_lr <- data.frame(X_te_cs)
lr_fit <- glm(HighSalary ~ ., data = train_df_lr, family = binomial)
prob_lr <- predict(lr_fit, newdata = test_df_lr, type = "response")
pred_lr <- ifelse(prob_lr > 0.5, 1, 0)
# Both vectors are plain numeric with length 66 — no names issue
stopifnot(length(pred_lr) == length(y_te_num))
acc_lr <- mean(pred_lr == y_te_num)
cat("Logistic Regression — Test Accuracy:", round(acc_lr, 4), "\n")#> Logistic Regression — Test Accuracy: 0.8485
#> Actual
#> Predicted 0 1
#> 0 25 8
#> 1 2 31
set.seed(1)
tune_lin <- tune(svm,
train.x = X_tr_cs,
train.y = y_tr_c,
kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 50, 100)),
tunecontrol = tune.control(cross = 10))
cat("Best cost (linear):", tune_lin$best.parameters$cost, "\n")#> Best cost (linear): 1
#> CV error: 0.1618
pred_svc <- predict(tune_lin$best.model, X_te_cs)
acc_svc <- mean(as.character(pred_svc) == as.character(y_te_c))
cat("SVC (Linear) — Test Accuracy:", round(acc_svc, 4), "\n")#> SVC (Linear) — Test Accuracy: 0.8788
#> Actual
#> Predicted 0 1
#> 0 26 7
#> 1 1 32
set.seed(1)
tune_rbf <- tune(svm,
train.x = X_tr_cs,
train.y = y_tr_c,
kernel = "radial",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 50, 100)),
tunecontrol = tune.control(cross = 10))
cat("Best cost (RBF):", tune_rbf$best.parameters$cost, "\n")#> Best cost (RBF): 1
pred_rbf <- predict(tune_rbf$best.model, X_te_cs)
acc_rbf <- mean(as.character(pred_rbf) == as.character(y_te_c))
cat("SVM (RBF) — Test Accuracy:", round(acc_rbf, 4), "\n")#> SVM (RBF) — Test Accuracy: 0.8182
library(pROC)
# SVC with probability output — refit with probability=TRUE
svc_prob <- svm(X_tr_cs, y_tr_c, kernel = "linear",
cost = tune_lin$best.parameters$cost,
probability = TRUE)
probs_svc <- attr(predict(svc_prob, X_te_cs, probability = TRUE),
"probabilities")[, "1"]
# Strip any names so roc() gets clean numeric vectors
roc_svc <- roc(y_te_num, as.numeric(probs_svc), quiet = TRUE)
roc_lr <- roc(y_te_num, as.numeric(unname(prob_lr)), quiet = TRUE)
plot(roc_svc, col = "#2E75B6", lwd = 2,
main = "ROC Curves: High/Low Salary Classification")
plot(roc_lr, col = "#5B9E5B", lwd = 2, add = TRUE)
abline(0, 1, lty = 2, col = "gray")
legend("bottomright",
legend = c(paste("SVC Linear (AUC =", round(auc(roc_svc), 3), ")"),
paste("Logistic (AUC =", round(auc(roc_lr), 3), ")")),
col = c("#2E75B6","#5B9E5B"), lwd = 2, bty = "n")clf_results <- data.frame(
Method = c("Logistic Regression", "SVC (Linear)", "SVM (RBF)"),
Accuracy = round(c(acc_lr, acc_svc, acc_rbf), 4),
AUC = c(round(auc(roc_lr), 4), round(auc(roc_svc), 4), NA)
)
print(clf_results[order(-clf_results$Accuracy), ])#> Method Accuracy AUC
#> 2 SVC (Linear) 0.8788 0.8955
#> 1 Logistic Regression 0.8485 0.9079
#> 3 SVM (RBF) 0.8182 NA
num_vars_uns <- c("AtBat","Hits","HmRun","Runs","RBI","Walks","Years",
"CAtBat","CHits","CHmRun","CRuns","CRBI","CWalks",
"PutOuts","Assists","Errors")
X_uns <- scale(as.matrix(Hitters[, num_vars_uns]))
pr_out <- prcomp(X_uns, scale = FALSE) # already scaled
pve <- pr_out$sdev^2 / sum(pr_out$sdev^2)
cat("Variance explained (first 5 PCs):\n")#> Variance explained (first 5 PCs):
#> [1] 0.4531 0.2569 0.1080 0.0544 0.0436
#> Cumulative PVE:
#> [1] 0.4531 0.7100 0.8180 0.8724 0.9160
par(mfrow = c(1, 3))
barplot(pve[1:10] * 100, names.arg = paste0("PC", 1:10),
col = "steelblue", border = "white",
main = "Scree Plot", ylab = "% Variance Explained", las = 2)
plot(1:10, cumsum(pve[1:10]) * 100, type = "b", pch = 19, col = "tomato",
xlab = "Number of PCs", ylab = "Cumulative % Variance",
main = "Cumulative PVE")
abline(h = 80, lty = 2, col = "gray", lwd = 1.5)
plot(pr_out$x[, 1], pr_out$x[, 2],
col = ifelse(Hitters$HighSalary == 1, "#E84646", "#2E75B6"),
pch = 20, cex = 0.9,
xlab = paste0("PC1 (", round(pve[1]*100, 1), "%)"),
ylab = paste0("PC2 (", round(pve[2]*100, 1), "%)"),
main = "PC1 vs PC2")
legend("topright", legend = c("High Salary","Low Salary"),
col = c("#E84646","#2E75B6"), pch = 20, bty = "n")par(mfrow = c(2, 1))
for (i in 1:2) {
load_i <- pr_out$rotation[, i]
cols_l <- ifelse(load_i > 0, "#5B9E5B", "#E84646")
barplot(load_i, col = cols_l, border = "white", las = 2,
main = paste0("PC", i, " Loadings (",
round(pve[i]*100, 1), "% variance)"),
ylab = "Loading")
abline(h = 0, col = "gray", lwd = 0.8)
}PC1 interpretation: Career stats (CAtBat, CHits, CRuns, CRBI) and Years all load positively — PC1 is a career volume / experience axis. Players scoring high on PC1 are veterans with long careers and high salary.
PC2 interpretation: PutOuts loads positively, Assists negatively — PC2 contrasts fielding positions (catchers/outfielders vs infielders/pitchers).
set.seed(1)
K_range <- 2:9
inertias <- numeric(length(K_range))
sil_scores <- numeric(length(K_range))
for (i in seq_along(K_range)) {
km_i <- kmeans(X_uns, centers = K_range[i], nstart = 20)
inertias[i] <- km_i$tot.withinss
sil_i <- silhouette(km_i$cluster, dist(X_uns))
sil_scores[i] <- mean(sil_i[, 3])
}
par(mfrow = c(1, 2))
plot(K_range, inertias, type = "b", pch = 19, col = "steelblue",
xlab = "K", ylab = "Total Within-Cluster SS", main = "Elbow Plot")
plot(K_range, sil_scores, type = "b", pch = 19, col = "tomato",
xlab = "K", ylab = "Average Silhouette Score",
main = "Silhouette Score vs K")
best_K <- K_range[which.max(sil_scores)]
abline(v = best_K, lty = 2, col = "gray", lwd = 1.5)#> Optimal K: 2
#> Cluster sizes:
#>
#> 1 2
#> 76 187
#>
#> Cluster vs High Salary:
#> HighSalary
#> Cluster 0 1
#> 1 8 68
#> 2 126 61
#>
#> Mean salary by cluster:
#> 1 2
#> 955.647 365.344
#>
#> Mean Years by cluster:
#> 1 2
#> 12.434211 5.229947
par(mfrow = c(1, 2))
km_col <- c("#2E75B6","#E84646")[km2$cluster]
plot(pr_out$x[,1], pr_out$x[,2], col=km_col, pch=20, cex=0.9,
xlab=paste0("PC1 (",round(pve[1]*100,1),"%)"),
ylab=paste0("PC2 (",round(pve[2]*100,1),"%)"),
main="K-Means K=2 on PC Scores")
legend("topright", legend=c("Cluster 1","Cluster 2"),
col=c("#2E75B6","#E84646"), pch=20, bty="n")
# Cluster profiles — build a matrix: rows=variables, cols=clusters
prof_vars <- c("Years", "CHits", "Salary")
cluster_means <- sapply(prof_vars,
function(v) tapply(Hitters[[v]], km2$cluster, mean))
# cluster_means is now 2 x 3 (clusters x variables)
# Normalise each variable by its maximum so they fit on one axis
norm_means <- sweep(cluster_means, 2, apply(cluster_means, 2, max), "/")
# barplot needs matrix where columns = groups → transpose to 3 x 2
barplot(t(norm_means), # 2 x 3: clusters in rows, vars in columns
beside = TRUE,
col = c("#2E75B6","#E84646"),
names.arg = paste("Cluster", 1:2),
border = "white",
main = "Cluster Profiles (normalised to max = 1)",
ylab = "Normalised Mean",
legend.text = prof_vars,
args.legend = list(bty = "n", x = "topright"))# Full dataset
hc_full <- hclust(dist(X_uns), method = "ward.D2")
# Cut at K=2
hc_labs <- cutree(hc_full, k = 2)
cat("Hierarchical K=2 vs High Salary:\n")#> Hierarchical K=2 vs High Salary:
#> HighSalary
#> HC_Cluster 0 1
#> 1 129 96
#> 2 5 33
#>
#> Agreement with K-means:
#> Hierarchical
#> KMeans 1 2
#> 1 38 38
#> 2 187 0
# Dendrogram (60-player sample for readability)
set.seed(1)
samp <- sample(nrow(Hitters), 60)
hc_samp <- hclust(dist(X_uns[samp, ]), method = "ward.D2")
plot(hc_samp,
labels = row.names(Hitters)[samp],
main = "Hierarchical Clustering Dendrogram (Ward Linkage, 60-Player Sample)",
xlab = "", sub = "", cex = 0.65)
rect.hclust(hc_samp, k = 2, border = c("#2E75B6","#E84646"))par(mfrow = c(2, 2))
for (meth in c("complete","single","average","ward.D2")) {
hc_m <- hclust(dist(X_uns[samp,]), method = meth)
plot(hc_m, labels = FALSE,
main = paste("Linkage:", meth),
xlab = "", sub = "")
}#> === REGRESSION (Test Set) ===
#> Method RMSE R2
#> 5 Random Forest 0.4713 0.7583
#> 6 Gradient Boosting 0.5077 0.7328
#> 4 Pruned Tree 0.6514 0.5437
#> 2 Ridge 0.6525 0.5610
#> 3 Lasso 0.6730 0.5429
#> 1 OLS 0.6847 0.5463
#>
#> === CLASSIFICATION (Test Set) ===
#> Method Accuracy AUC
#> 1 Logistic Regression 0.8485 0.9079
#> 2 SVC (Linear) 0.8788 0.8955
#> 3 SVM (RBF) 0.8182 NA
#>
#> === PCA ===
#> PC1: 45.3 % (Career volume)
#> PC2: 25.7 % (Fielding position)
#> PC1+PC2: 71 % cumulative
#>
#> === CLUSTERING ===
#> Optimal K (silhouette): 2
#> Mean salary Cluster 1: 956
#> Mean salary Cluster 2: 365
Regression:
Classification:
Unsupervised:
#> R version 4.6.0 (2026-04-24 ucrt)
#> Platform: x86_64-w64-mingw32/x64
#> Running under: Windows 11 x64 (build 26200)
#>
#> Matrix products: default
#> LAPACK version 3.12.1
#>
#> locale:
#> [1] LC_COLLATE=English_United States.utf8
#> [2] LC_CTYPE=English_United States.utf8
#> [3] LC_MONETARY=English_United States.utf8
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United States.utf8
#>
#> time zone: America/Chicago
#> tzcode source: internal
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] pROC_1.19.0.1 boot_1.3-32 class_7.3-23
#> [4] cluster_2.1.8.2 tree_1.0-45 e1071_1.7-17
#> [7] gbm_2.2.3 randomForest_4.7-1.2 glmnet_5.0
#> [10] Matrix_1.7-5 ISLR_1.4
#>
#> loaded via a namespace (and not attached):
#> [1] cli_3.6.6 knitr_1.51 rlang_1.2.0 xfun_0.57
#> [5] jsonlite_2.0.0 htmltools_0.5.9 sass_0.4.10 rmarkdown_2.31
#> [9] grid_4.6.0 evaluate_1.0.5 jquerylib_0.1.4 fastmap_1.2.0
#> [13] yaml_2.3.12 foreach_1.5.2 lifecycle_1.0.5 compiler_4.6.0
#> [17] codetools_0.2-20 Rcpp_1.1.1-1.1 rstudioapi_0.18.0 lattice_0.22-9
#> [21] digest_0.6.39 R6_2.6.1 parallel_4.6.0 splines_4.6.0
#> [25] shape_1.4.6.1 bslib_0.10.0 proxy_0.4-29 tools_4.6.0
#> [29] iterators_1.0.14 survival_3.8-6 cachem_1.1.0