Q1: Does random forest generalize better across divisions than LARS?
Q2: Why is it reasonable given the way trees work?
Q3: Use plots and correlations to answer.
library(ISLR)
library(lars)
## Loaded lars 1.3
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
data(Hitters)
HittersAEclean <- subset(Hitters, League == "A" & Division == "E" & !is.na(Salary))
HittersAWclean <- subset(Hitters, League == "A" & Division == "W" & !is.na(Salary))
HittersNEclean <- subset(Hitters, League == "N" & Division == "E" & !is.na(Salary))
HittersNWclean <- subset(Hitters, League == "N" & Division == "W" & !is.na(Salary))
X_AE <- as.matrix(HittersAEclean[, -c(20,19,14,15)])
X_AW <- as.matrix(HittersAWclean[, -c(20,19,14,15)])
X_NE <- as.matrix(HittersNEclean[, -c(20,19,14,15)])
X_NW <- as.matrix(HittersNWclean[, -c(20,19,14,15)])
# LARS MODELS
larsAE <- lars(X_AE, HittersAEclean$Salary)
larsAW <- lars(X_AW, HittersAWclean$Salary)
larsNE <- lars(X_NE, HittersNEclean$Salary)
larsNW <- lars(X_NW, HittersNWclean$Salary)
# Get coefficients at minimum Cp
I1_AE <- larsAE$Cp == min(larsAE$Cp)
I1_AW <- larsAW$Cp == min(larsAW$Cp)
I1_NE <- larsNE$Cp == min(larsNE$Cp)
I1_NW <- larsNW$Cp == min(larsNW$Cp)
betalarsAE <- larsAE$beta[I1_AE, ]
betalarsAW <- larsAW$beta[I1_AW, ]
betalarsNE <- larsNE$beta[I1_NE, ]
betalarsNW <- larsNW$beta[I1_NW, ]
# LARS CROSS PREDICTIONS
# AE model
predAEAE <- X_AE %*% betalarsAE
predAEAE <- predAEAE + (larsAE$mu - mean(predAEAE))
predAEAW <- X_AW %*% betalarsAE
predAEAW <- predAEAW + (larsAE$mu - mean(predAEAE))
predAENE <- X_NE %*% betalarsAE
predAENE <- predAENE + (larsAE$mu - mean(predAEAE))
predAENW <- X_NW %*% betalarsAE
predAENW <- predAENW + (larsAE$mu - mean(predAEAE))
# AW model
predAWAE <- X_AE %*% betalarsAW
predAWAE <- predAWAE + (larsAW$mu - mean(predAWAE))
predAWAW <- X_AW %*% betalarsAW
predAWAW <- predAWAW + (larsAW$mu - mean(predAWAW))
predAWNE <- X_NE %*% betalarsAW
predAWNE <- predAWNE + (larsAW$mu - mean(predAWAW))
predAWNW <- X_NW %*% betalarsAW
predAWNW <- predAWNW + (larsAW$mu - mean(predAWAW))
# NE model
predNEAE <- X_AE %*% betalarsNE
predNEAE <- predNEAE + (larsNE$mu - mean(predNEAE))
predNEAW <- X_AW %*% betalarsNE
predNEAW <- predNEAW + (larsNE$mu - mean(predNEAE))
predNENE <- X_NE %*% betalarsNE
predNENE <- predNENE + (larsNE$mu - mean(predNENE))
predNENW <- X_NW %*% betalarsNE
predNENW <- predNENW + (larsNE$mu - mean(predNENW))
# NW model
predNWAE <- X_AE %*% betalarsNW
predNWAE <- predNWAE + (larsNW$mu - mean(predNWAE))
predNWAW <- X_AW %*% betalarsNW
predNWAW <- predNWAW + (larsNW$mu - mean(predNWAE))
predNWNE <- X_NE %*% betalarsNW
predNWNE <- predNWNE + (larsNW$mu - mean(predNWNE))
predNWNW <- X_NW %*% betalarsNW
predNWNW <- predNWNW + (larsNW$mu - mean(predNWNW))
# LARS PLOTS: 16 scatterplots
par(mfrow = c(4, 4))
par(mar = c(3, 3, 2, 1))
plot(HittersAEclean$Salary, predAEAE, main = "LARS AE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predAEAW, main = "LARS AE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predAENE, main = "LARS AE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predAENW, main = "LARS AE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAEclean$Salary, predAWAE, main = "LARS AW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predAWAW, main = "LARS AW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predAWNE, main = "LARS AW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predAWNW, main = "LARS AW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAEclean$Salary, predNEAE, main = "LARS NE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predNEAW, main = "LARS NE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predNENE, main = "LARS NE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predNENW, main = "LARS NE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAEclean$Salary, predNWAE, main = "LARS NW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predNWAW, main = "LARS NW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predNWNE, main = "LARS NW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predNWNW, main = "LARS NW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
# LARS CORRELATION MATRIX
corr_lars <- matrix(NA, 4, 4)
rownames(corr_lars) <- c("AE model", "AW model", "NE model", "NW model")
colnames(corr_lars) <- c("AE data", "AW data", "NE data", "NW data")
corr_lars[1,1] <- cor(predAEAE, HittersAEclean$Salary)
corr_lars[1,2] <- cor(predAEAW, HittersAWclean$Salary)
corr_lars[1,3] <- cor(predAENE, HittersNEclean$Salary)
corr_lars[1,4] <- cor(predAENW, HittersNWclean$Salary)
corr_lars[2,1] <- cor(predAWAE, HittersAEclean$Salary)
corr_lars[2,2] <- cor(predAWAW, HittersAWclean$Salary)
corr_lars[2,3] <- cor(predAWNE, HittersNEclean$Salary)
corr_lars[2,4] <- cor(predAWNW, HittersNWclean$Salary)
corr_lars[3,1] <- cor(predNEAE, HittersAEclean$Salary)
corr_lars[3,2] <- cor(predNEAW, HittersAWclean$Salary)
corr_lars[3,3] <- cor(predNENE, HittersNEclean$Salary)
corr_lars[3,4] <- cor(predNENW, HittersNWclean$Salary)
corr_lars[4,1] <- cor(predNWAE, HittersAEclean$Salary)
corr_lars[4,2] <- cor(predNWAW, HittersAWclean$Salary)
corr_lars[4,3] <- cor(predNWNE, HittersNEclean$Salary)
corr_lars[4,4] <- cor(predNWNW, HittersNWclean$Salary)
cat("=== LARS CORRELATION MATRIX ===\n")
## === LARS CORRELATION MATRIX ===
print(round(corr_lars, 3))
## AE data AW data NE data NW data
## AE model 0.889 0.597 0.549 0.507
## AW model 0.737 0.785 0.579 0.572
## NE model 0.680 0.639 0.684 0.527
## NW model 0.455 0.564 0.145 0.767
predAEAW = AE model predicting
AW players), with predictions re-centered to the correct salary scale.
The results were visualized in 16 scatterplots (actual
vs predicted Salary, red = perfect prediction line) and summarized in a
4×4 correlation matrix.# RANDOM FOREST MODELS
# Keep only numeric predictors plus salary
rfAEdata <- HittersAEclean[, c("Salary", colnames(X_AE))]
rfAWdata <- HittersAWclean[, c("Salary", colnames(X_AW))]
rfNEdata <- HittersNEclean[, c("Salary", colnames(X_NE))]
rfNWdata <- HittersNWclean[, c("Salary", colnames(X_NW))]
set.seed(123)
rfAE <- randomForest(Salary ~ ., data = rfAEdata, importance = TRUE)
rfAW <- randomForest(Salary ~ ., data = rfAWdata, importance = TRUE)
rfNE <- randomForest(Salary ~ ., data = rfNEdata, importance = TRUE)
rfNW <- randomForest(Salary ~ ., data = rfNWdata, importance = TRUE)
# RANDOM FOREST CROSS PREDICTIONS
# AE model
rfpredAEAE <- predict(rfAE, newdata = rfAEdata)
rfpredAEAW <- predict(rfAE, newdata = rfAWdata)
rfpredAENE <- predict(rfAE, newdata = rfNEdata)
rfpredAENW <- predict(rfAE, newdata = rfNWdata)
# AW model
rfpredAWAE <- predict(rfAW, newdata = rfAEdata)
rfpredAWAW <- predict(rfAW, newdata = rfAWdata)
rfpredAWNE <- predict(rfAW, newdata = rfNEdata)
rfpredAWNW <- predict(rfAW, newdata = rfNWdata)
# NE model
rfpredNEAE <- predict(rfNE, newdata = rfAEdata)
rfpredNEAW <- predict(rfNE, newdata = rfAWdata)
rfpredNENE <- predict(rfNE, newdata = rfNEdata)
rfpredNENW <- predict(rfNE, newdata = rfNWdata)
# NW model
rfpredNWAE <- predict(rfNW, newdata = rfAEdata)
rfpredNWAW <- predict(rfNW, newdata = rfAWdata)
rfpredNWNE <- predict(rfNW, newdata = rfNEdata)
rfpredNWNW <- predict(rfNW, newdata = rfNWdata)
# RANDOM FOREST PLOTS: 16 scatterplots
par(mfrow = c(4, 4))
par(mar = c(3, 3, 2, 1))
plot(HittersAEclean$Salary, rfpredAEAE, main = "RF AE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredAEAW, main = "RF AE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredAENE, main = "RF AE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredAENW, main = "RF AE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAEclean$Salary, rfpredAWAE, main = "RF AW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredAWAW, main = "RF AW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredAWNE, main = "RF AW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredAWNW, main = "RF AW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAEclean$Salary, rfpredNEAE, main = "RF NE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredNEAW, main = "RF NE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredNENE, main = "RF NE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredNENW, main = "RF NE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAEclean$Salary, rfpredNWAE, main = "RF NW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredNWAW, main = "RF NW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredNWNE, main = "RF NW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredNWNW, main = "RF NW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
# RANDOM FOREST CORRELATION MATRIX
corr_rf <- matrix(NA, 4, 4)
rownames(corr_rf) <- c("AE model", "AW model", "NE model", "NW model")
colnames(corr_rf) <- c("AE data", "AW data", "NE data", "NW data")
corr_rf[1,1] <- cor(rfpredAEAE, HittersAEclean$Salary)
corr_rf[1,2] <- cor(rfpredAEAW, HittersAWclean$Salary)
corr_rf[1,3] <- cor(rfpredAENE, HittersNEclean$Salary)
corr_rf[1,4] <- cor(rfpredAENW, HittersNWclean$Salary)
corr_rf[2,1] <- cor(rfpredAWAE, HittersAEclean$Salary)
corr_rf[2,2] <- cor(rfpredAWAW, HittersAWclean$Salary)
corr_rf[2,3] <- cor(rfpredAWNE, HittersNEclean$Salary)
corr_rf[2,4] <- cor(rfpredAWNW, HittersNWclean$Salary)
corr_rf[3,1] <- cor(rfpredNEAE, HittersAEclean$Salary)
corr_rf[3,2] <- cor(rfpredNEAW, HittersAWclean$Salary)
corr_rf[3,3] <- cor(rfpredNENE, HittersNEclean$Salary)
corr_rf[3,4] <- cor(rfpredNENW, HittersNWclean$Salary)
corr_rf[4,1] <- cor(rfpredNWAE, HittersAEclean$Salary)
corr_rf[4,2] <- cor(rfpredNWAW, HittersAWclean$Salary)
corr_rf[4,3] <- cor(rfpredNWNE, HittersNEclean$Salary)
corr_rf[4,4] <- cor(rfpredNWNW, HittersNWclean$Salary)
cat("=== RANDOM FOREST CORRELATION MATRIX ===\n")
## === RANDOM FOREST CORRELATION MATRIX ===
print(round(corr_rf, 3))
## AE data AW data NE data NW data
## AE model 0.959 0.855 0.708 0.702
## AW model 0.755 0.977 0.685 0.686
## NE model 0.763 0.857 0.967 0.784
## NW model 0.771 0.796 0.780 0.948
# Variable importance for each RF model
cat("\n=== RANDOM FOREST VARIABLE IMPORTANCE ===\n")
##
## === RANDOM FOREST VARIABLE IMPORTANCE ===
cat("\nAE model:\n")
##
## AE model:
print(importance(rfAE))
## %IncMSE IncNodePurity
## AtBat 4.588217 758684.6
## Hits 4.978932 1536578.2
## HmRun 4.376369 468982.8
## Runs 7.284818 1595008.8
## RBI 3.319673 1204347.4
## Walks 3.099129 1723029.2
## Years 6.431135 463462.3
## CAtBat 6.844453 850912.1
## CHits 8.392600 1341035.1
## CHmRun 6.966168 2145353.5
## CRuns 9.629997 2023871.9
## CRBI 7.105555 2209174.4
## CWalks 4.159051 786373.9
## PutOuts -3.498514 775555.6
## Assists 1.701594 235967.1
## Errors 2.243344 345421.2
cat("\nAW model:\n")
##
## AW model:
print(importance(rfAW))
## %IncMSE IncNodePurity
## AtBat 9.1107980 418413.6
## Hits 7.3172718 300242.2
## HmRun -0.3918957 106145.1
## Runs 7.7259287 356593.9
## RBI 6.2830783 242418.0
## Walks 2.0108237 163691.4
## Years 3.6546097 160582.2
## CAtBat 11.2979456 1024750.9
## CHits 11.6227897 1193041.4
## CHmRun 3.6951505 293474.7
## CRuns 14.3034110 1542461.7
## CRBI 8.5737913 621661.1
## CWalks 7.7428817 683213.6
## PutOuts -1.5747921 157250.5
## Assists 2.6422665 127784.3
## Errors 3.7222535 133326.3
cat("\nNE model:\n")
##
## NE model:
print(importance(rfNE))
## %IncMSE IncNodePurity
## AtBat 0.7876112 514674.21
## Hits 1.7341228 383485.30
## HmRun 0.8933922 271474.99
## Runs 3.3182479 645899.91
## RBI 0.3446691 621430.75
## Walks 8.3750591 1952415.47
## Years -0.3123428 260383.49
## CAtBat 9.7412979 1759801.57
## CHits 9.5602771 1767768.16
## CHmRun 3.9767567 511291.19
## CRuns 8.5318039 1646026.95
## CRBI 6.3704624 2034404.76
## CWalks 6.0547759 1386568.57
## PutOuts 11.4511072 1162258.70
## Assists -2.4212059 111636.66
## Errors -2.8344688 64840.85
cat("\nNW model:\n")
##
## NW model:
print(importance(rfNW))
## %IncMSE IncNodePurity
## AtBat 6.06339779 556082.0
## Hits 3.87941653 337328.3
## HmRun 2.34171671 598477.1
## Runs 2.20627523 180413.7
## RBI 4.88858755 509162.9
## Walks 0.05511147 269136.8
## Years 3.16848755 156943.2
## CAtBat 5.89617904 669158.1
## CHits 7.71440852 715604.1
## CHmRun 3.19007609 978182.7
## CRuns 4.02045575 506723.6
## CRBI 9.16229273 903154.7
## CWalks 4.03766505 492678.7
## PutOuts 1.61301857 186426.0
## Assists -2.87181694 148892.9
## Errors -1.03365618 114303.5
importance = TRUE to track
variable contributions and set.seed(123) for
reproducibility. Each forest was then applied to all four division
datasets producing 16 cross-division predictions (e.g.,
rfpredAEAW = RF trained on AE predicting AW players),
visualized in 16 scatterplots (actual vs predicted
Salary, blue = perfect prediction line) and summarized in a 4×4
correlation matrix.# COMPARE LARS VS RANDOM FOREST
diff_rf_lars <- corr_rf - corr_lars
cat("=== DIFFERENCE MATRIX (RF - LARS) ===\n")
## === DIFFERENCE MATRIX (RF - LARS) ===
cat("Positive values mean Random Forest is better\n\n")
## Positive values mean Random Forest is better
print(round(diff_rf_lars, 3))
## AE data AW data NE data NW data
## AE model 0.070 0.258 0.158 0.195
## AW model 0.018 0.192 0.106 0.114
## NE model 0.083 0.218 0.283 0.256
## NW model 0.316 0.232 0.634 0.181
# Means
diag_lars <- mean(diag(corr_lars))
diag_rf <- mean(diag(corr_rf))
off_lars <- mean(corr_lars[row(corr_lars) != col(corr_lars)])
off_rf <- mean(corr_rf[row(corr_rf) != col(corr_rf)])
overall_lars <- mean(corr_lars)
overall_rf <- mean(corr_rf)
cat("\n=== SUMMARY ===\n")
##
## === SUMMARY ===
cat("LARS within-division mean:", round(diag_lars, 3), "\n")
## LARS within-division mean: 0.781
cat("RF within-division mean:", round(diag_rf, 3), "\n\n")
## RF within-division mean: 0.963
cat("LARS cross-division mean:", round(off_lars, 3), "\n")
## LARS cross-division mean: 0.546
cat("RF cross-division mean:", round(off_rf, 3), "\n\n")
## RF cross-division mean: 0.762
cat("LARS overall mean:", round(overall_lars, 3), "\n")
## LARS overall mean: 0.605
cat("RF overall mean:", round(overall_rf, 3), "\n")
## RF overall mean: 0.812
Yes. The RF cross-division mean correlation (0.762) is substantially higher than LARS (0.546), and every single cell in the difference matrix is positive, meaning RF outperformed LARS in all 16 train-test division combinations.
A single tree makes decisions using if/then split rules that naturally capture nonlinear patterns and interactions between predictors (e.g., career hits mattering more at certain experience levels), which a linear model like LARS cannot do. A random forest averages hundreds of such trees, which reduces overfitting to any one division’s specific patterns and produces more stable predictions when applied to a new division. LARS assumes a straight-line relationship between predictors and Salary which breaks down when salary patterns differ across divisions.
The 16 RF cross-division scatter plots show points clustering tighter around the blue 45° line compared to the LARS red-line plots, which show more scatter and systematic bias at higher salary values. The correlation matrices confirm this numerically: the RF cross-division minimum (0.685) versus the LARS cross-division minimum (0.145) shows that RF maintains prediction quality across all divisions while LARS degrades significantly, particularly in the NW→NE case where LARS nearly fails entirely.