Introduction

Questions To Answer:

1. Setup and Data Preparation

library(ISLR)
library(lars)
## Loaded lars 1.3
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
data(Hitters)

HittersAEclean <- subset(Hitters, League == "A" & Division == "E" & !is.na(Salary))
HittersAWclean <- subset(Hitters, League == "A" & Division == "W" & !is.na(Salary))
HittersNEclean <- subset(Hitters, League == "N" & Division == "E" & !is.na(Salary))
HittersNWclean <- subset(Hitters, League == "N" & Division == "W" & !is.na(Salary))

2. Predictor Matrix Setup

X_AE <- as.matrix(HittersAEclean[, -c(20,19,14,15)])
X_AW <- as.matrix(HittersAWclean[, -c(20,19,14,15)])
X_NE <- as.matrix(HittersNEclean[, -c(20,19,14,15)])
X_NW <- as.matrix(HittersNWclean[, -c(20,19,14,15)])

3. LARS Process

# LARS MODELS

larsAE <- lars(X_AE, HittersAEclean$Salary)
larsAW <- lars(X_AW, HittersAWclean$Salary)
larsNE <- lars(X_NE, HittersNEclean$Salary)
larsNW <- lars(X_NW, HittersNWclean$Salary)

# Get coefficients at minimum Cp
I1_AE <- larsAE$Cp == min(larsAE$Cp)
I1_AW <- larsAW$Cp == min(larsAW$Cp)
I1_NE <- larsNE$Cp == min(larsNE$Cp)
I1_NW <- larsNW$Cp == min(larsNW$Cp)

betalarsAE <- larsAE$beta[I1_AE, ]
betalarsAW <- larsAW$beta[I1_AW, ]
betalarsNE <- larsNE$beta[I1_NE, ]
betalarsNW <- larsNW$beta[I1_NW, ]

# LARS CROSS PREDICTIONS

# AE model
predAEAE <- X_AE %*% betalarsAE
predAEAE <- predAEAE + (larsAE$mu - mean(predAEAE))

predAEAW <- X_AW %*% betalarsAE
predAEAW <- predAEAW + (larsAE$mu - mean(predAEAE))

predAENE <- X_NE %*% betalarsAE
predAENE <- predAENE + (larsAE$mu - mean(predAEAE))

predAENW <- X_NW %*% betalarsAE
predAENW <- predAENW + (larsAE$mu - mean(predAEAE))

# AW model
predAWAE <- X_AE %*% betalarsAW
predAWAE <- predAWAE + (larsAW$mu - mean(predAWAE))

predAWAW <- X_AW %*% betalarsAW
predAWAW <- predAWAW + (larsAW$mu - mean(predAWAW))

predAWNE <- X_NE %*% betalarsAW
predAWNE <- predAWNE + (larsAW$mu - mean(predAWAW))

predAWNW <- X_NW %*% betalarsAW
predAWNW <- predAWNW + (larsAW$mu - mean(predAWAW))

# NE model
predNEAE <- X_AE %*% betalarsNE
predNEAE <- predNEAE + (larsNE$mu - mean(predNEAE))

predNEAW <- X_AW %*% betalarsNE
predNEAW <- predNEAW + (larsNE$mu - mean(predNEAE))

predNENE <- X_NE %*% betalarsNE
predNENE <- predNENE + (larsNE$mu - mean(predNENE))

predNENW <- X_NW %*% betalarsNE
predNENW <- predNENW + (larsNE$mu - mean(predNENW))

# NW model
predNWAE <- X_AE %*% betalarsNW
predNWAE <- predNWAE + (larsNW$mu - mean(predNWAE))

predNWAW <- X_AW %*% betalarsNW
predNWAW <- predNWAW + (larsNW$mu - mean(predNWAE))

predNWNE <- X_NE %*% betalarsNW
predNWNE <- predNWNE + (larsNW$mu - mean(predNWNE))

predNWNW <- X_NW %*% betalarsNW
predNWNW <- predNWNW + (larsNW$mu - mean(predNWNW))

# LARS PLOTS: 16 scatterplots

par(mfrow = c(4, 4))
par(mar = c(3, 3, 2, 1))

plot(HittersAEclean$Salary, predAEAE, main = "LARS AE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predAEAW, main = "LARS AE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predAENE, main = "LARS AE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predAENW, main = "LARS AE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")

plot(HittersAEclean$Salary, predAWAE, main = "LARS AW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predAWAW, main = "LARS AW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predAWNE, main = "LARS AW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predAWNW, main = "LARS AW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")

plot(HittersAEclean$Salary, predNEAE, main = "LARS NE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predNEAW, main = "LARS NE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predNENE, main = "LARS NE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predNENW, main = "LARS NE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")

plot(HittersAEclean$Salary, predNWAE, main = "LARS NW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersAWclean$Salary, predNWAW, main = "LARS NW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNEclean$Salary, predNWNE, main = "LARS NW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")
plot(HittersNWclean$Salary, predNWNW, main = "LARS NW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="red")

# LARS CORRELATION MATRIX

corr_lars <- matrix(NA, 4, 4)
rownames(corr_lars) <- c("AE model", "AW model", "NE model", "NW model")
colnames(corr_lars) <- c("AE data", "AW data", "NE data", "NW data")

corr_lars[1,1] <- cor(predAEAE, HittersAEclean$Salary)
corr_lars[1,2] <- cor(predAEAW, HittersAWclean$Salary)
corr_lars[1,3] <- cor(predAENE, HittersNEclean$Salary)
corr_lars[1,4] <- cor(predAENW, HittersNWclean$Salary)

corr_lars[2,1] <- cor(predAWAE, HittersAEclean$Salary)
corr_lars[2,2] <- cor(predAWAW, HittersAWclean$Salary)
corr_lars[2,3] <- cor(predAWNE, HittersNEclean$Salary)
corr_lars[2,4] <- cor(predAWNW, HittersNWclean$Salary)

corr_lars[3,1] <- cor(predNEAE, HittersAEclean$Salary)
corr_lars[3,2] <- cor(predNEAW, HittersAWclean$Salary)
corr_lars[3,3] <- cor(predNENE, HittersNEclean$Salary)
corr_lars[3,4] <- cor(predNENW, HittersNWclean$Salary)

corr_lars[4,1] <- cor(predNWAE, HittersAEclean$Salary)
corr_lars[4,2] <- cor(predNWAW, HittersAWclean$Salary)
corr_lars[4,3] <- cor(predNWNE, HittersNEclean$Salary)
corr_lars[4,4] <- cor(predNWNW, HittersNWclean$Salary)

cat("=== LARS CORRELATION MATRIX ===\n")
## === LARS CORRELATION MATRIX ===
print(round(corr_lars, 3))
##          AE data AW data NE data NW data
## AE model   0.889   0.597   0.549   0.507
## AW model   0.737   0.785   0.579   0.572
## NE model   0.680   0.639   0.684   0.527
## NW model   0.455   0.564   0.145   0.767

The Process Explanation:

  • We fit one LARS model per division (AE, AW, NE, NW) using 16 numeric baseball performance predictors, selecting the best coefficients at minimum Cp for each. Each model was then applied to all four division datasets, producing 16 cross-division predictions (e.g., predAEAW = AE model predicting AW players), with predictions re-centered to the correct salary scale. The results were visualized in 16 scatterplots (actual vs predicted Salary, red = perfect prediction line) and summarized in a 4×4 correlation matrix.

Result Explanation:

  • Within-division, LARS performed reasonably well (diagonal correlations: 0.68–0.89), but cross-division performance dropped significantly (off-diagonal correlations as low as 0.145 for NW→NE), with an average cross division correlation of only 0.546, indicating that LARS linear coefficients learned in one division do not transfer well to other divisions. The straight-line relationships it assumes appear to differ enough across divisions to cause meaningful prediction degradation.

Random Forest Process

# RANDOM FOREST MODELS

# Keep only numeric predictors plus salary
rfAEdata <- HittersAEclean[, c("Salary", colnames(X_AE))]
rfAWdata <- HittersAWclean[, c("Salary", colnames(X_AW))]
rfNEdata <- HittersNEclean[, c("Salary", colnames(X_NE))]
rfNWdata <- HittersNWclean[, c("Salary", colnames(X_NW))]

set.seed(123)

rfAE <- randomForest(Salary ~ ., data = rfAEdata, importance = TRUE)
rfAW <- randomForest(Salary ~ ., data = rfAWdata, importance = TRUE)
rfNE <- randomForest(Salary ~ ., data = rfNEdata, importance = TRUE)
rfNW <- randomForest(Salary ~ ., data = rfNWdata, importance = TRUE)

# RANDOM FOREST CROSS PREDICTIONS

# AE model
rfpredAEAE <- predict(rfAE, newdata = rfAEdata)
rfpredAEAW <- predict(rfAE, newdata = rfAWdata)
rfpredAENE <- predict(rfAE, newdata = rfNEdata)
rfpredAENW <- predict(rfAE, newdata = rfNWdata)

# AW model
rfpredAWAE <- predict(rfAW, newdata = rfAEdata)
rfpredAWAW <- predict(rfAW, newdata = rfAWdata)
rfpredAWNE <- predict(rfAW, newdata = rfNEdata)
rfpredAWNW <- predict(rfAW, newdata = rfNWdata)

# NE model
rfpredNEAE <- predict(rfNE, newdata = rfAEdata)
rfpredNEAW <- predict(rfNE, newdata = rfAWdata)
rfpredNENE <- predict(rfNE, newdata = rfNEdata)
rfpredNENW <- predict(rfNE, newdata = rfNWdata)

# NW model
rfpredNWAE <- predict(rfNW, newdata = rfAEdata)
rfpredNWAW <- predict(rfNW, newdata = rfAWdata)
rfpredNWNE <- predict(rfNW, newdata = rfNEdata)
rfpredNWNW <- predict(rfNW, newdata = rfNWdata)

# RANDOM FOREST PLOTS: 16 scatterplots

par(mfrow = c(4, 4))
par(mar = c(3, 3, 2, 1))

plot(HittersAEclean$Salary, rfpredAEAE, main = "RF AE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredAEAW, main = "RF AE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredAENE, main = "RF AE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredAENW, main = "RF AE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")

plot(HittersAEclean$Salary, rfpredAWAE, main = "RF AW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredAWAW, main = "RF AW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredAWNE, main = "RF AW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredAWNW, main = "RF AW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")

plot(HittersAEclean$Salary, rfpredNEAE, main = "RF NE->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredNEAW, main = "RF NE->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredNENE, main = "RF NE->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredNENW, main = "RF NE->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")

plot(HittersAEclean$Salary, rfpredNWAE, main = "RF NW->AE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersAWclean$Salary, rfpredNWAW, main = "RF NW->AW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNEclean$Salary, rfpredNWNE, main = "RF NW->NE", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")
plot(HittersNWclean$Salary, rfpredNWNW, main = "RF NW->NW", xlab = "Actual", ylab = "Predicted"); abline(0,1,col="blue")

# RANDOM FOREST CORRELATION MATRIX

corr_rf <- matrix(NA, 4, 4)
rownames(corr_rf) <- c("AE model", "AW model", "NE model", "NW model")
colnames(corr_rf) <- c("AE data", "AW data", "NE data", "NW data")

corr_rf[1,1] <- cor(rfpredAEAE, HittersAEclean$Salary)
corr_rf[1,2] <- cor(rfpredAEAW, HittersAWclean$Salary)
corr_rf[1,3] <- cor(rfpredAENE, HittersNEclean$Salary)
corr_rf[1,4] <- cor(rfpredAENW, HittersNWclean$Salary)

corr_rf[2,1] <- cor(rfpredAWAE, HittersAEclean$Salary)
corr_rf[2,2] <- cor(rfpredAWAW, HittersAWclean$Salary)
corr_rf[2,3] <- cor(rfpredAWNE, HittersNEclean$Salary)
corr_rf[2,4] <- cor(rfpredAWNW, HittersNWclean$Salary)

corr_rf[3,1] <- cor(rfpredNEAE, HittersAEclean$Salary)
corr_rf[3,2] <- cor(rfpredNEAW, HittersAWclean$Salary)
corr_rf[3,3] <- cor(rfpredNENE, HittersNEclean$Salary)
corr_rf[3,4] <- cor(rfpredNENW, HittersNWclean$Salary)

corr_rf[4,1] <- cor(rfpredNWAE, HittersAEclean$Salary)
corr_rf[4,2] <- cor(rfpredNWAW, HittersAWclean$Salary)
corr_rf[4,3] <- cor(rfpredNWNE, HittersNEclean$Salary)
corr_rf[4,4] <- cor(rfpredNWNW, HittersNWclean$Salary)

cat("=== RANDOM FOREST CORRELATION MATRIX ===\n")
## === RANDOM FOREST CORRELATION MATRIX ===
print(round(corr_rf, 3))
##          AE data AW data NE data NW data
## AE model   0.959   0.855   0.708   0.702
## AW model   0.755   0.977   0.685   0.686
## NE model   0.763   0.857   0.967   0.784
## NW model   0.771   0.796   0.780   0.948
# Variable importance for each RF model

cat("\n=== RANDOM FOREST VARIABLE IMPORTANCE ===\n")
## 
## === RANDOM FOREST VARIABLE IMPORTANCE ===
cat("\nAE model:\n")
## 
## AE model:
print(importance(rfAE))
##           %IncMSE IncNodePurity
## AtBat    4.588217      758684.6
## Hits     4.978932     1536578.2
## HmRun    4.376369      468982.8
## Runs     7.284818     1595008.8
## RBI      3.319673     1204347.4
## Walks    3.099129     1723029.2
## Years    6.431135      463462.3
## CAtBat   6.844453      850912.1
## CHits    8.392600     1341035.1
## CHmRun   6.966168     2145353.5
## CRuns    9.629997     2023871.9
## CRBI     7.105555     2209174.4
## CWalks   4.159051      786373.9
## PutOuts -3.498514      775555.6
## Assists  1.701594      235967.1
## Errors   2.243344      345421.2
cat("\nAW model:\n")
## 
## AW model:
print(importance(rfAW))
##            %IncMSE IncNodePurity
## AtBat    9.1107980      418413.6
## Hits     7.3172718      300242.2
## HmRun   -0.3918957      106145.1
## Runs     7.7259287      356593.9
## RBI      6.2830783      242418.0
## Walks    2.0108237      163691.4
## Years    3.6546097      160582.2
## CAtBat  11.2979456     1024750.9
## CHits   11.6227897     1193041.4
## CHmRun   3.6951505      293474.7
## CRuns   14.3034110     1542461.7
## CRBI     8.5737913      621661.1
## CWalks   7.7428817      683213.6
## PutOuts -1.5747921      157250.5
## Assists  2.6422665      127784.3
## Errors   3.7222535      133326.3
cat("\nNE model:\n")
## 
## NE model:
print(importance(rfNE))
##            %IncMSE IncNodePurity
## AtBat    0.7876112     514674.21
## Hits     1.7341228     383485.30
## HmRun    0.8933922     271474.99
## Runs     3.3182479     645899.91
## RBI      0.3446691     621430.75
## Walks    8.3750591    1952415.47
## Years   -0.3123428     260383.49
## CAtBat   9.7412979    1759801.57
## CHits    9.5602771    1767768.16
## CHmRun   3.9767567     511291.19
## CRuns    8.5318039    1646026.95
## CRBI     6.3704624    2034404.76
## CWalks   6.0547759    1386568.57
## PutOuts 11.4511072    1162258.70
## Assists -2.4212059     111636.66
## Errors  -2.8344688      64840.85
cat("\nNW model:\n")
## 
## NW model:
print(importance(rfNW))
##             %IncMSE IncNodePurity
## AtBat    6.06339779      556082.0
## Hits     3.87941653      337328.3
## HmRun    2.34171671      598477.1
## Runs     2.20627523      180413.7
## RBI      4.88858755      509162.9
## Walks    0.05511147      269136.8
## Years    3.16848755      156943.2
## CAtBat   5.89617904      669158.1
## CHits    7.71440852      715604.1
## CHmRun   3.19007609      978182.7
## CRuns    4.02045575      506723.6
## CRBI     9.16229273      903154.7
## CWalks   4.03766505      492678.7
## PutOuts  1.61301857      186426.0
## Assists -2.87181694      148892.9
## Errors  -1.03365618      114303.5

The Process Explanation:

  • We fit one Random Forest per division (AE, AW, NE, NW) using the same 16 numeric predictors, with importance = TRUE to track variable contributions and set.seed(123) for reproducibility. Each forest was then applied to all four division datasets producing 16 cross-division predictions (e.g., rfpredAEAW = RF trained on AE predicting AW players), visualized in 16 scatterplots (actual vs predicted Salary, blue = perfect prediction line) and summarized in a 4×4 correlation matrix.

Result Explanation:

  • Within-division RF performed exceptionally well (diagonal: 0.948–0.977), and critically, cross-division performance remained strong (off-diagonal minimum = 0.685), with an average cross-division correlation of 0.762, showing the forest’s learned salary patterns transfer reliably across divisions.

LARS vs. RF

# COMPARE LARS VS RANDOM FOREST

diff_rf_lars <- corr_rf - corr_lars

cat("=== DIFFERENCE MATRIX (RF - LARS) ===\n")
## === DIFFERENCE MATRIX (RF - LARS) ===
cat("Positive values mean Random Forest is better\n\n")
## Positive values mean Random Forest is better
print(round(diff_rf_lars, 3))
##          AE data AW data NE data NW data
## AE model   0.070   0.258   0.158   0.195
## AW model   0.018   0.192   0.106   0.114
## NE model   0.083   0.218   0.283   0.256
## NW model   0.316   0.232   0.634   0.181
# Means
diag_lars <- mean(diag(corr_lars))
diag_rf   <- mean(diag(corr_rf))

off_lars <- mean(corr_lars[row(corr_lars) != col(corr_lars)])
off_rf   <- mean(corr_rf[row(corr_rf) != col(corr_rf)])

overall_lars <- mean(corr_lars)
overall_rf   <- mean(corr_rf)

cat("\n=== SUMMARY ===\n")
## 
## === SUMMARY ===
cat("LARS within-division mean:", round(diag_lars, 3), "\n")
## LARS within-division mean: 0.781
cat("RF within-division mean:", round(diag_rf, 3), "\n\n")
## RF within-division mean: 0.963
cat("LARS cross-division mean:", round(off_lars, 3), "\n")
## LARS cross-division mean: 0.546
cat("RF cross-division mean:", round(off_rf, 3), "\n\n")
## RF cross-division mean: 0.762
cat("LARS overall mean:", round(overall_lars, 3), "\n")
## LARS overall mean: 0.605
cat("RF overall mean:", round(overall_rf, 3), "\n")
## RF overall mean: 0.812

The Process Explanation:

  • Subtracting the LARS correlation matrix from the RF matrix produced a difference matrix where every single value is positive, meaning RF outperformed LARS in all 16 train-test combinations.

Result Explanation:

  • The cross-division gap (0.762- 0.546 = 0.216) is the most important finding. Random Forest’s advantage is largest precisely where generalization matters most with the biggest individual gain at NW→NE (+0.634) and where LARS nearly failed (0.145). However, RF model remained strong (0.780). Variable importance further supports this: career statistics (CRuns, CRBI, CHmRun, CAtBat, CHits) were consistently the top predictors across all four divisions, suggesting RF successfully identified universal salary patterns through its nonlinear splits rather than division-specific linear weights. This is why it transfers better across divisions than LARS.

Final Answers to the 3 Questions:

  1. Yes. The RF cross-division mean correlation (0.762) is substantially higher than LARS (0.546), and every single cell in the difference matrix is positive, meaning RF outperformed LARS in all 16 train-test division combinations.

  2. A single tree makes decisions using if/then split rules that naturally capture nonlinear patterns and interactions between predictors (e.g., career hits mattering more at certain experience levels), which a linear model like LARS cannot do. A random forest averages hundreds of such trees, which reduces overfitting to any one division’s specific patterns and produces more stable predictions when applied to a new division. LARS assumes a straight-line relationship between predictors and Salary which breaks down when salary patterns differ across divisions.

  3. The 16 RF cross-division scatter plots show points clustering tighter around the blue 45° line compared to the LARS red-line plots, which show more scatter and systematic bias at higher salary values. The correlation matrices confirm this numerically: the RF cross-division minimum (0.685) versus the LARS cross-division minimum (0.145) shows that RF maintains prediction quality across all divisions while LARS degrades significantly, particularly in the NW→NE case where LARS nearly fails entirely.