Problem 1
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.2
hitters1 <- Hitters
hitters2 <- hitters1[,-c(14,15,20)]
hitters3 <- hitters2[complete.cases(hitters2),]
hitters3["logSalary"] <- NA
hitters3$logSalary <- log(hitters3$Salary)
hitters4 <- hitters3[,-c(17)]
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
set.seed(12345)
data_partition <- createDataPartition(hitters4$logSalary, p = 0.7, list = FALSE)
training <- hitters4[data_partition,]
testing <- hitters4[-data_partition,]
Problem 2
library(car)
## Warning: package 'car' was built under R version 3.3.2
LM1 <- lm(logSalary ~ ., training)
LM1 <- lm(logSalary ~ AtBat + Hits + Walks + PutOuts + Assists, training)
LM1 <- lm(logSalary ~ AtBat + Hits + Walks, training)
vif(LM1)
## AtBat Hits Walks
## 16.374806 15.387746 1.551907
# I removed AtBat because it had the largest VIF value.
# Without AtBat, both predictors were under the acceptable value.
LM1 <- lm(logSalary ~ Hits + Walks, training)
vif(LM1)
## Hits Walks
## 1.44903 1.44903
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ Hits + Walks, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5822 -0.6320 0.1774 0.5589 2.8355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.820657 0.156824 30.739 < 2e-16 ***
## Hits 0.006499 0.001533 4.239 3.56e-05 ***
## Walks 0.009822 0.003365 2.919 0.00395 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7901 on 182 degrees of freedom
## Multiple R-squared: 0.2428, Adjusted R-squared: 0.2345
## F-statistic: 29.18 on 2 and 182 DF, p-value: 1.02e-11
# R-squared for the training data is 0.2428
plot(LM1$fitted.values, LM1$residuals, pch = 20, xlab = "Predicted", ylab = "Residuals", main = "Linear Model")
abline(0,1)

# There seems to be no pattern in the residuals.
plot(LM1$fitted.values, training$logSalary, asp = 1, pch = 20, xlab="Predicted", ylab="Observed", main="Linear Model")
abline(0,1)

vif(LM1)
## Hits Walks
## 1.44903 1.44903
# The VIFs suggest slight correlation between the two predictors since they are not 1, but nothing to be concerned about. If the values were between 5 and 10, then there would be much higher correlation.
predicted_LM1 <- predict(LM1, newdata = testing)
rsquared_LM1 <- cor(predicted_LM1, testing$logSalary)^2
rsquared_LM1
## [1] 0.2522379
Problem 3
library(leaps)
## Warning: package 'leaps' was built under R version 3.3.2
null <- lm(logSalary~1, training)
full <- lm(logSalary~., training)
LM2 <- step(null, data = training, direction = "forward", scope = list(lower = null, upper = full))
## Start: AIC=-36.75
## logSalary ~ 1
##
## Df Sum of Sq RSS AIC
## + CHits 1 62.279 87.758 -133.968
## + CRuns 1 61.152 88.885 -131.607
## + CAtBat 1 58.656 91.381 -126.484
## + CRBI 1 52.014 98.023 -113.503
## + CWalks 1 45.190 104.848 -101.052
## + Years 1 42.795 107.243 -96.873
## + CHmRun 1 35.394 114.643 -84.528
## + Hits 1 31.109 118.929 -77.739
## + RBI 1 28.142 121.895 -73.181
## + Runs 1 25.595 124.442 -69.355
## + AtBat 1 25.314 124.723 -68.938
## + Walks 1 25.209 124.828 -68.782
## + HmRun 1 16.054 133.983 -55.689
## + PutOuts 1 6.397 143.640 -42.813
## <none> 150.037 -36.752
## + Assists 1 0.680 149.357 -35.593
## + Errors 1 0.009 150.028 -34.763
##
## Step: AIC=-133.97
## logSalary ~ CHits
##
## Df Sum of Sq RSS AIC
## + Hits 1 13.7220 74.036 -163.42
## + Runs 1 11.9989 75.759 -159.17
## + AtBat 1 10.6337 77.124 -155.86
## + RBI 1 9.2378 78.520 -152.54
## + Walks 1 7.9072 79.851 -149.44
## + PutOuts 1 6.5531 81.205 -146.32
## + HmRun 1 4.2733 83.485 -141.20
## + CAtBat 1 3.3886 84.370 -139.25
## + Years 1 1.6035 86.155 -135.38
## + CRBI 1 1.1586 86.599 -134.43
## + Assists 1 1.0612 86.697 -134.22
## <none> 87.758 -133.97
## + CHmRun 1 0.8151 86.943 -133.69
## + CWalks 1 0.5362 87.222 -133.10
## + Errors 1 0.1170 87.641 -132.21
## + CRuns 1 0.0740 87.684 -132.12
##
## Step: AIC=-163.42
## logSalary ~ CHits + Hits
##
## Df Sum of Sq RSS AIC
## + PutOuts 1 2.30371 71.732 -167.27
## + AtBat 1 1.41545 72.621 -165.00
## + Walks 1 1.06455 72.972 -164.10
## <none> 74.036 -163.42
## + Errors 1 0.57543 73.461 -162.87
## + Years 1 0.42476 73.611 -162.49
## + CHmRun 1 0.35216 73.684 -162.31
## + CRBI 1 0.34062 73.696 -162.28
## + CAtBat 1 0.27880 73.757 -162.12
## + RBI 1 0.06229 73.974 -161.58
## + HmRun 1 0.05493 73.981 -161.56
## + Runs 1 0.05245 73.984 -161.56
## + CWalks 1 0.01684 74.019 -161.47
## + Assists 1 0.01205 74.024 -161.45
## + CRuns 1 0.00068 74.035 -161.43
##
## Step: AIC=-167.27
## logSalary ~ CHits + Hits + PutOuts
##
## Df Sum of Sq RSS AIC
## + AtBat 1 1.45545 70.277 -169.06
## <none> 71.732 -167.27
## + CRBI 1 0.69234 71.040 -167.07
## + Walks 1 0.58028 71.152 -166.77
## + Years 1 0.56417 71.168 -166.73
## + CHmRun 1 0.53506 71.197 -166.66
## + Errors 1 0.51310 71.219 -166.60
## + CAtBat 1 0.12139 71.611 -165.59
## + Runs 1 0.08114 71.651 -165.48
## + Assists 1 0.02542 71.707 -165.34
## + CRuns 1 0.01330 71.719 -165.31
## + HmRun 1 0.00192 71.731 -165.28
## + RBI 1 0.00161 71.731 -165.28
## + CWalks 1 0.00103 71.731 -165.27
##
## Step: AIC=-169.06
## logSalary ~ CHits + Hits + PutOuts + AtBat
##
## Df Sum of Sq RSS AIC
## + Walks 1 1.28214 68.995 -170.47
## <none> 70.277 -169.06
## + CRBI 1 0.57984 69.697 -168.60
## + Years 1 0.54125 69.736 -168.49
## + CHmRun 1 0.32888 69.948 -167.93
## + Assists 1 0.24828 70.029 -167.72
## + Runs 1 0.24429 70.033 -167.71
## + Errors 1 0.18449 70.093 -167.55
## + CRuns 1 0.11788 70.159 -167.37
## + RBI 1 0.06921 70.208 -167.25
## + HmRun 1 0.04273 70.234 -167.18
## + CWalks 1 0.02884 70.248 -167.14
## + CAtBat 1 0.01971 70.257 -167.12
##
## Step: AIC=-170.47
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks
##
## Df Sum of Sq RSS AIC
## + CRBI 1 0.97209 68.023 -171.09
## <none> 68.995 -170.47
## + CHmRun 1 0.66399 68.331 -170.26
## + Years 1 0.63599 68.359 -170.18
## + CWalks 1 0.58012 68.415 -170.03
## + Assists 1 0.46264 68.532 -169.72
## + Errors 1 0.06954 68.925 -168.66
## + CRuns 1 0.04122 68.954 -168.58
## + RBI 1 0.02362 68.971 -168.53
## + HmRun 1 0.01653 68.978 -168.51
## + Runs 1 0.00882 68.986 -168.49
## + CAtBat 1 0.00169 68.993 -168.47
##
## Step: AIC=-171.1
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI
##
## Df Sum of Sq RSS AIC
## + RBI 1 0.78089 67.242 -171.23
## <none> 68.023 -171.09
## + Years 1 0.54402 67.479 -170.58
## + HmRun 1 0.51320 67.510 -170.50
## + CWalks 1 0.32193 67.701 -169.97
## + Assists 1 0.22721 67.796 -169.71
## + Errors 1 0.12014 67.903 -169.42
## + CHmRun 1 0.04982 67.973 -169.23
## + CAtBat 1 0.02030 68.002 -169.15
## + CRuns 1 0.00374 68.019 -169.10
## + Runs 1 0.00359 68.019 -169.10
##
## Step: AIC=-171.23
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI + RBI
##
## Df Sum of Sq RSS AIC
## <none> 67.242 -171.23
## + Years 1 0.48548 66.756 -170.57
## + Assists 1 0.33082 66.911 -170.14
## + CWalks 1 0.21262 67.029 -169.82
## + Errors 1 0.15537 67.086 -169.66
## + CHmRun 1 0.07190 67.170 -169.43
## + CAtBat 1 0.02184 67.220 -169.29
## + Runs 1 0.01944 67.222 -169.28
## + HmRun 1 0.01869 67.223 -169.28
## + CRuns 1 0.00038 67.241 -169.23
predicted_LM2 <- predict(LM2, newdata = testing)
Rsquared_LM2 <- cor(predicted_LM2, testing$logSalary)^2
Rsquared_LM2
## [1] 0.4212
LM3 <- step(full, training, direction = "backward")
## Start: AIC=-161.44
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts +
## Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CAtBat 1 0.00277 64.326 -163.43
## - CHmRun 1 0.00643 64.330 -163.42
## - CRuns 1 0.08696 64.410 -163.19
## - RBI 1 0.11183 64.435 -163.12
## - Runs 1 0.11738 64.441 -163.10
## - CHits 1 0.15150 64.475 -163.00
## - HmRun 1 0.15473 64.478 -163.00
## - CRBI 1 0.18447 64.508 -162.91
## - CWalks 1 0.55591 64.879 -161.85
## - Years 1 0.68277 65.006 -161.49
## <none> 64.323 -161.44
## - Errors 1 1.00330 65.327 -160.58
## - Assists 1 1.61018 65.934 -158.87
## - PutOuts 1 2.12262 66.446 -157.43
## - Walks 1 2.15548 66.479 -157.34
## - Hits 1 2.36473 66.688 -156.76
## - AtBat 1 2.55156 66.875 -156.24
##
## Step: AIC=-163.43
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts + Assists +
## Errors
##
## Df Sum of Sq RSS AIC
## - CHmRun 1 0.00447 64.331 -165.42
## - RBI 1 0.10978 64.436 -165.12
## - CRuns 1 0.11018 64.436 -165.12
## - Runs 1 0.12591 64.452 -165.07
## - HmRun 1 0.15858 64.485 -164.98
## - CRBI 1 0.18660 64.513 -164.90
## - CHits 1 0.30816 64.634 -164.55
## <none> 64.326 -163.43
## - CWalks 1 0.76838 65.094 -163.24
## - Years 1 0.78033 65.106 -163.20
## - Errors 1 1.00053 65.327 -162.58
## - Assists 1 1.62479 65.951 -160.82
## - PutOuts 1 2.16701 66.493 -159.30
## - Walks 1 2.27754 66.604 -159.00
## - Hits 1 2.94702 67.273 -157.15
## - AtBat 1 3.11340 67.440 -156.69
##
## Step: AIC=-165.42
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - RBI 1 0.10979 64.440 -167.10
## - Runs 1 0.15513 64.486 -166.97
## - CRuns 1 0.21506 64.546 -166.80
## - HmRun 1 0.24283 64.573 -166.72
## - CHits 1 0.62192 64.953 -165.64
## <none> 64.331 -165.42
## - Years 1 0.78668 65.117 -165.17
## - CWalks 1 0.87782 65.208 -164.91
## - CRBI 1 0.98899 65.320 -164.60
## - Errors 1 0.99788 65.328 -164.57
## - Assists 1 1.62108 65.952 -162.81
## - PutOuts 1 2.17446 66.505 -161.27
## - Walks 1 2.33502 66.666 -160.82
## - Hits 1 3.01160 67.342 -158.96
## - AtBat 1 3.16207 67.493 -158.54
##
## Step: AIC=-167.1
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CRuns 1 0.1849 64.625 -168.57
## - Runs 1 0.2007 64.641 -168.53
## - CHits 1 0.6080 65.048 -167.37
## <none> 64.440 -167.10
## - Years 1 0.8357 65.276 -166.72
## - CRBI 1 0.8806 65.321 -166.59
## - Errors 1 0.9422 65.383 -166.42
## - CWalks 1 0.9896 65.430 -166.28
## - HmRun 1 1.1478 65.588 -165.84
## - Assists 1 1.6562 66.097 -164.41
## - PutOuts 1 2.1321 66.573 -163.08
## - Walks 1 2.7494 67.190 -161.37
## - AtBat 1 3.1033 67.544 -160.40
## - Hits 1 3.7342 68.175 -158.68
##
## Step: AIC=-168.57
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - Runs 1 0.0761 64.701 -170.36
## - Years 1 0.6994 65.325 -168.58
## <none> 64.625 -168.57
## - CWalks 1 0.8078 65.433 -168.28
## - CRBI 1 0.8574 65.483 -168.13
## - Errors 1 1.0301 65.655 -167.65
## - HmRun 1 1.1341 65.759 -167.35
## - Assists 1 1.6612 66.287 -165.88
## - PutOuts 1 1.9977 66.623 -164.94
## - Walks 1 2.6085 67.234 -163.25
## - AtBat 1 2.9512 67.576 -162.31
## - Hits 1 3.6306 68.256 -160.46
## - CHits 1 4.4185 69.044 -158.34
##
## Step: AIC=-170.36
## logSalary ~ AtBat + Hits + HmRun + Walks + Years + CHits + CRBI +
## CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## <none> 64.701 -170.36
## - Years 1 0.7623 65.464 -170.19
## - CRBI 1 0.7913 65.493 -170.11
## - CWalks 1 0.8654 65.567 -169.90
## - Errors 1 0.9766 65.678 -169.59
## - HmRun 1 1.0704 65.772 -169.32
## - Assists 1 1.6525 66.354 -167.69
## - PutOuts 1 2.2163 66.918 -166.12
## - Walks 1 2.6465 67.348 -164.94
## - AtBat 1 2.9875 67.689 -164.00
## - Hits 1 4.0416 68.743 -161.15
## - CHits 1 4.3473 69.049 -160.33
predicted_LM3 <- predict(LM3, newdata = testing)
Rsquared_LM3 <- cor(predicted_LM3, testing$logSalary)^2
Rsquared_LM3
## [1] 0.4549475
Problem 4
LM4 <- train(logSalary ~ ., data = training, method = "lm", trControl = trainControl(method="cv", number = 10))
predicted_LM4 <- predict(LM4, newdata = testing)
Rsquared_LM4 <- cor(predicted_LM4, testing$logSalary)^2
Rsquared_LM4
## [1] 0.4518025
kNN1 <- train(logSalary ~ ., data = training, method = "knn", trControl = trainControl(method="cv", number = 10))
predicted_kNN1 <- predict(kNN1, newdata = testing)
Rsquared_kNN1 <- cor(predicted_kNN1, testing$logSalary)^2
Rsquared_kNN1
## [1] 0.6818339
library(earth)
## Warning: package 'earth' was built under R version 3.3.2
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.3.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.3.2
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 3.3.2
MARS1 <- train(logSalary ~ ., data = training, method = "earth", trControl = trainControl(method="cv", number = 10))
predicted_MARS1 <- predict(MARS1, newdata = testing)
Rsquared_MARS1 <- cor(predicted_MARS1, testing$logSalary)^2
Rsquared_MARS1
## [,1]
## y 0.6125382
Problem 5
(a) The r-squared values increase with each of the four linear models. They become increasingly more accurate because of this. LM1 only uses two predictors, while the others use a varying amount of predictors.
(b) The k neartest neighbors model seems to be the best. This is because it has the highest r-squared value.