Problem 1

library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.2
hitters1 <- Hitters
hitters2 <- hitters1[,-c(14,15,20)]
hitters3 <- hitters2[complete.cases(hitters2),]
hitters3["logSalary"] <- NA
hitters3$logSalary <- log(hitters3$Salary)
hitters4 <- hitters3[,-c(17)]
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
set.seed(12345)
data_partition <- createDataPartition(hitters4$logSalary, p = 0.7, list = FALSE)
training <- hitters4[data_partition,]
testing <- hitters4[-data_partition,]

Problem 2

library(car)
## Warning: package 'car' was built under R version 3.3.2
LM1 <- lm(logSalary ~ ., training)
LM1 <- lm(logSalary ~ AtBat + Hits + Walks + PutOuts + Assists, training)
LM1 <- lm(logSalary ~ AtBat + Hits + Walks, training)
vif(LM1)
##     AtBat      Hits     Walks 
## 16.374806 15.387746  1.551907
# I removed AtBat because it had the largest VIF value.
# Without AtBat, both predictors were under the acceptable value.
LM1 <- lm(logSalary ~ Hits + Walks, training)
vif(LM1)
##    Hits   Walks 
## 1.44903 1.44903
summary(LM1)
## 
## Call:
## lm(formula = logSalary ~ Hits + Walks, data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5822 -0.6320  0.1774  0.5589  2.8355 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.820657   0.156824  30.739  < 2e-16 ***
## Hits        0.006499   0.001533   4.239 3.56e-05 ***
## Walks       0.009822   0.003365   2.919  0.00395 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7901 on 182 degrees of freedom
## Multiple R-squared:  0.2428, Adjusted R-squared:  0.2345 
## F-statistic: 29.18 on 2 and 182 DF,  p-value: 1.02e-11
# R-squared for the training data is 0.2428

plot(LM1$fitted.values, LM1$residuals, pch = 20, xlab = "Predicted", ylab = "Residuals", main = "Linear Model")
abline(0,1)

# There seems to be no pattern in the residuals.
plot(LM1$fitted.values, training$logSalary, asp = 1, pch = 20, xlab="Predicted", ylab="Observed", main="Linear Model")
abline(0,1) 

vif(LM1)
##    Hits   Walks 
## 1.44903 1.44903
# The VIFs suggest slight correlation between the two predictors since they are not 1, but nothing to be concerned about. If the values were between 5 and 10, then there would be much higher correlation.

predicted_LM1 <- predict(LM1, newdata = testing)
rsquared_LM1 <- cor(predicted_LM1, testing$logSalary)^2
rsquared_LM1
## [1] 0.2522379

Problem 3

library(leaps)
## Warning: package 'leaps' was built under R version 3.3.2
null <- lm(logSalary~1, training)
full <- lm(logSalary~., training)
LM2 <- step(null, data = training, direction = "forward", scope = list(lower = null, upper = full))
## Start:  AIC=-36.75
## logSalary ~ 1
## 
##           Df Sum of Sq     RSS      AIC
## + CHits    1    62.279  87.758 -133.968
## + CRuns    1    61.152  88.885 -131.607
## + CAtBat   1    58.656  91.381 -126.484
## + CRBI     1    52.014  98.023 -113.503
## + CWalks   1    45.190 104.848 -101.052
## + Years    1    42.795 107.243  -96.873
## + CHmRun   1    35.394 114.643  -84.528
## + Hits     1    31.109 118.929  -77.739
## + RBI      1    28.142 121.895  -73.181
## + Runs     1    25.595 124.442  -69.355
## + AtBat    1    25.314 124.723  -68.938
## + Walks    1    25.209 124.828  -68.782
## + HmRun    1    16.054 133.983  -55.689
## + PutOuts  1     6.397 143.640  -42.813
## <none>                 150.037  -36.752
## + Assists  1     0.680 149.357  -35.593
## + Errors   1     0.009 150.028  -34.763
## 
## Step:  AIC=-133.97
## logSalary ~ CHits
## 
##           Df Sum of Sq    RSS     AIC
## + Hits     1   13.7220 74.036 -163.42
## + Runs     1   11.9989 75.759 -159.17
## + AtBat    1   10.6337 77.124 -155.86
## + RBI      1    9.2378 78.520 -152.54
## + Walks    1    7.9072 79.851 -149.44
## + PutOuts  1    6.5531 81.205 -146.32
## + HmRun    1    4.2733 83.485 -141.20
## + CAtBat   1    3.3886 84.370 -139.25
## + Years    1    1.6035 86.155 -135.38
## + CRBI     1    1.1586 86.599 -134.43
## + Assists  1    1.0612 86.697 -134.22
## <none>                 87.758 -133.97
## + CHmRun   1    0.8151 86.943 -133.69
## + CWalks   1    0.5362 87.222 -133.10
## + Errors   1    0.1170 87.641 -132.21
## + CRuns    1    0.0740 87.684 -132.12
## 
## Step:  AIC=-163.42
## logSalary ~ CHits + Hits
## 
##           Df Sum of Sq    RSS     AIC
## + PutOuts  1   2.30371 71.732 -167.27
## + AtBat    1   1.41545 72.621 -165.00
## + Walks    1   1.06455 72.972 -164.10
## <none>                 74.036 -163.42
## + Errors   1   0.57543 73.461 -162.87
## + Years    1   0.42476 73.611 -162.49
## + CHmRun   1   0.35216 73.684 -162.31
## + CRBI     1   0.34062 73.696 -162.28
## + CAtBat   1   0.27880 73.757 -162.12
## + RBI      1   0.06229 73.974 -161.58
## + HmRun    1   0.05493 73.981 -161.56
## + Runs     1   0.05245 73.984 -161.56
## + CWalks   1   0.01684 74.019 -161.47
## + Assists  1   0.01205 74.024 -161.45
## + CRuns    1   0.00068 74.035 -161.43
## 
## Step:  AIC=-167.27
## logSalary ~ CHits + Hits + PutOuts
## 
##           Df Sum of Sq    RSS     AIC
## + AtBat    1   1.45545 70.277 -169.06
## <none>                 71.732 -167.27
## + CRBI     1   0.69234 71.040 -167.07
## + Walks    1   0.58028 71.152 -166.77
## + Years    1   0.56417 71.168 -166.73
## + CHmRun   1   0.53506 71.197 -166.66
## + Errors   1   0.51310 71.219 -166.60
## + CAtBat   1   0.12139 71.611 -165.59
## + Runs     1   0.08114 71.651 -165.48
## + Assists  1   0.02542 71.707 -165.34
## + CRuns    1   0.01330 71.719 -165.31
## + HmRun    1   0.00192 71.731 -165.28
## + RBI      1   0.00161 71.731 -165.28
## + CWalks   1   0.00103 71.731 -165.27
## 
## Step:  AIC=-169.06
## logSalary ~ CHits + Hits + PutOuts + AtBat
## 
##           Df Sum of Sq    RSS     AIC
## + Walks    1   1.28214 68.995 -170.47
## <none>                 70.277 -169.06
## + CRBI     1   0.57984 69.697 -168.60
## + Years    1   0.54125 69.736 -168.49
## + CHmRun   1   0.32888 69.948 -167.93
## + Assists  1   0.24828 70.029 -167.72
## + Runs     1   0.24429 70.033 -167.71
## + Errors   1   0.18449 70.093 -167.55
## + CRuns    1   0.11788 70.159 -167.37
## + RBI      1   0.06921 70.208 -167.25
## + HmRun    1   0.04273 70.234 -167.18
## + CWalks   1   0.02884 70.248 -167.14
## + CAtBat   1   0.01971 70.257 -167.12
## 
## Step:  AIC=-170.47
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks
## 
##           Df Sum of Sq    RSS     AIC
## + CRBI     1   0.97209 68.023 -171.09
## <none>                 68.995 -170.47
## + CHmRun   1   0.66399 68.331 -170.26
## + Years    1   0.63599 68.359 -170.18
## + CWalks   1   0.58012 68.415 -170.03
## + Assists  1   0.46264 68.532 -169.72
## + Errors   1   0.06954 68.925 -168.66
## + CRuns    1   0.04122 68.954 -168.58
## + RBI      1   0.02362 68.971 -168.53
## + HmRun    1   0.01653 68.978 -168.51
## + Runs     1   0.00882 68.986 -168.49
## + CAtBat   1   0.00169 68.993 -168.47
## 
## Step:  AIC=-171.1
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI
## 
##           Df Sum of Sq    RSS     AIC
## + RBI      1   0.78089 67.242 -171.23
## <none>                 68.023 -171.09
## + Years    1   0.54402 67.479 -170.58
## + HmRun    1   0.51320 67.510 -170.50
## + CWalks   1   0.32193 67.701 -169.97
## + Assists  1   0.22721 67.796 -169.71
## + Errors   1   0.12014 67.903 -169.42
## + CHmRun   1   0.04982 67.973 -169.23
## + CAtBat   1   0.02030 68.002 -169.15
## + CRuns    1   0.00374 68.019 -169.10
## + Runs     1   0.00359 68.019 -169.10
## 
## Step:  AIC=-171.23
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI + RBI
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 67.242 -171.23
## + Years    1   0.48548 66.756 -170.57
## + Assists  1   0.33082 66.911 -170.14
## + CWalks   1   0.21262 67.029 -169.82
## + Errors   1   0.15537 67.086 -169.66
## + CHmRun   1   0.07190 67.170 -169.43
## + CAtBat   1   0.02184 67.220 -169.29
## + Runs     1   0.01944 67.222 -169.28
## + HmRun    1   0.01869 67.223 -169.28
## + CRuns    1   0.00038 67.241 -169.23
predicted_LM2 <- predict(LM2, newdata = testing)
Rsquared_LM2 <- cor(predicted_LM2, testing$logSalary)^2
Rsquared_LM2
## [1] 0.4212
LM3 <- step(full, training, direction = "backward")
## Start:  AIC=-161.44
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
##     CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts + 
##     Assists + Errors
## 
##           Df Sum of Sq    RSS     AIC
## - CAtBat   1   0.00277 64.326 -163.43
## - CHmRun   1   0.00643 64.330 -163.42
## - CRuns    1   0.08696 64.410 -163.19
## - RBI      1   0.11183 64.435 -163.12
## - Runs     1   0.11738 64.441 -163.10
## - CHits    1   0.15150 64.475 -163.00
## - HmRun    1   0.15473 64.478 -163.00
## - CRBI     1   0.18447 64.508 -162.91
## - CWalks   1   0.55591 64.879 -161.85
## - Years    1   0.68277 65.006 -161.49
## <none>                 64.323 -161.44
## - Errors   1   1.00330 65.327 -160.58
## - Assists  1   1.61018 65.934 -158.87
## - PutOuts  1   2.12262 66.446 -157.43
## - Walks    1   2.15548 66.479 -157.34
## - Hits     1   2.36473 66.688 -156.76
## - AtBat    1   2.55156 66.875 -156.24
## 
## Step:  AIC=-163.43
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
##     CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts + Assists + 
##     Errors
## 
##           Df Sum of Sq    RSS     AIC
## - CHmRun   1   0.00447 64.331 -165.42
## - RBI      1   0.10978 64.436 -165.12
## - CRuns    1   0.11018 64.436 -165.12
## - Runs     1   0.12591 64.452 -165.07
## - HmRun    1   0.15858 64.485 -164.98
## - CRBI     1   0.18660 64.513 -164.90
## - CHits    1   0.30816 64.634 -164.55
## <none>                 64.326 -163.43
## - CWalks   1   0.76838 65.094 -163.24
## - Years    1   0.78033 65.106 -163.20
## - Errors   1   1.00053 65.327 -162.58
## - Assists  1   1.62479 65.951 -160.82
## - PutOuts  1   2.16701 66.493 -159.30
## - Walks    1   2.27754 66.604 -159.00
## - Hits     1   2.94702 67.273 -157.15
## - AtBat    1   3.11340 67.440 -156.69
## 
## Step:  AIC=-165.42
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years + 
##     CHits + CRuns + CRBI + CWalks + PutOuts + Assists + Errors
## 
##           Df Sum of Sq    RSS     AIC
## - RBI      1   0.10979 64.440 -167.10
## - Runs     1   0.15513 64.486 -166.97
## - CRuns    1   0.21506 64.546 -166.80
## - HmRun    1   0.24283 64.573 -166.72
## - CHits    1   0.62192 64.953 -165.64
## <none>                 64.331 -165.42
## - Years    1   0.78668 65.117 -165.17
## - CWalks   1   0.87782 65.208 -164.91
## - CRBI     1   0.98899 65.320 -164.60
## - Errors   1   0.99788 65.328 -164.57
## - Assists  1   1.62108 65.952 -162.81
## - PutOuts  1   2.17446 66.505 -161.27
## - Walks    1   2.33502 66.666 -160.82
## - Hits     1   3.01160 67.342 -158.96
## - AtBat    1   3.16207 67.493 -158.54
## 
## Step:  AIC=-167.1
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits + 
##     CRuns + CRBI + CWalks + PutOuts + Assists + Errors
## 
##           Df Sum of Sq    RSS     AIC
## - CRuns    1    0.1849 64.625 -168.57
## - Runs     1    0.2007 64.641 -168.53
## - CHits    1    0.6080 65.048 -167.37
## <none>                 64.440 -167.10
## - Years    1    0.8357 65.276 -166.72
## - CRBI     1    0.8806 65.321 -166.59
## - Errors   1    0.9422 65.383 -166.42
## - CWalks   1    0.9896 65.430 -166.28
## - HmRun    1    1.1478 65.588 -165.84
## - Assists  1    1.6562 66.097 -164.41
## - PutOuts  1    2.1321 66.573 -163.08
## - Walks    1    2.7494 67.190 -161.37
## - AtBat    1    3.1033 67.544 -160.40
## - Hits     1    3.7342 68.175 -158.68
## 
## Step:  AIC=-168.57
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits + 
##     CRBI + CWalks + PutOuts + Assists + Errors
## 
##           Df Sum of Sq    RSS     AIC
## - Runs     1    0.0761 64.701 -170.36
## - Years    1    0.6994 65.325 -168.58
## <none>                 64.625 -168.57
## - CWalks   1    0.8078 65.433 -168.28
## - CRBI     1    0.8574 65.483 -168.13
## - Errors   1    1.0301 65.655 -167.65
## - HmRun    1    1.1341 65.759 -167.35
## - Assists  1    1.6612 66.287 -165.88
## - PutOuts  1    1.9977 66.623 -164.94
## - Walks    1    2.6085 67.234 -163.25
## - AtBat    1    2.9512 67.576 -162.31
## - Hits     1    3.6306 68.256 -160.46
## - CHits    1    4.4185 69.044 -158.34
## 
## Step:  AIC=-170.36
## logSalary ~ AtBat + Hits + HmRun + Walks + Years + CHits + CRBI + 
##     CWalks + PutOuts + Assists + Errors
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 64.701 -170.36
## - Years    1    0.7623 65.464 -170.19
## - CRBI     1    0.7913 65.493 -170.11
## - CWalks   1    0.8654 65.567 -169.90
## - Errors   1    0.9766 65.678 -169.59
## - HmRun    1    1.0704 65.772 -169.32
## - Assists  1    1.6525 66.354 -167.69
## - PutOuts  1    2.2163 66.918 -166.12
## - Walks    1    2.6465 67.348 -164.94
## - AtBat    1    2.9875 67.689 -164.00
## - Hits     1    4.0416 68.743 -161.15
## - CHits    1    4.3473 69.049 -160.33
predicted_LM3 <- predict(LM3, newdata = testing)
Rsquared_LM3 <- cor(predicted_LM3, testing$logSalary)^2
Rsquared_LM3
## [1] 0.4549475

Problem 4

LM4 <- train(logSalary ~ ., data = training, method = "lm", trControl = trainControl(method="cv", number = 10))
predicted_LM4 <- predict(LM4, newdata = testing)
Rsquared_LM4 <- cor(predicted_LM4, testing$logSalary)^2
Rsquared_LM4
## [1] 0.4518025
kNN1 <- train(logSalary ~ ., data = training, method = "knn", trControl = trainControl(method="cv", number = 10))
predicted_kNN1 <- predict(kNN1, newdata = testing)
Rsquared_kNN1 <- cor(predicted_kNN1, testing$logSalary)^2
Rsquared_kNN1
## [1] 0.6818339
library(earth)
## Warning: package 'earth' was built under R version 3.3.2
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.3.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.3.2
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 3.3.2
MARS1 <- train(logSalary ~ ., data = training, method = "earth", trControl = trainControl(method="cv", number = 10))
predicted_MARS1 <- predict(MARS1, newdata = testing)
Rsquared_MARS1 <- cor(predicted_MARS1, testing$logSalary)^2
Rsquared_MARS1
##        [,1]
## y 0.6125382

Problem 5

(a) The r-squared values increase with each of the four linear models. They become increasingly more accurate because of this. LM1 only uses two predictors, while the others use a varying amount of predictors.

(b) The k neartest neighbors model seems to be the best. This is because it has the highest r-squared value.