library(ISLR)
## Warning: package 'ISLR' was built under R version 3.3.2
Hitters_1 <- Hitters
Hitters_2 <- Hitters_1[,-c(14,15,20)]
Hitters_3 <- Hitters_2[complete.cases(Hitters_2),]
Hitters_3["logSalary"] <- NA
Hitters_3$logSalary <- log(Hitters_3$Salary)
Hitters_4 <- Hitters_3[,-c(17)]
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.2
set.seed(12345)
data_partition <- createDataPartition(Hitters_4$logSalary, p = .7, list = FALSE)
training <- Hitters_4[data_partition,]
testing <- Hitters_4[-data_partition,]
LM1 <- lm(logSalary ~ ., data=training)
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27147 -0.44874 0.01607 0.40936 2.79909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.677e+00 1.959e-01 23.872 <2e-16 ***
## AtBat -3.931e-03 1.523e-03 -2.582 0.0107 *
## Hits 1.445e-02 5.813e-03 2.485 0.0139 *
## HmRun 9.406e-03 1.480e-02 0.636 0.5258
## Runs -3.868e-03 6.987e-03 -0.554 0.5805
## RBI 3.493e-03 6.463e-03 0.540 0.5896
## Walks 1.066e-02 4.492e-03 2.373 0.0188 *
## Years 4.177e-02 3.128e-02 1.335 0.1836
## CAtBat -2.807e-05 3.299e-04 -0.085 0.9323
## CHits 1.097e-03 1.745e-03 0.629 0.5302
## CHmRun 5.071e-04 3.912e-03 0.130 0.8970
## CRuns 8.409e-04 1.764e-03 0.477 0.6343
## CRBI -1.265e-03 1.822e-03 -0.694 0.4886
## CWalks -9.993e-04 8.293e-04 -1.205 0.2299
## PutOuts 4.154e-04 1.765e-04 2.355 0.0197 *
## Assists 1.052e-03 5.131e-04 2.051 0.0418 *
## Errors -1.647e-02 1.017e-02 -1.619 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6188 on 168 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5305
## F-statistic: 13.99 on 16 and 168 DF, p-value: < 2.2e-16
LM1 <- lm(logSalary ~ AtBat +Hits +Walks +PutOuts +Assists , data=training )
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ AtBat + Hits + Walks + PutOuts + Assists,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5468 -0.5977 0.1007 0.5593 2.6496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.048e+00 1.799e-01 28.069 < 2e-16 ***
## AtBat -4.240e-03 1.648e-03 -2.573 0.010895 *
## Hits 1.892e-02 5.039e-03 3.753 0.000236 ***
## Walks 1.191e-02 3.544e-03 3.361 0.000950 ***
## PutOuts 1.423e-04 2.081e-04 0.684 0.495085
## Assists 8.827e-05 4.392e-04 0.201 0.840930
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7804 on 179 degrees of freedom
## Multiple R-squared: 0.2734, Adjusted R-squared: 0.2531
## F-statistic: 13.47 on 5 and 179 DF, p-value: 3.739e-11
LM1 <- lm(logSalary ~ AtBat +Hits +Walks , data=training )
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ AtBat + Hits + Walks, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4733 -0.6045 0.1040 0.5608 2.6699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.057128 0.177881 28.430 < 2e-16 ***
## AtBat -0.004171 0.001563 -2.669 0.008294 **
## Hits 0.018981 0.004913 3.863 0.000156 ***
## Walks 0.012176 0.003425 3.555 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7771 on 181 degrees of freedom
## Multiple R-squared: 0.2715, Adjusted R-squared: 0.2594
## F-statistic: 22.48 on 3 and 181 DF, p-value: 2.028e-12
library(car)
## Warning: package 'car' was built under R version 3.3.2
vif(LM1)
## AtBat Hits Walks
## 16.374806 15.387746 1.551907
LM1 <- lm(logSalary ~ Hits +Walks , data=training )
vif(LM1)
## Hits Walks
## 1.44903 1.44903
If we were to draw a line starting at y=0 through the residuals, there would be approximately half of the points above the line and approximately half of the points below the line which is what we want to see. There are perhaps two outlying residuals in the top left corner of the plot, but this should not skew the line too much. The \(R^2\) is .2428
LM1 <- lm(logSalary ~ Hits +Walks , data=training )
summary(LM1)
##
## Call:
## lm(formula = logSalary ~ Hits + Walks, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5822 -0.6320 0.1774 0.5589 2.8355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.820657 0.156824 30.739 < 2e-16 ***
## Hits 0.006499 0.001533 4.239 3.56e-05 ***
## Walks 0.009822 0.003365 2.919 0.00395 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7901 on 182 degrees of freedom
## Multiple R-squared: 0.2428, Adjusted R-squared: 0.2345
## F-statistic: 29.18 on 2 and 182 DF, p-value: 1.02e-11
plot(LM1$fitted.values,LM1$residuals)
predicted <- predict(LM1)
plot(predicted, training$logSalary)
RSquared_LM1 <- cor(predicted, training$logSalary)^2
RSquared_LM1
## [1] 0.2427942
vif(LM1)
## Hits Walks
## 1.44903 1.44903
The \(R^2\) is .2522
predicted_LM1 <- predict(LM1, newdata = testing)
RSquared_LM1Testing <- cor(predicted_LM1, testing$logSalary)^2
RSquared_LM1Testing
## [1] 0.2522379
The \(R^2\) is .4212
library(leaps)
## Warning: package 'leaps' was built under R version 3.3.2
null <- lm(logSalary~1, data = training)
full <- lm(logSalary~ ., data = training)
LM2 <- step(null, data = training, direction = "forward", scope = list(lower = null, upper = full))
## Start: AIC=-36.75
## logSalary ~ 1
##
## Df Sum of Sq RSS AIC
## + CHits 1 62.279 87.758 -133.968
## + CRuns 1 61.152 88.885 -131.607
## + CAtBat 1 58.656 91.381 -126.484
## + CRBI 1 52.014 98.023 -113.503
## + CWalks 1 45.190 104.848 -101.052
## + Years 1 42.795 107.243 -96.873
## + CHmRun 1 35.394 114.643 -84.528
## + Hits 1 31.109 118.929 -77.739
## + RBI 1 28.142 121.895 -73.181
## + Runs 1 25.595 124.442 -69.355
## + AtBat 1 25.314 124.723 -68.938
## + Walks 1 25.209 124.828 -68.782
## + HmRun 1 16.054 133.983 -55.689
## + PutOuts 1 6.397 143.640 -42.813
## <none> 150.037 -36.752
## + Assists 1 0.680 149.357 -35.593
## + Errors 1 0.009 150.028 -34.763
##
## Step: AIC=-133.97
## logSalary ~ CHits
##
## Df Sum of Sq RSS AIC
## + Hits 1 13.7220 74.036 -163.42
## + Runs 1 11.9989 75.759 -159.17
## + AtBat 1 10.6337 77.124 -155.86
## + RBI 1 9.2378 78.520 -152.54
## + Walks 1 7.9072 79.851 -149.44
## + PutOuts 1 6.5531 81.205 -146.32
## + HmRun 1 4.2733 83.485 -141.20
## + CAtBat 1 3.3886 84.370 -139.25
## + Years 1 1.6035 86.155 -135.38
## + CRBI 1 1.1586 86.599 -134.43
## + Assists 1 1.0612 86.697 -134.22
## <none> 87.758 -133.97
## + CHmRun 1 0.8151 86.943 -133.69
## + CWalks 1 0.5362 87.222 -133.10
## + Errors 1 0.1170 87.641 -132.21
## + CRuns 1 0.0740 87.684 -132.12
##
## Step: AIC=-163.42
## logSalary ~ CHits + Hits
##
## Df Sum of Sq RSS AIC
## + PutOuts 1 2.30371 71.732 -167.27
## + AtBat 1 1.41545 72.621 -165.00
## + Walks 1 1.06455 72.972 -164.10
## <none> 74.036 -163.42
## + Errors 1 0.57543 73.461 -162.87
## + Years 1 0.42476 73.611 -162.49
## + CHmRun 1 0.35216 73.684 -162.31
## + CRBI 1 0.34062 73.696 -162.28
## + CAtBat 1 0.27880 73.757 -162.12
## + RBI 1 0.06229 73.974 -161.58
## + HmRun 1 0.05493 73.981 -161.56
## + Runs 1 0.05245 73.984 -161.56
## + CWalks 1 0.01684 74.019 -161.47
## + Assists 1 0.01205 74.024 -161.45
## + CRuns 1 0.00068 74.035 -161.43
##
## Step: AIC=-167.27
## logSalary ~ CHits + Hits + PutOuts
##
## Df Sum of Sq RSS AIC
## + AtBat 1 1.45545 70.277 -169.06
## <none> 71.732 -167.27
## + CRBI 1 0.69234 71.040 -167.07
## + Walks 1 0.58028 71.152 -166.77
## + Years 1 0.56417 71.168 -166.73
## + CHmRun 1 0.53506 71.197 -166.66
## + Errors 1 0.51310 71.219 -166.60
## + CAtBat 1 0.12139 71.611 -165.59
## + Runs 1 0.08114 71.651 -165.48
## + Assists 1 0.02542 71.707 -165.34
## + CRuns 1 0.01330 71.719 -165.31
## + HmRun 1 0.00192 71.731 -165.28
## + RBI 1 0.00161 71.731 -165.28
## + CWalks 1 0.00103 71.731 -165.27
##
## Step: AIC=-169.06
## logSalary ~ CHits + Hits + PutOuts + AtBat
##
## Df Sum of Sq RSS AIC
## + Walks 1 1.28214 68.995 -170.47
## <none> 70.277 -169.06
## + CRBI 1 0.57984 69.697 -168.60
## + Years 1 0.54125 69.736 -168.49
## + CHmRun 1 0.32888 69.948 -167.93
## + Assists 1 0.24828 70.029 -167.72
## + Runs 1 0.24429 70.033 -167.71
## + Errors 1 0.18449 70.093 -167.55
## + CRuns 1 0.11788 70.159 -167.37
## + RBI 1 0.06921 70.208 -167.25
## + HmRun 1 0.04273 70.234 -167.18
## + CWalks 1 0.02884 70.248 -167.14
## + CAtBat 1 0.01971 70.257 -167.12
##
## Step: AIC=-170.47
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks
##
## Df Sum of Sq RSS AIC
## + CRBI 1 0.97209 68.023 -171.09
## <none> 68.995 -170.47
## + CHmRun 1 0.66399 68.331 -170.26
## + Years 1 0.63599 68.359 -170.18
## + CWalks 1 0.58012 68.415 -170.03
## + Assists 1 0.46264 68.532 -169.72
## + Errors 1 0.06954 68.925 -168.66
## + CRuns 1 0.04122 68.954 -168.58
## + RBI 1 0.02362 68.971 -168.53
## + HmRun 1 0.01653 68.978 -168.51
## + Runs 1 0.00882 68.986 -168.49
## + CAtBat 1 0.00169 68.993 -168.47
##
## Step: AIC=-171.1
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI
##
## Df Sum of Sq RSS AIC
## + RBI 1 0.78089 67.242 -171.23
## <none> 68.023 -171.09
## + Years 1 0.54402 67.479 -170.58
## + HmRun 1 0.51320 67.510 -170.50
## + CWalks 1 0.32193 67.701 -169.97
## + Assists 1 0.22721 67.796 -169.71
## + Errors 1 0.12014 67.903 -169.42
## + CHmRun 1 0.04982 67.973 -169.23
## + CAtBat 1 0.02030 68.002 -169.15
## + CRuns 1 0.00374 68.019 -169.10
## + Runs 1 0.00359 68.019 -169.10
##
## Step: AIC=-171.23
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI + RBI
##
## Df Sum of Sq RSS AIC
## <none> 67.242 -171.23
## + Years 1 0.48548 66.756 -170.57
## + Assists 1 0.33082 66.911 -170.14
## + CWalks 1 0.21262 67.029 -169.82
## + Errors 1 0.15537 67.086 -169.66
## + CHmRun 1 0.07190 67.170 -169.43
## + CAtBat 1 0.02184 67.220 -169.29
## + Runs 1 0.01944 67.222 -169.28
## + HmRun 1 0.01869 67.223 -169.28
## + CRuns 1 0.00038 67.241 -169.23
predicted_LM2 <- predict(LM2, newdata = testing)
RSquared_LM2Testing <- cor(predicted_LM2, testing$logSalary)^2
RSquared_LM2Testing
## [1] 0.4212
The \(R^2\) is .4549
library(leaps)
LM3 <- step(full, data = training, direction = "backward")
## Start: AIC=-161.44
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts +
## Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CAtBat 1 0.00277 64.326 -163.43
## - CHmRun 1 0.00643 64.330 -163.42
## - CRuns 1 0.08696 64.410 -163.19
## - RBI 1 0.11183 64.435 -163.12
## - Runs 1 0.11738 64.441 -163.10
## - CHits 1 0.15150 64.475 -163.00
## - HmRun 1 0.15473 64.478 -163.00
## - CRBI 1 0.18447 64.508 -162.91
## - CWalks 1 0.55591 64.879 -161.85
## - Years 1 0.68277 65.006 -161.49
## <none> 64.323 -161.44
## - Errors 1 1.00330 65.327 -160.58
## - Assists 1 1.61018 65.934 -158.87
## - PutOuts 1 2.12262 66.446 -157.43
## - Walks 1 2.15548 66.479 -157.34
## - Hits 1 2.36473 66.688 -156.76
## - AtBat 1 2.55156 66.875 -156.24
##
## Step: AIC=-163.43
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts + Assists +
## Errors
##
## Df Sum of Sq RSS AIC
## - CHmRun 1 0.00447 64.331 -165.42
## - RBI 1 0.10978 64.436 -165.12
## - CRuns 1 0.11018 64.436 -165.12
## - Runs 1 0.12591 64.452 -165.07
## - HmRun 1 0.15858 64.485 -164.98
## - CRBI 1 0.18660 64.513 -164.90
## - CHits 1 0.30816 64.634 -164.55
## <none> 64.326 -163.43
## - CWalks 1 0.76838 65.094 -163.24
## - Years 1 0.78033 65.106 -163.20
## - Errors 1 1.00053 65.327 -162.58
## - Assists 1 1.62479 65.951 -160.82
## - PutOuts 1 2.16701 66.493 -159.30
## - Walks 1 2.27754 66.604 -159.00
## - Hits 1 2.94702 67.273 -157.15
## - AtBat 1 3.11340 67.440 -156.69
##
## Step: AIC=-165.42
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - RBI 1 0.10979 64.440 -167.10
## - Runs 1 0.15513 64.486 -166.97
## - CRuns 1 0.21506 64.546 -166.80
## - HmRun 1 0.24283 64.573 -166.72
## - CHits 1 0.62192 64.953 -165.64
## <none> 64.331 -165.42
## - Years 1 0.78668 65.117 -165.17
## - CWalks 1 0.87782 65.208 -164.91
## - CRBI 1 0.98899 65.320 -164.60
## - Errors 1 0.99788 65.328 -164.57
## - Assists 1 1.62108 65.952 -162.81
## - PutOuts 1 2.17446 66.505 -161.27
## - Walks 1 2.33502 66.666 -160.82
## - Hits 1 3.01160 67.342 -158.96
## - AtBat 1 3.16207 67.493 -158.54
##
## Step: AIC=-167.1
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CRuns 1 0.1849 64.625 -168.57
## - Runs 1 0.2007 64.641 -168.53
## - CHits 1 0.6080 65.048 -167.37
## <none> 64.440 -167.10
## - Years 1 0.8357 65.276 -166.72
## - CRBI 1 0.8806 65.321 -166.59
## - Errors 1 0.9422 65.383 -166.42
## - CWalks 1 0.9896 65.430 -166.28
## - HmRun 1 1.1478 65.588 -165.84
## - Assists 1 1.6562 66.097 -164.41
## - PutOuts 1 2.1321 66.573 -163.08
## - Walks 1 2.7494 67.190 -161.37
## - AtBat 1 3.1033 67.544 -160.40
## - Hits 1 3.7342 68.175 -158.68
##
## Step: AIC=-168.57
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - Runs 1 0.0761 64.701 -170.36
## - Years 1 0.6994 65.325 -168.58
## <none> 64.625 -168.57
## - CWalks 1 0.8078 65.433 -168.28
## - CRBI 1 0.8574 65.483 -168.13
## - Errors 1 1.0301 65.655 -167.65
## - HmRun 1 1.1341 65.759 -167.35
## - Assists 1 1.6612 66.287 -165.88
## - PutOuts 1 1.9977 66.623 -164.94
## - Walks 1 2.6085 67.234 -163.25
## - AtBat 1 2.9512 67.576 -162.31
## - Hits 1 3.6306 68.256 -160.46
## - CHits 1 4.4185 69.044 -158.34
##
## Step: AIC=-170.36
## logSalary ~ AtBat + Hits + HmRun + Walks + Years + CHits + CRBI +
## CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## <none> 64.701 -170.36
## - Years 1 0.7623 65.464 -170.19
## - CRBI 1 0.7913 65.493 -170.11
## - CWalks 1 0.8654 65.567 -169.90
## - Errors 1 0.9766 65.678 -169.59
## - HmRun 1 1.0704 65.772 -169.32
## - Assists 1 1.6525 66.354 -167.69
## - PutOuts 1 2.2163 66.918 -166.12
## - Walks 1 2.6465 67.348 -164.94
## - AtBat 1 2.9875 67.689 -164.00
## - Hits 1 4.0416 68.743 -161.15
## - CHits 1 4.3473 69.049 -160.33
predicted_LM3 <- predict(LM3, newdata = testing)
RSquared_LM3Testing <- cor(predicted_LM3, testing$logSalary)^2
RSquared_LM3Testing
## [1] 0.4549475
The \(R^2\) is .4518
LM4 = lm(logSalary ~ ., data = training)
summary(LM4)
##
## Call:
## lm(formula = logSalary ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.27147 -0.44874 0.01607 0.40936 2.79909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.677e+00 1.959e-01 23.872 <2e-16 ***
## AtBat -3.931e-03 1.523e-03 -2.582 0.0107 *
## Hits 1.445e-02 5.813e-03 2.485 0.0139 *
## HmRun 9.406e-03 1.480e-02 0.636 0.5258
## Runs -3.868e-03 6.987e-03 -0.554 0.5805
## RBI 3.493e-03 6.463e-03 0.540 0.5896
## Walks 1.066e-02 4.492e-03 2.373 0.0188 *
## Years 4.177e-02 3.128e-02 1.335 0.1836
## CAtBat -2.807e-05 3.299e-04 -0.085 0.9323
## CHits 1.097e-03 1.745e-03 0.629 0.5302
## CHmRun 5.071e-04 3.912e-03 0.130 0.8970
## CRuns 8.409e-04 1.764e-03 0.477 0.6343
## CRBI -1.265e-03 1.822e-03 -0.694 0.4886
## CWalks -9.993e-04 8.293e-04 -1.205 0.2299
## PutOuts 4.154e-04 1.765e-04 2.355 0.0197 *
## Assists 1.052e-03 5.131e-04 2.051 0.0418 *
## Errors -1.647e-02 1.017e-02 -1.619 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6188 on 168 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5305
## F-statistic: 13.99 on 16 and 168 DF, p-value: < 2.2e-16
predicted <- predict(LM4, newdata = testing)
RSquared_LM4Testing <- cor(predicted, testing$logSalary)^2
RSquared_LM4Testing
## [1] 0.4518025
The \(R^2\) is .6818
kNN1 <- train(logSalary~., data = training, method = "knn", maximize = TRUE, metric = "Rsquared", trControl = trainControl(method = "cv", number = 10))
predicted_kNN1 <- predict(kNN1, newdata = testing)
RSquared_kNN1 <- cor(predicted_kNN1, testing$logSalary)^2
RSquared_kNN1
## [1] 0.6818339
The \(R^2\) is .63335
MARS1 <- train(logSalary~ ., data = training, method = "earth", trControl = trainControl(method = "cv", number = 10))
## Loading required package: earth
## Warning: package 'earth' was built under R version 3.3.2
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.3.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.3.2
## Loading required package: TeachingDemos
## Warning: package 'TeachingDemos' was built under R version 3.3.2
predicted_MARS1 <- predict(MARS1, newdata = testing)
RSquared_MARS1 <- cor(predicted_MARS1, testing$logSalary)^2
RSquared_MARS1
## [,1]
## y 0.633359
LM1: The \(R^2\) value for the training data is .24 whereas the \(R^2\) value for the testing data is .25. They are practically the same.
LM2: The \(R^2\) value is .42
LM3: The \(R^2\) value is .454
LM4: The \(R^2\) value is .4518
kNN: The \(R^2\) value is .6818
MARS: The \(R^2\) value is .6125
Due to the fact that the k nearest neighbors model has the largest R^2 value with a value of .6818, I believe that this model serves as the best one out of all.