library(ISLR)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
NewHit <- subset(Hitters, select = -c(League, Division,NewLeague))
CleanHit <-NewHit[complete.cases(NewHit),]
CleanHit$Salary <- log10(CleanHit$Salary)
colnames(CleanHit)[colnames(CleanHit)=="Salary"]<-"logSalary"
set.seed(12345)
SeperateData <- createDataPartition (CleanHit$logSalary, times=1, p=.7, list=FALSE)
Train <- CleanHit[SeperateData,]
Test<-CleanHit[-SeperateData,]
LM1<-train(logSalary~., data=Train, method = "lm", maximize =TRUE, metric= "Rsquared")
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55219 -0.19488 0.00698 0.17778 1.21563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.031e+00 8.509e-02 23.872 <2e-16 ***
## AtBat -1.707e-03 6.613e-04 -2.582 0.0107 *
## Hits 6.274e-03 2.525e-03 2.485 0.0139 *
## HmRun 4.085e-03 6.426e-03 0.636 0.5258
## Runs -1.680e-03 3.034e-03 -0.554 0.5805
## RBI 1.517e-03 2.807e-03 0.540 0.5896
## Walks 4.629e-03 1.951e-03 2.373 0.0188 *
## Years 1.814e-02 1.359e-02 1.335 0.1836
## CAtBat -1.219e-05 1.433e-04 -0.085 0.9323
## CHits 4.766e-04 7.577e-04 0.629 0.5302
## CHmRun 2.202e-04 1.699e-03 0.130 0.8970
## CRuns 3.652e-04 7.663e-04 0.477 0.6343
## CRBI -5.492e-04 7.912e-04 -0.694 0.4886
## CWalks -4.340e-04 3.602e-04 -1.205 0.2299
## PutOuts 1.804e-04 7.663e-05 2.355 0.0197 *
## Assists 4.570e-04 2.228e-04 2.051 0.0418 *
## Errors -7.152e-03 4.418e-03 -1.619 0.1074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2687 on 168 degrees of freedom
## Multiple R-squared: 0.5713, Adjusted R-squared: 0.5305
## F-statistic: 13.99 on 16 and 168 DF, p-value: < 2.2e-16
LM1<-train(logSalary ~ AtBat+Hits+Walks+ PutOuts+Assists, data= Train, method = "lm", maximize = TRUE, metric = "Rsquared")
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67178 -0.25957 0.04372 0.24289 1.15069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.193e+00 7.811e-02 28.069 < 2e-16 ***
## AtBat -1.841e-03 7.156e-04 -2.573 0.010895 *
## Hits 8.215e-03 2.189e-03 3.753 0.000236 ***
## Walks 5.173e-03 1.539e-03 3.361 0.000950 ***
## PutOuts 6.180e-05 9.039e-05 0.684 0.495085
## Assists 3.834e-05 1.907e-04 0.201 0.840930
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3389 on 179 degrees of freedom
## Multiple R-squared: 0.2734, Adjusted R-squared: 0.2531
## F-statistic: 13.47 on 5 and 179 DF, p-value: 3.739e-11
LM1<-train(logSalary ~ AtBat + Hits + Walks, data = Train, method = "lm", maximize = TRUE, metric = "Rsquared")
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63985 -0.26253 0.04515 0.24357 1.15954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1962828 0.0772528 28.430 < 2e-16 ***
## AtBat -0.0018114 0.0006786 -2.669 0.008294 **
## Hits 0.0082434 0.0021339 3.863 0.000156 ***
## Walks 0.0052878 0.0014874 3.555 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3375 on 181 degrees of freedom
## Multiple R-squared: 0.2715, Adjusted R-squared: 0.2594
## F-statistic: 22.48 on 3 and 181 DF, p-value: 2.028e-12
library(car)
## Loading required package: carData
TestingCor<-lm(logSalary ~ AtBat + Hits + Walks, data = Train)
vif(TestingCor)
## AtBat Hits Walks
## 16.374806 15.387746 1.551907
TestingCor<-lm(logSalary ~ Hits + Walks, data = Train)
vif(TestingCor)
## Hits Walks
## 1.44903 1.44903
##In the first VIF Check Atbat had a VIF of 16. This is a very large VIF proving that the variable is extrememly correlated. I removed that variable for my model and tested the VIFs again to see if Hits VIF would decrease with the removal of Atbat which it did. After running the new model, the VIF for Hits decreased to 1.449.
PV <-predict(LM1, data=traindata)
LM1R<-resid(LM1)
plot(PV,LM1R, col="blue", xlab="Predicted", ylab="Residuals")
##There afe no apparent patterns
plot(PV,Train$logSalary, col="blue",xlab="Predicted", ylab="Observed")
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63985 -0.26253 0.04515 0.24357 1.15954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1962828 0.0772528 28.430 < 2e-16 ***
## AtBat -0.0018114 0.0006786 -2.669 0.008294 **
## Hits 0.0082434 0.0021339 3.863 0.000156 ***
## Walks 0.0052878 0.0014874 3.555 0.000482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3375 on 181 degrees of freedom
## Multiple R-squared: 0.2715, Adjusted R-squared: 0.2594
## F-statistic: 22.48 on 3 and 181 DF, p-value: 2.028e-12
vif(lm(Train$logSalary~Train$Hits+Train$Walks))
## Train$Hits Train$Walks
## 1.44903 1.44903
PLM1<-predict(LM1, newdata= Test)
summary(PLM1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.217 2.381 2.540 2.562 2.684 3.407
RLM1<-1-sum((Test$logSalary-PLM1)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RLM1
## [1] 0.2278484
RLM2<-1-sum((Train$logSalary-PLM1)^2)/sum((Train$logSalary-mean(Train$logSalary))^2)
## Warning in Train$logSalary - PLM1: longer object length is not a multiple
## of shorter object length
RLM2
## [1] -0.323474
library(leaps)
LM2 <-step(lm((logSalary ~ 1), data= Train),data= Train, direction= "forward", scope =( ~AtBat + Hits + HmRun +Runs + RBI+ Walks + Years +CAtBat + CHits + CHmRun +CRuns + CRBI + CWalks +PutOuts + Assists + Errors))
## Start: AIC=-345.34
## logSalary ~ 1
##
## Df Sum of Sq RSS AIC
## + CHits 1 11.7466 16.552 -442.56
## + CRuns 1 11.5340 16.765 -440.20
## + CAtBat 1 11.0633 17.235 -435.08
## + CRBI 1 9.8105 18.488 -422.10
## + CWalks 1 8.5233 19.776 -409.64
## + Years 1 8.0716 20.227 -405.47
## + CHmRun 1 6.6758 21.623 -393.12
## + Hits 1 5.8675 22.431 -386.33
## + RBI 1 5.3079 22.991 -381.77
## + Runs 1 4.8275 23.471 -377.95
## + AtBat 1 4.7745 23.524 -377.53
## + Walks 1 4.7547 23.544 -377.37
## + HmRun 1 3.0280 25.271 -364.28
## + PutOuts 1 1.2066 27.092 -351.41
## <none> 28.299 -345.34
## + Assists 1 0.1283 28.170 -344.19
## + Errors 1 0.0017 28.297 -343.36
##
## Step: AIC=-442.56
## logSalary ~ CHits
##
## Df Sum of Sq RSS AIC
## + Hits 1 2.58812 13.964 -472.02
## + Runs 1 2.26313 14.289 -467.76
## + AtBat 1 2.00564 14.547 -464.46
## + RBI 1 1.74236 14.810 -461.14
## + Walks 1 1.49138 15.061 -458.03
## + PutOuts 1 1.23599 15.316 -454.92
## + HmRun 1 0.80599 15.746 -449.79
## + CAtBat 1 0.63913 15.913 -447.84
## + Years 1 0.30244 16.250 -443.97
## + CRBI 1 0.21853 16.334 -443.02
## + Assists 1 0.20015 16.352 -442.81
## <none> 16.552 -442.56
## + CHmRun 1 0.15373 16.398 -442.29
## + CWalks 1 0.10113 16.451 -441.69
## + Errors 1 0.02206 16.530 -440.81
## + CRuns 1 0.01395 16.538 -440.72
##
## Step: AIC=-472.02
## logSalary ~ CHits + Hits
##
## Df Sum of Sq RSS AIC
## + PutOuts 1 0.43451 13.530 -475.86
## + AtBat 1 0.26697 13.697 -473.59
## + Walks 1 0.20079 13.763 -472.69
## <none> 13.964 -472.02
## + Errors 1 0.10853 13.856 -471.46
## + Years 1 0.08011 13.884 -471.08
## + CHmRun 1 0.06642 13.898 -470.90
## + CRBI 1 0.06425 13.900 -470.87
## + CAtBat 1 0.05259 13.912 -470.71
## + RBI 1 0.01175 13.952 -470.17
## + HmRun 1 0.01036 13.954 -470.15
## + Runs 1 0.00989 13.954 -470.15
## + CWalks 1 0.00318 13.961 -470.06
## + Assists 1 0.00227 13.962 -470.05
## + CRuns 1 0.00013 13.964 -470.02
##
## Step: AIC=-475.86
## logSalary ~ CHits + Hits + PutOuts
##
## Df Sum of Sq RSS AIC
## + AtBat 1 0.274515 13.255 -477.66
## <none> 13.530 -475.86
## + CRBI 1 0.130584 13.399 -475.66
## + Walks 1 0.109448 13.420 -475.37
## + Years 1 0.106408 13.423 -475.32
## + CHmRun 1 0.100918 13.429 -475.25
## + Errors 1 0.096776 13.433 -475.19
## + CAtBat 1 0.022896 13.507 -474.18
## + Runs 1 0.015303 13.514 -474.07
## + Assists 1 0.004794 13.525 -473.93
## + CRuns 1 0.002509 13.527 -473.90
## + HmRun 1 0.000363 13.529 -473.87
## + RBI 1 0.000304 13.529 -473.87
## + CWalks 1 0.000195 13.529 -473.87
##
## Step: AIC=-477.66
## logSalary ~ CHits + Hits + PutOuts + AtBat
##
## Df Sum of Sq RSS AIC
## + Walks 1 0.241828 13.013 -479.06
## <none> 13.255 -477.66
## + CRBI 1 0.109365 13.146 -477.19
## + Years 1 0.102086 13.153 -477.09
## + CHmRun 1 0.062031 13.193 -476.52
## + Assists 1 0.046828 13.208 -476.31
## + Runs 1 0.046075 13.209 -476.30
## + Errors 1 0.034797 13.220 -476.14
## + CRuns 1 0.022234 13.233 -475.97
## + RBI 1 0.013055 13.242 -475.84
## + HmRun 1 0.008060 13.247 -475.77
## + CWalks 1 0.005440 13.250 -475.73
## + CAtBat 1 0.003718 13.251 -475.71
##
## Step: AIC=-479.06
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks
##
## Df Sum of Sq RSS AIC
## + CRBI 1 0.183348 12.830 -479.69
## <none> 13.013 -479.06
## + CHmRun 1 0.125236 12.888 -478.85
## + Years 1 0.119955 12.893 -478.78
## + CWalks 1 0.109417 12.904 -478.62
## + Assists 1 0.087260 12.926 -478.31
## + Errors 1 0.013117 13.000 -477.25
## + CRuns 1 0.007775 13.005 -477.17
## + RBI 1 0.004455 13.009 -477.13
## + HmRun 1 0.003118 13.010 -477.11
## + Runs 1 0.001663 13.012 -477.09
## + CAtBat 1 0.000319 13.013 -477.07
##
## Step: AIC=-479.69
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI
##
## Df Sum of Sq RSS AIC
## + RBI 1 0.147284 12.683 -479.82
## <none> 12.830 -479.69
## + Years 1 0.102608 12.727 -479.17
## + HmRun 1 0.096796 12.733 -479.09
## + CWalks 1 0.060720 12.769 -478.56
## + Assists 1 0.042854 12.787 -478.31
## + Errors 1 0.022660 12.807 -478.01
## + CHmRun 1 0.009396 12.820 -477.82
## + CAtBat 1 0.003829 12.826 -477.74
## + CRuns 1 0.000706 12.829 -477.70
## + Runs 1 0.000676 12.829 -477.70
##
## Step: AIC=-479.82
## logSalary ~ CHits + Hits + PutOuts + AtBat + Walks + CRBI + RBI
##
## Df Sum of Sq RSS AIC
## <none> 12.683 -479.82
## + Years 1 0.091567 12.591 -479.16
## + Assists 1 0.062397 12.620 -478.74
## + CWalks 1 0.040103 12.643 -478.41
## + Errors 1 0.029305 12.653 -478.25
## + CHmRun 1 0.013561 12.669 -478.02
## + CAtBat 1 0.004120 12.678 -477.88
## + Runs 1 0.003667 12.679 -477.88
## + HmRun 1 0.003525 12.679 -477.87
## + CRuns 1 0.000071 12.682 -477.82
PLM2<-predict(LM2, newdata= Test)
RLM2<-1-sum((Test$logSalary-PLM2)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RLM2
## [1] 0.3068857
library(leaps)
LM3<-step(lm(logSalary ~ AtBat + Hits + HmRun + Runs + RBI+ Walks + Years + CAtBat +CHits + CHmRun + CRuns + CRBI + CWalks +PutOuts + Assists + Errors, data=Train), direction = "backward")
## Start: AIC=-470.03
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CAtBat + CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts +
## Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CAtBat 1 0.00052 12.133 -472.02
## - CHmRun 1 0.00121 12.133 -472.01
## - CRuns 1 0.01640 12.149 -471.78
## - RBI 1 0.02109 12.153 -471.71
## - Runs 1 0.02214 12.154 -471.69
## - CHits 1 0.02858 12.161 -471.60
## - HmRun 1 0.02918 12.161 -471.59
## - CRBI 1 0.03479 12.167 -471.50
## - CWalks 1 0.10485 12.237 -470.44
## - Years 1 0.12878 12.261 -470.08
## <none> 12.132 -470.03
## - Errors 1 0.18923 12.321 -469.17
## - Assists 1 0.30370 12.436 -467.46
## - PutOuts 1 0.40035 12.533 -466.03
## - Walks 1 0.40655 12.539 -465.93
## - Hits 1 0.44602 12.578 -465.35
## - AtBat 1 0.48125 12.613 -464.84
##
## Step: AIC=-472.02
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CHmRun + CRuns + CRBI + CWalks + PutOuts + Assists +
## Errors
##
## Df Sum of Sq RSS AIC
## - CHmRun 1 0.00084 12.133 -474.01
## - RBI 1 0.02071 12.153 -473.71
## - CRuns 1 0.02078 12.153 -473.71
## - Runs 1 0.02375 12.156 -473.66
## - HmRun 1 0.02991 12.163 -473.57
## - CRBI 1 0.03519 12.168 -473.49
## - CHits 1 0.05812 12.191 -473.14
## <none> 12.133 -472.02
## - CWalks 1 0.14493 12.278 -471.83
## - Years 1 0.14718 12.280 -471.79
## - Errors 1 0.18871 12.321 -471.17
## - Assists 1 0.30645 12.439 -469.41
## - PutOuts 1 0.40872 12.541 -467.89
## - Walks 1 0.42957 12.562 -467.59
## - Hits 1 0.55584 12.688 -465.74
## - AtBat 1 0.58722 12.720 -465.28
##
## Step: AIC=-474.01
## logSalary ~ AtBat + Hits + HmRun + Runs + RBI + Walks + Years +
## CHits + CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - RBI 1 0.02071 12.154 -475.70
## - Runs 1 0.02926 12.163 -475.57
## - CRuns 1 0.04056 12.174 -475.39
## - HmRun 1 0.04580 12.179 -475.31
## - CHits 1 0.11730 12.251 -474.23
## <none> 12.133 -474.01
## - Years 1 0.14838 12.282 -473.76
## - CWalks 1 0.16557 12.299 -473.50
## - CRBI 1 0.18653 12.320 -473.19
## - Errors 1 0.18821 12.322 -473.16
## - Assists 1 0.30575 12.439 -471.41
## - PutOuts 1 0.41013 12.544 -469.86
## - Walks 1 0.44041 12.574 -469.42
## - Hits 1 0.56802 12.701 -467.55
## - AtBat 1 0.59640 12.730 -467.13
##
## Step: AIC=-475.7
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRuns + CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - CRuns 1 0.03487 12.189 -477.17
## - Runs 1 0.03785 12.192 -477.12
## - CHits 1 0.11468 12.269 -475.96
## <none> 12.154 -475.70
## - Years 1 0.15762 12.312 -475.31
## - CRBI 1 0.16609 12.320 -475.18
## - Errors 1 0.17770 12.332 -475.01
## - CWalks 1 0.18665 12.341 -474.88
## - HmRun 1 0.21648 12.371 -474.43
## - Assists 1 0.31239 12.467 -473.00
## - PutOuts 1 0.40215 12.556 -471.67
## - Walks 1 0.51856 12.673 -469.97
## - AtBat 1 0.58532 12.739 -468.99
## - Hits 1 0.70431 12.858 -467.27
##
## Step: AIC=-477.17
## logSalary ~ AtBat + Hits + HmRun + Runs + Walks + Years + CHits +
## CRBI + CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## - Runs 1 0.01435 12.203 -478.95
## - Years 1 0.13192 12.321 -477.17
## <none> 12.189 -477.17
## - CWalks 1 0.15235 12.341 -476.87
## - CRBI 1 0.16172 12.351 -476.73
## - Errors 1 0.19429 12.383 -476.24
## - HmRun 1 0.21391 12.403 -475.95
## - Assists 1 0.31333 12.502 -474.47
## - PutOuts 1 0.37678 12.566 -473.53
## - Walks 1 0.49200 12.681 -471.85
## - AtBat 1 0.55664 12.746 -470.90
## - Hits 1 0.68477 12.874 -469.05
## - CHits 1 0.83339 13.023 -466.93
##
## Step: AIC=-478.95
## logSalary ~ AtBat + Hits + HmRun + Walks + Years + CHits + CRBI +
## CWalks + PutOuts + Assists + Errors
##
## Df Sum of Sq RSS AIC
## <none> 12.203 -478.95
## - Years 1 0.14377 12.347 -478.78
## - CRBI 1 0.14924 12.353 -478.70
## - CWalks 1 0.16323 12.367 -478.49
## - Errors 1 0.18420 12.388 -478.18
## - HmRun 1 0.20189 12.405 -477.91
## - Assists 1 0.31167 12.515 -476.28
## - PutOuts 1 0.41803 12.621 -474.72
## - Walks 1 0.49916 12.703 -473.53
## - AtBat 1 0.56348 12.767 -472.60
## - Hits 1 0.76229 12.966 -469.74
## - CHits 1 0.81996 13.023 -468.92
PLM3<-predict(LM3, newdata = Test)
RLM3<-1-sum((Test$logSalary-PLM3)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RLM3
## [1] 0.3632116
LM4<-train(logSalary~., data = Train, method= "lm", maximize = TRUE, metric = "RMSE", trControl=trainControl(method ="cv", number=10))
PLM4<- predict(LM4, newdata= Test)
RLM4<-1-sum((Test$logSalary-PLM4)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RLM4
## [1] 0.3497916
KNN1<-train(logSalary~., data = Train , method= "knn", maximize= TRUE, metric = "RMSE", trControl=trainControl(method ="cv", number=10))
summary(KNN1)
## Length Class Mode
## learn 2 -none- list
## k 1 -none- numeric
## theDots 0 -none- list
## xNames 16 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 1 -none- logical
## param 0 -none- list
PKNN1<-predict(KNN1, newdata= Test)
RKNN1<-1-sum((Test$logSalary-PKNN1)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RKNN1
## [1] 0.6898984
library(earth)
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
MARS1<-train(logSalary~., data=Train, method = "earth", maximize = TRUE, metric = "Rsquared",trControl=trainControl(method ="cv", number=10))
PMARS1<-predict(MARS1, newdata= Test)
RMARS1<-1-sum((Test$logSalary-PMARS1)^2)/sum((Test$logSalary-mean(Test$logSalary))^2)
RMARS1
## [1] 0.5692355
LM1 was the simplest linear model out of the group. LM1 R-Sqaured as a result was extremely bad being 0.2594. This was the lowest R-Squared amongst the group. LM3 had an R-Sqaured of.36. LM3 used one predicator at a time while LM1 used mutiple predictors at the same time. LM3 used stepwise backwards and LM2 used a stepwise forward. LM2 had a R-Squared of .30688 and LM3 had an R-Squared of .36.LM4 left all the variables in the model, LM1 selected variables with a P< .05. LM1 calculated the VIF for variables and removed the ones that were above 10.
The KNN model was the best out of all my models with an R-Squared of .6898 in problem 4 B. It had the best R-Squared amongst the group. The KNN model gives higher weights to the neighbors that are cloer allowing for further neighbors to not have as much contribution. The KNN model also only focuses on the data at hand since it is a nonparametic model.