knitr::opts_chunk$set(echo = TRUE)
library(ISLR)
Hitters1 <- subset(Hitters, select = -c(League, Division, NewLeague))
Hitters2 <- na.omit(Hitters1)
logSalary <- log10(Hitters2$Salary)
Hitters2$Salary <- logSalary
names(Hitters2)[names(Hitters2) == "Salary" ] <- "logSalary"
library(caret)
set.seed(12345)
trainingIndices <- createDataPartition(Hitters2$logSalary, p=0.7, list=FALSE)
training <- Hitters2[trainingIndices, ]
testing <- Hitters2[-trainingIndices, ]
LM1 <- train(logSalary~., data = training, method = 'lm')
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99149 -0.18939 0.00944 0.19067 0.49502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.915e+00 8.538e-02 22.426 <2e-16 ***
## AtBat -1.185e-03 6.687e-04 -1.772 0.0783 .
## Hits 5.732e-03 2.453e-03 2.337 0.0206 *
## HmRun 5.987e-03 6.189e-03 0.967 0.3347
## Runs -1.486e-03 3.037e-03 -0.489 0.6252
## RBI 3.102e-04 2.685e-03 0.116 0.9082
## Walks 4.925e-03 1.864e-03 2.643 0.0090 **
## Years 2.065e-02 1.181e-02 1.748 0.0822 .
## CAtBat 8.394e-05 1.408e-04 0.596 0.5518
## CHits -9.382e-05 7.283e-04 -0.129 0.8977
## CHmRun -6.723e-05 1.625e-03 -0.041 0.9670
## CRuns 6.681e-04 7.506e-04 0.890 0.3748
## CRBI -4.246e-04 7.352e-04 -0.577 0.5644
## CWalks -4.682e-04 3.448e-04 -1.358 0.1764
## PutOuts 1.086e-04 7.559e-05 1.437 0.1527
## Assists 9.614e-05 2.290e-04 0.420 0.6751
## Errors -4.120e-03 4.807e-03 -0.857 0.3927
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2614 on 168 degrees of freedom
## Multiple R-squared: 0.5899, Adjusted R-squared: 0.5508
## F-statistic: 15.1 on 16 and 168 DF, p-value: < 2.2e-16
#From the summary in 2A, we found that only Hits and Walks had p-values less than .05. Every other predictor has been removed.
LM1 <- train(logSalary~Hits+Walks, data = training, method = 'lm')
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99970 -0.26649 0.07084 0.22775 0.72103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0339649 0.0660700 30.785 < 2e-16 ***
## Hits 0.0030419 0.0006671 4.560 9.39e-06 ***
## Walks 0.0047658 0.0013415 3.553 0.000486 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3292 on 182 degrees of freedom
## Multiple R-squared: 0.2952, Adjusted R-squared: 0.2875
## F-statistic: 38.11 on 2 and 182 DF, p-value: 1.493e-14
# Neither Walks, nor Hits had a p-value higher than .05, so we keep both of them in the model.
LM1 <- train(logSalary~Hits+Walks, data = training, method = 'lm')
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99970 -0.26649 0.07084 0.22775 0.72103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0339649 0.0660700 30.785 < 2e-16 ***
## Hits 0.0030419 0.0006671 4.560 9.39e-06 ***
## Walks 0.0047658 0.0013415 3.553 0.000486 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3292 on 182 degrees of freedom
## Multiple R-squared: 0.2952, Adjusted R-squared: 0.2875
## F-statistic: 38.11 on 2 and 182 DF, p-value: 1.493e-14
library(car)
vifmodel <- lm(logSalary ~ Hits + Walks, data = training)
vif(vifmodel)
## Hits Walks
## 1.472601 1.472601
# The variance inflation factor between hits and walks is fairly low, which shows that the predictors are not very correlated with each other. Therefore, we leave them both in the model.
LM1 <- train(logSalary~Hits+Walks, data = training, method = 'lm')
library(car)
predicted <- predict(LM1, data = training)
LM1resid <- resid(LM1)
plot(predicted, LM1resid, pch = 21, xlab = "Predicted", ylab = "Residuals", col = "red")
abline(0,0)
# The residual plot seems to show no real pattern. The residuals appear scattered randomly.
plot(predicted, training$logSalary, pch = 21, xlab = "Predicted", ylab = "Observed", col = "purple")
abline(lm(training$logSalary~predicted))
summary(LM1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.99970 -0.26649 0.07084 0.22775 0.72103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0339649 0.0660700 30.785 < 2e-16 ***
## Hits 0.0030419 0.0006671 4.560 9.39e-06 ***
## Walks 0.0047658 0.0013415 3.553 0.000486 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3292 on 182 degrees of freedom
## Multiple R-squared: 0.2952, Adjusted R-squared: 0.2875
## F-statistic: 38.11 on 2 and 182 DF, p-value: 1.493e-14
# R-squared is 0.2952 in this model, which is fairly low.
vifmodel1 <- lm(training$logSalary~training$Hits+training$Walks)
vif(vifmodel1)
## training$Hits training$Walks
## 1.472601 1.472601
#The variance inflation factor between hits and walks is 1.472601, which, as pointed out earlier, is fairly low.
LM1predict <- predict(LM1, newdata = testing)
summary(LM1predict)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.037 2.386 2.509 2.544 2.716 3.164
cor(LM1predict, testing$logSalary)^2
## [1] 0.153787
# When we run LM1 on the testing data, we end up with an Rsquared value of .153787, which is quite a bit lower than the Rsquared value of the training data.
library(leaps)
LM2 <- step(lm((logSalary~1), data = training), data = training, direction = "forward", scope=(~AtBat + Hits + HmRun + Runs + RBI + Walks + Years + CAtBat + CHits + CHmRun + CRuns + CRuns + CRBI + CWalks + PutOuts + Assists + Errors))
LM2predict <- predict(LM2, newdata = testing)
cor(LM2predict, testing$logSalary)^2
# When we run LM2 on the testing data, we end up with an Rsquared value of .4384341, which is higher than we got in LM1.
library(leaps)
LM3 <- step(lm((logSalary~.), data = training), direction = "backward")
LM3predict <- predict(LM3, newdata = testing)
cor(LM3predict, testing$logSalary)^2
# When we run LM3 on the testing data, we end up with an Rsquared value of .4384241, which is the same Rsquared that we got for LM2.
LM4 <- train(logSalary~., data = training, method = 'lm', trControl = trainControl(method = "cv", number = 10))
LM4predict <- predict(LM4, newdata = testing)
cor(LM4predict, testing$logSalary)^2
## [1] 0.4251033
# When we run LM4 on the testing data, we end up with an Rsquared value of .4251033, which is slightly lower than LM2 and LM3.
kNN1 <- train(logSalary~., data = training, method = 'knn', trControl = trainControl(method = "cv", number = 10))
kNN1predict <- predict(kNN1, newdata = testing)
cor(kNN1predict, testing$logSalary)^2
## [1] 0.5343042
# When we run kNN1 on the testing data, we end up with an Rsquared value of .5343042, which is higher than all of the LM models.
MARS1 <- train(logSalary~., data = training, method = 'earth', trControl = trainControl(method = "cv", number = 10))
MARS1predict <- predict(MARS1, newdata = testing)
cor(MARS1predict, testing$logSalary)^2
## [,1]
## y 0.59092
# When we run MARS1 on the testing data, we end up with an Rsquared value of .59092, which is the highest out of all the models we ran for this project.
LM1-LM4 are all linears models that are coded in different ways and use slightly different methods. In LM1, we built the model ourselves by running a summary and manually taking out the predictors with p-values above .05. LM2 was a forward stepwise model where we started with no predictors and added more as R saw fit. LM3 was a backward stepwise model where R started with all of the predictors and eliminated ones that didn’t contribute much to the model. LM4 was another linear model that used 10 fold cross validation, which breaks the data up into more groups to train, which helps to reduce bias. We got a very low Rsquared for LM1 of .153787. It makes sense that that Rsquared value is low because the model only used two predictors. LM2 and LM3 had the same Rsquared value of .4384241. These two had the highest Rsquared of the four linear models, which means that their predictability on new data is the highest. LM2 kept 10 of the original predictors and LM3 kept 6 of the original predictors after their stepwise procedures. LM4 had an Rsquared value of .4251033, which is slightly lower than LM2 and LM3. In LM4, cross validation kept all of the original predictors. Therefore, there could have been a higher chance of collinearity between the predictors, which may explain why LM4’s Rsquared is not as high as LM2 and LM3. All four of the LM models have high interpretability. The coefficients help to explain how logSalary change with each unit increase in their variables, assuming all else is equal.
Out of the six models, we believe that the MARS1 model is the best because it has the highest Rsquared out of all the models (.59092). This means that the MARS1 model has highest predictability when provided with new data (such as the testing data).