# requesting the package and importing the data set
library(readr)
Data<-read_csv("hitters.csv")
## Rows: 322 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): League, Division, NewLeague
## dbl (17): AtBat, Hits, HmRun, Runs, RBI, Walks, Years, CAtBat, CHits, CHmRun...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Data<-na.omit(Data)
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
set.seed(1234)
index <- createDataPartition(Data$Salary,
p=.7,
list=FALSE)
train <- Data[index, ]
test <- Data[-index, ]
trctrl<- trainControl(method="cv",
number=10)
set.seed(1234)
rtree_fit <- train(Salary~., data =train,
method = "rpart",
trControl=trctrl,
metric = "RMSE",
tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rtree_fit
## CART
##
## 187 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 168, 170, 167, 168, 167, 170, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.001169156 305.5222 0.6102353 204.4492
## 0.005371228 306.9931 0.6083438 208.7565
## 0.006441837 307.4189 0.6141347 212.1654
## 0.007502022 309.7125 0.6073551 215.0887
## 0.015732260 311.5067 0.5924867 215.3907
## 0.037506815 318.6875 0.5630123 219.2902
## 0.041443657 318.5133 0.5644374 218.4298
## 0.069667656 332.0187 0.4936072 241.5012
## 0.156996992 354.1323 0.4336791 256.7751
## 0.438517298 388.2926 0.3827415 283.6147
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.001169156.
The final complexity parameter chosen is 0.001
library(rpart.plot)
## Loading required package: rpart
rpart.plot(rtree_fit$finalModel)
There are 8 unique predictor variables
yhat <- predict(rtree_fit, test)
postResample(pred=yhat, obs=test$Salary)
## RMSE Rsquared MAE
## 329.270830 0.501331 205.055710
RMSE is 329.3 and R-squared is 0.5 when applying the model to the test data
A player who has more than 310 runs batted in during his career, has more than 53 walks in 1986 and has more than 257 home runs during his career got an annual salary of $1680000 in 1987
set.seed(1234)
model.rf <- train(Salary ~ .,
data = train,
method = "rf",
metric = "RMSE",
trControl = trctrl,
tuneLength = 5)
model.rf
## Random Forest
##
## 187 samples
## 19 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 168, 170, 167, 168, 167, 170, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 246.5996 0.7372052 158.5701
## 6 248.4012 0.7321114 157.5011
## 10 249.9406 0.7324121 159.1240
## 14 252.8591 0.7256929 161.8102
## 19 252.7659 0.7259220 160.9355
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
mtry is 2
The mtry parameter represents the number of features (variables) considered for splitting at each node of an individual decision tree in the forest
pred <- predict(model.rf, test)
postResample(pred, test$Salary)
## RMSE Rsquared MAE
## 273.4527833 0.5901216 179.0163477
The RMSE is 273.45 and the R-squared is 0.6 when applying the model to the test data
plot(varImp(model.rf))
In a Random Forest, the variable importance is the total decrease in
node impurities from splitting on the variable, averaged over all trees.
In our case, since Salary is quantitative, (regression problem), the
variables importance is measured by residual sum of squares.
Number of times at bat during his career (CAtBat) is the most important variable
Number of runs batted in during his career (CRBI) is the second most important variable