library(magrittr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
knitr::kable(head(mlb <- read.csv("MLB_cleaned.csv"), n = 5))
| First.Name | Last.Name | Team | Position | Height.inches. | Weight.pounds. | Age |
|---|---|---|---|---|---|---|
| Jeff | Mathis | ANA | Catcher | 72 | 180 | 23.92 |
| Mike | Napoli | ANA | Catcher | 72 | 205 | 25.33 |
| Jose | Molina | ANA | Catcher | 74 | 220 | 31.74 |
| Howie | Kendrick | ANA | First Baseman | 70 | 180 | 23.64 |
| Kendry | Morales | ANA | First Baseman | 73 | 220 | 23.70 |
head(mlb)
## First.Name Last.Name Team Position Height.inches. Weight.pounds. Age
## 1 Jeff Mathis ANA Catcher 72 180 23.92
## 2 Mike Napoli ANA Catcher 72 205 25.33
## 3 Jose Molina ANA Catcher 74 220 31.74
## 4 Howie Kendrick ANA First Baseman 70 180 23.64
## 5 Kendry Morales ANA First Baseman 73 220 23.70
## 6 Casey Kotchman ANA First Baseman 75 210 24.02
summary(mlb)
## First.Name Last.Name Team Position
## Length:1034 Length:1034 Length:1034 Length:1034
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Height.inches. Weight.pounds. Age
## Min. :67.0 Min. :150.0 Min. :20.90
## 1st Qu.:72.0 1st Qu.:187.0 1st Qu.:25.44
## Median :74.0 Median :200.0 Median :27.93
## Mean :73.7 Mean :201.7 Mean :28.74
## 3rd Qu.:75.0 3rd Qu.:215.0 3rd Qu.:31.23
## Max. :83.0 Max. :290.0 Max. :48.52
mlb <- mlb %>% mutate(
Team = factor(Team),
Position = factor(Position),
Height.inches.= as.numeric(Height.inches.),
Weight.pounds.= as.numeric(Weight.pounds.),
Age = as.numeric(Age)
)
head(mlb)
## First.Name Last.Name Team Position Height.inches. Weight.pounds. Age
## 1 Jeff Mathis ANA Catcher 72 180 23.92
## 2 Mike Napoli ANA Catcher 72 205 25.33
## 3 Jose Molina ANA Catcher 74 220 31.74
## 4 Howie Kendrick ANA First Baseman 70 180 23.64
## 5 Kendry Morales ANA First Baseman 73 220 23.70
## 6 Casey Kotchman ANA First Baseman 75 210 24.02
summary(mlb)
## First.Name Last.Name Team Position
## Length:1034 Length:1034 NYM : 38 Relief Pitcher :315
## Class :character Class :character ATL : 37 Starting Pitcher:221
## Mode :character Mode :character DET : 37 Outfielder :194
## OAK : 37 Catcher : 76
## BOS : 36 Second Baseman : 58
## CHC : 36 First Baseman : 55
## (Other):813 (Other) :115
## Height.inches. Weight.pounds. Age
## Min. :67.0 Min. :150.0 Min. :20.90
## 1st Qu.:72.0 1st Qu.:187.0 1st Qu.:25.44
## Median :74.0 Median :200.0 Median :27.93
## Mean :73.7 Mean :201.7 Mean :28.74
## 3rd Qu.:75.0 3rd Qu.:215.0 3rd Qu.:31.23
## Max. :83.0 Max. :290.0 Max. :48.52
##
ggplot(mlb, aes(x = Height.inches., y = Weight.pounds.)) + geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
The following predictive models predict weight based on other variables in the data (position, height, age) using machine learning algorithms. I chose to implement three different algorithms in my model-Ordinary Least Squares, Ridge Regression, and Lasso Regression-to see how they compared to one another, as well as how they compared to the actual weight values observed in the data.
# Package Installation
#install.packages("caret", dependencies = c("Depends", "Suggests"))
#install.packages('gower', dependencies = TRUE)
#install.packages('parallelly', dependencies = TRUE)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Models
fitControl <- trainControl(method="repeatedcv", number=10, repeats=3)
# OLS (Lm) Model
modelOls <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="lm", trControl=fitControl)
pred1 <- predict(modelOls, newdata = mlb)
# Ridge Model
modelRidge <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="ridge", trControl=fitControl)
pred2 <- predict(modelRidge, newdata = mlb)
# Lasso Model
modelLasso <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="lasso", trControl=fitControl)
pred3 <- predict(modelLasso, newdata = mlb)
# Summary and Box Plot Comparison
results <- resamples(list(OLS=modelOls, Ridge=modelRidge, Lasso=modelLasso))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: OLS, Ridge, Lasso
## Number of resamples: 30
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 11.58320 12.40738 13.04977 13.10605 13.69247 15.03658 0
## Ridge 10.92560 12.40732 13.28987 13.09372 13.67323 14.91554 0
## Lasso 11.52366 12.43168 13.12807 13.10145 13.80846 14.87318 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 14.11878 15.97827 16.69680 16.82685 17.45877 20.53683 0
## Ridge 13.68838 16.15525 16.93469 16.82497 17.75779 18.76432 0
## Lasso 14.72216 15.92962 16.70848 16.82537 17.51604 20.45946 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## OLS 0.1911469 0.2943103 0.3524004 0.3634273 0.4274540 0.5215746 0
## Ridge 0.1970274 0.3203493 0.3688352 0.3624446 0.4117538 0.4845154 0
## Lasso 0.2282287 0.3075130 0.3701521 0.3612728 0.3986607 0.5010668 0
bwplot(results)
# Weight Prediction Comparison Table
knitr::kable(head(mlb2 <- mlb %>% transmute(First.Name, Last.Name, Weight.pounds., Predicted.Weight.OLS = pred1, Predicted.Weight.Ridge = pred2, Predicted.Weight.Lasso = pred3)))
| First.Name | Last.Name | Weight.pounds. | Predicted.Weight.OLS | Predicted.Weight.Ridge | Predicted.Weight.Lasso |
|---|---|---|---|---|---|
| Jeff | Mathis | 180 | 195.9313 | 195.9246 | 194.4047 |
| Mike | Napoli | 205 | 197.1680 | 197.1613 | 195.6249 |
| Jose | Molina | 220 | 212.3180 | 212.3110 | 210.5862 |
| Howie | Kendrick | 180 | 188.9359 | 188.9376 | 188.9062 |
| Kendry | Morales | 220 | 203.2802 | 203.2812 | 203.0794 |
| Casey | Kotchman | 210 | 213.0886 | 213.0892 | 212.7705 |
The OLS method had the lowest MAE and RMSE and highest R-squared, indicating that it was the best fit of the 3 models in estimating weight. However, all three algorithms resulted in relatively low accuracy, suggesting that there’s a relatively weak correlation between the dependent variable (weight) and the predictor variables (height, position, and age). The models may benefit from more data and predictor variables.
The following classification models predict players’ positions using player height, weight, and age. The two methods used for comparison are KNN and LDA models.
fitControl <- trainControl(method="repeatedcv", number=10, repeats=3)
# KNN Model
modelKNN <- train(Position ~ Height.inches. + Weight.pounds. + Age, data=mlb, method="knn", trControl=fitControl)
pred4 <- predict(modelKNN, newdata = mlb)
# LDA Model
modelLDA <- train(Position ~ Height.inches. + Weight.pounds. + Age, data=mlb, method="lda", trControl=fitControl)
pred5 <- predict(modelLDA, newdata = mlb)
# Summary and Box Plot Comparison
results2 <- resamples(list(KNN=modelKNN, LDA=modelLDA))
summary(results2)
##
## Call:
## summary.resamples(object = results2)
##
## Models: KNN, LDA
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN 0.1960784 0.2409681 0.2647313 0.2675703 0.291455 0.3653846 0
## LDA 0.2403846 0.2850175 0.3170569 0.3135014 0.338989 0.3800000 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## KNN -0.05141420 0.02978701 0.06516431 0.06082350 0.08961945 0.1664845 0
## LDA -0.02176346 0.01871796 0.06974121 0.06256161 0.10332679 0.1522078 0
bwplot(results2)
# Weight Prediction Comparison Table
knitr::kable(head(mlb3 <- mlb %>% transmute(First.Name, Last.Name, Weight.pounds., Position, KNN = pred4, LDA = pred5)))
| First.Name | Last.Name | Weight.pounds. | Position | KNN | LDA |
|---|---|---|---|---|---|
| Jeff | Mathis | 180 | Catcher | Shortstop | Relief Pitcher |
| Mike | Napoli | 205 | Catcher | Relief Pitcher | Outfielder |
| Jose | Molina | 220 | Catcher | Catcher | Relief Pitcher |
| Howie | Kendrick | 180 | First Baseman | Catcher | Outfielder |
| Kendry | Morales | 220 | First Baseman | Outfielder | Relief Pitcher |
| Casey | Kotchman | 210 | First Baseman | Relief Pitcher | Relief Pitcher |
The LDA had a greater mean accuracy, but both had relatively low mean accuracy at 0.26 (KNN) and 0.32 (LDA). As expected, the accuracy decreased with the inclusion of Team as an x variable, so this was omitted from the predictor variables.