Loading the Dataset

library(magrittr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
library(ggplot2)
knitr::kable(head(mlb <- read.csv("MLB_cleaned.csv"), n = 5))
First.Name Last.Name Team Position Height.inches. Weight.pounds. Age
Jeff Mathis ANA Catcher 72 180 23.92
Mike Napoli ANA Catcher 72 205 25.33
Jose Molina ANA Catcher 74 220 31.74
Howie Kendrick ANA First Baseman 70 180 23.64
Kendry Morales ANA First Baseman 73 220 23.70
head(mlb)
##   First.Name Last.Name Team      Position Height.inches. Weight.pounds.   Age
## 1       Jeff    Mathis  ANA       Catcher             72            180 23.92
## 2       Mike    Napoli  ANA       Catcher             72            205 25.33
## 3       Jose    Molina  ANA       Catcher             74            220 31.74
## 4      Howie  Kendrick  ANA First Baseman             70            180 23.64
## 5     Kendry   Morales  ANA First Baseman             73            220 23.70
## 6      Casey  Kotchman  ANA First Baseman             75            210 24.02
summary(mlb)
##   First.Name         Last.Name             Team             Position        
##  Length:1034        Length:1034        Length:1034        Length:1034       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Height.inches. Weight.pounds.       Age       
##  Min.   :67.0   Min.   :150.0   Min.   :20.90  
##  1st Qu.:72.0   1st Qu.:187.0   1st Qu.:25.44  
##  Median :74.0   Median :200.0   Median :27.93  
##  Mean   :73.7   Mean   :201.7   Mean   :28.74  
##  3rd Qu.:75.0   3rd Qu.:215.0   3rd Qu.:31.23  
##  Max.   :83.0   Max.   :290.0   Max.   :48.52

Converting Data Types

mlb <- mlb %>% mutate(
  Team = factor(Team), 
  Position = factor(Position),
  Height.inches.= as.numeric(Height.inches.),
  Weight.pounds.= as.numeric(Weight.pounds.),
  Age = as.numeric(Age)
)
head(mlb)
##   First.Name Last.Name Team      Position Height.inches. Weight.pounds.   Age
## 1       Jeff    Mathis  ANA       Catcher             72            180 23.92
## 2       Mike    Napoli  ANA       Catcher             72            205 25.33
## 3       Jose    Molina  ANA       Catcher             74            220 31.74
## 4      Howie  Kendrick  ANA First Baseman             70            180 23.64
## 5     Kendry   Morales  ANA First Baseman             73            220 23.70
## 6      Casey  Kotchman  ANA First Baseman             75            210 24.02
summary(mlb)
##   First.Name         Last.Name              Team                 Position  
##  Length:1034        Length:1034        NYM    : 38   Relief Pitcher  :315  
##  Class :character   Class :character   ATL    : 37   Starting Pitcher:221  
##  Mode  :character   Mode  :character   DET    : 37   Outfielder      :194  
##                                        OAK    : 37   Catcher         : 76  
##                                        BOS    : 36   Second Baseman  : 58  
##                                        CHC    : 36   First Baseman   : 55  
##                                        (Other):813   (Other)         :115  
##  Height.inches. Weight.pounds.       Age       
##  Min.   :67.0   Min.   :150.0   Min.   :20.90  
##  1st Qu.:72.0   1st Qu.:187.0   1st Qu.:25.44  
##  Median :74.0   Median :200.0   Median :27.93  
##  Mean   :73.7   Mean   :201.7   Mean   :28.74  
##  3rd Qu.:75.0   3rd Qu.:215.0   3rd Qu.:31.23  
##  Max.   :83.0   Max.   :290.0   Max.   :48.52  
## 

Weight by Height Scatter Plot

ggplot(mlb, aes(x = Height.inches., y = Weight.pounds.)) + geom_point() +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Predictive Models

The following predictive models predict weight based on other variables in the data (position, height, age) using machine learning algorithms. I chose to implement three different algorithms in my model-Ordinary Least Squares, Ridge Regression, and Lasso Regression-to see how they compared to one another, as well as how they compared to the actual weight values observed in the data.

# Package Installation
#install.packages("caret", dependencies = c("Depends", "Suggests"))
#install.packages('gower', dependencies = TRUE)
#install.packages('parallelly', dependencies = TRUE)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Models
fitControl <- trainControl(method="repeatedcv", number=10, repeats=3)

# OLS (Lm) Model
modelOls <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="lm", trControl=fitControl)
pred1 <- predict(modelOls, newdata = mlb)

# Ridge Model
modelRidge <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="ridge", trControl=fitControl)
pred2 <- predict(modelRidge, newdata = mlb)

# Lasso Model
modelLasso <- train(Weight.pounds.~Position + Height.inches. + Age, data=mlb, method="lasso", trControl=fitControl)
pred3 <- predict(modelLasso, newdata = mlb)

# Summary and Box Plot Comparison
results <- resamples(list(OLS=modelOls, Ridge=modelRidge, Lasso=modelLasso))
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: OLS, Ridge, Lasso 
## Number of resamples: 30 
## 
## MAE 
##           Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## OLS   11.58320 12.40738 13.04977 13.10605 13.69247 15.03658    0
## Ridge 10.92560 12.40732 13.28987 13.09372 13.67323 14.91554    0
## Lasso 11.52366 12.43168 13.12807 13.10145 13.80846 14.87318    0
## 
## RMSE 
##           Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## OLS   14.11878 15.97827 16.69680 16.82685 17.45877 20.53683    0
## Ridge 13.68838 16.15525 16.93469 16.82497 17.75779 18.76432    0
## Lasso 14.72216 15.92962 16.70848 16.82537 17.51604 20.45946    0
## 
## Rsquared 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## OLS   0.1911469 0.2943103 0.3524004 0.3634273 0.4274540 0.5215746    0
## Ridge 0.1970274 0.3203493 0.3688352 0.3624446 0.4117538 0.4845154    0
## Lasso 0.2282287 0.3075130 0.3701521 0.3612728 0.3986607 0.5010668    0
bwplot(results)

# Weight Prediction Comparison Table
knitr::kable(head(mlb2 <- mlb %>% transmute(First.Name, Last.Name, Weight.pounds., Predicted.Weight.OLS = pred1, Predicted.Weight.Ridge = pred2, Predicted.Weight.Lasso = pred3)))
First.Name Last.Name Weight.pounds. Predicted.Weight.OLS Predicted.Weight.Ridge Predicted.Weight.Lasso
Jeff Mathis 180 195.9313 195.9246 194.4047
Mike Napoli 205 197.1680 197.1613 195.6249
Jose Molina 220 212.3180 212.3110 210.5862
Howie Kendrick 180 188.9359 188.9376 188.9062
Kendry Morales 220 203.2802 203.2812 203.0794
Casey Kotchman 210 213.0886 213.0892 212.7705

The OLS method had the lowest MAE and RMSE and highest R-squared, indicating that it was the best fit of the 3 models in estimating weight. However, all three algorithms resulted in relatively low accuracy, suggesting that there’s a relatively weak correlation between the dependent variable (weight) and the predictor variables (height, position, and age). The models may benefit from more data and predictor variables.

Classification Models

The following classification models predict players’ positions using player height, weight, and age. The two methods used for comparison are KNN and LDA models.

fitControl <- trainControl(method="repeatedcv", number=10, repeats=3)

# KNN Model
modelKNN <- train(Position ~ Height.inches. + Weight.pounds. + Age, data=mlb, method="knn", trControl=fitControl)
pred4 <- predict(modelKNN, newdata = mlb)

# LDA Model
modelLDA <- train(Position ~ Height.inches. + Weight.pounds. + Age, data=mlb, method="lda", trControl=fitControl)
pred5 <- predict(modelLDA, newdata = mlb)

# Summary and Box Plot Comparison
results2 <- resamples(list(KNN=modelKNN, LDA=modelLDA))
summary(results2)
## 
## Call:
## summary.resamples(object = results2)
## 
## Models: KNN, LDA 
## Number of resamples: 30 
## 
## Accuracy 
##          Min.   1st Qu.    Median      Mean  3rd Qu.      Max. NA's
## KNN 0.1960784 0.2409681 0.2647313 0.2675703 0.291455 0.3653846    0
## LDA 0.2403846 0.2850175 0.3170569 0.3135014 0.338989 0.3800000    0
## 
## Kappa 
##            Min.    1st Qu.     Median       Mean    3rd Qu.      Max. NA's
## KNN -0.05141420 0.02978701 0.06516431 0.06082350 0.08961945 0.1664845    0
## LDA -0.02176346 0.01871796 0.06974121 0.06256161 0.10332679 0.1522078    0
bwplot(results2)

# Weight Prediction Comparison Table
knitr::kable(head(mlb3 <- mlb %>% transmute(First.Name, Last.Name, Weight.pounds., Position, KNN = pred4, LDA = pred5)))
First.Name Last.Name Weight.pounds. Position KNN LDA
Jeff Mathis 180 Catcher Shortstop Relief Pitcher
Mike Napoli 205 Catcher Relief Pitcher Outfielder
Jose Molina 220 Catcher Catcher Relief Pitcher
Howie Kendrick 180 First Baseman Catcher Outfielder
Kendry Morales 220 First Baseman Outfielder Relief Pitcher
Casey Kotchman 210 First Baseman Relief Pitcher Relief Pitcher

The LDA had a greater mean accuracy, but both had relatively low mean accuracy at 0.26 (KNN) and 0.32 (LDA). As expected, the accuracy decreased with the inclusion of Team as an x variable, so this was omitted from the predictor variables.