library(MASS)
library(rpart.plot)
## Loading required package: rpart
library(ggplot2)
library(ggfortify)
library(gridExtra)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ fma       2.5     ✔ expsmooth 2.3
## 
library(fma)
library(kableExtra)
library(e1071)
library(mlbench)
library(ggcorrplot)
library(DataExplorer)
library(timeDate)
## 
## Attaching package: 'timeDate'
## The following objects are masked from 'package:e1071':
## 
##     kurtosis, skewness
library(caret)
## Loading required package: lattice
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
## 
##     pigs
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(tibble)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine()    masks gridExtra::combine()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ purrr::lift()       masks caret::lift()
## ✖ dplyr::select()     masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(mixtools)
## mixtools package, version 2.0.0, Released 2022-12-04
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772 and the Chan Zuckerberg Initiative: Essential Open Source Software for Science (Grant No. 2020-255193).
library(naniar)
library(ggpmisc)
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## 
## Attaching package: 'ggpp'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(regclass)
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## 
## The following object is masked from 'package:caret':
## 
##     predictors
## 
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
## 
## 
## Attaching package: 'regclass'
## 
## The following object is masked from 'package:lattice':
## 
##     qq
library(tidymodels)
## Registered S3 method overwritten by 'parsnip':
##   method          from     
##   autoplot.glmnet ggfortify
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.2.1     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ yardstick::accuracy()       masks forecast::accuracy()
## ✖ ggpp::annotate()            masks ggplot2::annotate()
## ✖ randomForest::combine()     masks dplyr::combine(), gridExtra::combine()
## ✖ scales::discard()           masks purrr::discard()
## ✖ dplyr::filter()             masks stats::filter()
## ✖ recipes::fixed()            masks stringr::fixed()
## ✖ dplyr::group_rows()         masks kableExtra::group_rows()
## ✖ dplyr::lag()                masks stats::lag()
## ✖ purrr::lift()               masks caret::lift()
## ✖ randomForest::margin()      masks ggplot2::margin()
## ✖ rsample::permutations()     masks e1071::permutations()
## ✖ yardstick::precision()      masks caret::precision()
## ✖ dials::prune()              masks rpart::prune()
## ✖ yardstick::recall()         masks caret::recall()
## ✖ dplyr::select()             masks MASS::select()
## ✖ yardstick::sensitivity()    masks caret::sensitivity()
## ✖ yardstick::spec()           masks readr::spec()
## ✖ yardstick::specificity()    masks caret::specificity()
## ✖ recipes::step()             masks stats::step()
## ✖ tune::tune()                masks parsnip::tune(), e1071::tune()
## ✖ recipes::update()           masks stats4::update(), stats::update()
## ✖ workflows::update_formula() masks VGAM::update_formula()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(predictmeans)
## Loading required package: glmmTMB
## 
## Attaching package: 'glmmTMB'
## 
## The following objects are masked from 'package:VGAM':
## 
##     betabinomial, lognormal
## 
## Loading required package: lme4
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:lme4':
## 
##     lmList
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## The following object is masked from 'package:forecast':
## 
##     getResponse
## 
## Loading required package: lmerTest
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:recipes':
## 
##     step
## 
## The following object is masked from 'package:stats':
## 
##     step
# Print a side-by-side Histogram and QQPlot of Residuals
residPlot <- function(model) {
  if (is.null(model)) {
    return
  }
  
  layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
  plot(residuals(model))
  hist(model[["residuals"]], freq = FALSE, breaks = "fd", main = "Residual Histogram",
       xlab = "Residuals",col="lightgreen")
  lines(density(model[["residuals"]], kernel = "ep"),col="blue", lwd=3)
  curve(dnorm(x,mean=mean(model[["residuals"]]), sd=sd(model[["residuals"]])), col="red", lwd=3, lty="dotted", add=T)
  qqnorm(model[["residuals"]], main = "Residual Q-Q plot")
  qqline(model[["residuals"]],col="red", lwd=3, lty="dotted")
  par(mfrow = c(1, 1))
}

# Print a Variable Importance Plot for the provided model
variableImportancePlot <- function(model=NULL, chart_title='Variable Importance Plot') {
  if (is.null(model)) {
    return
  }


# use caret and gglot to print a variable importance plot
varImp(model) %>% as.data.frame() %>% 
  ggplot(aes(x = reorder(rownames(.), desc(Overall)), y = Overall)) +
  geom_col(aes(fill = Overall)) +
  theme(panel.background = element_blank(),
        panel.grid = element_blank(),
        axis.text.x = element_text(angle = 90)) +
  scale_fill_gradient() +
  labs(title = chart_title,
        x = "Parameter",
        y = "Relative Importance")
}

# Extract key performance results from a model

model_performance_extraction <- function(model=NULL) {
  if (is.null(model)) {
    return
  }
  
  data.frame("RSE" = model$sigma,
             "Adj R2" = model$adj.r.squared,
             "F-Statistic" = model$fstatistic[1])
}

Introduction

In professional sports, there is a significant interest in leveraging historical statistics to predict future outcomes, such as wins and losses, and to identify opportunities for improving team or individual performance. This data-driven approach has gained immense popularity over the past decade, showing up in mass media through fantasy leagues, movies like “Moneyball,” and platforms such as FiveThirtyEight.

In this analysis, we will utilize a classic baseball dataset to build several models capable of predicting a team’s wins over a season based on various team statistics (e.g., home runs, strikeouts, base hits). The approach will be systematic and thorough, beginning with an exploration of the data to identify potential issues or challenges such as missing data, outliers, coding errors, and multicollinearity.

After addressing these challenges through appropriate data-cleaning steps, three different linear models to predict seasonal wins will be built and evaluated. The goal is to select a final model that strikes the best balance between accuracy and simplicity, providing a robust tool for predicting team performance.

Data Exploration

The moneyball training set contains 17 columns - including the target variable “TARGET_WINS” - and 2276 rows, covering baseball team performance statistics from the years 1871 to 2006 inclusive. The data has been adjusted to match the performance of a typical 162 game season. The data-set was entirely numerical and contained no categorical variables.

Below is a chart that describes each variable in the data set and the theoretical effect it will have on the number of wins projected for a team.

After loading the data set in R, the following summary statistics were calculated:

df <- read.csv("C:/moneyball-training-data.csv")

str(df)
## 'data.frame':    2276 obs. of  17 variables:
##  $ INDEX           : int  1 2 3 4 5 6 7 8 11 12 ...
##  $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
##  $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
##  $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
##  $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
##  $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
##  $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
##  $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
##  $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
##  $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
##  $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
##  $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
##  $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
##  $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
##  $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
##  $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...
summary(df)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286

From the outputs of the str and summary functions it was noted that there was many NA values throughout the data set which needed to be transformed or cleaned. Additionally, the average wins per season for a team is about 81, which is exactly half of the total games played in a typical MLB season, batters hit, on average, about 9 base hits per game, pitchers throw about 4 base-on-balls (walks) per game, and pitchers record about 5 strikeouts per game when calculated out over a 162-game season.

From there it was important to understand the different variables and visualize the data by showing the amount of missing values for each variable in the data set, plot the distributions of values for each variable with histograms and density curves as well as show a box plot and scatter plot comparison. Additionally, the index variable did not hold any statistical value so it was removed from the data.

##Remove "TEAM" from variable names
names(df) <- names(df) %>% 
str_replace_all('TEAM_', '')

## Drop the INDEX column
df <- df %>%  
  dplyr::select(-INDEX)
gg_miss_var(df, show_pct = TRUE)

# Prepare data for ggplot
gather_df <- df %>% 
  gather(key = 'variable', value = 'value')

# Histogram plots of each variable
ggplot(gather_df) + 
  geom_histogram(aes(x = value, y = ..density..), bins=30) + 
  geom_density(aes(x = value), color='blue') +
  facet_wrap(. ~variable, scales='free', ncol=4)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_density()`).

The distribution profiles show the prevalence of kurtosis, specifically right skew in variables BASERUN_CS, BASERUN_SB, FIELDING_E, PITCHING_BB, PITCHING_H and PITCHING_SO. These deviations from a traditional normal distribution can be problematic for linear regression assumptions, meaning the data may have to be transformed. Furthermore BATTING_HBP, BATTING_HR, PITCHING_HR and BATTING_SO appear bimodal. Bimodal data suggests that there are possibly two different groups or classes of baseball season data. During those seasons, teams tended to score higher or lower for the bimodal feature.

# Boxplots for each variable
ggplot(gather_df, aes(variable, value)) + 
  geom_boxplot() + 
  facet_wrap(. ~variable, scales='free', ncol=6)
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The box-plots reveal significant outliers and uniform (zero-only) values for many of our predictor variables. Outliers will need to imputed if necessary, and sparse data-sets might need to be dropped.

Finally, the scatter plots of each variable versus the target variable of TARGET_WINS wer plotted to get an idea of the relationship between them.

# Scatter plots of each variable versus the target variable
featurePlot(x = df[, -1], 
            y = df$TARGET_WINS,
            plot = "scatter")

The plots indicate some clear relationships, such as hitting more doubles or more home runs clearly improves the number of wins. However, they also reveal some significant issues with the data. Most of the predictor variables are skewed or non-normally distributed and will need to be transformed. Additionally, many data points contain missing data that will need to be either imputed or discarded. It also appears that there is some missing data encoded as 0 and some nonsensical outliers. There is a team with 0 wins in the dataset. This seems unlikely. Many of the hitting categories include teams at 0; it is unlikely that a team hit 0 home runs throughout a season. The pitching variables also include many 0’s. There are multiple teams with 0 strikeouts by their pitchers over the season, which is extremely unlikely. The pitching data also includes strange outliers such as a team logging 20,000 strikeouts which would be an average of 160 strikeouts per game. Also, team pitching walks and team pitching hits have strange outliers. Lastly, the error variable does make sense. teams usually score 2 or fewer errors per game, which would lead to an overall team error of approximately 320 throughout a season. This does not match the scale of the error variable. Errors may have become much less frequent over time which would account for the high error data.

Data Preparation

Removed Fields

BATTING_HBP field was removed from the data as it was missing >90% of the data and the INDEX field as it offers no information for a model.

Missing Values

There were missing values found in BASERUN_CS (Caught Stolen Bases), FIELDING_DP (Double plays), BASERUN_SB (Stolen Bases), BATTING_SO (Batter Strike outs), PITCHING_SO (Pitcher Strike outs) so the missing values were replaced with their corresponding median values.

Outliers

There are unreasonable outliers found in PITCHING_SO (pitching strikeouts), PITCHING_H (allowed hits per game), PITCHING_BB (walks), and FIELDING_E (fielding errors) that exceed what is reasonable or possible given standard game length. Any outliers were replaced with the median for the data set. Limits that were set included: > 4000 PITCHING_SO (25 strikeouts per game), > 5000 PITCHING_H (30 hits allowed per game), > 2000 PITCHING_BB (13 walks per game), and > 480 FIELDING_E (3 errors per game).

Transform non-normal variables

Finally, some of the variables were highly skewed. To address this, boxcox, log and inverse log transformations were used to make them more normally distributed.

# Identify missing data by Feature and display percent breakout
missing <- colSums(df %>% sapply(is.na))
missing_pct <- round(missing / nrow(df) * 100, 2)
stack(sort(missing_pct, decreasing = TRUE))
# Drop the BATTING_HBP field
df <- df %>% 
  select(-BATTING_HBP)
no_outlier_df <- df


no_outlier_df$PITCHING_SO <- ifelse(no_outlier_df$PITCHING_SO > 4000, NA, no_outlier_df$PITCHING_SO)


no_outlier_df$PITCHING_H <- ifelse(no_outlier_df$PITCHING_H > 5000, NA, no_outlier_df$PITCHING_H)


no_outlier_df$PITCHING_BB <- ifelse(no_outlier_df$PITCHING_BB > 2000, NA, no_outlier_df$PITCHING_BB)


no_outlier_df$FIELDING_E <- ifelse(no_outlier_df$FIELDING_E > 480, NA, no_outlier_df$FIELDING_E)

clean_df <- no_outlier_df %>% 
  impute(what = 'median') %>% as.data.frame()

# From the plots, the team with 0 wins has 0 in multiple other categories. not valid data.
clean_df <- clean_df %>% 
  filter(TARGET_WINS != 0)

gather_clean_df <- clean_df %>% 
  gather(key = 'variable', value = 'value', -TARGET_WINS)

# created empty data frame to store transformed variables
df_temp <- data.frame(matrix(ncol = 1, nrow = length(clean_df$TARGET_WINS)))

# performed boxcox transformation after identifying proper lambda
df_temp$BATTING_3B <- clean_df$BATTING_3B
batting3b_lambda <- BoxCox.lambda(clean_df$BATTING_3B)
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
df_temp$BATTING_3B_transform <- log(clean_df$BATTING_3B)

# performed boxcox transformation after identifying proper lambda
df_temp$BATTING_HR <- clean_df$BATTING_HR
battingHR_lambda <- BoxCox.lambda(clean_df$BATTING_HR)
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
df_temp$BATTING_HR_transform <- BoxCox(clean_df$BATTING_HR, battingHR_lambda)

# performed a log transformation
df_temp$PITCHING_BB <- clean_df$PITCHING_BB
df_temp$PITCHING_BB_transform <- log(clean_df$PITCHING_BB)

# performed a log transformation
df_temp$PITCHING_SO <- clean_df$PITCHING_SO
df_temp$PITCHING_SO_transform <- log(clean_df$PITCHING_SO)

# performed an inverse log transformation
df_temp$FIELDING_E <- clean_df$FIELDING_E
df_temp$FIELDING_E_transform <- 1/log(clean_df$FIELDING_E)

# performed a log transformation
df_temp$BASERUN_SB <- clean_df$BASERUN_SB
df_temp$BASERUN_SB_transform <- log(clean_df$BASERUN_SB)

df_temp <- df_temp[, 2:13]

# Build clean dataframe with transformation
clean_df <- data.frame(cbind(clean_df, 
                        BATTING_3B_transform = df_temp$BATTING_3B_transform,
                        BATTING_HR_transform = df_temp$BATTING_HR_transform,
                        BASERUN_SB_transform = df_temp$BASERUN_SB_transform,
                        PITCHING_BB_transform = df_temp$PITCHING_BB_transform,
                        PITCHING_SO_transform = df_temp$PITCHING_SO_transform,
                        FIELDING_E_transform = df_temp$FIELDING_E_transform))

is.na(clean_df) <- sapply(clean_df, is.infinite)

# Impute missing value with the mean
mean = mean(clean_df$BATTING_3B_transform, na.rm = TRUE)
mean2 = mean(clean_df$BASERUN_SB_transform, na.rm = TRUE)
mean3 = mean(clean_df$PITCHING_SO_transform, na.rm = TRUE)

clean_df$BATTING_3B_transform[is.na(clean_df$BATTING_3B_transform)] <- mean
clean_df$BASERUN_SB_transform[is.na(clean_df$BASERUN_SB_transform)] <- mean2
clean_df$PITCHING_SO_transform[is.na(clean_df$PITCHING_SO_transform)] <- mean3

gather_clean_df <- clean_df %>% 
  gather(key = 'variable', value = 'value', -TARGET_WINS)
## Warning: attributes are not identical across measure variables; they will be
## dropped

After preparing the clean data frame it was necessary to run similar data visualization methods used previously to understand the transformed variables and determine their validity within the data set.

# Histogram plots of each variable
ggplot(gather_clean_df) + 
  geom_histogram(aes(x=value, y = ..density..), bins=30) + 
  geom_density(aes(x=value), color='blue') +
  facet_wrap(. ~variable, scales='free', ncol=4)

# Boxplots for each variable
ggplot(gather_clean_df, aes(variable, value)) + 
  geom_boxplot() + 
  facet_wrap(. ~variable, scales='free', ncol=6)

Finally, a correlation table was produced to understand each of the variable’s relationship to each other. These values determine the statistical significance of each variable and show the importance of including them in regression models.

# Correlation Table
M = cor(clean_df, use = "na.or.complete")
corrplot(M, method = 'number', type = 'lower', diag = FALSE, number.cex = 0.5, tl.cex = 0.5, cl.cex = 0.5)

Build Models

Multiple linear regression model (Model #1) included all non-transformed features that hadn’t been removed following the data cleaning process mentioned above. Next, in Model #2, to select the optimal set of features, the stepAIC function in R was used to perform step wise selection on the initial model to help reduce the number of features and address multicollinearity issues from Model #1. Finally, the last model (Model #3) included all data from the cleaned dataset, but also contained some of the transformed features in replacement of their non-transformed counterparts. Additionally, this model utilized stepwise selection to determine the optimal set of features and to reduce multicollinearity as much as possible.

Negative values for coefficients that expected to be positive

Positive values for coefficients expected to be negative

This is a trend seen throughout the three models that were built which can likely be due to multicollinearity. Since it was noticed in the initial data exploration that many variables in the dataset were highly correlated with one another, this phenomenon likely is increasing the variance of the coefficient estimates, making them difficult to interpret. This was also supported by the Variance Inflation Factor (VIF) tests, which showed high values for features such as BATTING_HR, BATTING_SO, PITCHING_HR, and PITCHING_SO. In the final model (Model #3 certain variables were removed that had high VIF scores through the stepwise selection process.

Model 1

# Model 1 - Include all features and leverage stepAIC to help identify ideal features

multi_lm <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, clean_df)

lm_s <- summary(multi_lm)
print(lm_s)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + 
##     BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + 
##     PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + 
##     FIELDING_DP, data = clean_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.246  -8.040   0.149   8.459  64.706 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.8791382  5.5609672   3.575 0.000358 ***
## BATTING_H    0.0435068  0.0037271  11.673  < 2e-16 ***
## BATTING_2B  -0.0181980  0.0091059  -1.998 0.045785 *  
## BATTING_3B   0.0887800  0.0166055   5.346 9.87e-08 ***
## BATTING_HR  -0.0498772  0.0307896  -1.620 0.105385    
## BATTING_BB   0.0668013  0.0051197  13.048  < 2e-16 ***
## BATTING_SO   0.0043819  0.0039536   1.108 0.267841    
## BASERUN_SB   0.0199677  0.0041255   4.840 1.39e-06 ***
## BASERUN_CS   0.0054960  0.0154463   0.356 0.722013    
## PITCHING_H   0.0031034  0.0009168   3.385 0.000723 ***
## PITCHING_HR  0.0875913  0.0272043   3.220 0.001301 ** 
## PITCHING_BB -0.0343166  0.0043823  -7.831 7.39e-15 ***
## PITCHING_SO -0.0059484  0.0028921  -2.057 0.039826 *  
## FIELDING_E  -0.0431197  0.0051761  -8.331  < 2e-16 ***
## FIELDING_DP -0.1453152  0.0132805 -10.942  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.01 on 2260 degrees of freedom
## Multiple R-squared:  0.3141, Adjusted R-squared:  0.3099 
## F-statistic: 73.93 on 14 and 2260 DF,  p-value: < 2.2e-16
residPlot(lm_s)

# Model 1 - Display Variable feature importance plot
variableImportancePlot(multi_lm, "Model 1 LM Variable Importance")

# model 1 - print variable inflation factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(multi_lm)
##   BATTING_H  BATTING_2B  BATTING_3B  BATTING_HR  BATTING_BB  BATTING_SO 
##    3.874288    2.434501    2.886646   46.634071    5.260219   12.340845 
##  BASERUN_SB  BASERUN_CS  PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO 
##    1.666296    1.122657    1.987088   37.311322    4.129284    7.258820 
##  FIELDING_E FIELDING_DP 
##    1.970051    1.426686

Model 2

The second model, step wise selection was used to determine the optimal set of features from the original cleaned data set.

# Build model 2 - this is Model 1 with only significant features (using stepAIC)
mult_lm_final <- stepAIC(multi_lm, direction = "both",
                         scope = list(upper = multi_lm, lower = ~ 1),
                         scale = 0, trace = FALSE)
# Display Model 2 Summary
lmf_s <- summary(mult_lm_final)
print(lmf_s)
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + 
##     BATTING_BB + BASERUN_SB + PITCHING_H + PITCHING_HR + PITCHING_BB + 
##     PITCHING_SO + FIELDING_E + FIELDING_DP, data = clean_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.051  -7.965   0.164   8.359  65.549 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 21.7189057  4.9999664   4.344 1.46e-05 ***
## BATTING_H    0.0428227  0.0036457  11.746  < 2e-16 ***
## BATTING_2B  -0.0172282  0.0089438  -1.926 0.054197 .  
## BATTING_3B   0.0917609  0.0161863   5.669 1.62e-08 ***
## BATTING_BB   0.0643346  0.0041503  15.501  < 2e-16 ***
## BASERUN_SB   0.0209334  0.0039963   5.238 1.77e-07 ***
## PITCHING_H   0.0027985  0.0008467   3.305 0.000964 ***
## PITCHING_HR  0.0466418  0.0083051   5.616 2.19e-08 ***
## PITCHING_BB -0.0323388  0.0034282  -9.433  < 2e-16 ***
## PITCHING_SO -0.0031173  0.0016399  -1.901 0.057446 .  
## FIELDING_E  -0.0425346  0.0051451  -8.267 2.31e-16 ***
## FIELDING_DP -0.1466897  0.0132112 -11.103  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.01 on 2263 degrees of freedom
## Multiple R-squared:  0.3132, Adjusted R-squared:  0.3098 
## F-statistic:  93.8 on 11 and 2263 DF,  p-value: < 2.2e-16
# Display Model 2 REsidual plots
residPlot(lmf_s)

# model 2 - Display Variable feature importance plot
variableImportancePlot(mult_lm_final, "Model 2 LM Variable Importance")

# model 2 - print variable inflation factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(mult_lm_final)
##   BATTING_H  BATTING_2B  BATTING_3B  BATTING_BB  BASERUN_SB  PITCHING_H 
##    3.706640    2.348442    2.742530    3.456414    1.563497    1.694787 
## PITCHING_HR PITCHING_BB PITCHING_SO  FIELDING_E FIELDING_DP 
##    3.477116    2.526799    2.333742    1.946421    1.411744

Model 3

The third model utilizes some of the transformed variables to compare against the initial model. Additionally, similar to model 2, stepwise selection was used to determine feature importance and select the simplest model possible.

# Model 3 - Build linear model
multi_lm_2 <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B_transform + BATTING_HR_transform + BATTING_BB + BATTING_SO + BASERUN_SB_transform + BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB_transform + PITCHING_SO_transform + FIELDING_E_transform + FIELDING_DP, clean_df)

# Model 3 - Show model summary info
lm_s_2 <- summary(multi_lm_2)
lm_s_2
## 
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B_transform + 
##     BATTING_HR_transform + BATTING_BB + BATTING_SO + BASERUN_SB_transform + 
##     BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB_transform + 
##     PITCHING_SO_transform + FIELDING_E_transform + FIELDING_DP, 
##     data = clean_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.074  -7.938   0.122   8.547  64.835 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1.054e+01  1.907e+01   0.553    0.581    
## BATTING_H              4.316e-02  3.717e-03  11.611  < 2e-16 ***
## BATTING_2B            -1.953e-02  9.047e-03  -2.159    0.031 *  
## BATTING_3B_transform   6.194e+00  9.093e-01   6.812 1.23e-11 ***
## BATTING_HR_transform   3.564e-01  3.070e-01   1.161    0.246    
## BATTING_BB             6.532e-02  5.689e-03  11.482  < 2e-16 ***
## BATTING_SO            -4.883e-03  3.794e-03  -1.287    0.198    
## BASERUN_SB_transform   2.742e+00  5.782e-01   4.743 2.24e-06 ***
## BASERUN_CS             1.887e-03  1.605e-02   0.118    0.906    
## PITCHING_H             1.361e-03  9.483e-04   1.435    0.151    
## PITCHING_HR            2.305e-02  1.804e-02   1.278    0.201    
## PITCHING_BB_transform -1.641e+01  2.391e+00  -6.862 8.71e-12 ***
## PITCHING_SO_transform  1.444e+00  2.512e+00   0.575    0.566    
## FIELDING_E_transform   2.590e+02  3.210e+01   8.070 1.13e-15 ***
## FIELDING_DP           -1.438e-01  1.363e-02 -10.549  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.03 on 2260 degrees of freedom
## Multiple R-squared:  0.3121, Adjusted R-squared:  0.3079 
## F-statistic: 73.25 on 14 and 2260 DF,  p-value: < 2.2e-16
residPlot(lm_s_2)

# Model 3 - use stepAIC to select significant features
mult_lm_final_2 <- stepAIC(multi_lm_2, direction = "both",
                         scope = list(upper = multi_lm_2, lower = ~ 1),
                         scale = 0, trace = FALSE)

# Model 3 - Show feature importance in final model after stepAIC
variableImportancePlot(mult_lm_final_2, "Model 3 LM Variable Importance")

# Model 3 - show Variance Inflation Factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(mult_lm_final_2)
##             BATTING_H            BATTING_2B  BATTING_3B_transform 
##              2.571455              2.179680              2.626343 
##            BATTING_BB  BASERUN_SB_transform            PITCHING_H 
##              3.858783              1.334910              1.630391 
##           PITCHING_HR PITCHING_BB_transform  FIELDING_E_transform 
##              2.993639              2.997260              2.284460 
##           FIELDING_DP 
##              1.418338

Model Selection and Analysis

The following table discusses each of the model performance metrics on the data set. These values indicate there is a minor improvement in model performance after applying transformations and selecting for significant parameters.

All models achieved a positive adjusted R^2 pointing to a weakly-positive relationship between the target variable TOTAL_WINs and the model predictor variables.

From the previous section and residual diagnostics (both the residual histogram and QQ plot)–all three models have normally distributed residuals with mean 0. There appears to be no heteroskedasticity (non-constant variance) or serial-correlation within the residuals–thus strictly from a linear regression perspective, none of the assumptions are violated and thus we can make inferential predictions based on our model.

The F-statistics for all models were large and achieved a statistically significant p-value less than a default alpha of .05; thus for all three models the Null hypothesis that is rejected that 0s for the regression coefficients would fit the data better.

From the exploratory data analysis, many of the predictor variables were correlated with each other. However, this could be an issue from a multicolinearity, that can lead to unstable regression fits. From the previous section VIF (Variable Inflation Factor) was used to gauge better models by preferring VIF values less than 5.

The following figure represents the performance of 3 of the models on both the testing and on the data set. for this multiple linear regression problem, the adjusted R2 value was used as the metric for performance on predicting the target wins.

# Build summary table with each model performance metrics
SummaryTable <- bind_rows(
  model_performance_extraction(lm_s),
  model_performance_extraction(lmf_s),
  model_performance_extraction(lm_s_2)
)

# Add row names
rownames(SummaryTable) <- c("Model 1", "Model 2", "Model 3")

# Display nice format table
SummaryTable %>% 
  kable() %>% 
  kable_styling(
    bootstrap_options = c("hover", "condensed", "responsive"),
    full_width = F)
RSE Adj.R2 F.Statistic
Model 1 13.01296 0.3098646 73.92897
Model 2 13.01342 0.3098152 93.79725
Model 3 13.03182 0.3078626 73.24820
# Splitting the dataset into train/testing set with 80/20 split
set.seed(3)
df_split <- initial_split(clean_df, prop = 0.8)
df_train <- training(df_split)
df_test <- testing(df_split)

# Build prediction results dataset for plotting (each model with training and test data)
results <- multi_lm %>%
  predict(df_train) %>% 
  as.data.frame() %>% 
  mutate(Predicted = df_train$TARGET_WINS, model = "Model 1") %>% 
  bind_rows(mult_lm_final %>% 
              predict(df_train) %>% 
              as.data.frame() %>% 
              mutate(Predicted = df_train$TARGET_WINS, model = "Model 2"),
            mult_lm_final_2 %>% 
              predict(df_train) %>% 
              as.data.frame() %>% 
              mutate(Predicted = df_train$TARGET_WINS, model = "Model 3")) %>% 
  rename("Observed" = ".") %>% 
  mutate(dataset = "Training Set") %>% 
  bind_rows(
    results_test <- multi_lm %>%
      predict(df_test) %>% 
      as.data.frame() %>%
      mutate(Predicted = df_test$TARGET_WINS, model = "Model 1") %>% 
      bind_rows(mult_lm_final %>% 
                  predict(df_test) %>% 
                  as.data.frame() %>% 
                  mutate(Predicted = df_test$TARGET_WINS, model = "Model 2"),
                mult_lm_final_2 %>% 
                  predict(df_test) %>% as.data.frame() %>% 
                  mutate(Predicted = df_test$TARGET_WINS, model = "Model 3")) %>% 
      rename("Observed" = ".") %>% 
      mutate(dataset = "Testing Set")
  )

# Plot each model with datapoints, model prediction line and R^2
results %>% 
  ggplot(mapping = aes(x = Observed, y = Predicted)) +
  geom_point(pch =21, alpha = 0.3, fill = "skyblue") +
  geom_smooth(method = "lm", color = "red3") +
  facet_wrap(dataset~model) +
  stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), 
                   label.x.npc = "left", label.y.npc = .9,
                   formula = y~x, parse = TRUE, size = 3.5)+
  theme(panel.background = element_blank(),
        panel.grid = element_blank())
## Warning in stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), :
## Ignoring unknown parameters: `label.x.npc` and `label.y.npc`
## `geom_smooth()` using formula = 'y ~ x'

The analysis shows the following results:

Model 1 - This model is based on the all predictors on an un-transformed data set. Results for this model provided an R^2 of about of 0.31 on the training set and 0.3 on the testing data set.

Model 2 - This model is based on a subset of significant predictors on an un-transformed data set. Results for this model provided an R^2 of about of 0.31 on the training set and 0.31 on the testing data set.

Model 3 - This model is based on a subset of significant predictors on a transformed data set. Results for this model provided an R^2 of about 0.31 on the training set and 0.31 on the testing data set.

Based on these analyses, Model 2 and Model 3 both performed marginally better than Model 1 and could be selected as the linear model of choice when looking at adjusted R2. However, there is greater statistical significance under the third model relative to the others, and uses fewer unnecessary variables to compute our prediction without sacrificing much in terms of adjusted R^2 value. Additionally, Model 3 seems to have lower VIF scores than Models 1 and 2. Because of these factors, as well as its better adjustment for multicollinearity (we noticeably saw less sign-flipping in coefficients), model 3 would be the model of choice.