library(MASS)
library(rpart.plot)
## Loading required package: rpart
library(ggplot2)
library(ggfortify)
library(gridExtra)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ fma 2.5 ✔ expsmooth 2.3
##
library(fma)
library(kableExtra)
library(e1071)
library(mlbench)
library(ggcorrplot)
library(DataExplorer)
library(timeDate)
##
## Attaching package: 'timeDate'
## The following objects are masked from 'package:e1071':
##
## kurtosis, skewness
library(caret)
## Loading required package: lattice
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:fma':
##
## pigs
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(tibble)
library(tidyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(mixtools)
## mixtools package, version 2.0.0, Released 2022-12-04
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772 and the Chan Zuckerberg Initiative: Essential Open Source Software for Science (Grant No. 2020-255193).
library(naniar)
library(ggpmisc)
## Loading required package: ggpp
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Attaching package: 'ggpp'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(regclass)
## Loading required package: bestglm
## Loading required package: leaps
## Loading required package: VGAM
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
##
## The following object is masked from 'package:caret':
##
## predictors
##
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Important regclass change from 1.3:
## All functions that had a . in the name now have an _
## all.correlations -> all_correlations, cor.demo -> cor_demo, etc.
##
##
## Attaching package: 'regclass'
##
## The following object is masked from 'package:lattice':
##
## qq
library(tidymodels)
## Registered S3 method overwritten by 'parsnip':
## method from
## autoplot.glmnet ggfortify
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.2.1
## ✔ infer 1.0.7 ✔ workflows 1.1.4
## ✔ modeldata 1.4.0 ✔ workflowsets 1.1.0
## ✔ parsnip 1.2.1 ✔ yardstick 1.3.1
## ✔ recipes 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ yardstick::accuracy() masks forecast::accuracy()
## ✖ ggpp::annotate() masks ggplot2::annotate()
## ✖ randomForest::combine() masks dplyr::combine(), gridExtra::combine()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::group_rows() masks kableExtra::group_rows()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ rsample::permutations() masks e1071::permutations()
## ✖ yardstick::precision() masks caret::precision()
## ✖ dials::prune() masks rpart::prune()
## ✖ yardstick::recall() masks caret::recall()
## ✖ dplyr::select() masks MASS::select()
## ✖ yardstick::sensitivity() masks caret::sensitivity()
## ✖ yardstick::spec() masks readr::spec()
## ✖ yardstick::specificity() masks caret::specificity()
## ✖ recipes::step() masks stats::step()
## ✖ tune::tune() masks parsnip::tune(), e1071::tune()
## ✖ recipes::update() masks stats4::update(), stats::update()
## ✖ workflows::update_formula() masks VGAM::update_formula()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(predictmeans)
## Loading required package: glmmTMB
##
## Attaching package: 'glmmTMB'
##
## The following objects are masked from 'package:VGAM':
##
## betabinomial, lognormal
##
## Loading required package: lme4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:lme4':
##
## lmList
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## The following object is masked from 'package:forecast':
##
## getResponse
##
## Loading required package: lmerTest
##
## Attaching package: 'lmerTest'
##
## The following object is masked from 'package:lme4':
##
## lmer
##
## The following object is masked from 'package:recipes':
##
## step
##
## The following object is masked from 'package:stats':
##
## step
# Print a side-by-side Histogram and QQPlot of Residuals
residPlot <- function(model) {
if (is.null(model)) {
return
}
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
plot(residuals(model))
hist(model[["residuals"]], freq = FALSE, breaks = "fd", main = "Residual Histogram",
xlab = "Residuals",col="lightgreen")
lines(density(model[["residuals"]], kernel = "ep"),col="blue", lwd=3)
curve(dnorm(x,mean=mean(model[["residuals"]]), sd=sd(model[["residuals"]])), col="red", lwd=3, lty="dotted", add=T)
qqnorm(model[["residuals"]], main = "Residual Q-Q plot")
qqline(model[["residuals"]],col="red", lwd=3, lty="dotted")
par(mfrow = c(1, 1))
}
# Print a Variable Importance Plot for the provided model
variableImportancePlot <- function(model=NULL, chart_title='Variable Importance Plot') {
if (is.null(model)) {
return
}
# use caret and gglot to print a variable importance plot
varImp(model) %>% as.data.frame() %>%
ggplot(aes(x = reorder(rownames(.), desc(Overall)), y = Overall)) +
geom_col(aes(fill = Overall)) +
theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(angle = 90)) +
scale_fill_gradient() +
labs(title = chart_title,
x = "Parameter",
y = "Relative Importance")
}
# Extract key performance results from a model
model_performance_extraction <- function(model=NULL) {
if (is.null(model)) {
return
}
data.frame("RSE" = model$sigma,
"Adj R2" = model$adj.r.squared,
"F-Statistic" = model$fstatistic[1])
}
In professional sports, there is a significant interest in leveraging historical statistics to predict future outcomes, such as wins and losses, and to identify opportunities for improving team or individual performance. This data-driven approach has gained immense popularity over the past decade, showing up in mass media through fantasy leagues, movies like “Moneyball,” and platforms such as FiveThirtyEight.
In this analysis, we will utilize a classic baseball dataset to build several models capable of predicting a team’s wins over a season based on various team statistics (e.g., home runs, strikeouts, base hits). The approach will be systematic and thorough, beginning with an exploration of the data to identify potential issues or challenges such as missing data, outliers, coding errors, and multicollinearity.
After addressing these challenges through appropriate data-cleaning steps, three different linear models to predict seasonal wins will be built and evaluated. The goal is to select a final model that strikes the best balance between accuracy and simplicity, providing a robust tool for predicting team performance.
The moneyball training set contains 17 columns - including the target variable “TARGET_WINS” - and 2276 rows, covering baseball team performance statistics from the years 1871 to 2006 inclusive. The data has been adjusted to match the performance of a typical 162 game season. The data-set was entirely numerical and contained no categorical variables.
Below is a chart that describes each variable in the data set and the theoretical effect it will have on the number of wins projected for a team.
After loading the data set in R, the following summary statistics were calculated:
df <- read.csv("C:/moneyball-training-data.csv")
str(df)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
summary(df)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
From the outputs of the str and summary functions it was noted that there was many NA values throughout the data set which needed to be transformed or cleaned. Additionally, the average wins per season for a team is about 81, which is exactly half of the total games played in a typical MLB season, batters hit, on average, about 9 base hits per game, pitchers throw about 4 base-on-balls (walks) per game, and pitchers record about 5 strikeouts per game when calculated out over a 162-game season.
From there it was important to understand the different variables and visualize the data by showing the amount of missing values for each variable in the data set, plot the distributions of values for each variable with histograms and density curves as well as show a box plot and scatter plot comparison. Additionally, the index variable did not hold any statistical value so it was removed from the data.
##Remove "TEAM" from variable names
names(df) <- names(df) %>%
str_replace_all('TEAM_', '')
## Drop the INDEX column
df <- df %>%
dplyr::select(-INDEX)
gg_miss_var(df, show_pct = TRUE)
# Prepare data for ggplot
gather_df <- df %>%
gather(key = 'variable', value = 'value')
# Histogram plots of each variable
ggplot(gather_df) +
geom_histogram(aes(x = value, y = ..density..), bins=30) +
geom_density(aes(x = value), color='blue') +
facet_wrap(. ~variable, scales='free', ncol=4)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_density()`).
The distribution profiles show the prevalence of kurtosis, specifically right skew in variables BASERUN_CS, BASERUN_SB, FIELDING_E, PITCHING_BB, PITCHING_H and PITCHING_SO. These deviations from a traditional normal distribution can be problematic for linear regression assumptions, meaning the data may have to be transformed. Furthermore BATTING_HBP, BATTING_HR, PITCHING_HR and BATTING_SO appear bimodal. Bimodal data suggests that there are possibly two different groups or classes of baseball season data. During those seasons, teams tended to score higher or lower for the bimodal feature.
# Boxplots for each variable
ggplot(gather_df, aes(variable, value)) +
geom_boxplot() +
facet_wrap(. ~variable, scales='free', ncol=6)
## Warning: Removed 3478 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
The box-plots reveal significant outliers and uniform (zero-only) values for many of our predictor variables. Outliers will need to imputed if necessary, and sparse data-sets might need to be dropped.
Finally, the scatter plots of each variable versus the target variable of TARGET_WINS wer plotted to get an idea of the relationship between them.
# Scatter plots of each variable versus the target variable
featurePlot(x = df[, -1],
y = df$TARGET_WINS,
plot = "scatter")
The plots indicate some clear relationships, such as hitting more doubles or more home runs clearly improves the number of wins. However, they also reveal some significant issues with the data. Most of the predictor variables are skewed or non-normally distributed and will need to be transformed. Additionally, many data points contain missing data that will need to be either imputed or discarded. It also appears that there is some missing data encoded as 0 and some nonsensical outliers. There is a team with 0 wins in the dataset. This seems unlikely. Many of the hitting categories include teams at 0; it is unlikely that a team hit 0 home runs throughout a season. The pitching variables also include many 0’s. There are multiple teams with 0 strikeouts by their pitchers over the season, which is extremely unlikely. The pitching data also includes strange outliers such as a team logging 20,000 strikeouts which would be an average of 160 strikeouts per game. Also, team pitching walks and team pitching hits have strange outliers. Lastly, the error variable does make sense. teams usually score 2 or fewer errors per game, which would lead to an overall team error of approximately 320 throughout a season. This does not match the scale of the error variable. Errors may have become much less frequent over time which would account for the high error data.
BATTING_HBP field was removed from the data as it was missing >90% of the data and the INDEX field as it offers no information for a model.
There were missing values found in BASERUN_CS (Caught Stolen Bases), FIELDING_DP (Double plays), BASERUN_SB (Stolen Bases), BATTING_SO (Batter Strike outs), PITCHING_SO (Pitcher Strike outs) so the missing values were replaced with their corresponding median values.
There are unreasonable outliers found in PITCHING_SO (pitching strikeouts), PITCHING_H (allowed hits per game), PITCHING_BB (walks), and FIELDING_E (fielding errors) that exceed what is reasonable or possible given standard game length. Any outliers were replaced with the median for the data set. Limits that were set included: > 4000 PITCHING_SO (25 strikeouts per game), > 5000 PITCHING_H (30 hits allowed per game), > 2000 PITCHING_BB (13 walks per game), and > 480 FIELDING_E (3 errors per game).
Finally, some of the variables were highly skewed. To address this, boxcox, log and inverse log transformations were used to make them more normally distributed.
# Identify missing data by Feature and display percent breakout
missing <- colSums(df %>% sapply(is.na))
missing_pct <- round(missing / nrow(df) * 100, 2)
stack(sort(missing_pct, decreasing = TRUE))
# Drop the BATTING_HBP field
df <- df %>%
select(-BATTING_HBP)
no_outlier_df <- df
no_outlier_df$PITCHING_SO <- ifelse(no_outlier_df$PITCHING_SO > 4000, NA, no_outlier_df$PITCHING_SO)
no_outlier_df$PITCHING_H <- ifelse(no_outlier_df$PITCHING_H > 5000, NA, no_outlier_df$PITCHING_H)
no_outlier_df$PITCHING_BB <- ifelse(no_outlier_df$PITCHING_BB > 2000, NA, no_outlier_df$PITCHING_BB)
no_outlier_df$FIELDING_E <- ifelse(no_outlier_df$FIELDING_E > 480, NA, no_outlier_df$FIELDING_E)
clean_df <- no_outlier_df %>%
impute(what = 'median') %>% as.data.frame()
# From the plots, the team with 0 wins has 0 in multiple other categories. not valid data.
clean_df <- clean_df %>%
filter(TARGET_WINS != 0)
gather_clean_df <- clean_df %>%
gather(key = 'variable', value = 'value', -TARGET_WINS)
# created empty data frame to store transformed variables
df_temp <- data.frame(matrix(ncol = 1, nrow = length(clean_df$TARGET_WINS)))
# performed boxcox transformation after identifying proper lambda
df_temp$BATTING_3B <- clean_df$BATTING_3B
batting3b_lambda <- BoxCox.lambda(clean_df$BATTING_3B)
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
df_temp$BATTING_3B_transform <- log(clean_df$BATTING_3B)
# performed boxcox transformation after identifying proper lambda
df_temp$BATTING_HR <- clean_df$BATTING_HR
battingHR_lambda <- BoxCox.lambda(clean_df$BATTING_HR)
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
df_temp$BATTING_HR_transform <- BoxCox(clean_df$BATTING_HR, battingHR_lambda)
# performed a log transformation
df_temp$PITCHING_BB <- clean_df$PITCHING_BB
df_temp$PITCHING_BB_transform <- log(clean_df$PITCHING_BB)
# performed a log transformation
df_temp$PITCHING_SO <- clean_df$PITCHING_SO
df_temp$PITCHING_SO_transform <- log(clean_df$PITCHING_SO)
# performed an inverse log transformation
df_temp$FIELDING_E <- clean_df$FIELDING_E
df_temp$FIELDING_E_transform <- 1/log(clean_df$FIELDING_E)
# performed a log transformation
df_temp$BASERUN_SB <- clean_df$BASERUN_SB
df_temp$BASERUN_SB_transform <- log(clean_df$BASERUN_SB)
df_temp <- df_temp[, 2:13]
# Build clean dataframe with transformation
clean_df <- data.frame(cbind(clean_df,
BATTING_3B_transform = df_temp$BATTING_3B_transform,
BATTING_HR_transform = df_temp$BATTING_HR_transform,
BASERUN_SB_transform = df_temp$BASERUN_SB_transform,
PITCHING_BB_transform = df_temp$PITCHING_BB_transform,
PITCHING_SO_transform = df_temp$PITCHING_SO_transform,
FIELDING_E_transform = df_temp$FIELDING_E_transform))
is.na(clean_df) <- sapply(clean_df, is.infinite)
# Impute missing value with the mean
mean = mean(clean_df$BATTING_3B_transform, na.rm = TRUE)
mean2 = mean(clean_df$BASERUN_SB_transform, na.rm = TRUE)
mean3 = mean(clean_df$PITCHING_SO_transform, na.rm = TRUE)
clean_df$BATTING_3B_transform[is.na(clean_df$BATTING_3B_transform)] <- mean
clean_df$BASERUN_SB_transform[is.na(clean_df$BASERUN_SB_transform)] <- mean2
clean_df$PITCHING_SO_transform[is.na(clean_df$PITCHING_SO_transform)] <- mean3
gather_clean_df <- clean_df %>%
gather(key = 'variable', value = 'value', -TARGET_WINS)
## Warning: attributes are not identical across measure variables; they will be
## dropped
After preparing the clean data frame it was necessary to run similar data visualization methods used previously to understand the transformed variables and determine their validity within the data set.
# Histogram plots of each variable
ggplot(gather_clean_df) +
geom_histogram(aes(x=value, y = ..density..), bins=30) +
geom_density(aes(x=value), color='blue') +
facet_wrap(. ~variable, scales='free', ncol=4)
# Boxplots for each variable
ggplot(gather_clean_df, aes(variable, value)) +
geom_boxplot() +
facet_wrap(. ~variable, scales='free', ncol=6)
Finally, a correlation table was produced to understand each of the variable’s relationship to each other. These values determine the statistical significance of each variable and show the importance of including them in regression models.
# Correlation Table
M = cor(clean_df, use = "na.or.complete")
corrplot(M, method = 'number', type = 'lower', diag = FALSE, number.cex = 0.5, tl.cex = 0.5, cl.cex = 0.5)
Multiple linear regression model (Model #1) included all
non-transformed features that hadn’t been removed following the data
cleaning process mentioned above. Next, in Model #2, to select the
optimal set of features, the stepAIC function in R was used
to perform step wise selection on the initial model to help reduce the
number of features and address multicollinearity issues from Model #1.
Finally, the last model (Model #3) included all data from the cleaned
dataset, but also contained some of the transformed features in
replacement of their non-transformed counterparts. Additionally, this
model utilized stepwise selection to determine the optimal set of
features and to reduce multicollinearity as much as possible.
Negative values for coefficients that expected to be positive
BATTING_2B - logically more doubles hit by batters would increase the likelihood of winning more games
BATTING_HR - logically more homeruns hit by batters would increase the likelihood of winning more games
PITCHING_SO - logically more strikeouts thrown by pitchers would increase the likelihood of winning more games
FIELDING_DP - logically more double plays turned by a team would increase the likelihood of winning more games
Positive values for coefficients expected to be negative
BATTING_SO - logically more strikeouts by batters would lead to less of a likelihood of winning more games
PITCHING_H - logically more hits given up by pitchers would lead to less of a likelihood of winning more games
PITCHING_HR - logically more homeruns given up by pitchers would lead to less of a likelihood of winning more games
This is a trend seen throughout the three models that were built which can likely be due to multicollinearity. Since it was noticed in the initial data exploration that many variables in the dataset were highly correlated with one another, this phenomenon likely is increasing the variance of the coefficient estimates, making them difficult to interpret. This was also supported by the Variance Inflation Factor (VIF) tests, which showed high values for features such as BATTING_HR, BATTING_SO, PITCHING_HR, and PITCHING_SO. In the final model (Model #3 certain variables were removed that had high VIF scores through the stepwise selection process.
# Model 1 - Include all features and leverage stepAIC to help identify ideal features
multi_lm <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B + BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E + FIELDING_DP, clean_df)
lm_s <- summary(multi_lm)
print(lm_s)
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B +
## BATTING_HR + BATTING_BB + BATTING_SO + BASERUN_SB + BASERUN_CS +
## PITCHING_H + PITCHING_HR + PITCHING_BB + PITCHING_SO + FIELDING_E +
## FIELDING_DP, data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.246 -8.040 0.149 8.459 64.706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.8791382 5.5609672 3.575 0.000358 ***
## BATTING_H 0.0435068 0.0037271 11.673 < 2e-16 ***
## BATTING_2B -0.0181980 0.0091059 -1.998 0.045785 *
## BATTING_3B 0.0887800 0.0166055 5.346 9.87e-08 ***
## BATTING_HR -0.0498772 0.0307896 -1.620 0.105385
## BATTING_BB 0.0668013 0.0051197 13.048 < 2e-16 ***
## BATTING_SO 0.0043819 0.0039536 1.108 0.267841
## BASERUN_SB 0.0199677 0.0041255 4.840 1.39e-06 ***
## BASERUN_CS 0.0054960 0.0154463 0.356 0.722013
## PITCHING_H 0.0031034 0.0009168 3.385 0.000723 ***
## PITCHING_HR 0.0875913 0.0272043 3.220 0.001301 **
## PITCHING_BB -0.0343166 0.0043823 -7.831 7.39e-15 ***
## PITCHING_SO -0.0059484 0.0028921 -2.057 0.039826 *
## FIELDING_E -0.0431197 0.0051761 -8.331 < 2e-16 ***
## FIELDING_DP -0.1453152 0.0132805 -10.942 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.01 on 2260 degrees of freedom
## Multiple R-squared: 0.3141, Adjusted R-squared: 0.3099
## F-statistic: 73.93 on 14 and 2260 DF, p-value: < 2.2e-16
residPlot(lm_s)
# Model 1 - Display Variable feature importance plot
variableImportancePlot(multi_lm, "Model 1 LM Variable Importance")
# model 1 - print variable inflation factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(multi_lm)
## BATTING_H BATTING_2B BATTING_3B BATTING_HR BATTING_BB BATTING_SO
## 3.874288 2.434501 2.886646 46.634071 5.260219 12.340845
## BASERUN_SB BASERUN_CS PITCHING_H PITCHING_HR PITCHING_BB PITCHING_SO
## 1.666296 1.122657 1.987088 37.311322 4.129284 7.258820
## FIELDING_E FIELDING_DP
## 1.970051 1.426686
The second model, step wise selection was used to determine the optimal set of features from the original cleaned data set.
# Build model 2 - this is Model 1 with only significant features (using stepAIC)
mult_lm_final <- stepAIC(multi_lm, direction = "both",
scope = list(upper = multi_lm, lower = ~ 1),
scale = 0, trace = FALSE)
# Display Model 2 Summary
lmf_s <- summary(mult_lm_final)
print(lmf_s)
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B +
## BATTING_BB + BASERUN_SB + PITCHING_H + PITCHING_HR + PITCHING_BB +
## PITCHING_SO + FIELDING_E + FIELDING_DP, data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.051 -7.965 0.164 8.359 65.549
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.7189057 4.9999664 4.344 1.46e-05 ***
## BATTING_H 0.0428227 0.0036457 11.746 < 2e-16 ***
## BATTING_2B -0.0172282 0.0089438 -1.926 0.054197 .
## BATTING_3B 0.0917609 0.0161863 5.669 1.62e-08 ***
## BATTING_BB 0.0643346 0.0041503 15.501 < 2e-16 ***
## BASERUN_SB 0.0209334 0.0039963 5.238 1.77e-07 ***
## PITCHING_H 0.0027985 0.0008467 3.305 0.000964 ***
## PITCHING_HR 0.0466418 0.0083051 5.616 2.19e-08 ***
## PITCHING_BB -0.0323388 0.0034282 -9.433 < 2e-16 ***
## PITCHING_SO -0.0031173 0.0016399 -1.901 0.057446 .
## FIELDING_E -0.0425346 0.0051451 -8.267 2.31e-16 ***
## FIELDING_DP -0.1466897 0.0132112 -11.103 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.01 on 2263 degrees of freedom
## Multiple R-squared: 0.3132, Adjusted R-squared: 0.3098
## F-statistic: 93.8 on 11 and 2263 DF, p-value: < 2.2e-16
# Display Model 2 REsidual plots
residPlot(lmf_s)
# model 2 - Display Variable feature importance plot
variableImportancePlot(mult_lm_final, "Model 2 LM Variable Importance")
# model 2 - print variable inflation factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(mult_lm_final)
## BATTING_H BATTING_2B BATTING_3B BATTING_BB BASERUN_SB PITCHING_H
## 3.706640 2.348442 2.742530 3.456414 1.563497 1.694787
## PITCHING_HR PITCHING_BB PITCHING_SO FIELDING_E FIELDING_DP
## 3.477116 2.526799 2.333742 1.946421 1.411744
The third model utilizes some of the transformed variables to compare against the initial model. Additionally, similar to model 2, stepwise selection was used to determine feature importance and select the simplest model possible.
# Model 3 - Build linear model
multi_lm_2 <- lm(TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B_transform + BATTING_HR_transform + BATTING_BB + BATTING_SO + BASERUN_SB_transform + BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB_transform + PITCHING_SO_transform + FIELDING_E_transform + FIELDING_DP, clean_df)
# Model 3 - Show model summary info
lm_s_2 <- summary(multi_lm_2)
lm_s_2
##
## Call:
## lm(formula = TARGET_WINS ~ BATTING_H + BATTING_2B + BATTING_3B_transform +
## BATTING_HR_transform + BATTING_BB + BATTING_SO + BASERUN_SB_transform +
## BASERUN_CS + PITCHING_H + PITCHING_HR + PITCHING_BB_transform +
## PITCHING_SO_transform + FIELDING_E_transform + FIELDING_DP,
## data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.074 -7.938 0.122 8.547 64.835
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.054e+01 1.907e+01 0.553 0.581
## BATTING_H 4.316e-02 3.717e-03 11.611 < 2e-16 ***
## BATTING_2B -1.953e-02 9.047e-03 -2.159 0.031 *
## BATTING_3B_transform 6.194e+00 9.093e-01 6.812 1.23e-11 ***
## BATTING_HR_transform 3.564e-01 3.070e-01 1.161 0.246
## BATTING_BB 6.532e-02 5.689e-03 11.482 < 2e-16 ***
## BATTING_SO -4.883e-03 3.794e-03 -1.287 0.198
## BASERUN_SB_transform 2.742e+00 5.782e-01 4.743 2.24e-06 ***
## BASERUN_CS 1.887e-03 1.605e-02 0.118 0.906
## PITCHING_H 1.361e-03 9.483e-04 1.435 0.151
## PITCHING_HR 2.305e-02 1.804e-02 1.278 0.201
## PITCHING_BB_transform -1.641e+01 2.391e+00 -6.862 8.71e-12 ***
## PITCHING_SO_transform 1.444e+00 2.512e+00 0.575 0.566
## FIELDING_E_transform 2.590e+02 3.210e+01 8.070 1.13e-15 ***
## FIELDING_DP -1.438e-01 1.363e-02 -10.549 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.03 on 2260 degrees of freedom
## Multiple R-squared: 0.3121, Adjusted R-squared: 0.3079
## F-statistic: 73.25 on 14 and 2260 DF, p-value: < 2.2e-16
residPlot(lm_s_2)
# Model 3 - use stepAIC to select significant features
mult_lm_final_2 <- stepAIC(multi_lm_2, direction = "both",
scope = list(upper = multi_lm_2, lower = ~ 1),
scale = 0, trace = FALSE)
# Model 3 - Show feature importance in final model after stepAIC
variableImportancePlot(mult_lm_final_2, "Model 3 LM Variable Importance")
# Model 3 - show Variance Inflation Factor score
print('VIF scores of predictors')
## [1] "VIF scores of predictors"
VIF(mult_lm_final_2)
## BATTING_H BATTING_2B BATTING_3B_transform
## 2.571455 2.179680 2.626343
## BATTING_BB BASERUN_SB_transform PITCHING_H
## 3.858783 1.334910 1.630391
## PITCHING_HR PITCHING_BB_transform FIELDING_E_transform
## 2.993639 2.997260 2.284460
## FIELDING_DP
## 1.418338
The following table discusses each of the model performance metrics on the data set. These values indicate there is a minor improvement in model performance after applying transformations and selecting for significant parameters.
All models achieved a positive adjusted R^2 pointing to a weakly-positive relationship between the target variable TOTAL_WINs and the model predictor variables.
From the previous section and residual diagnostics (both the residual histogram and QQ plot)–all three models have normally distributed residuals with mean 0. There appears to be no heteroskedasticity (non-constant variance) or serial-correlation within the residuals–thus strictly from a linear regression perspective, none of the assumptions are violated and thus we can make inferential predictions based on our model.
The F-statistics for all models were large and achieved a statistically significant p-value less than a default alpha of .05; thus for all three models the Null hypothesis that is rejected that 0s for the regression coefficients would fit the data better.
From the exploratory data analysis, many of the predictor variables were correlated with each other. However, this could be an issue from a multicolinearity, that can lead to unstable regression fits. From the previous section VIF (Variable Inflation Factor) was used to gauge better models by preferring VIF values less than 5.
The following figure represents the performance of 3 of the models on both the testing and on the data set. for this multiple linear regression problem, the adjusted R2 value was used as the metric for performance on predicting the target wins.
# Build summary table with each model performance metrics
SummaryTable <- bind_rows(
model_performance_extraction(lm_s),
model_performance_extraction(lmf_s),
model_performance_extraction(lm_s_2)
)
# Add row names
rownames(SummaryTable) <- c("Model 1", "Model 2", "Model 3")
# Display nice format table
SummaryTable %>%
kable() %>%
kable_styling(
bootstrap_options = c("hover", "condensed", "responsive"),
full_width = F)
| RSE | Adj.R2 | F.Statistic | |
|---|---|---|---|
| Model 1 | 13.01296 | 0.3098646 | 73.92897 |
| Model 2 | 13.01342 | 0.3098152 | 93.79725 |
| Model 3 | 13.03182 | 0.3078626 | 73.24820 |
# Splitting the dataset into train/testing set with 80/20 split
set.seed(3)
df_split <- initial_split(clean_df, prop = 0.8)
df_train <- training(df_split)
df_test <- testing(df_split)
# Build prediction results dataset for plotting (each model with training and test data)
results <- multi_lm %>%
predict(df_train) %>%
as.data.frame() %>%
mutate(Predicted = df_train$TARGET_WINS, model = "Model 1") %>%
bind_rows(mult_lm_final %>%
predict(df_train) %>%
as.data.frame() %>%
mutate(Predicted = df_train$TARGET_WINS, model = "Model 2"),
mult_lm_final_2 %>%
predict(df_train) %>%
as.data.frame() %>%
mutate(Predicted = df_train$TARGET_WINS, model = "Model 3")) %>%
rename("Observed" = ".") %>%
mutate(dataset = "Training Set") %>%
bind_rows(
results_test <- multi_lm %>%
predict(df_test) %>%
as.data.frame() %>%
mutate(Predicted = df_test$TARGET_WINS, model = "Model 1") %>%
bind_rows(mult_lm_final %>%
predict(df_test) %>%
as.data.frame() %>%
mutate(Predicted = df_test$TARGET_WINS, model = "Model 2"),
mult_lm_final_2 %>%
predict(df_test) %>% as.data.frame() %>%
mutate(Predicted = df_test$TARGET_WINS, model = "Model 3")) %>%
rename("Observed" = ".") %>%
mutate(dataset = "Testing Set")
)
# Plot each model with datapoints, model prediction line and R^2
results %>%
ggplot(mapping = aes(x = Observed, y = Predicted)) +
geom_point(pch =21, alpha = 0.3, fill = "skyblue") +
geom_smooth(method = "lm", color = "red3") +
facet_wrap(dataset~model) +
stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")),
label.x.npc = "left", label.y.npc = .9,
formula = y~x, parse = TRUE, size = 3.5)+
theme(panel.background = element_blank(),
panel.grid = element_blank())
## Warning in stat_poly_eq(aes(label = paste(..adj.rr.label.., sep = "~~~")), :
## Ignoring unknown parameters: `label.x.npc` and `label.y.npc`
## `geom_smooth()` using formula = 'y ~ x'
The analysis shows the following results:
Model 1 - This model is based on the all predictors on an un-transformed data set. Results for this model provided an R^2 of about of 0.31 on the training set and 0.3 on the testing data set.
Model 2 - This model is based on a subset of significant predictors on an un-transformed data set. Results for this model provided an R^2 of about of 0.31 on the training set and 0.31 on the testing data set.
Model 3 - This model is based on a subset of significant predictors on a transformed data set. Results for this model provided an R^2 of about 0.31 on the training set and 0.31 on the testing data set.
Based on these analyses, Model 2 and Model 3 both performed marginally better than Model 1 and could be selected as the linear model of choice when looking at adjusted R2. However, there is greater statistical significance under the third model relative to the others, and uses fewer unnecessary variables to compute our prediction without sacrificing much in terms of adjusted R^2 value. Additionally, Model 3 seems to have lower VIF scores than Models 1 and 2. Because of these factors, as well as its better adjustment for multicollinearity (we noticeably saw less sign-flipping in coefficients), model 3 would be the model of choice.