Setup
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(visdat)
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(readr)
library(dplyr)
Import Data
df_train <- read.csv("moneyball-training-data.csv")
vis_dat(df_train)

Cleaning the Data
vis_dat(df_train)

df_train1 <- df_train[ , c(-1, -11)]
df_train1 <- na.omit(df_train1)
df_train2 <- df_train1
df_train3 <- df_train1
df_train4 <- df_train1
vis_dat(df_train1)

Plotting the Data
ggplot(data = df_train1,
mapping = aes(x = TARGET_WINS)) +
geom_histogram(color = "black", fill = "red", bins = 30) +
labs(title = "Histogram of Target Wins",
x = "Wins", y = "Count")

Regressions
reg1 <- stepAIC(object = lm(data = df_train1, TARGET_WINS ~ .),
direction = c("backward"))
## Start: AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 0.8 134324 6721.2
## - TEAM_PITCHING_HR 1 7.2 134330 6721.3
## - TEAM_BATTING_SO 1 55.2 134378 6721.8
## - TEAM_BATTING_H 1 56.5 134380 6721.8
## - TEAM_BATTING_HR 1 68.5 134392 6721.9
## - TEAM_BATTING_BB 1 81.0 134404 6722.1
## - TEAM_PITCHING_H 1 98.0 134421 6722.3
## <none> 134323 6723.2
## - TEAM_PITCHING_SO 1 264.1 134587 6724.1
## - TEAM_BASERUN_CS 1 746.8 135070 6729.4
## - TEAM_BASERUN_SB 1 1557.8 135881 6738.3
## - TEAM_BATTING_3B 1 4838.9 139162 6773.8
## - TEAM_BATTING_2B 1 5166.3 139489 6777.3
## - TEAM_FIELDING_DP 1 6742.5 141066 6794.0
## - TEAM_FIELDING_E 1 22427.4 156751 6950.6
##
## Step: AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 6.4 134330 6719.3
## - TEAM_BATTING_SO 1 56.2 134380 6719.8
## - TEAM_BATTING_HR 1 77.9 134402 6720.1
## - TEAM_BATTING_H 1 147.2 134471 6720.8
## <none> 134324 6721.2
## - TEAM_PITCHING_H 1 197.5 134521 6721.4
## - TEAM_PITCHING_SO 1 266.3 134590 6722.1
## - TEAM_BASERUN_CS 1 746.5 135070 6727.4
## - TEAM_BASERUN_SB 1 1564.2 135888 6736.4
## - TEAM_BATTING_3B 1 4840.8 139165 6771.8
## - TEAM_BATTING_2B 1 5175.9 139500 6775.4
## - TEAM_FIELDING_DP 1 6744.6 141069 6792.0
## - TEAM_BATTING_BB 1 12568.9 146893 6852.1
## - TEAM_FIELDING_E 1 22491.7 156816 6949.2
##
## Step: AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 51.2 134382 6717.8
## - TEAM_BATTING_H 1 144.7 134475 6718.9
## <none> 134330 6719.3
## - TEAM_PITCHING_H 1 202.0 134532 6719.5
## - TEAM_PITCHING_SO 1 298.0 134628 6720.6
## - TEAM_BASERUN_CS 1 742.6 135073 6725.5
## - TEAM_BASERUN_SB 1 1570.4 135901 6734.5
## - TEAM_BATTING_3B 1 4842.6 139173 6769.9
## - TEAM_BATTING_2B 1 5198.7 139529 6773.7
## - TEAM_FIELDING_DP 1 6744.4 141075 6790.1
## - TEAM_BATTING_HR 1 9780.8 144111 6821.7
## - TEAM_BATTING_BB 1 12606.9 146937 6850.6
## - TEAM_FIELDING_E 1 22525.1 156855 6947.6
##
## Step: AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 134382 6717.8
## - TEAM_BASERUN_CS 1 737.6 135119 6724.0
## - TEAM_PITCHING_H 1 1355.1 135737 6730.7
## - TEAM_BASERUN_SB 1 1575.6 135957 6733.2
## - TEAM_BATTING_H 1 1740.1 136122 6734.9
## - TEAM_BATTING_3B 1 4849.8 139231 6768.5
## - TEAM_BATTING_2B 1 5148.1 139530 6771.7
## - TEAM_FIELDING_DP 1 6779.2 141161 6789.0
## - TEAM_PITCHING_SO 1 7395.1 141777 6795.4
## - TEAM_BATTING_HR 1 9785.1 144167 6820.3
## - TEAM_BATTING_BB 1 12619.7 147001 6849.2
## - TEAM_FIELDING_E 1 22552.0 156934 6946.4
final_reg <- reg1
stargazer(final_reg, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## TARGET_WINS
## -----------------------------------------------
## TEAM_BATTING_H 0.026***
## (0.006)
##
## TEAM_BATTING_2B -0.070***
## (0.009)
##
## TEAM_BATTING_3B 0.162***
## (0.022)
##
## TEAM_BATTING_HR 0.098***
## (0.009)
##
## TEAM_BATTING_BB 0.039***
## (0.003)
##
## TEAM_BASERUN_SB 0.036***
## (0.009)
##
## TEAM_BASERUN_CS 0.052***
## (0.018)
##
## TEAM_PITCHING_H 0.009***
## (0.002)
##
## TEAM_PITCHING_SO -0.021***
## (0.002)
##
## TEAM_FIELDING_E -0.156***
## (0.010)
##
## TEAM_FIELDING_DP -0.113***
## (0.013)
##
## Constant 58.446***
## (6.589)
##
## -----------------------------------------------
## Observations 1,486
## R2 0.438
## Adjusted R2 0.434
## Residual Std. Error 9.548 (df = 1474)
## F Statistic 104.596*** (df = 11; 1474)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Logistic
Regression
logistic_model <- glm(data = df_train1,
formula = final_reg,
family = gaussian(link = "identity"))
ols_model <- lm(data = df_train1,
formula = final_reg)
stargazer(ols_model, logistic_model, type = "text")
##
## =========================================================
## Dependent variable:
## -------------------------------------
## TARGET_WINS
## OLS normal
## (1) (2)
## ---------------------------------------------------------
## TEAM_BATTING_H 0.026*** 0.026***
## (0.006) (0.006)
##
## TEAM_BATTING_2B -0.070*** -0.070***
## (0.009) (0.009)
##
## TEAM_BATTING_3B 0.162*** 0.162***
## (0.022) (0.022)
##
## TEAM_BATTING_HR 0.098*** 0.098***
## (0.009) (0.009)
##
## TEAM_BATTING_BB 0.039*** 0.039***
## (0.003) (0.003)
##
## TEAM_BASERUN_SB 0.036*** 0.036***
## (0.009) (0.009)
##
## TEAM_BASERUN_CS 0.052*** 0.052***
## (0.018) (0.018)
##
## TEAM_PITCHING_H 0.009*** 0.009***
## (0.002) (0.002)
##
## TEAM_PITCHING_SO -0.021*** -0.021***
## (0.002) (0.002)
##
## TEAM_FIELDING_E -0.156*** -0.156***
## (0.010) (0.010)
##
## TEAM_FIELDING_DP -0.113*** -0.113***
## (0.013) (0.013)
##
## Constant 58.446*** 58.446***
## (6.589) (6.589)
##
## ---------------------------------------------------------
## Observations 1,486 1,486
## R2 0.438
## Adjusted R2 0.434
## Log Likelihood -5,456.457
## Akaike Inf. Crit. 10,936.910
## Residual Std. Error 9.548 (df = 1474)
## F Statistic 104.596*** (df = 11; 1474)
## =========================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Prediction
predicted <- predict(object = logistic_model,
newdata = df_train1)
summary(predicted)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 46.79 75.46 80.85 81.00 86.66 106.92
predicted_logistic_binary <- ifelse(test = predicted > 80.79,
yes = 1, no = 0)
table(predicted_logistic_binary)
## predicted_logistic_binary
## 0 1
## 732 754
summary(df_train2$TARGET_WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.0 72.0 81.5 81.0 90.0 117.0
target_wins_binary <- ifelse(test = df_train2$TARGET_WINS > 80.79,
yes = 1, no =0)
table(target_wins_binary)
## target_wins_binary
## 0 1
## 707 779
Confusion Matrix
confusion_matrix_by_hand <- table(target_wins_binary,
predicted_logistic_binary)
rownames(confusion_matrix_by_hand) <- c("Actual Losing Season", "Actual Winning Season")
colnames(confusion_matrix_by_hand) <- c("Predicted Losing Season", "Predicted Winning Season")
confusion_matrix_by_hand
## predicted_logistic_binary
## target_wins_binary Predicted Losing Season Predicted Winning Season
## Actual Losing Season 516 191
## Actual Winning Season 216 563
sensitivity <- 789/(789+414)
specificity <- 695/(695+378)
sensitivity
## [1] 0.6558603
specificity
## [1] 0.6477167
confusionMatrix(data = factor(predicted_logistic_binary),
reference = factor(target_wins_binary),
positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 516 216
## 1 191 563
##
## Accuracy : 0.7261
## 95% CI : (0.7027, 0.7487)
## No Information Rate : 0.5242
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.4518
##
## Mcnemar's Test P-Value : 0.2342
##
## Sensitivity : 0.7227
## Specificity : 0.7298
## Pos Pred Value : 0.7467
## Neg Pred Value : 0.7049
## Prevalence : 0.5242
## Detection Rate : 0.3789
## Detection Prevalence : 0.5074
## Balanced Accuracy : 0.7263
##
## 'Positive' Class : 1
##
Residuals
res <- resid(final_reg)
plot(df_train4$TARGET_WINS,
res,
xlab = "Target Wins", ylab = "Residuals", main = "Moneyball Target Wins")
abline(0,0)

qqnorm(res)
qqline(res)

plot(density(res))

Graphs and Models
Hits
ggplot(data = df_train1, aes(x = TEAM_BATTING_H, y = TARGET_WINS)) + geom_point() + geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Doubles
ggplot(data = df_train1, aes(x = TEAM_BATTING_2B, y = TARGET_WINS)) + geom_point() + geom_smooth( method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Triples
ggplot(data = df_train1, aes(x = TEAM_BATTING_3B, y = TARGET_WINS)) + geom_point() + geom_smooth( method = lm)
## `geom_smooth()` using formula = 'y ~ x'

Walks
ggplot(data = df_train1, aes(x = TEAM_BATTING_BB, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Homerun
ggplot(data = df_train1, aes(x = TEAM_BATTING_HR, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Hits Allowed
ggplot(data = df_train1, aes(x = TEAM_PITCHING_H, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Strike-Outs
ggplot(data = df_train1, aes(x = TEAM_PITCHING_SO, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Errors
ggplot(data = df_train1, aes(x = TEAM_FIELDING_E, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Double-Plays
ggplot(data = df_train1, aes(x = TEAM_FIELDING_DP, y = TARGET_WINS)) + geom_point() + geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
