1 Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(visdat)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(readr)
library(dplyr)

2 Import Data

df_train <- read.csv("moneyball-training-data.csv")
vis_dat(df_train)

3 Cleaning the Data

vis_dat(df_train)

df_train1 <- df_train[ , c(-1, -11)]
df_train1 <- na.omit(df_train1)
df_train2 <- df_train1
df_train3 <- df_train1
df_train4 <- df_train1
vis_dat(df_train1)

4 Plotting the Data

ggplot(data = df_train1, 
       mapping = aes(x = TARGET_WINS)) + 
  geom_histogram(color = "black", fill = "red", bins = 30) + 
  labs(title = "Histogram of Target Wins",
       x = "Wins", y = "Count")

5 Regressions

reg1 <- stepAIC(object = lm(data = df_train1, TARGET_WINS ~ .), 
                direction = c("backward"))
## Start:  AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_BB  1       0.8 134324 6721.2
## - TEAM_PITCHING_HR  1       7.2 134330 6721.3
## - TEAM_BATTING_SO   1      55.2 134378 6721.8
## - TEAM_BATTING_H    1      56.5 134380 6721.8
## - TEAM_BATTING_HR   1      68.5 134392 6721.9
## - TEAM_BATTING_BB   1      81.0 134404 6722.1
## - TEAM_PITCHING_H   1      98.0 134421 6722.3
## <none>                          134323 6723.2
## - TEAM_PITCHING_SO  1     264.1 134587 6724.1
## - TEAM_BASERUN_CS   1     746.8 135070 6729.4
## - TEAM_BASERUN_SB   1    1557.8 135881 6738.3
## - TEAM_BATTING_3B   1    4838.9 139162 6773.8
## - TEAM_BATTING_2B   1    5166.3 139489 6777.3
## - TEAM_FIELDING_DP  1    6742.5 141066 6794.0
## - TEAM_FIELDING_E   1   22427.4 156751 6950.6
## 
## Step:  AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_PITCHING_HR  1       6.4 134330 6719.3
## - TEAM_BATTING_SO   1      56.2 134380 6719.8
## - TEAM_BATTING_HR   1      77.9 134402 6720.1
## - TEAM_BATTING_H    1     147.2 134471 6720.8
## <none>                          134324 6721.2
## - TEAM_PITCHING_H   1     197.5 134521 6721.4
## - TEAM_PITCHING_SO  1     266.3 134590 6722.1
## - TEAM_BASERUN_CS   1     746.5 135070 6727.4
## - TEAM_BASERUN_SB   1    1564.2 135888 6736.4
## - TEAM_BATTING_3B   1    4840.8 139165 6771.8
## - TEAM_BATTING_2B   1    5175.9 139500 6775.4
## - TEAM_FIELDING_DP  1    6744.6 141069 6792.0
## - TEAM_BATTING_BB   1   12568.9 146893 6852.1
## - TEAM_FIELDING_E   1   22491.7 156816 6949.2
## 
## Step:  AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## - TEAM_BATTING_SO   1      51.2 134382 6717.8
## - TEAM_BATTING_H    1     144.7 134475 6718.9
## <none>                          134330 6719.3
## - TEAM_PITCHING_H   1     202.0 134532 6719.5
## - TEAM_PITCHING_SO  1     298.0 134628 6720.6
## - TEAM_BASERUN_CS   1     742.6 135073 6725.5
## - TEAM_BASERUN_SB   1    1570.4 135901 6734.5
## - TEAM_BATTING_3B   1    4842.6 139173 6769.9
## - TEAM_BATTING_2B   1    5198.7 139529 6773.7
## - TEAM_FIELDING_DP  1    6744.4 141075 6790.1
## - TEAM_BATTING_HR   1    9780.8 144111 6821.7
## - TEAM_BATTING_BB   1   12606.9 146937 6850.6
## - TEAM_FIELDING_E   1   22525.1 156855 6947.6
## 
## Step:  AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          134382 6717.8
## - TEAM_BASERUN_CS   1     737.6 135119 6724.0
## - TEAM_PITCHING_H   1    1355.1 135737 6730.7
## - TEAM_BASERUN_SB   1    1575.6 135957 6733.2
## - TEAM_BATTING_H    1    1740.1 136122 6734.9
## - TEAM_BATTING_3B   1    4849.8 139231 6768.5
## - TEAM_BATTING_2B   1    5148.1 139530 6771.7
## - TEAM_FIELDING_DP  1    6779.2 141161 6789.0
## - TEAM_PITCHING_SO  1    7395.1 141777 6795.4
## - TEAM_BATTING_HR   1    9785.1 144167 6820.3
## - TEAM_BATTING_BB   1   12619.7 147001 6849.2
## - TEAM_FIELDING_E   1   22552.0 156934 6946.4
final_reg <- reg1
stargazer(final_reg, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                             TARGET_WINS        
## -----------------------------------------------
## TEAM_BATTING_H               0.026***          
##                               (0.006)          
##                                                
## TEAM_BATTING_2B              -0.070***         
##                               (0.009)          
##                                                
## TEAM_BATTING_3B              0.162***          
##                               (0.022)          
##                                                
## TEAM_BATTING_HR              0.098***          
##                               (0.009)          
##                                                
## TEAM_BATTING_BB              0.039***          
##                               (0.003)          
##                                                
## TEAM_BASERUN_SB              0.036***          
##                               (0.009)          
##                                                
## TEAM_BASERUN_CS              0.052***          
##                               (0.018)          
##                                                
## TEAM_PITCHING_H              0.009***          
##                               (0.002)          
##                                                
## TEAM_PITCHING_SO             -0.021***         
##                               (0.002)          
##                                                
## TEAM_FIELDING_E              -0.156***         
##                               (0.010)          
##                                                
## TEAM_FIELDING_DP             -0.113***         
##                               (0.013)          
##                                                
## Constant                     58.446***         
##                               (6.589)          
##                                                
## -----------------------------------------------
## Observations                   1,486           
## R2                             0.438           
## Adjusted R2                    0.434           
## Residual Std. Error      9.548 (df = 1474)     
## F Statistic         104.596*** (df = 11; 1474) 
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

5.1 Logistic Regression

logistic_model <- glm(data = df_train1, 
                      formula = final_reg,
                      family = gaussian(link = "identity"))

ols_model <- lm(data = df_train1, 
                formula = final_reg)

stargazer(ols_model, logistic_model, type = "text")
## 
## =========================================================
##                              Dependent variable:         
##                     -------------------------------------
##                                  TARGET_WINS             
##                                OLS               normal  
##                                (1)                (2)    
## ---------------------------------------------------------
## TEAM_BATTING_H               0.026***           0.026*** 
##                              (0.006)            (0.006)  
##                                                          
## TEAM_BATTING_2B             -0.070***          -0.070*** 
##                              (0.009)            (0.009)  
##                                                          
## TEAM_BATTING_3B              0.162***           0.162*** 
##                              (0.022)            (0.022)  
##                                                          
## TEAM_BATTING_HR              0.098***           0.098*** 
##                              (0.009)            (0.009)  
##                                                          
## TEAM_BATTING_BB              0.039***           0.039*** 
##                              (0.003)            (0.003)  
##                                                          
## TEAM_BASERUN_SB              0.036***           0.036*** 
##                              (0.009)            (0.009)  
##                                                          
## TEAM_BASERUN_CS              0.052***           0.052*** 
##                              (0.018)            (0.018)  
##                                                          
## TEAM_PITCHING_H              0.009***           0.009*** 
##                              (0.002)            (0.002)  
##                                                          
## TEAM_PITCHING_SO            -0.021***          -0.021*** 
##                              (0.002)            (0.002)  
##                                                          
## TEAM_FIELDING_E             -0.156***          -0.156*** 
##                              (0.010)            (0.010)  
##                                                          
## TEAM_FIELDING_DP            -0.113***          -0.113*** 
##                              (0.013)            (0.013)  
##                                                          
## Constant                    58.446***          58.446*** 
##                              (6.589)            (6.589)  
##                                                          
## ---------------------------------------------------------
## Observations                  1,486              1,486   
## R2                            0.438                      
## Adjusted R2                   0.434                      
## Log Likelihood                                 -5,456.457
## Akaike Inf. Crit.                              10,936.910
## Residual Std. Error     9.548 (df = 1474)                
## F Statistic         104.596*** (df = 11; 1474)           
## =========================================================
## Note:                         *p<0.1; **p<0.05; ***p<0.01

6 Prediction

predicted <- predict(object = logistic_model, 
                     newdata = df_train1)
summary(predicted)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   46.79   75.46   80.85   81.00   86.66  106.92
predicted_logistic_binary <- ifelse(test = predicted > 80.79, 
                                    yes = 1, no = 0)

table(predicted_logistic_binary)
## predicted_logistic_binary
##   0   1 
## 732 754
summary(df_train2$TARGET_WINS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    41.0    72.0    81.5    81.0    90.0   117.0
target_wins_binary <- ifelse(test = df_train2$TARGET_WINS > 80.79,
                             yes = 1, no =0)
table(target_wins_binary)
## target_wins_binary
##   0   1 
## 707 779

7 Confusion Matrix

confusion_matrix_by_hand <- table(target_wins_binary, 
                                  predicted_logistic_binary)

rownames(confusion_matrix_by_hand) <- c("Actual Losing Season", "Actual Winning Season")
colnames(confusion_matrix_by_hand) <- c("Predicted Losing Season", "Predicted Winning Season")

confusion_matrix_by_hand
##                        predicted_logistic_binary
## target_wins_binary      Predicted Losing Season Predicted Winning Season
##   Actual Losing Season                      516                      191
##   Actual Winning Season                     216                      563
sensitivity <- 789/(789+414)
specificity <- 695/(695+378)

sensitivity
## [1] 0.6558603
specificity
## [1] 0.6477167
confusionMatrix(data = factor(predicted_logistic_binary),
                reference = factor(target_wins_binary),
                positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 516 216
##          1 191 563
##                                           
##                Accuracy : 0.7261          
##                  95% CI : (0.7027, 0.7487)
##     No Information Rate : 0.5242          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4518          
##                                           
##  Mcnemar's Test P-Value : 0.2342          
##                                           
##             Sensitivity : 0.7227          
##             Specificity : 0.7298          
##          Pos Pred Value : 0.7467          
##          Neg Pred Value : 0.7049          
##              Prevalence : 0.5242          
##          Detection Rate : 0.3789          
##    Detection Prevalence : 0.5074          
##       Balanced Accuracy : 0.7263          
##                                           
##        'Positive' Class : 1               
## 

8 Residuals

res <- resid(final_reg)

plot(df_train4$TARGET_WINS, 
     res, 
     xlab = "Target Wins", ylab = "Residuals", main = "Moneyball Target Wins")
abline(0,0)

qqnorm(res)
qqline(res)

plot(density(res))

9 Graphs and Models

9.1 Hits

ggplot(data = df_train1, aes(x = TEAM_BATTING_H, y = TARGET_WINS)) + geom_point() +  geom_smooth(method = lm)
## `geom_smooth()` using formula = 'y ~ x'

9.2 Doubles

ggplot(data = df_train1, aes(x = TEAM_BATTING_2B, y = TARGET_WINS)) + geom_point() +  geom_smooth( method = lm)
## `geom_smooth()` using formula = 'y ~ x'

9.3 Triples

ggplot(data = df_train1, aes(x = TEAM_BATTING_3B, y = TARGET_WINS)) + geom_point() +  geom_smooth( method = lm)
## `geom_smooth()` using formula = 'y ~ x'

9.4 Walks

ggplot(data = df_train1, aes(x = TEAM_BATTING_BB, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

9.5 Homerun

ggplot(data = df_train1, aes(x = TEAM_BATTING_HR, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

9.6 Hits Allowed

ggplot(data = df_train1, aes(x = TEAM_PITCHING_H, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

9.7 Strike-Outs

ggplot(data = df_train1, aes(x = TEAM_PITCHING_SO, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

9.8 Errors

ggplot(data = df_train1, aes(x = TEAM_FIELDING_E, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

9.9 Double-Plays

ggplot(data = df_train1, aes(x = TEAM_FIELDING_DP, y = TARGET_WINS)) +   geom_point() +   geom_smooth(model = lm)
## Warning in geom_smooth(model = lm): Ignoring unknown parameters: `model`
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'