rm(list =ls())
library(ggplot2)
library(visdat)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(reshape2)
dfmoneyball <- read.csv("moneyball-training-data.csv")
vis_dat(dfmoneyball)
dfmoneyball2 <- dfmoneyball[ , c(-11)]
dfmoneyballclean <- na.omit(dfmoneyball2)
head(dfmoneyballclean)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 2 2 70 1339 219 22
## 3 3 86 1377 232 35
## 4 4 70 1387 209 38
## 5 5 82 1297 186 27
## 6 6 75 1279 200 36
## 7 7 80 1244 179 54
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 2 190 685 1075 37
## 3 137 602 917 46
## 4 96 451 922 43
## 5 102 472 920 49
## 6 92 443 973 107
## 7 122 525 1062 80
## TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## 2 28 1347 191 689
## 3 27 1377 137 602
## 4 30 1396 97 454
## 5 39 1297 102 472
## 6 59 1279 92 443
## 7 54 1244 122 525
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 2 1082 193 155
## 3 917 175 153
## 4 928 164 156
## 5 920 138 168
## 6 973 123 149
## 7 1062 136 186
stargazer(dfmoneyballclean, type = "text",
title = "Money Ball Statistics",
summary.stat = c("Mean", "sd", "Min", "Max"),
notes = "1486 Trials for all Stats",
covariate.labels = c("Index",
"Wins",
"Batting Base Hits ",
"Batting Doubles",
"Batting Triples",
"Batting Homeruns",
"Batting Walks",
"Batting Strikeouts",
"Baserunning Stolen Bases",
"Baserunning Caught Stealing",
"Pitching Hits Allowed",
"Pitching Homeruns Allowed",
"Pitching Walks Allowed",
"Pitching Strikeouts",
"Fielding Errors",
"Fielding Double Plays"))
##
## Money Ball Statistics
## ==========================================================
## Statistic Mean St. Dev. Min Max
## ----------------------------------------------------------
## Index 1,273.812 725.508 2 2,534
## Wins 80.997 12.694 41 117
## Batting Base Hits 1,452.157 104.336 1,137 1,786
## Batting Doubles 250.970 42.026 154 377
## Batting Triples 42.905 18.649 11 129
## Batting Homeruns 129.842 48.609 11 264
## Batting Walks 541.888 80.567 309 878
## Batting Strikeouts 841.743 200.355 326 1,399
## Baserunning Stolen Bases 95.858 44.345 18 314
## Baserunning Caught Stealing 52.963 22.851 11 201
## Pitching Hits Allowed 1,505.122 173.472 1,137 2,394
## Pitching Homeruns Allowed 134.069 50.902 12 343
## Pitching Walks Allowed 561.570 97.347 325 1,090
## Pitching Strikeouts 869.253 211.859 345 1,781
## Fielding Errors 143.145 38.954 65 360
## Fielding Double Plays 153.743 20.321 87 228
## ----------------------------------------------------------
## 1486 Trials for all Stats
df_melted <- melt(dfmoneyballclean)
## No id variables; using all as measure variables
ggplot(df_melted, aes(x = value)) +
geom_histogram() +
facet_wrap(~variable, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dfmoneyballclean2 <- dfmoneyballclean[,-c(1)]
moneyballregression <- lm(data = dfmoneyballclean2, TARGET_WINS ~.)
summary(moneyballregression)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfmoneyballclean2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.5627 -6.6932 -0.1328 6.5249 27.8525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.912438 6.642839 8.718 < 2e-16 ***
## TEAM_BATTING_H 0.015434 0.019626 0.786 0.4318
## TEAM_BATTING_2B -0.070472 0.009369 -7.522 9.36e-14 ***
## TEAM_BATTING_3B 0.161551 0.022192 7.280 5.43e-13 ***
## TEAM_BATTING_HR 0.073952 0.085392 0.866 0.3866
## TEAM_BATTING_BB 0.043765 0.046454 0.942 0.3463
## TEAM_BATTING_SO 0.018250 0.023463 0.778 0.4368
## TEAM_BASERUN_SB 0.035880 0.008687 4.130 3.83e-05 ***
## TEAM_BASERUN_CS 0.052124 0.018227 2.860 0.0043 **
## TEAM_PITCHING_H 0.019044 0.018381 1.036 0.3003
## TEAM_PITCHING_HR 0.022997 0.082092 0.280 0.7794
## TEAM_PITCHING_BB -0.004180 0.044692 -0.094 0.9255
## TEAM_PITCHING_SO -0.038176 0.022447 -1.701 0.0892 .
## TEAM_FIELDING_E -0.155876 0.009946 -15.672 < 2e-16 ***
## TEAM_FIELDING_DP -0.112885 0.013137 -8.593 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.556 on 1471 degrees of freedom
## Multiple R-squared: 0.4386, Adjusted R-squared: 0.4333
## F-statistic: 82.1 on 14 and 1471 DF, p-value: < 2.2e-16
library(MASS)
stepAIC(object = moneyballregression,
direction = "backward")
## Start: AIC=6723.18
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_BB 1 0.8 134324 6721.2
## - TEAM_PITCHING_HR 1 7.2 134330 6721.3
## - TEAM_BATTING_SO 1 55.2 134378 6721.8
## - TEAM_BATTING_H 1 56.5 134380 6721.8
## - TEAM_BATTING_HR 1 68.5 134392 6721.9
## - TEAM_BATTING_BB 1 81.0 134404 6722.1
## - TEAM_PITCHING_H 1 98.0 134421 6722.3
## <none> 134323 6723.2
## - TEAM_PITCHING_SO 1 264.1 134587 6724.1
## - TEAM_BASERUN_CS 1 746.8 135070 6729.4
## - TEAM_BASERUN_SB 1 1557.8 135881 6738.3
## - TEAM_BATTING_3B 1 4838.9 139162 6773.8
## - TEAM_BATTING_2B 1 5166.3 139489 6777.3
## - TEAM_FIELDING_DP 1 6742.5 141066 6794.0
## - TEAM_FIELDING_E 1 22427.4 156751 6950.6
##
## Step: AIC=6721.19
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_HR 1 6.4 134330 6719.3
## - TEAM_BATTING_SO 1 56.2 134380 6719.8
## - TEAM_BATTING_HR 1 77.9 134402 6720.1
## - TEAM_BATTING_H 1 147.2 134471 6720.8
## <none> 134324 6721.2
## - TEAM_PITCHING_H 1 197.5 134521 6721.4
## - TEAM_PITCHING_SO 1 266.3 134590 6722.1
## - TEAM_BASERUN_CS 1 746.5 135070 6727.4
## - TEAM_BASERUN_SB 1 1564.2 135888 6736.4
## - TEAM_BATTING_3B 1 4840.8 139165 6771.8
## - TEAM_BATTING_2B 1 5175.9 139500 6775.4
## - TEAM_FIELDING_DP 1 6744.6 141069 6792.0
## - TEAM_BATTING_BB 1 12568.9 146893 6852.1
## - TEAM_FIELDING_E 1 22491.7 156816 6949.2
##
## Step: AIC=6719.26
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 51.2 134382 6717.8
## - TEAM_BATTING_H 1 144.7 134475 6718.9
## <none> 134330 6719.3
## - TEAM_PITCHING_H 1 202.0 134532 6719.5
## - TEAM_PITCHING_SO 1 298.0 134628 6720.6
## - TEAM_BASERUN_CS 1 742.6 135073 6725.5
## - TEAM_BASERUN_SB 1 1570.4 135901 6734.5
## - TEAM_BATTING_3B 1 4842.6 139173 6769.9
## - TEAM_BATTING_2B 1 5198.7 139529 6773.7
## - TEAM_FIELDING_DP 1 6744.4 141075 6790.1
## - TEAM_BATTING_HR 1 9780.8 144111 6821.7
## - TEAM_BATTING_BB 1 12606.9 146937 6850.6
## - TEAM_FIELDING_E 1 22525.1 156855 6947.6
##
## Step: AIC=6717.83
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 134382 6717.8
## - TEAM_BASERUN_CS 1 737.6 135119 6724.0
## - TEAM_PITCHING_H 1 1355.1 135737 6730.7
## - TEAM_BASERUN_SB 1 1575.6 135957 6733.2
## - TEAM_BATTING_H 1 1740.1 136122 6734.9
## - TEAM_BATTING_3B 1 4849.8 139231 6768.5
## - TEAM_BATTING_2B 1 5148.1 139530 6771.7
## - TEAM_FIELDING_DP 1 6779.2 141161 6789.0
## - TEAM_PITCHING_SO 1 7395.1 141777 6795.4
## - TEAM_BATTING_HR 1 9785.1 144167 6820.3
## - TEAM_BATTING_BB 1 12619.7 147001 6849.2
## - TEAM_FIELDING_E 1 22552.0 156934 6946.4
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = dfmoneyballclean2)
##
## Coefficients:
## (Intercept) TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 58.44606 0.02550 -0.06983 0.16162
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BASERUN_SB TEAM_BASERUN_CS
## 0.09775 0.03948 0.03603 0.05177
## TEAM_PITCHING_H TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0.00907 -0.02083 -0.15597 -0.11315
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
dfselected <- dplyr::select(dfmoneyballclean2, -c(TEAM_BATTING_H, TEAM_PITCHING_H))
finalregression <- lm(data = dfselected, TARGET_WINS ~.)
summary(finalregression)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfselected)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.549 -6.714 0.261 6.518 32.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.800657 4.380426 21.870 < 2e-16 ***
## TEAM_BATTING_2B -0.021020 0.006810 -3.087 0.00206 **
## TEAM_BATTING_3B 0.220307 0.021134 10.424 < 2e-16 ***
## TEAM_BATTING_HR 0.107772 0.085814 1.256 0.20936
## TEAM_BATTING_BB 0.015782 0.031239 0.505 0.61348
## TEAM_BATTING_SO -0.006358 0.020565 -0.309 0.75725
## TEAM_BASERUN_SB 0.044526 0.008771 5.076 4.34e-07 ***
## TEAM_BASERUN_CS 0.048784 0.018560 2.628 0.00867 **
## TEAM_PITCHING_HR 0.022182 0.082430 0.269 0.78789
## TEAM_PITCHING_BB 0.021527 0.029861 0.721 0.47108
## TEAM_PITCHING_SO -0.023125 0.019683 -1.175 0.24024
## TEAM_FIELDING_E -0.153743 0.010117 -15.196 < 2e-16 ***
## TEAM_FIELDING_DP -0.103948 0.013323 -7.802 1.14e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.733 on 1473 degrees of freedom
## Multiple R-squared: 0.4168, Adjusted R-squared: 0.412
## F-statistic: 87.72 on 12 and 1473 DF, p-value: < 2.2e-16