remove(list=ls()) # Cleaning the environment
money <- read.csv("~/Downloads/moneyball-training-data.csv") # Importing the DataMoneyballLinearRegression
Importing the Data
Cleaning the Data
head(is.na(money)) # search for missing values in data set INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE FALSE
TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
[1,] FALSE FALSE FALSE TRUE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE
TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
[1,] TRUE TRUE FALSE FALSE
[2,] FALSE TRUE FALSE FALSE
[3,] FALSE TRUE FALSE FALSE
[4,] FALSE TRUE FALSE FALSE
[5,] FALSE TRUE FALSE FALSE
[6,] FALSE TRUE FALSE FALSE
TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
[1,] FALSE FALSE FALSE TRUE
[2,] FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE
head(colSums(is.na(money))) # number of missing values in each variable of the data set INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
0 0 0 0 0
TEAM_BATTING_HR
0
?head
head(money, n = 10) # first 4 entries INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
1 1 39 1445 194 39
2 2 70 1339 219 22
3 3 86 1377 232 35
4 4 70 1387 209 38
5 5 82 1297 186 27
6 6 75 1279 200 36
7 7 80 1244 179 54
8 8 85 1273 171 37
9 11 86 1391 197 40
10 12 76 1271 213 18
TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
1 13 143 842 NA
2 190 685 1075 37
3 137 602 917 46
4 96 451 922 43
5 102 472 920 49
6 92 443 973 107
7 122 525 1062 80
8 115 456 1027 40
9 114 447 922 69
10 96 441 827 72
TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
1 NA NA 9364 84
2 28 NA 1347 191
3 27 NA 1377 137
4 30 NA 1396 97
5 39 NA 1297 102
6 59 NA 1279 92
7 54 NA 1244 122
8 36 NA 1281 116
9 27 NA 1391 114
10 34 NA 1271 96
TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
1 927 5456 1011 NA
2 689 1082 193 155
3 602 917 175 153
4 454 928 164 156
5 472 920 138 168
6 443 973 123 149
7 525 1062 136 186
8 459 1033 112 136
9 447 922 127 169
10 441 827 131 159
#install.packages("visdat")
library(visdat)
# vis_miss(df)
vis_dat(money)# Removing the variables with the most missing values
money_clean <- money
money_clean$TEAM_BATTING_HBP = NULL
money_clean$TEAM_BATTING_CS = NULL
vis_miss(money_clean)clean_df <- money_cleanGgplot
library(ggplot2)
df_melted <- reshape2::melt(clean_df)No id variables; using all as measure variables
ggplot(df_melted, aes(value)) + geom_histogram() + facet_wrap (~ variable, scales = "free_x")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1393 rows containing non-finite outside the scale range
(`stat_bin()`).
reg1 <-
lm(data = clean_df, formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B)
kitchen_sink <-
lm(data = clean_df, formula = TARGET_WINS ~ .)library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# Creating a predicting equation
stargazer(reg1, kitchen_sink, type = "text")
=======================================================================
Dependent variable:
---------------------------------------------------
TARGET_WINS
(1) (2)
-----------------------------------------------------------------------
INDEX 0.0003
(0.0004)
TEAM_BATTING_H 0.036*** 0.016
(0.003) (0.020)
TEAM_BATTING_2B 0.035*** -0.070***
(0.008) (0.009)
TEAM_BATTING_3B 0.160***
(0.022)
TEAM_BATTING_HR 0.075
(0.085)
TEAM_BATTING_BB 0.043
(0.046)
TEAM_BATTING_SO 0.018
(0.023)
TEAM_BASERUN_SB 0.035***
(0.009)
TEAM_BASERUN_CS 0.054***
(0.018)
TEAM_PITCHING_H 0.019
(0.018)
TEAM_PITCHING_HR 0.022
(0.082)
TEAM_PITCHING_BB -0.003
(0.045)
TEAM_PITCHING_SO -0.038*
(0.022)
TEAM_FIELDING_E -0.156***
(0.010)
TEAM_FIELDING_DP -0.113***
(0.013)
Constant 19.477*** 57.843***
(3.102) (6.645)
-----------------------------------------------------------------------
Observations 2,276 1,486
R2 0.158 0.439
Adjusted R2 0.158 0.433
Residual Std. Error 14.457 (df = 2273) 9.557 (df = 1470)
F Statistic 213.858*** (df = 2; 2273) 76.630*** (df = 15; 1470)
=======================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
library(MASS)
?stepAIC()
# Finding the best model
best_model <-
stepAIC(object = kitchen_sink,
direction = "backward")Start: AIC=6724.68
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- TEAM_PITCHING_BB 1 0.5 134278 6722.7
- TEAM_PITCHING_HR 1 6.6 134285 6722.8
- INDEX 1 45.3 134323 6723.2
- TEAM_BATTING_SO 1 52.0 134330 6723.3
- TEAM_BATTING_H 1 59.8 134338 6723.3
- TEAM_BATTING_HR 1 70.3 134348 6723.5
- TEAM_BATTING_BB 1 78.1 134356 6723.5
- TEAM_PITCHING_H 1 93.3 134371 6723.7
<none> 134278 6724.7
- TEAM_PITCHING_SO 1 260.2 134538 6725.6
- TEAM_BASERUN_CS 1 780.9 135059 6731.3
- TEAM_BASERUN_SB 1 1430.9 135709 6738.4
- TEAM_BATTING_3B 1 4664.4 138942 6773.4
- TEAM_BATTING_2B 1 5147.7 139426 6778.6
- TEAM_FIELDING_DP 1 6780.8 141059 6795.9
- TEAM_FIELDING_E 1 22451.0 156729 6952.4
Step: AIC=6722.69
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- TEAM_PITCHING_HR 1 6.2 134285 6720.8
- INDEX 1 45.6 134324 6721.2
- TEAM_BATTING_SO 1 52.7 134331 6721.3
- TEAM_BATTING_HR 1 78.9 134357 6721.6
- TEAM_BATTING_H 1 149.9 134428 6722.3
<none> 134278 6722.7
- TEAM_PITCHING_H 1 193.8 134472 6722.8
- TEAM_PITCHING_SO 1 262.0 134540 6723.6
- TEAM_BASERUN_CS 1 780.8 135059 6729.3
- TEAM_BASERUN_SB 1 1435.3 135714 6736.5
- TEAM_BATTING_3B 1 4668.0 138946 6771.5
- TEAM_BATTING_2B 1 5156.1 139434 6776.7
- TEAM_FIELDING_DP 1 6782.9 141061 6793.9
- TEAM_BATTING_BB 1 12608.7 146887 6854.1
- TEAM_FIELDING_E 1 22517.9 156796 6951.1
Step: AIC=6720.76
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- INDEX 1 45.8 134330 6719.3
- TEAM_BATTING_SO 1 47.9 134332 6719.3
- TEAM_BATTING_H 1 147.4 134432 6720.4
<none> 134285 6720.8
- TEAM_PITCHING_H 1 198.2 134483 6720.9
- TEAM_PITCHING_SO 1 293.6 134578 6722.0
- TEAM_BASERUN_CS 1 777.0 135062 6727.3
- TEAM_BASERUN_SB 1 1440.7 135725 6734.6
- TEAM_BATTING_3B 1 4669.4 138954 6769.5
- TEAM_BATTING_2B 1 5178.5 139463 6775.0
- TEAM_FIELDING_DP 1 6783.0 141067 6792.0
- TEAM_BATTING_HR 1 9801.3 144086 6823.4
- TEAM_BATTING_BB 1 12647.1 146932 6852.5
- TEAM_FIELDING_E 1 22551.2 156836 6949.4
Step: AIC=6719.26
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- TEAM_BATTING_SO 1 51.2 134382 6717.8
- TEAM_BATTING_H 1 144.7 134475 6718.9
<none> 134330 6719.3
- TEAM_PITCHING_H 1 202.0 134532 6719.5
- TEAM_PITCHING_SO 1 298.0 134628 6720.6
- TEAM_BASERUN_CS 1 742.6 135073 6725.5
- TEAM_BASERUN_SB 1 1570.4 135901 6734.5
- TEAM_BATTING_3B 1 4842.6 139173 6769.9
- TEAM_BATTING_2B 1 5198.7 139529 6773.7
- TEAM_FIELDING_DP 1 6744.4 141075 6790.1
- TEAM_BATTING_HR 1 9780.8 144111 6821.7
- TEAM_BATTING_BB 1 12606.9 146937 6850.6
- TEAM_FIELDING_E 1 22525.1 156855 6947.6
Step: AIC=6717.83
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
<none> 134382 6717.8
- TEAM_BASERUN_CS 1 737.6 135119 6724.0
- TEAM_PITCHING_H 1 1355.1 135737 6730.7
- TEAM_BASERUN_SB 1 1575.6 135957 6733.2
- TEAM_BATTING_H 1 1740.1 136122 6734.9
- TEAM_BATTING_3B 1 4849.8 139231 6768.5
- TEAM_BATTING_2B 1 5148.1 139530 6771.7
- TEAM_FIELDING_DP 1 6779.2 141161 6789.0
- TEAM_PITCHING_SO 1 7395.1 141777 6795.4
- TEAM_BATTING_HR 1 9785.1 144167 6820.3
- TEAM_BATTING_BB 1 12619.7 147001 6849.2
- TEAM_FIELDING_E 1 22552.0 156934 6946.4
- In our final model, we kept TARGET_WINS and TEAM_BATTING_H, TEAM_BATTING_2B, TEAM_BATTING_3B, TEAM_BATTING_HR, TEAM_BATTING_BB, TEAM_BASERUN_SB, TEAM_BASERUN_CS, TEAM_PITCHING_H, TEAM_PITCHING_SO, TEAM_FIELDING_E, and TEAM_FIELDING_DP.
- For the most part these variables make sense because they all positively impact the teams performance and winning/scoring chances.