Import and Clean Data
rm (list= ls ())
moneyball.training.data <- read.csv ("C:/Users/aarav/Downloads/moneyball-training-data.csv" , header= TRUE )
library (dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library (visdat)
df_clean <- moneyball.training.data[- c (10 ,11 )]
df_clean <- na.omit (df_clean)
Kitchen Sink Model
HomeRunsModel <- lm (TARGET_WINS ~ ., data = df_clean)
summary (HomeRunsModel)
Call:
lm(formula = TARGET_WINS ~ ., data = df_clean)
Residuals:
Min 1Q Median 3Q Max
-32.261 -7.222 0.149 6.876 29.754
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58.9585552 6.0320374 9.774 < 2e-16 ***
INDEX -0.0001211 0.0003329 -0.364 0.716143
TEAM_BATTING_H -0.0316007 0.0164330 -1.923 0.054635 .
TEAM_BATTING_2B -0.0493788 0.0088806 -5.560 3.09e-08 ***
TEAM_BATTING_3B 0.1842087 0.0190656 9.662 < 2e-16 ***
TEAM_BATTING_HR 0.1406899 0.0814215 1.728 0.084172 .
TEAM_BATTING_BB 0.1137803 0.0425467 2.674 0.007557 **
TEAM_BATTING_SO 0.0267560 0.0219903 1.217 0.223868
TEAM_BASERUN_SB 0.0695120 0.0055539 12.516 < 2e-16 ***
TEAM_PITCHING_H 0.0577947 0.0149601 3.863 0.000116 ***
TEAM_PITCHING_HR -0.0389564 0.0779770 -0.500 0.617425
TEAM_PITCHING_BB -0.0759516 0.0404578 -1.877 0.060637 .
TEAM_PITCHING_SO -0.0471155 0.0209273 -2.251 0.024480 *
TEAM_FIELDING_E -0.1192872 0.0071568 -16.668 < 2e-16 ***
TEAM_FIELDING_DP -0.1120094 0.0122863 -9.117 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.18 on 1820 degrees of freedom
Multiple R-squared: 0.406, Adjusted R-squared: 0.4014
F-statistic: 88.84 on 14 and 1820 DF, p-value: < 2.2e-16
Backward Selection
BackwardModel <- step (HomeRunsModel, direction = "backward" )
Start: AIC=8531.16
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- INDEX 1 13.7 188651 8529.3
- TEAM_PITCHING_HR 1 25.9 188663 8529.4
- TEAM_BATTING_SO 1 153.4 188791 8530.6
<none> 188637 8531.2
- TEAM_BATTING_HR 1 309.5 188947 8532.2
- TEAM_PITCHING_BB 1 365.3 189003 8532.7
- TEAM_BATTING_H 1 383.3 189021 8532.9
- TEAM_PITCHING_SO 1 525.4 189163 8534.3
- TEAM_BATTING_BB 1 741.2 189379 8536.4
- TEAM_PITCHING_H 1 1546.9 190184 8544.1
- TEAM_BATTING_2B 1 3204.4 191842 8560.1
- TEAM_FIELDING_DP 1 8614.4 197252 8611.1
- TEAM_BATTING_3B 1 9675.6 198313 8620.9
- TEAM_BASERUN_SB 1 16235.9 204873 8680.7
- TEAM_FIELDING_E 1 28794.4 217432 8789.8
Step: AIC=8529.29
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
- TEAM_PITCHING_HR 1 27.3 188679 8527.6
- TEAM_BATTING_SO 1 150.8 188802 8528.8
<none> 188651 8529.3
- TEAM_BATTING_HR 1 314.7 188966 8530.3
- TEAM_PITCHING_BB 1 361.1 189012 8530.8
- TEAM_BATTING_H 1 380.6 189032 8531.0
- TEAM_PITCHING_SO 1 522.1 189173 8532.4
- TEAM_BATTING_BB 1 736.4 189388 8534.4
- TEAM_PITCHING_H 1 1539.1 190190 8542.2
- TEAM_BATTING_2B 1 3196.2 191847 8558.1
- TEAM_FIELDING_DP 1 8636.8 197288 8609.4
- TEAM_BATTING_3B 1 9685.4 198337 8619.2
- TEAM_BASERUN_SB 1 16250.9 204902 8678.9
- TEAM_FIELDING_E 1 28809.2 217460 8788.1
Step: AIC=8527.56
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E +
TEAM_FIELDING_DP
Df Sum of Sq RSS AIC
<none> 188679 8527.6
- TEAM_BATTING_SO 1 374.1 189053 8529.2
- TEAM_PITCHING_BB 1 432.2 189111 8529.8
- TEAM_BATTING_H 1 477.1 189156 8530.2
- TEAM_BATTING_BB 1 847.1 189526 8533.8
- TEAM_PITCHING_SO 1 1081.9 189760 8536.0
- TEAM_PITCHING_H 1 1805.9 190484 8543.0
- TEAM_BATTING_2B 1 3192.1 191871 8556.3
- TEAM_FIELDING_DP 1 8675.6 197354 8608.0
- TEAM_BATTING_3B 1 9674.9 198353 8617.3
- TEAM_BATTING_HR 1 12421.7 201100 8642.6
- TEAM_BASERUN_SB 1 16314.6 204993 8677.7
- TEAM_FIELDING_E 1 28845.1 217524 8786.6
Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
TEAM_FIELDING_E + TEAM_FIELDING_DP, data = df_clean)
Residuals:
Min 1Q Median 3Q Max
-32.168 -7.256 0.142 6.945 29.896
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 59.054832 6.025125 9.801 < 2e-16 ***
TEAM_BATTING_H -0.033844 0.015767 -2.146 0.03197 *
TEAM_BATTING_2B -0.049268 0.008874 -5.552 3.24e-08 ***
TEAM_BATTING_3B 0.183497 0.018984 9.666 < 2e-16 ***
TEAM_BATTING_HR 0.100263 0.009155 10.952 < 2e-16 ***
TEAM_BATTING_BB 0.118363 0.041385 2.860 0.00428 **
TEAM_BATTING_SO 0.033316 0.017529 1.901 0.05751 .
TEAM_BASERUN_SB 0.069465 0.005534 12.552 < 2e-16 ***
TEAM_PITCHING_H 0.059813 0.014323 4.176 3.11e-05 ***
TEAM_PITCHING_BB -0.080309 0.039308 -2.043 0.04119 *
TEAM_PITCHING_SO -0.053523 0.016559 -3.232 0.00125 **
TEAM_FIELDING_E -0.118864 0.007122 -16.690 < 2e-16 ***
TEAM_FIELDING_DP -0.112317 0.012271 -9.153 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.18 on 1822 degrees of freedom
Multiple R-squared: 0.4058, Adjusted R-squared: 0.4019
F-statistic: 103.7 on 12 and 1822 DF, p-value: < 2.2e-16
Insights
Based on the backwards selection, TEAM_BATTING_SO, TEAM_PITCHING_BB, TEAM_BATTING_H, TEAM_BATTING_BB, and TEAM_PITCHING_SO should be included in a final regression.