Day 7 HW

Import and Clean Data

rm(list=ls())
moneyball.training.data <- read.csv("C:/Users/aarav/Downloads/moneyball-training-data.csv", header=TRUE)

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(visdat)



df_clean <- moneyball.training.data[-c(10,11)]

df_clean <- na.omit(df_clean)

Kitchen Sink Model

HomeRunsModel <- lm(TARGET_WINS ~., data = df_clean)
summary(HomeRunsModel)

Call:
lm(formula = TARGET_WINS ~ ., data = df_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-32.261  -7.222   0.149   6.876  29.754 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      58.9585552  6.0320374   9.774  < 2e-16 ***
INDEX            -0.0001211  0.0003329  -0.364 0.716143    
TEAM_BATTING_H   -0.0316007  0.0164330  -1.923 0.054635 .  
TEAM_BATTING_2B  -0.0493788  0.0088806  -5.560 3.09e-08 ***
TEAM_BATTING_3B   0.1842087  0.0190656   9.662  < 2e-16 ***
TEAM_BATTING_HR   0.1406899  0.0814215   1.728 0.084172 .  
TEAM_BATTING_BB   0.1137803  0.0425467   2.674 0.007557 ** 
TEAM_BATTING_SO   0.0267560  0.0219903   1.217 0.223868    
TEAM_BASERUN_SB   0.0695120  0.0055539  12.516  < 2e-16 ***
TEAM_PITCHING_H   0.0577947  0.0149601   3.863 0.000116 ***
TEAM_PITCHING_HR -0.0389564  0.0779770  -0.500 0.617425    
TEAM_PITCHING_BB -0.0759516  0.0404578  -1.877 0.060637 .  
TEAM_PITCHING_SO -0.0471155  0.0209273  -2.251 0.024480 *  
TEAM_FIELDING_E  -0.1192872  0.0071568 -16.668  < 2e-16 ***
TEAM_FIELDING_DP -0.1120094  0.0122863  -9.117  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.18 on 1820 degrees of freedom
Multiple R-squared:  0.406, Adjusted R-squared:  0.4014 
F-statistic: 88.84 on 14 and 1820 DF,  p-value: < 2.2e-16

Backward Selection

BackwardModel <- step(HomeRunsModel, direction = "backward")
Start:  AIC=8531.16
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- INDEX             1      13.7 188651 8529.3
- TEAM_PITCHING_HR  1      25.9 188663 8529.4
- TEAM_BATTING_SO   1     153.4 188791 8530.6
<none>                          188637 8531.2
- TEAM_BATTING_HR   1     309.5 188947 8532.2
- TEAM_PITCHING_BB  1     365.3 189003 8532.7
- TEAM_BATTING_H    1     383.3 189021 8532.9
- TEAM_PITCHING_SO  1     525.4 189163 8534.3
- TEAM_BATTING_BB   1     741.2 189379 8536.4
- TEAM_PITCHING_H   1    1546.9 190184 8544.1
- TEAM_BATTING_2B   1    3204.4 191842 8560.1
- TEAM_FIELDING_DP  1    8614.4 197252 8611.1
- TEAM_BATTING_3B   1    9675.6 198313 8620.9
- TEAM_BASERUN_SB   1   16235.9 204873 8680.7
- TEAM_FIELDING_E   1   28794.4 217432 8789.8

Step:  AIC=8529.29
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
- TEAM_PITCHING_HR  1      27.3 188679 8527.6
- TEAM_BATTING_SO   1     150.8 188802 8528.8
<none>                          188651 8529.3
- TEAM_BATTING_HR   1     314.7 188966 8530.3
- TEAM_PITCHING_BB  1     361.1 189012 8530.8
- TEAM_BATTING_H    1     380.6 189032 8531.0
- TEAM_PITCHING_SO  1     522.1 189173 8532.4
- TEAM_BATTING_BB   1     736.4 189388 8534.4
- TEAM_PITCHING_H   1    1539.1 190190 8542.2
- TEAM_BATTING_2B   1    3196.2 191847 8558.1
- TEAM_FIELDING_DP  1    8636.8 197288 8609.4
- TEAM_BATTING_3B   1    9685.4 198337 8619.2
- TEAM_BASERUN_SB   1   16250.9 204902 8678.9
- TEAM_FIELDING_E   1   28809.2 217460 8788.1

Step:  AIC=8527.56
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
    TEAM_FIELDING_DP

                   Df Sum of Sq    RSS    AIC
<none>                          188679 8527.6
- TEAM_BATTING_SO   1     374.1 189053 8529.2
- TEAM_PITCHING_BB  1     432.2 189111 8529.8
- TEAM_BATTING_H    1     477.1 189156 8530.2
- TEAM_BATTING_BB   1     847.1 189526 8533.8
- TEAM_PITCHING_SO  1    1081.9 189760 8536.0
- TEAM_PITCHING_H   1    1805.9 190484 8543.0
- TEAM_BATTING_2B   1    3192.1 191871 8556.3
- TEAM_FIELDING_DP  1    8675.6 197354 8608.0
- TEAM_BATTING_3B   1    9674.9 198353 8617.3
- TEAM_BATTING_HR   1   12421.7 201100 8642.6
- TEAM_BASERUN_SB   1   16314.6 204993 8677.7
- TEAM_FIELDING_E   1   28845.1 217524 8786.6
summary(BackwardModel)

Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
    TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
    TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP, data = df_clean)

Residuals:
    Min      1Q  Median      3Q     Max 
-32.168  -7.256   0.142   6.945  29.896 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      59.054832   6.025125   9.801  < 2e-16 ***
TEAM_BATTING_H   -0.033844   0.015767  -2.146  0.03197 *  
TEAM_BATTING_2B  -0.049268   0.008874  -5.552 3.24e-08 ***
TEAM_BATTING_3B   0.183497   0.018984   9.666  < 2e-16 ***
TEAM_BATTING_HR   0.100263   0.009155  10.952  < 2e-16 ***
TEAM_BATTING_BB   0.118363   0.041385   2.860  0.00428 ** 
TEAM_BATTING_SO   0.033316   0.017529   1.901  0.05751 .  
TEAM_BASERUN_SB   0.069465   0.005534  12.552  < 2e-16 ***
TEAM_PITCHING_H   0.059813   0.014323   4.176 3.11e-05 ***
TEAM_PITCHING_BB -0.080309   0.039308  -2.043  0.04119 *  
TEAM_PITCHING_SO -0.053523   0.016559  -3.232  0.00125 ** 
TEAM_FIELDING_E  -0.118864   0.007122 -16.690  < 2e-16 ***
TEAM_FIELDING_DP -0.112317   0.012271  -9.153  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 10.18 on 1822 degrees of freedom
Multiple R-squared:  0.4058,    Adjusted R-squared:  0.4019 
F-statistic: 103.7 on 12 and 1822 DF,  p-value: < 2.2e-16

Insights

Based on the backwards selection, TEAM_BATTING_SO, TEAM_PITCHING_BB, TEAM_BATTING_H, TEAM_BATTING_BB, and TEAM_PITCHING_SO should be included in a final regression.