Homework7

Install and load packages

# install.packages("tidyverse")
# install.packages("dplyr")

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(MASS)

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

Plug In Data set

moneyball_data<- read.csv("C:/Users/Adi/Downloads/moneyball-training-data (1).csv")

Cleaning the data

colSums(is.na(moneyball_data))
           INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
               0                0                0                0 
 TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
               0                0                0              102 
 TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
             131              772             2085                0 
TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
               0                0              102                0 
TEAM_FIELDING_DP 
             286 
moneyball_data_clean <- na.omit(moneyball_data)
clean_df <- moneyball_data_clean[, colSums(is.na(moneyball_data_clean)) == 0]

Creating a Kitchen Sink Model

kitchen_sink_model <- lm(TARGET_WINS ~ ., data = clean_df)
summary(kitchen_sink_model)

Call:
lm(formula = TARGET_WINS ~ ., data = clean_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.0626  -5.4196  -0.0423   5.2111  22.9355 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)      60.4562317 19.7385030   3.063  0.00254 ** 
INDEX            -0.0002478  0.0008508  -0.291  0.77122    
TEAM_BATTING_H    1.8111103  2.7908648   0.649  0.51723    
TEAM_BATTING_2B   0.0267462  0.0303941   0.880  0.38008    
TEAM_BATTING_3B  -0.1018043  0.0777401  -1.310  0.19208    
TEAM_BATTING_HR  -4.6100155 10.5666083  -0.436  0.66317    
TEAM_BATTING_BB  -4.4606275  3.6457882  -1.224  0.22279    
TEAM_BATTING_SO   0.4303282  2.6231874   0.164  0.86988    
TEAM_BASERUN_SB   0.0335937  0.0288100   1.166  0.24519    
TEAM_BASERUN_CS  -0.0130338  0.0719436  -0.181  0.85645    
TEAM_BATTING_HBP  0.0837038  0.0499097   1.677  0.09532 .  
TEAM_PITCHING_H  -1.7887761  2.7903398  -0.641  0.52233    
TEAM_PITCHING_HR  4.6958245 10.5649821   0.444  0.65725    
TEAM_PITCHING_BB  4.5120283  3.6432611   1.238  0.21721    
TEAM_PITCHING_SO -0.4618971  2.6214432  -0.176  0.86034    
TEAM_FIELDING_E  -0.1724513  0.0415365  -4.152 5.16e-05 ***
TEAM_FIELDING_DP -0.1063200  0.0371964  -2.858  0.00478 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.489 on 174 degrees of freedom
Multiple R-squared:  0.5503,    Adjusted R-squared:  0.509 
F-statistic: 13.31 on 16 and 174 DF,  p-value: < 2.2e-16

Backward Selection

kitchen_model <- stepAIC(kitchen_sink_model, direction = "backward")
Start:  AIC=833.22
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
    TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
    TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BATTING_SO   1      1.94 12542 831.25
- TEAM_PITCHING_SO  1      2.24 12542 831.25
- TEAM_BASERUN_CS   1      2.37 12542 831.25
- INDEX             1      6.11 12546 831.31
- TEAM_BATTING_HR   1     13.72 12554 831.43
- TEAM_PITCHING_HR  1     14.24 12554 831.43
- TEAM_PITCHING_H   1     29.62 12569 831.67
- TEAM_BATTING_H    1     30.35 12570 831.68
- TEAM_BATTING_2B   1     55.81 12596 832.07
- TEAM_BASERUN_SB   1     97.99 12638 832.70
- TEAM_BATTING_BB   1    107.88 12648 832.85
- TEAM_PITCHING_BB  1    110.54 12650 832.89
- TEAM_BATTING_3B   1    123.59 12663 833.09
<none>                          12540 833.22
- TEAM_BATTING_HBP  1    202.70 12742 834.28
- TEAM_FIELDING_DP  1    588.80 13129 839.98
- TEAM_FIELDING_E   1   1242.26 13782 849.26

Step:  AIC=831.25
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
    TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
    TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BASERUN_CS   1      2.14 12544 829.28
- INDEX             1      5.41 12547 829.33
- TEAM_BATTING_HR   1     14.50 12556 829.47
- TEAM_PITCHING_HR  1     15.04 12557 829.48
- TEAM_BATTING_2B   1     54.49 12596 830.08
- TEAM_PITCHING_H   1     85.44 12627 830.54
- TEAM_BATTING_H    1     87.23 12629 830.57
- TEAM_BASERUN_SB   1     96.49 12638 830.71
- TEAM_BATTING_BB   1    108.02 12650 830.89
- TEAM_PITCHING_BB  1    110.68 12652 830.93
- TEAM_BATTING_3B   1    123.53 12665 831.12
<none>                          12542 831.25
- TEAM_BATTING_HBP  1    201.03 12743 832.28
- TEAM_FIELDING_DP  1    593.51 13135 838.08
- TEAM_FIELDING_E   1   1245.15 13787 847.33
- TEAM_PITCHING_SO  1   1307.82 13850 848.19

Step:  AIC=829.28
TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + 
    TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- INDEX             1      4.86 12549 827.35
- TEAM_BATTING_HR   1     14.84 12559 827.51
- TEAM_PITCHING_HR  1     15.39 12559 827.51
- TEAM_BATTING_2B   1     53.95 12598 828.10
- TEAM_PITCHING_H   1     87.62 12632 828.61
- TEAM_BATTING_H    1     89.46 12633 828.64
- TEAM_BATTING_BB   1    110.65 12654 828.96
- TEAM_PITCHING_BB  1    113.36 12657 829.00
- TEAM_BASERUN_SB   1    123.69 12668 829.15
- TEAM_BATTING_3B   1    131.20 12675 829.27
<none>                          12544 829.28
- TEAM_BATTING_HBP  1    200.88 12745 830.31
- TEAM_FIELDING_DP  1    600.78 13145 836.22
- TEAM_PITCHING_SO  1   1305.90 13850 846.20
- TEAM_FIELDING_E   1   1326.90 13871 846.49

Step:  AIC=827.35
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + 
    TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BATTING_HR   1     16.06 12565 825.60
- TEAM_PITCHING_HR  1     16.64 12565 825.61
- TEAM_BATTING_2B   1     53.05 12602 826.16
- TEAM_PITCHING_H   1     90.24 12639 826.72
- TEAM_BATTING_H    1     92.13 12641 826.75
- TEAM_BATTING_BB   1    110.31 12659 827.03
- TEAM_PITCHING_BB  1    113.00 12662 827.07
- TEAM_BASERUN_SB   1    123.42 12672 827.22
- TEAM_BATTING_3B   1    129.33 12678 827.31
<none>                          12549 827.35
- TEAM_BATTING_HBP  1    197.23 12746 828.33
- TEAM_FIELDING_DP  1    635.62 13184 834.79
- TEAM_PITCHING_SO  1   1311.88 13861 844.35
- TEAM_FIELDING_E   1   1322.05 13871 844.49

Step:  AIC=825.6
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
    TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BATTING_2B   1     55.48 12620 824.44
- TEAM_PITCHING_H   1     89.26 12654 824.95
- TEAM_BATTING_H    1     91.97 12657 824.99
- TEAM_BATTING_BB   1    104.58 12669 825.18
- TEAM_PITCHING_BB  1    107.19 12672 825.22
<none>                          12565 825.60
- TEAM_BATTING_3B   1    137.48 12702 825.68
- TEAM_BASERUN_SB   1    146.90 12712 825.82
- TEAM_BATTING_HBP  1    200.36 12765 826.62
- TEAM_FIELDING_DP  1    628.95 13194 832.93
- TEAM_PITCHING_HR  1    853.54 13418 836.15
- TEAM_PITCHING_SO  1   1316.68 13882 842.63
- TEAM_FIELDING_E   1   1333.15 13898 842.86

Step:  AIC=824.44
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB + 
    TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
    TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_PITCHING_H   1     84.47 12705 823.71
- TEAM_BATTING_H    1     87.79 12708 823.76
- TEAM_BATTING_BB   1     98.92 12719 823.93
- TEAM_PITCHING_BB  1    101.48 12722 823.97
- TEAM_BASERUN_SB   1    109.27 12730 824.09
<none>                          12620 824.44
- TEAM_BATTING_3B   1    147.01 12767 824.65
- TEAM_BATTING_HBP  1    204.39 12825 825.51
- TEAM_FIELDING_DP  1    649.12 13269 832.02
- TEAM_PITCHING_HR  1    812.92 13433 834.36
- TEAM_PITCHING_SO  1   1262.90 13883 840.66
- TEAM_FIELDING_E   1   1379.34 14000 842.25

Step:  AIC=823.71
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB + 
    TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
    TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BATTING_BB   1     32.85 12738 822.21
- TEAM_PITCHING_BB  1     43.42 12748 822.37
- TEAM_BASERUN_SB   1    105.16 12810 823.29
<none>                          12705 823.71
- TEAM_BATTING_3B   1    153.13 12858 824.00
- TEAM_BATTING_HBP  1    183.82 12888 824.46
- TEAM_BATTING_H    1    504.11 13209 829.15
- TEAM_FIELDING_DP  1    602.80 13308 830.57
- TEAM_PITCHING_HR  1    850.25 13555 834.09
- TEAM_PITCHING_SO  1   1259.72 13964 839.77
- TEAM_FIELDING_E   1   1419.39 14124 841.94

Step:  AIC=822.21
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BASERUN_SB + 
    TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
    TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BASERUN_SB   1    109.99 12848 821.85
<none>                          12738 822.21
- TEAM_BATTING_3B   1    156.45 12894 822.54
- TEAM_BATTING_HBP  1    186.58 12924 822.98
- TEAM_BATTING_H    1    485.67 13223 827.35
- TEAM_FIELDING_DP  1    623.19 13361 829.33
- TEAM_PITCHING_HR  1    843.83 13581 832.46
- TEAM_PITCHING_SO  1   1267.25 14005 838.32
- TEAM_FIELDING_E   1   1395.02 14133 840.06
- TEAM_PITCHING_BB  1   2364.81 15102 852.73

Step:  AIC=821.85
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_HBP + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
- TEAM_BATTING_3B   1    133.47 12981 821.82
<none>                          12848 821.85
- TEAM_BATTING_HBP  1    177.11 13025 822.46
- TEAM_BATTING_H    1    566.11 13414 828.09
- TEAM_FIELDING_DP  1    737.46 13585 830.51
- TEAM_PITCHING_HR  1    756.49 13604 830.78
- TEAM_PITCHING_SO  1   1257.91 14106 837.69
- TEAM_FIELDING_E   1   1330.40 14178 838.67
- TEAM_PITCHING_BB  1   2371.12 15219 852.20

Step:  AIC=821.82
TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + TEAM_PITCHING_HR + 
    TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP

                   Df Sum of Sq   RSS    AIC
<none>                          12981 821.82
- TEAM_BATTING_HBP  1    228.70 13210 823.16
- TEAM_BATTING_H    1    449.87 13431 826.33
- TEAM_FIELDING_DP  1    813.17 13794 831.43
- TEAM_PITCHING_HR  1    990.20 13971 833.86
- TEAM_PITCHING_SO  1   1316.56 14298 838.27
- TEAM_FIELDING_E   1   1334.60 14316 838.52
- TEAM_PITCHING_BB  1   2583.00 15564 854.49

Summarize

summary(kitchen_model)

Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + 
    TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
    TEAM_FIELDING_E + TEAM_FIELDING_DP, data = clean_df)

Residuals:
     Min       1Q   Median       3Q      Max 
-20.2248  -5.6294  -0.0212   5.0439  21.3065 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      60.95454   19.10292   3.191 0.001670 ** 
TEAM_BATTING_H    0.02541    0.01009   2.518 0.012648 *  
TEAM_BATTING_HBP  0.08712    0.04852   1.796 0.074211 .  
TEAM_PITCHING_HR  0.08945    0.02394   3.736 0.000249 ***
TEAM_PITCHING_BB  0.05672    0.00940   6.034 8.66e-09 ***
TEAM_PITCHING_SO -0.03136    0.00728  -4.308 2.68e-05 ***
TEAM_FIELDING_E  -0.17218    0.03970  -4.338 2.38e-05 ***
TEAM_FIELDING_DP -0.11904    0.03516  -3.386 0.000869 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.422 on 183 degrees of freedom
Multiple R-squared:  0.5345,    Adjusted R-squared:  0.5167 
F-statistic: 30.02 on 7 and 183 DF,  p-value: < 2.2e-16

I believe the variables that are not missing any data should be kept.