Load Necessary Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
moneyball.training.data <- read.csv("~/Downloads/moneyball-training-data.csv")
Clean Data
# remove(list = ls())
colSums(is.na(moneyball.training.data))
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 0 0 0 0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 0 0 0 102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 131 772 2085 0
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 0 0 102 0
## TEAM_FIELDING_DP
## 286
moneyball_data_clean <- na.omit(moneyball.training.data)
clean_df <- moneyball_data_clean[, colSums(is.na(moneyball_data_clean)) == 0]
Kitchen Sink Model
kitchen_sink_model <- lm(TARGET_WINS ~ ., data = clean_df)
summary(kitchen_sink_model)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0626 -5.4196 -0.0423 5.2111 22.9355
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.4562317 19.7385030 3.063 0.00254 **
## INDEX -0.0002478 0.0008508 -0.291 0.77122
## TEAM_BATTING_H 1.8111103 2.7908648 0.649 0.51723
## TEAM_BATTING_2B 0.0267462 0.0303941 0.880 0.38008
## TEAM_BATTING_3B -0.1018043 0.0777401 -1.310 0.19208
## TEAM_BATTING_HR -4.6100155 10.5666083 -0.436 0.66317
## TEAM_BATTING_BB -4.4606275 3.6457882 -1.224 0.22279
## TEAM_BATTING_SO 0.4303282 2.6231874 0.164 0.86988
## TEAM_BASERUN_SB 0.0335937 0.0288100 1.166 0.24519
## TEAM_BASERUN_CS -0.0130338 0.0719436 -0.181 0.85645
## TEAM_BATTING_HBP 0.0837038 0.0499097 1.677 0.09532 .
## TEAM_PITCHING_H -1.7887761 2.7903398 -0.641 0.52233
## TEAM_PITCHING_HR 4.6958245 10.5649821 0.444 0.65725
## TEAM_PITCHING_BB 4.5120283 3.6432611 1.238 0.21721
## TEAM_PITCHING_SO -0.4618971 2.6214432 -0.176 0.86034
## TEAM_FIELDING_E -0.1724513 0.0415365 -4.152 5.16e-05 ***
## TEAM_FIELDING_DP -0.1063200 0.0371964 -2.858 0.00478 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.489 on 174 degrees of freedom
## Multiple R-squared: 0.5503, Adjusted R-squared: 0.509
## F-statistic: 13.31 on 16 and 174 DF, p-value: < 2.2e-16
kitchen_model <- stepAIC(kitchen_sink_model, direction = "backward")
## Start: AIC=833.22
## TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 1.94 12542 831.25
## - TEAM_PITCHING_SO 1 2.24 12542 831.25
## - TEAM_BASERUN_CS 1 2.37 12542 831.25
## - INDEX 1 6.11 12546 831.31
## - TEAM_BATTING_HR 1 13.72 12554 831.43
## - TEAM_PITCHING_HR 1 14.24 12554 831.43
## - TEAM_PITCHING_H 1 29.62 12569 831.67
## - TEAM_BATTING_H 1 30.35 12570 831.68
## - TEAM_BATTING_2B 1 55.81 12596 832.07
## - TEAM_BASERUN_SB 1 97.99 12638 832.70
## - TEAM_BATTING_BB 1 107.88 12648 832.85
## - TEAM_PITCHING_BB 1 110.54 12650 832.89
## - TEAM_BATTING_3B 1 123.59 12663 833.09
## <none> 12540 833.22
## - TEAM_BATTING_HBP 1 202.70 12742 834.28
## - TEAM_FIELDING_DP 1 588.80 13129 839.98
## - TEAM_FIELDING_E 1 1242.26 13782 849.26
##
## Step: AIC=831.25
## TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BASERUN_CS 1 2.14 12544 829.28
## - INDEX 1 5.41 12547 829.33
## - TEAM_BATTING_HR 1 14.50 12556 829.47
## - TEAM_PITCHING_HR 1 15.04 12557 829.48
## - TEAM_BATTING_2B 1 54.49 12596 830.08
## - TEAM_PITCHING_H 1 85.44 12627 830.54
## - TEAM_BATTING_H 1 87.23 12629 830.57
## - TEAM_BASERUN_SB 1 96.49 12638 830.71
## - TEAM_BATTING_BB 1 108.02 12650 830.89
## - TEAM_PITCHING_BB 1 110.68 12652 830.93
## - TEAM_BATTING_3B 1 123.53 12665 831.12
## <none> 12542 831.25
## - TEAM_BATTING_HBP 1 201.03 12743 832.28
## - TEAM_FIELDING_DP 1 593.51 13135 838.08
## - TEAM_FIELDING_E 1 1245.15 13787 847.33
## - TEAM_PITCHING_SO 1 1307.82 13850 848.19
##
## Step: AIC=829.28
## TARGET_WINS ~ INDEX + TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - INDEX 1 4.86 12549 827.35
## - TEAM_BATTING_HR 1 14.84 12559 827.51
## - TEAM_PITCHING_HR 1 15.39 12559 827.51
## - TEAM_BATTING_2B 1 53.95 12598 828.10
## - TEAM_PITCHING_H 1 87.62 12632 828.61
## - TEAM_BATTING_H 1 89.46 12633 828.64
## - TEAM_BATTING_BB 1 110.65 12654 828.96
## - TEAM_PITCHING_BB 1 113.36 12657 829.00
## - TEAM_BASERUN_SB 1 123.69 12668 829.15
## - TEAM_BATTING_3B 1 131.20 12675 829.27
## <none> 12544 829.28
## - TEAM_BATTING_HBP 1 200.88 12745 830.31
## - TEAM_FIELDING_DP 1 600.78 13145 836.22
## - TEAM_PITCHING_SO 1 1305.90 13850 846.20
## - TEAM_FIELDING_E 1 1326.90 13871 846.49
##
## Step: AIC=827.35
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_HR 1 16.06 12565 825.60
## - TEAM_PITCHING_HR 1 16.64 12565 825.61
## - TEAM_BATTING_2B 1 53.05 12602 826.16
## - TEAM_PITCHING_H 1 90.24 12639 826.72
## - TEAM_BATTING_H 1 92.13 12641 826.75
## - TEAM_BATTING_BB 1 110.31 12659 827.03
## - TEAM_PITCHING_BB 1 113.00 12662 827.07
## - TEAM_BASERUN_SB 1 123.42 12672 827.22
## - TEAM_BATTING_3B 1 129.33 12678 827.31
## <none> 12549 827.35
## - TEAM_BATTING_HBP 1 197.23 12746 828.33
## - TEAM_FIELDING_DP 1 635.62 13184 834.79
## - TEAM_PITCHING_SO 1 1311.88 13861 844.35
## - TEAM_FIELDING_E 1 1322.05 13871 844.49
##
## Step: AIC=825.6
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_2B 1 55.48 12620 824.44
## - TEAM_PITCHING_H 1 89.26 12654 824.95
## - TEAM_BATTING_H 1 91.97 12657 824.99
## - TEAM_BATTING_BB 1 104.58 12669 825.18
## - TEAM_PITCHING_BB 1 107.19 12672 825.22
## <none> 12565 825.60
## - TEAM_BATTING_3B 1 137.48 12702 825.68
## - TEAM_BASERUN_SB 1 146.90 12712 825.82
## - TEAM_BATTING_HBP 1 200.36 12765 826.62
## - TEAM_FIELDING_DP 1 628.95 13194 832.93
## - TEAM_PITCHING_HR 1 853.54 13418 836.15
## - TEAM_PITCHING_SO 1 1316.68 13882 842.63
## - TEAM_FIELDING_E 1 1333.15 13898 842.86
##
## Step: AIC=824.44
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_H 1 84.47 12705 823.71
## - TEAM_BATTING_H 1 87.79 12708 823.76
## - TEAM_BATTING_BB 1 98.92 12719 823.93
## - TEAM_PITCHING_BB 1 101.48 12722 823.97
## - TEAM_BASERUN_SB 1 109.27 12730 824.09
## <none> 12620 824.44
## - TEAM_BATTING_3B 1 147.01 12767 824.65
## - TEAM_BATTING_HBP 1 204.39 12825 825.51
## - TEAM_FIELDING_DP 1 649.12 13269 832.02
## - TEAM_PITCHING_HR 1 812.92 13433 834.36
## - TEAM_PITCHING_SO 1 1262.90 13883 840.66
## - TEAM_FIELDING_E 1 1379.34 14000 842.25
##
## Step: AIC=823.71
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_BB 1 32.85 12738 822.21
## - TEAM_PITCHING_BB 1 43.42 12748 822.37
## - TEAM_BASERUN_SB 1 105.16 12810 823.29
## <none> 12705 823.71
## - TEAM_BATTING_3B 1 153.13 12858 824.00
## - TEAM_BATTING_HBP 1 183.82 12888 824.46
## - TEAM_BATTING_H 1 504.11 13209 829.15
## - TEAM_FIELDING_DP 1 602.80 13308 830.57
## - TEAM_PITCHING_HR 1 850.25 13555 834.09
## - TEAM_PITCHING_SO 1 1259.72 13964 839.77
## - TEAM_FIELDING_E 1 1419.39 14124 841.94
##
## Step: AIC=822.21
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BASERUN_SB +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BASERUN_SB 1 109.99 12848 821.85
## <none> 12738 822.21
## - TEAM_BATTING_3B 1 156.45 12894 822.54
## - TEAM_BATTING_HBP 1 186.58 12924 822.98
## - TEAM_BATTING_H 1 485.67 13223 827.35
## - TEAM_FIELDING_DP 1 623.19 13361 829.33
## - TEAM_PITCHING_HR 1 843.83 13581 832.46
## - TEAM_PITCHING_SO 1 1267.25 14005 838.32
## - TEAM_FIELDING_E 1 1395.02 14133 840.06
## - TEAM_PITCHING_BB 1 2364.81 15102 852.73
##
## Step: AIC=821.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_3B 1 133.47 12981 821.82
## <none> 12848 821.85
## - TEAM_BATTING_HBP 1 177.11 13025 822.46
## - TEAM_BATTING_H 1 566.11 13414 828.09
## - TEAM_FIELDING_DP 1 737.46 13585 830.51
## - TEAM_PITCHING_HR 1 756.49 13604 830.78
## - TEAM_PITCHING_SO 1 1257.91 14106 837.69
## - TEAM_FIELDING_E 1 1330.40 14178 838.67
## - TEAM_PITCHING_BB 1 2371.12 15219 852.20
##
## Step: AIC=821.82
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 12981 821.82
## - TEAM_BATTING_HBP 1 228.70 13210 823.16
## - TEAM_BATTING_H 1 449.87 13431 826.33
## - TEAM_FIELDING_DP 1 813.17 13794 831.43
## - TEAM_PITCHING_HR 1 990.20 13971 833.86
## - TEAM_PITCHING_SO 1 1316.56 14298 838.27
## - TEAM_FIELDING_E 1 1334.60 14316 838.52
## - TEAM_PITCHING_BB 1 2583.00 15564 854.49
summary(kitchen_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2248 -5.6294 -0.0212 5.0439 21.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.95454 19.10292 3.191 0.001670 **
## TEAM_BATTING_H 0.02541 0.01009 2.518 0.012648 *
## TEAM_BATTING_HBP 0.08712 0.04852 1.796 0.074211 .
## TEAM_PITCHING_HR 0.08945 0.02394 3.736 0.000249 ***
## TEAM_PITCHING_BB 0.05672 0.00940 6.034 8.66e-09 ***
## TEAM_PITCHING_SO -0.03136 0.00728 -4.308 2.68e-05 ***
## TEAM_FIELDING_E -0.17218 0.03970 -4.338 2.38e-05 ***
## TEAM_FIELDING_DP -0.11904 0.03516 -3.386 0.000869 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.422 on 183 degrees of freedom
## Multiple R-squared: 0.5345, Adjusted R-squared: 0.5167
## F-statistic: 30.02 on 7 and 183 DF, p-value: < 2.2e-16
Summarize
summary(kitchen_model)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2248 -5.6294 -0.0212 5.0439 21.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.95454 19.10292 3.191 0.001670 **
## TEAM_BATTING_H 0.02541 0.01009 2.518 0.012648 *
## TEAM_BATTING_HBP 0.08712 0.04852 1.796 0.074211 .
## TEAM_PITCHING_HR 0.08945 0.02394 3.736 0.000249 ***
## TEAM_PITCHING_BB 0.05672 0.00940 6.034 8.66e-09 ***
## TEAM_PITCHING_SO -0.03136 0.00728 -4.308 2.68e-05 ***
## TEAM_FIELDING_E -0.17218 0.03970 -4.338 2.38e-05 ***
## TEAM_FIELDING_DP -0.11904 0.03516 -3.386 0.000869 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.422 on 183 degrees of freedom
## Multiple R-squared: 0.5345, Adjusted R-squared: 0.5167
## F-statistic: 30.02 on 7 and 183 DF, p-value: < 2.2e-16
- The stepAIC function allowed me to analyze the variables
- Based on the kitchen sink model, we look at the positive/negative
sign to determine if the variable will remain in the final regression.
The kitchen sink model showed that TEAM_BATTING_H, TEAM_BATTING_2B,
TEAM_BATTING_3B, TEAM_BATTING_HR, TEAM_BATTING_BB, TEAM_BASERUN_SB,
TEAM_PITCHING_SO, TEAM_FIELDING_E, and TEAM_FIELDING_DP should be
removed from the final regression