#install.packages("tidyverse")
#install.packages("visdat")
#install.packages("stargazer")
#install.packages("MASS")
library("MASS")
library("psych")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat") #for visualizing the data
library("stargazer")
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library("ggplot2")
library("reshape2") #makeing data long
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
Step 1: Import the Data
moneyball <- read.csv("C:/Users/Alex Law/Documents/BC Data Analysis/moneyball-training-data (3).csv")
Step 2: Clean Data
moneyball2 <- moneyball[ , c(-11)]
clean_moneyball <- na.omit(moneyball2)
Step 3:Summary Stat
stargazer(clean_moneyball, type = "text")
##
## =====================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------
## INDEX 1,486 1,273.812 725.508 2 2,534
## TARGET_WINS 1,486 80.997 12.694 41 117
## TEAM_BATTING_H 1,486 1,452.157 104.336 1,137 1,786
## TEAM_BATTING_2B 1,486 250.970 42.026 154 377
## TEAM_BATTING_3B 1,486 42.905 18.649 11 129
## TEAM_BATTING_HR 1,486 129.842 48.609 11 264
## TEAM_BATTING_BB 1,486 541.888 80.567 309 878
## TEAM_BATTING_SO 1,486 841.743 200.355 326 1,399
## TEAM_BASERUN_SB 1,486 95.858 44.345 18 314
## TEAM_BASERUN_CS 1,486 52.963 22.851 11 201
## TEAM_PITCHING_H 1,486 1,505.122 173.472 1,137 2,394
## TEAM_PITCHING_HR 1,486 134.069 50.902 12 343
## TEAM_PITCHING_BB 1,486 561.570 97.347 325 1,090
## TEAM_PITCHING_SO 1,486 869.253 211.859 345 1,781
## TEAM_FIELDING_E 1,486 143.145 38.954 65 360
## TEAM_FIELDING_DP 1,486 153.743 20.321 87 228
## -----------------------------------------------------
Step 4: Graphs
moneyball_melted <- melt(clean_moneyball)
## No id variables; using all as measure variables
ggplot(data = moneyball_melted,
aes(x = value)
) +
geom_histogram() +
facet_wrap(facets = ~ variable,
scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7/25 Homework:
1. Unique identifier
require(tidyverse)
clean_df <- dplyr::select(moneyball, - c(INDEX)
)
kitchen_sink <- lm(data = clean_df, formula = TARGET_WINS ~ .)
summary(kitchen_sink)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = clean_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.8708 -5.6564 -0.0599 5.2545 22.9274
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.28826 19.67842 3.064 0.00253 **
## TEAM_BATTING_H 1.91348 2.76139 0.693 0.48927
## TEAM_BATTING_2B 0.02639 0.03029 0.871 0.38484
## TEAM_BATTING_3B -0.10118 0.07751 -1.305 0.19348
## TEAM_BATTING_HR -4.84371 10.50851 -0.461 0.64542
## TEAM_BATTING_BB -4.45969 3.63624 -1.226 0.22167
## TEAM_BATTING_SO 0.34196 2.59876 0.132 0.89546
## TEAM_BASERUN_SB 0.03304 0.02867 1.152 0.25071
## TEAM_BASERUN_CS -0.01104 0.07143 -0.155 0.87730
## TEAM_BATTING_HBP 0.08247 0.04960 1.663 0.09815 .
## TEAM_PITCHING_H -1.89096 2.76095 -0.685 0.49432
## TEAM_PITCHING_HR 4.93043 10.50664 0.469 0.63946
## TEAM_PITCHING_BB 4.51089 3.63372 1.241 0.21612
## TEAM_PITCHING_SO -0.37364 2.59705 -0.144 0.88577
## TEAM_FIELDING_E -0.17204 0.04140 -4.155 5.08e-05 ***
## TEAM_FIELDING_DP -0.10819 0.03654 -2.961 0.00349 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.467 on 175 degrees of freedom
## (2085 observations deleted due to missingness)
## Multiple R-squared: 0.5501, Adjusted R-squared: 0.5116
## F-statistic: 14.27 on 15 and 175 DF, p-value: < 2.2e-16
stepAIC(object = kitchen_sink, direction = "backward")
## Start: AIC=831.31
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB +
## TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_SO 1 1.24 12547 829.33
## - TEAM_PITCHING_SO 1 1.48 12547 829.33
## - TEAM_BASERUN_CS 1 1.71 12548 829.34
## - TEAM_BATTING_HR 1 15.23 12561 829.54
## - TEAM_PITCHING_HR 1 15.79 12562 829.55
## - TEAM_PITCHING_H 1 33.63 12580 829.82
## - TEAM_BATTING_H 1 34.42 12580 829.83
## - TEAM_BATTING_2B 1 54.41 12600 830.14
## - TEAM_BASERUN_SB 1 95.22 12641 830.76
## - TEAM_BATTING_BB 1 107.84 12654 830.95
## - TEAM_PITCHING_BB 1 110.48 12656 830.99
## - TEAM_BATTING_3B 1 122.16 12668 831.16
## <none> 12546 831.31
## - TEAM_BATTING_HBP 1 198.21 12744 832.31
## - TEAM_FIELDING_DP 1 628.49 13174 838.65
## - TEAM_FIELDING_E 1 1237.79 13784 847.28
##
## Step: AIC=829.33
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS +
## TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BASERUN_CS 1 1.59 12549 827.35
## - TEAM_BATTING_HR 1 15.82 12563 827.57
## - TEAM_PITCHING_HR 1 16.39 12564 827.58
## - TEAM_BATTING_2B 1 53.47 12601 828.14
## - TEAM_PITCHING_H 1 88.45 12636 828.67
## - TEAM_BATTING_H 1 90.30 12637 828.70
## - TEAM_BASERUN_SB 1 94.19 12641 828.76
## - TEAM_BATTING_BB 1 107.95 12655 828.97
## - TEAM_PITCHING_BB 1 110.60 12658 829.01
## - TEAM_BATTING_3B 1 122.20 12669 829.18
## <none> 12547 829.33
## - TEAM_BATTING_HBP 1 197.11 12744 830.31
## - TEAM_FIELDING_DP 1 630.68 13178 836.70
## - TEAM_FIELDING_E 1 1240.80 13788 845.34
## - TEAM_PITCHING_SO 1 1312.89 13860 846.34
##
## Step: AIC=827.35
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP +
## TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_HR 1 16.06 12565 825.60
## - TEAM_PITCHING_HR 1 16.64 12565 825.61
## - TEAM_BATTING_2B 1 53.05 12602 826.16
## - TEAM_PITCHING_H 1 90.24 12639 826.72
## - TEAM_BATTING_H 1 92.13 12641 826.75
## - TEAM_BATTING_BB 1 110.31 12659 827.03
## - TEAM_PITCHING_BB 1 113.00 12662 827.07
## - TEAM_BASERUN_SB 1 123.42 12672 827.22
## - TEAM_BATTING_3B 1 129.33 12678 827.31
## <none> 12549 827.35
## - TEAM_BATTING_HBP 1 197.23 12746 828.33
## - TEAM_FIELDING_DP 1 635.62 13184 834.79
## - TEAM_PITCHING_SO 1 1311.88 13861 844.35
## - TEAM_FIELDING_E 1 1322.05 13871 844.49
##
## Step: AIC=825.6
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B +
## TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_2B 1 55.48 12620 824.44
## - TEAM_PITCHING_H 1 89.26 12654 824.95
## - TEAM_BATTING_H 1 91.97 12657 824.99
## - TEAM_BATTING_BB 1 104.58 12669 825.18
## - TEAM_PITCHING_BB 1 107.19 12672 825.22
## <none> 12565 825.60
## - TEAM_BATTING_3B 1 137.48 12702 825.68
## - TEAM_BASERUN_SB 1 146.90 12712 825.82
## - TEAM_BATTING_HBP 1 200.36 12765 826.62
## - TEAM_FIELDING_DP 1 628.95 13194 832.93
## - TEAM_PITCHING_HR 1 853.54 13418 836.15
## - TEAM_PITCHING_SO 1 1316.68 13882 842.63
## - TEAM_FIELDING_E 1 1333.15 13898 842.86
##
## Step: AIC=824.44
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_PITCHING_H 1 84.47 12705 823.71
## - TEAM_BATTING_H 1 87.79 12708 823.76
## - TEAM_BATTING_BB 1 98.92 12719 823.93
## - TEAM_PITCHING_BB 1 101.48 12722 823.97
## - TEAM_BASERUN_SB 1 109.27 12730 824.09
## <none> 12620 824.44
## - TEAM_BATTING_3B 1 147.01 12767 824.65
## - TEAM_BATTING_HBP 1 204.39 12825 825.51
## - TEAM_FIELDING_DP 1 649.12 13269 832.02
## - TEAM_PITCHING_HR 1 812.92 13433 834.36
## - TEAM_PITCHING_SO 1 1262.90 13883 840.66
## - TEAM_FIELDING_E 1 1379.34 14000 842.25
##
## Step: AIC=823.71
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB +
## TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_BB 1 32.85 12738 822.21
## - TEAM_PITCHING_BB 1 43.42 12748 822.37
## - TEAM_BASERUN_SB 1 105.16 12810 823.29
## <none> 12705 823.71
## - TEAM_BATTING_3B 1 153.13 12858 824.00
## - TEAM_BATTING_HBP 1 183.82 12888 824.46
## - TEAM_BATTING_H 1 504.11 13209 829.15
## - TEAM_FIELDING_DP 1 602.80 13308 830.57
## - TEAM_PITCHING_HR 1 850.25 13555 834.09
## - TEAM_PITCHING_SO 1 1259.72 13964 839.77
## - TEAM_FIELDING_E 1 1419.39 14124 841.94
##
## Step: AIC=822.21
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BASERUN_SB +
## TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB +
## TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BASERUN_SB 1 109.99 12848 821.85
## <none> 12738 822.21
## - TEAM_BATTING_3B 1 156.45 12894 822.54
## - TEAM_BATTING_HBP 1 186.58 12924 822.98
## - TEAM_BATTING_H 1 485.67 13223 827.35
## - TEAM_FIELDING_DP 1 623.19 13361 829.33
## - TEAM_PITCHING_HR 1 843.83 13581 832.46
## - TEAM_PITCHING_SO 1 1267.25 14005 838.32
## - TEAM_FIELDING_E 1 1395.02 14133 840.06
## - TEAM_PITCHING_BB 1 2364.81 15102 852.73
##
## Step: AIC=821.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## - TEAM_BATTING_3B 1 133.47 12981 821.82
## <none> 12848 821.85
## - TEAM_BATTING_HBP 1 177.11 13025 822.46
## - TEAM_BATTING_H 1 566.11 13414 828.09
## - TEAM_FIELDING_DP 1 737.46 13585 830.51
## - TEAM_PITCHING_HR 1 756.49 13604 830.78
## - TEAM_PITCHING_SO 1 1257.91 14106 837.69
## - TEAM_FIELDING_E 1 1330.40 14178 838.67
## - TEAM_PITCHING_BB 1 2371.12 15219 852.20
##
## Step: AIC=821.82
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + TEAM_PITCHING_HR +
## TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
##
## Df Sum of Sq RSS AIC
## <none> 12981 821.82
## - TEAM_BATTING_HBP 1 228.70 13210 823.16
## - TEAM_BATTING_H 1 449.87 13431 826.33
## - TEAM_FIELDING_DP 1 813.17 13794 831.43
## - TEAM_PITCHING_HR 1 990.20 13971 833.86
## - TEAM_PITCHING_SO 1 1316.56 14298 838.27
## - TEAM_FIELDING_E 1 1334.60 14316 838.52
## - TEAM_PITCHING_BB 1 2583.00 15564 854.49
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = clean_df)
##
## Coefficients:
## (Intercept) TEAM_BATTING_H TEAM_BATTING_HBP TEAM_PITCHING_HR
## 60.95454 0.02541 0.08712 0.08945
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 0.05672 -0.03136 -0.17218 -0.11904