#install.packages("tidyverse")
#install.packages("visdat")
#install.packages("stargazer")
#install.packages("MASS")

library("MASS")
library("psych")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::select()  masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("visdat") #for visualizing the data
library("stargazer")
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library("ggplot2")
library("reshape2")  #makeing data long
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

Step 1: Import the Data

moneyball <- read.csv("C:/Users/Alex Law/Documents/BC Data Analysis/moneyball-training-data (3).csv")

Step 2: Clean Data

moneyball2 <- moneyball[ , c(-11)]
clean_moneyball <- na.omit(moneyball2)

Step 3:Summary Stat

stargazer(clean_moneyball, type = "text")
## 
## =====================================================
## Statistic          N     Mean    St. Dev.  Min   Max 
## -----------------------------------------------------
## INDEX            1,486 1,273.812 725.508    2   2,534
## TARGET_WINS      1,486  80.997    12.694   41    117 
## TEAM_BATTING_H   1,486 1,452.157 104.336  1,137 1,786
## TEAM_BATTING_2B  1,486  250.970   42.026   154   377 
## TEAM_BATTING_3B  1,486  42.905    18.649   11    129 
## TEAM_BATTING_HR  1,486  129.842   48.609   11    264 
## TEAM_BATTING_BB  1,486  541.888   80.567   309   878 
## TEAM_BATTING_SO  1,486  841.743  200.355   326  1,399
## TEAM_BASERUN_SB  1,486  95.858    44.345   18    314 
## TEAM_BASERUN_CS  1,486  52.963    22.851   11    201 
## TEAM_PITCHING_H  1,486 1,505.122 173.472  1,137 2,394
## TEAM_PITCHING_HR 1,486  134.069   50.902   12    343 
## TEAM_PITCHING_BB 1,486  561.570   97.347   325  1,090
## TEAM_PITCHING_SO 1,486  869.253  211.859   345  1,781
## TEAM_FIELDING_E  1,486  143.145   38.954   65    360 
## TEAM_FIELDING_DP 1,486  153.743   20.321   87    228 
## -----------------------------------------------------

Step 4: Graphs

moneyball_melted <- melt(clean_moneyball)
## No id variables; using all as measure variables
ggplot(data = moneyball_melted, 
       aes(x = value)
       ) + 
  geom_histogram() + 
  facet_wrap(facets = ~ variable, 
             scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7/25 Homework:

1. Unique identifier

require(tidyverse)
clean_df <- dplyr::select(moneyball, - c(INDEX)
                             )
kitchen_sink <- lm(data = clean_df, formula = TARGET_WINS ~ .)
summary(kitchen_sink) 
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = clean_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.8708  -5.6564  -0.0599   5.2545  22.9274 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      60.28826   19.67842   3.064  0.00253 ** 
## TEAM_BATTING_H    1.91348    2.76139   0.693  0.48927    
## TEAM_BATTING_2B   0.02639    0.03029   0.871  0.38484    
## TEAM_BATTING_3B  -0.10118    0.07751  -1.305  0.19348    
## TEAM_BATTING_HR  -4.84371   10.50851  -0.461  0.64542    
## TEAM_BATTING_BB  -4.45969    3.63624  -1.226  0.22167    
## TEAM_BATTING_SO   0.34196    2.59876   0.132  0.89546    
## TEAM_BASERUN_SB   0.03304    0.02867   1.152  0.25071    
## TEAM_BASERUN_CS  -0.01104    0.07143  -0.155  0.87730    
## TEAM_BATTING_HBP  0.08247    0.04960   1.663  0.09815 .  
## TEAM_PITCHING_H  -1.89096    2.76095  -0.685  0.49432    
## TEAM_PITCHING_HR  4.93043   10.50664   0.469  0.63946    
## TEAM_PITCHING_BB  4.51089    3.63372   1.241  0.21612    
## TEAM_PITCHING_SO -0.37364    2.59705  -0.144  0.88577    
## TEAM_FIELDING_E  -0.17204    0.04140  -4.155 5.08e-05 ***
## TEAM_FIELDING_DP -0.10819    0.03654  -2.961  0.00349 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.467 on 175 degrees of freedom
##   (2085 observations deleted due to missingness)
## Multiple R-squared:  0.5501, Adjusted R-squared:  0.5116 
## F-statistic: 14.27 on 15 and 175 DF,  p-value: < 2.2e-16
stepAIC(object = kitchen_sink, direction = "backward")
## Start:  AIC=831.31
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + 
##     TEAM_BASERUN_CS + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BATTING_SO   1      1.24 12547 829.33
## - TEAM_PITCHING_SO  1      1.48 12547 829.33
## - TEAM_BASERUN_CS   1      1.71 12548 829.34
## - TEAM_BATTING_HR   1     15.23 12561 829.54
## - TEAM_PITCHING_HR  1     15.79 12562 829.55
## - TEAM_PITCHING_H   1     33.63 12580 829.82
## - TEAM_BATTING_H    1     34.42 12580 829.83
## - TEAM_BATTING_2B   1     54.41 12600 830.14
## - TEAM_BASERUN_SB   1     95.22 12641 830.76
## - TEAM_BATTING_BB   1    107.84 12654 830.95
## - TEAM_PITCHING_BB  1    110.48 12656 830.99
## - TEAM_BATTING_3B   1    122.16 12668 831.16
## <none>                          12546 831.31
## - TEAM_BATTING_HBP  1    198.21 12744 832.31
## - TEAM_FIELDING_DP  1    628.49 13174 838.65
## - TEAM_FIELDING_E   1   1237.79 13784 847.28
## 
## Step:  AIC=829.33
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BASERUN_CS + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BASERUN_CS   1      1.59 12549 827.35
## - TEAM_BATTING_HR   1     15.82 12563 827.57
## - TEAM_PITCHING_HR  1     16.39 12564 827.58
## - TEAM_BATTING_2B   1     53.47 12601 828.14
## - TEAM_PITCHING_H   1     88.45 12636 828.67
## - TEAM_BATTING_H    1     90.30 12637 828.70
## - TEAM_BASERUN_SB   1     94.19 12641 828.76
## - TEAM_BATTING_BB   1    107.95 12655 828.97
## - TEAM_PITCHING_BB  1    110.60 12658 829.01
## - TEAM_BATTING_3B   1    122.20 12669 829.18
## <none>                          12547 829.33
## - TEAM_BATTING_HBP  1    197.11 12744 830.31
## - TEAM_FIELDING_DP  1    630.68 13178 836.70
## - TEAM_FIELDING_E   1   1240.80 13788 845.34
## - TEAM_PITCHING_SO  1   1312.89 13860 846.34
## 
## Step:  AIC=827.35
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + 
##     TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BATTING_HR   1     16.06 12565 825.60
## - TEAM_PITCHING_HR  1     16.64 12565 825.61
## - TEAM_BATTING_2B   1     53.05 12602 826.16
## - TEAM_PITCHING_H   1     90.24 12639 826.72
## - TEAM_BATTING_H    1     92.13 12641 826.75
## - TEAM_BATTING_BB   1    110.31 12659 827.03
## - TEAM_PITCHING_BB  1    113.00 12662 827.07
## - TEAM_BASERUN_SB   1    123.42 12672 827.22
## - TEAM_BATTING_3B   1    129.33 12678 827.31
## <none>                          12549 827.35
## - TEAM_BATTING_HBP  1    197.23 12746 828.33
## - TEAM_FIELDING_DP  1    635.62 13184 834.79
## - TEAM_PITCHING_SO  1   1311.88 13861 844.35
## - TEAM_FIELDING_E   1   1322.05 13871 844.49
## 
## Step:  AIC=825.6
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + TEAM_BATTING_3B + 
##     TEAM_BATTING_BB + TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BATTING_2B   1     55.48 12620 824.44
## - TEAM_PITCHING_H   1     89.26 12654 824.95
## - TEAM_BATTING_H    1     91.97 12657 824.99
## - TEAM_BATTING_BB   1    104.58 12669 825.18
## - TEAM_PITCHING_BB  1    107.19 12672 825.22
## <none>                          12565 825.60
## - TEAM_BATTING_3B   1    137.48 12702 825.68
## - TEAM_BASERUN_SB   1    146.90 12712 825.82
## - TEAM_BATTING_HBP  1    200.36 12765 826.62
## - TEAM_FIELDING_DP  1    628.95 13194 832.93
## - TEAM_PITCHING_HR  1    853.54 13418 836.15
## - TEAM_PITCHING_SO  1   1316.68 13882 842.63
## - TEAM_FIELDING_E   1   1333.15 13898 842.86
## 
## Step:  AIC=824.44
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_H + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_PITCHING_H   1     84.47 12705 823.71
## - TEAM_BATTING_H    1     87.79 12708 823.76
## - TEAM_BATTING_BB   1     98.92 12719 823.93
## - TEAM_PITCHING_BB  1    101.48 12722 823.97
## - TEAM_BASERUN_SB   1    109.27 12730 824.09
## <none>                          12620 824.44
## - TEAM_BATTING_3B   1    147.01 12767 824.65
## - TEAM_BATTING_HBP  1    204.39 12825 825.51
## - TEAM_FIELDING_DP  1    649.12 13269 832.02
## - TEAM_PITCHING_HR  1    812.92 13433 834.36
## - TEAM_PITCHING_SO  1   1262.90 13883 840.66
## - TEAM_FIELDING_E   1   1379.34 14000 842.25
## 
## Step:  AIC=823.71
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_BB + 
##     TEAM_BASERUN_SB + TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BATTING_BB   1     32.85 12738 822.21
## - TEAM_PITCHING_BB  1     43.42 12748 822.37
## - TEAM_BASERUN_SB   1    105.16 12810 823.29
## <none>                          12705 823.71
## - TEAM_BATTING_3B   1    153.13 12858 824.00
## - TEAM_BATTING_HBP  1    183.82 12888 824.46
## - TEAM_BATTING_H    1    504.11 13209 829.15
## - TEAM_FIELDING_DP  1    602.80 13308 830.57
## - TEAM_PITCHING_HR  1    850.25 13555 834.09
## - TEAM_PITCHING_SO  1   1259.72 13964 839.77
## - TEAM_FIELDING_E   1   1419.39 14124 841.94
## 
## Step:  AIC=822.21
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BASERUN_SB + 
##     TEAM_BATTING_HBP + TEAM_PITCHING_HR + TEAM_PITCHING_BB + 
##     TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BASERUN_SB   1    109.99 12848 821.85
## <none>                          12738 822.21
## - TEAM_BATTING_3B   1    156.45 12894 822.54
## - TEAM_BATTING_HBP  1    186.58 12924 822.98
## - TEAM_BATTING_H    1    485.67 13223 827.35
## - TEAM_FIELDING_DP  1    623.19 13361 829.33
## - TEAM_PITCHING_HR  1    843.83 13581 832.46
## - TEAM_PITCHING_SO  1   1267.25 14005 838.32
## - TEAM_FIELDING_E   1   1395.02 14133 840.06
## - TEAM_PITCHING_BB  1   2364.81 15102 852.73
## 
## Step:  AIC=821.85
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_3B + TEAM_BATTING_HBP + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## - TEAM_BATTING_3B   1    133.47 12981 821.82
## <none>                          12848 821.85
## - TEAM_BATTING_HBP  1    177.11 13025 822.46
## - TEAM_BATTING_H    1    566.11 13414 828.09
## - TEAM_FIELDING_DP  1    737.46 13585 830.51
## - TEAM_PITCHING_HR  1    756.49 13604 830.78
## - TEAM_PITCHING_SO  1   1257.91 14106 837.69
## - TEAM_FIELDING_E   1   1330.40 14178 838.67
## - TEAM_PITCHING_BB  1   2371.12 15219 852.20
## 
## Step:  AIC=821.82
## TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + TEAM_PITCHING_HR + 
##     TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP
## 
##                    Df Sum of Sq   RSS    AIC
## <none>                          12981 821.82
## - TEAM_BATTING_HBP  1    228.70 13210 823.16
## - TEAM_BATTING_H    1    449.87 13431 826.33
## - TEAM_FIELDING_DP  1    813.17 13794 831.43
## - TEAM_PITCHING_HR  1    990.20 13971 833.86
## - TEAM_PITCHING_SO  1   1316.56 14298 838.27
## - TEAM_FIELDING_E   1   1334.60 14316 838.52
## - TEAM_PITCHING_BB  1   2583.00 15564 854.49
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = clean_df)
## 
## Coefficients:
##      (Intercept)    TEAM_BATTING_H  TEAM_BATTING_HBP  TEAM_PITCHING_HR  
##         60.95454           0.02541           0.08712           0.08945  
## TEAM_PITCHING_BB  TEAM_PITCHING_SO   TEAM_FIELDING_E  TEAM_FIELDING_DP  
##          0.05672          -0.03136          -0.17218          -0.11904