remove(list = ls())

moneyball.training.data <- read.csv("~/Downloads/moneyball-training-data.csv")

Clean Data

colSums(is.na(moneyball.training.data))
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##                0                0                0                0 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##                0                0                0              102 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##              131              772             2085                0 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##                0                0              102                0 
## TEAM_FIELDING_DP 
##              286
moneyball_data_clean <- na.omit(moneyball.training.data)
money_clean <- moneyball_data_clean[, colSums(is.na(moneyball_data_clean)) == 0]

Summary Statistics Table

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggplot2)

stargazer(money_clean, type = "text", title = "Summary Statistics", digits = 2, omit.summary.stat = "n", notes = "n = 2276")
## 
## Summary Statistics
## ==============================================
## Statistic          Mean   St. Dev.  Min   Max 
## ----------------------------------------------
## INDEX            1,383.59  765.24   41   2,534
## TARGET_WINS       80.93    12.12    43    116 
## TEAM_BATTING_H   1,478.63  76.15   1,308 1,667
## TEAM_BATTING_2B   297.20   26.33    201   373 
## TEAM_BATTING_3B   30.74     9.04    12    61  
## TEAM_BATTING_HR   178.05   32.41    116   260 
## TEAM_BATTING_BB   543.32   74.84    365   775 
## TEAM_BATTING_SO  1,051.03  104.16   805  1,399
## TEAM_BASERUN_SB   90.91    29.92    31    177 
## TEAM_BASERUN_CS   39.94    11.90    12    74  
## TEAM_BATTING_HBP  59.36    12.97    29    95  
## TEAM_PITCHING_H  1,479.70  75.79   1,312 1,667
## TEAM_PITCHING_HR  178.18   32.39    116   260 
## TEAM_PITCHING_BB  543.72   74.92    367   775 
## TEAM_PITCHING_SO 1,051.82  104.35   805  1,399
## TEAM_FIELDING_E   107.05   16.63    65    145 
## TEAM_FIELDING_DP  152.34   17.61    113   204 
## ----------------------------------------------
## n = 2276
ggplot(data = money_clean, mapping = aes(x = TEAM_BATTING_HR, y = TARGET_WINS)) + geom_point()

Move Data to Excel

# install.packages("writexl")
library(writexl)

write_xlsx(money_clean, "GroupProject.xlsx")