Import Data

rm(list=ls())
moneyball.training.data <- read.csv("C:/Users/aarav/Downloads/moneyball-training-data.csv", header=TRUE)
df <- moneyball.training.data

Clean Data

for(i in colnames(df)){
  df[,i][is.na(df[,i])] <- median(df[,i], na.rm=TRUE)
}

sum(is.na(moneyball.training.data))
## [1] 3478

Data Table

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df, type = "text", 
          title = "Moneyball Summary Stats", 
          omit ="INDEX",
          covariate.labels = c("Number of wins", "Base Hits by batters", "Doubles by batters", "Triples by batters", "Homeruns by batters", "Walks by batters", "Batters hit by pitch", "Strikeouts by batters", "Stolen bases", "Caught stealing", "Walks allowed", "Hits allowed", "Homeruns allowed", "Strikeouts by pitchers", "Errors Negative", "Double Plays"
          ))
## 
## Moneyball Summary Stats
## =================================================================
## Statistic                N     Mean    St. Dev.   Min     Max    
## -----------------------------------------------------------------
## Number of wins         2,276  80.791    15.752     0      146    
## Base Hits by batters   2,276 1,469.270  144.591   891    2,554   
## Doubles by batters     2,276  241.247   46.801    69      458    
## Triples by batters     2,276  55.250    27.939     0      223    
## Homeruns by batters    2,276  99.612    60.547     0      264    
## Walks by batters       2,276  501.559   122.671    0      878    
## Batters hit by pitch   2,276  736.250   242.909    0     1,399   
## Strikeouts by batters  2,276  123.394   85.406     0      697    
## Stolen bases           2,276  51.514    18.746     0      201    
## Caught stealing        2,276  58.114     3.766    29       95    
## Walks allowed          2,276 1,779.210 1,406.843 1,137   30,132  
## Hits allowed           2,276  105.699   61.299     0      343    
## Homeruns allowed       2,276  553.008   166.357    0     3,645   
## Strikeouts by pitchers 2,276  817.541   540.545  0.000 19,278.000
## Errors Negative        2,276  246.481   227.771   65     1,898   
## Double Plays           2,276  146.716   24.538    52      228    
## -----------------------------------------------------------------

Data Visualization

Homerun Distribution

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
?ggplot
## starting httpd help server ...
##  done
ggplot(data=df, 
       mapping = aes(x = TEAM_BATTING_HR)) +
  geom_histogram(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Homeruns by Team", x = "Number of Homeruns by Team", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Walks Given

ggplot(data=df, 
       mapping = aes(x = TEAM_PITCHING_BB)) +
  geom_boxplot(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Walks Allowed", x = "Number of Walks Allowed by Team", y = "Frequency") #This Statistic is Right-Skewed

Win Distribution in Relation to Strike-outs

df <-
dplyr::filter(df, TEAM_PITCHING_SO < 2000)
  
ggplot(data = df,
       mapping = aes(x = TEAM_PITCHING_SO,
                     y = TARGET_WINS)) +
  geom_point(fill="skyblue", color = "darkblue") + 
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Distribution of Wins according to strike-outs", 
       x = "Strike-outs by Pitchers", 
       y = "Number of Wins")
## `geom_smooth()` using formula = 'y ~ x'

Faceted Tables

df_melted <- reshape2::melt(data = df)
## No id variables; using all as measure variables
table(df_melted$variable)
## 
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##             2267             2267             2267             2267 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##             2267             2267             2267             2267 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##             2267             2267             2267             2267 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##             2267             2267             2267             2267 
## TEAM_FIELDING_DP 
##             2267
ggplot(data = df_melted,
       mapping = aes(value)) +
  geom_histogram() + facet_wrap(~ variable,
                                scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Key Insights