Import Data

rm(list=ls())
moneyball.training.data <- read.csv("C:/Users/aarav/Downloads/moneyball-training-data.csv", header=TRUE)
df <- moneyball.training.data

Clean Data

for(i in colnames(df)){
  df[,i][is.na(df[,i])] <- median(df[,i], na.rm=TRUE)
}

sum(is.na(moneyball.training.data))

## [1] 3478

Data Table

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

stargazer(df, type = "text", 
          title = "Moneyball Summary Stats", 
          omit ="INDEX",
          covariate.labels = c("Number of wins", "Base Hits by batters", "Doubles by batters", "Triples by batters", "Homeruns by batters", "Walks by batters", "Batters hit by pitch", "Strikeouts by batters", "Stolen bases", "Caught stealing", "Walks allowed", "Hits allowed", "Homeruns allowed", "Strikeouts by pitchers", "Errors Negative", "Double Plays"
          ))

## 
## Moneyball Summary Stats
## =================================================================
## Statistic                N     Mean    St. Dev.   Min     Max    
## -----------------------------------------------------------------
## Number of wins         2,276  80.791    15.752     0      146    
## Base Hits by batters   2,276 1,469.270  144.591   891    2,554   
## Doubles by batters     2,276  241.247   46.801    69      458    
## Triples by batters     2,276  55.250    27.939     0      223    
## Homeruns by batters    2,276  99.612    60.547     0      264    
## Walks by batters       2,276  501.559   122.671    0      878    
## Batters hit by pitch   2,276  736.250   242.909    0     1,399   
## Strikeouts by batters  2,276  123.394   85.406     0      697    
## Stolen bases           2,276  51.514    18.746     0      201    
## Caught stealing        2,276  58.114     3.766    29       95    
## Walks allowed          2,276 1,779.210 1,406.843 1,137   30,132  
## Hits allowed           2,276  105.699   61.299     0      343    
## Homeruns allowed       2,276  553.008   166.357    0     3,645   
## Strikeouts by pitchers 2,276  817.541   540.545  0.000 19,278.000
## Errors Negative        2,276  246.481   227.771   65     1,898   
## Double Plays           2,276  146.716   24.538    52      228    
## -----------------------------------------------------------------

Data Visualization

Homerun Distribution

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(reshape2)
?ggplot

## starting httpd help server ...

##  done

ggplot(data=df, 
       mapping = aes(x = TEAM_BATTING_HR)) +
  geom_histogram(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Homeruns by Team", x = "Number of Homeruns by Team", y = "Frequency")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Walks Given

ggplot(data=df, 
       mapping = aes(x = TEAM_PITCHING_BB)) +
  geom_boxplot(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Walks Allowed", x = "Number of Walks Allowed by Team", y = "Frequency") #This Statistic is Right-Skewed

Win Distribution in Relation to Strike-outs

df <-
dplyr::filter(df, TEAM_PITCHING_SO < 2000)
  
ggplot(data = df,
       mapping = aes(x = TEAM_PITCHING_SO,
                     y = TARGET_WINS)) +
  geom_point(fill="skyblue", color = "darkblue") + 
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Distribution of Wins according to strike-outs", 
       x = "Strike-outs by Pitchers", 
       y = "Number of Wins")

## `geom_smooth()` using formula = 'y ~ x'

Faceted Tables

df_melted <- reshape2::melt(data = df)

## No id variables; using all as measure variables

table(df_melted$variable)

## 
##            INDEX      TARGET_WINS   TEAM_BATTING_H  TEAM_BATTING_2B 
##             2267             2267             2267             2267 
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB  TEAM_BATTING_SO 
##             2267             2267             2267             2267 
##  TEAM_BASERUN_SB  TEAM_BASERUN_CS TEAM_BATTING_HBP  TEAM_PITCHING_H 
##             2267             2267             2267             2267 
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##             2267             2267             2267             2267 
## TEAM_FIELDING_DP 
##             2267

ggplot(data = df_melted,
       mapping = aes(value)) +
  geom_histogram() + facet_wrap(~ variable,
                                scales = "free_x")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Key Insights

There is a fairly normal distribution of home-runs made by teams, centered at about 100
The number of walks allowed is skewed to the right, with a median of around 500 with outliers that exceed 3000
There is weak correlation observed when comparing strikes-outs by pitchers and number of wins per team
Each team individually has never exceeded 10000 hits
Most teams have been caught stealing 50 times

Weekend Project

Aarav Agrawal

2024-07-27

Import Data

Clean Data

Data Table

Data Visualization

Homerun Distribution

Distribution of Walks Given

Win Distribution in Relation to Strike-outs

Faceted Tables

Key Insights