Data Table
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(df, type = "text",
title = "Moneyball Summary Stats",
omit ="INDEX",
covariate.labels = c("Number of wins", "Base Hits by batters", "Doubles by batters", "Triples by batters", "Homeruns by batters", "Walks by batters", "Batters hit by pitch", "Strikeouts by batters", "Stolen bases", "Caught stealing", "Walks allowed", "Hits allowed", "Homeruns allowed", "Strikeouts by pitchers", "Errors Negative", "Double Plays"
))
##
## Moneyball Summary Stats
## =================================================================
## Statistic N Mean St. Dev. Min Max
## -----------------------------------------------------------------
## Number of wins 2,276 80.791 15.752 0 146
## Base Hits by batters 2,276 1,469.270 144.591 891 2,554
## Doubles by batters 2,276 241.247 46.801 69 458
## Triples by batters 2,276 55.250 27.939 0 223
## Homeruns by batters 2,276 99.612 60.547 0 264
## Walks by batters 2,276 501.559 122.671 0 878
## Batters hit by pitch 2,276 736.250 242.909 0 1,399
## Strikeouts by batters 2,276 123.394 85.406 0 697
## Stolen bases 2,276 51.514 18.746 0 201
## Caught stealing 2,276 58.114 3.766 29 95
## Walks allowed 2,276 1,779.210 1,406.843 1,137 30,132
## Hits allowed 2,276 105.699 61.299 0 343
## Homeruns allowed 2,276 553.008 166.357 0 3,645
## Strikeouts by pitchers 2,276 817.541 540.545 0.000 19,278.000
## Errors Negative 2,276 246.481 227.771 65 1,898
## Double Plays 2,276 146.716 24.538 52 228
## -----------------------------------------------------------------
Data Visualization
Homerun Distribution
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
?ggplot
## starting httpd help server ...
## done
ggplot(data=df,
mapping = aes(x = TEAM_BATTING_HR)) +
geom_histogram(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Homeruns by Team", x = "Number of Homeruns by Team", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of Walks Given
ggplot(data=df,
mapping = aes(x = TEAM_PITCHING_BB)) +
geom_boxplot(fill="skyblue", color = "darkblue") + labs(title = "Distribution of Walks Allowed", x = "Number of Walks Allowed by Team", y = "Frequency") #This Statistic is Right-Skewed

Win Distribution in Relation to Strike-outs
df <-
dplyr::filter(df, TEAM_PITCHING_SO < 2000)
ggplot(data = df,
mapping = aes(x = TEAM_PITCHING_SO,
y = TARGET_WINS)) +
geom_point(fill="skyblue", color = "darkblue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Distribution of Wins according to strike-outs",
x = "Strike-outs by Pitchers",
y = "Number of Wins")
## `geom_smooth()` using formula = 'y ~ x'

Faceted Tables
df_melted <- reshape2::melt(data = df)
## No id variables; using all as measure variables
table(df_melted$variable)
##
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## 2267 2267 2267 2267
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## 2267 2267 2267 2267
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## 2267 2267 2267 2267
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## 2267 2267 2267 2267
## TEAM_FIELDING_DP
## 2267
ggplot(data = df_melted,
mapping = aes(value)) +
geom_histogram() + facet_wrap(~ variable,
scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
