First let’s install and upload some of the libraries we’ll be using
library(ggplot2)
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.2
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.2
install.packages("ggrepel")
## Installing package into 'C:/Users/james/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ggrepel' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'ggrepel'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\james\AppData\Local\R\win-library\4.3\00LOCK\ggrepel\libs\x64\ggrepel.dll
## to C:\Users\james\AppData\Local\R\win-library\4.3\ggrepel\libs\x64\ggrepel.dll:
## Permission denied
## Warning: restored 'ggrepel'
##
## The downloaded binary packages are in
## C:\Users\james\AppData\Local\Temp\RtmpY7gJDH\downloaded_packages
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("Lahman")
## Installing package into 'C:/Users/james/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'Lahman' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\james\AppData\Local\Temp\RtmpY7gJDH\downloaded_packages
Let’s get into evaluating the Lahman data now
github_pull("beanumber/baseball_R/tree/master/data")
## [1] "beanumber/baseball_R/tree/master/data"
## attr(,"class")
## [1] "github_pull"
Batting <- Lahman::Batting
halloffame <- Lahman::HallOfFame
setwd("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R")
Batting <- Batting %>%
group_by(playerID) %>%
mutate(From = min(yearID)) %>%
mutate(To = max(yearID))
batting_summary <- Batting %>%
group_by(playerID) %>%
summarise(From = mean(From), To = mean(To))
hof_lahman <- halloffame %>%
left_join(select(batting_summary, playerID, From, To), by = "playerID")
hof_lahman <- hof_lahman %>%
filter(inducted == "Y" & category == "Player" & !is.na(From))
hof <- read.csv("hofbatting.csv")
First, we define a player’s mid-career as the average of the first and last seasons of baseball, we then use mutate() and cut() functions to create a new factor variable Era
hof <- hof %>%
mutate(MidCareer = (From + To) / 2,
Era = cut(MidCareer,
breaks = c(1800, 1900, 1919, 1941, 1960, 1976, 1993, 2050),
labels = c("19th Century", "Dead Ball", "Lively Ball", "Integration", "Expansion", "Free Agency", "Long Ball")))
hof_eras <- summarise(group_by(hof, Era), N=n())
hof_eras
ggplot(hof, aes(x=Era)) +
geom_bar() +
xlab("Baseball Era") +
ylab("Frequency") +
ggtitle("Era of the Nonpitching Hall of Famers")
ggsave("chapter3_BarGraph.png")
## Saving 7 x 5 in image
ggplot(hof_eras, aes(Era, N)) +
geom_point() +
xlab("Baseball Era") +
ylab("Frequency") +
ggtitle("Era of the Nonpitching Hall of Famers") +
coord_flip()
Saving a pdf of the graphs
pdf("Chapter3_Multiple_Graphs.pdf")
ggplot(hof, aes(Era)) + geom_bar()
ggplot(hof_eras, aes(Era, N)) + geom_point() + coord_flip()
dev.off()
## png
## 2
3.4 Numeric Variable: One-Dimensional Scatterplot and Histogram
ggplot(hof, aes(x=OPS, y=1)) +
geom_jitter(height = 0.6) + ylim(-1,3) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
coord_fixed(ratio = 0.03)
lmOPS_hof = lm(OPS ~ SLG, data= hof)
summary(lmOPS_hof)
##
## Call:
## lm(formula = OPS ~ SLG, data = hof)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.132071 -0.018723 0.000047 0.017066 0.090489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.22288 0.01492 14.94 <2e-16 ***
## SLG 1.32785 0.03206 41.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02722 on 154 degrees of freedom
## Multiple R-squared: 0.9176, Adjusted R-squared: 0.9171
## F-statistic: 1715 on 1 and 154 DF, p-value: < 2.2e-16
#let's try a qplot
library(ggrepel)
p <- qplot(HR, OPS, data = hof, color = Era, size = AB)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
data = subset(hof, OPS > quantile(OPS, 0.75) + 1.5 * IQR(OPS) | OPS < quantile(OPS, 0.25) - 1.5 * IQR(OPS) | HR > quantile(HR, 0.75) + 1.5 * IQR(HR) | HR < quantile(HR, 0.25) - 1.5 * IQR(HR) ),
aes(x = HR, y = OPS, label = Player),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red",
)
print(r)
p <- qplot(MidCareer, OBP, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
data = subset(hof, OBP > quantile(OBP, 0.75) + 1.5 * IQR(OBP) | OBP < quantile(OBP, 0.25) - 1.5 * IQR(OBP)),
aes(x = MidCareer, y = OBP, label = Player),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red",
)
print(r)
p <- qplot(MidCareer, SLG, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
data = subset(hof, SLG > quantile(SLG, 0.75) + 1.5 * IQR(SLG) | SLG < quantile(SLG, 0.25) - 1.5 * IQR(SLG)),
aes(x = MidCareer, y = SLG, label = Player),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red",
)
print(r)
p <- qplot(MidCareer, OPS, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
data = subset(hof, OPS > quantile(OPS, 0.75) + 1.5 * IQR(OPS) | OPS < quantile(OPS, 0.25) - 1.5 * IQR(OPS)),
aes(x = MidCareer, y = OPS, label = Player),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red",
)
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$OPS, 0.75) + 1.5 * IQR(hof$OPS), color = "red", linetype = "dotted")
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$OPS, 0.25) - 1.5 * IQR(hof$OPS), color = "blue", linetype = "dotted")
print(r)
p <- qplot(MidCareer, HR, data = hof, color = Era, size = OBP)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
data = subset(hof, HR > quantile(HR, 0.75) + 1.5 * IQR(HR) | HR< quantile(HR, 0.25) - 1.5 * IQR(HR)),
aes(x = MidCareer, y = HR, label = Player),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red",
)
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$HR, 0.75) + 1.5 * IQR(hof$HR), color = "red", linetype = "dotted")
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$HR, 0.25) - 1.5 * IQR(hof$HR), color = "blue", linetype = "dotted")
print(r)
Now let’s make a histogram
ggplot(hof, aes(x=OPS)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Let’s specify bin size
ggplot(hof, aes(x=OPS)) +
geom_histogram(breaks = seq(0.4, 1.2, by = 0.1),
color = "blue", fill = "white")
Chapter 3.5 Two Numeric Values, Scatter Plot with Smoothing
ggplot(hof, aes(MidCareer, OPS)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
(p <- ggplot(hof, aes(OBP,SLG)) + geom_point())
lm_fit <- lm(SLG ~OBP, data = hof)
(p <- p +
xlim(0.25, 0.50) + ylim(0.28, 0.75) +
xlab("On Base Percentage") +
ylab("Slugging Percentage"))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
#Alternatively we could change the limits and the labels by appealing directly to the scale_x_continuous() and scale_y_continuous() functions
p <- p +
scale_x_continuous("On Base Percentage",
limits = c(0.25, 0.50)) +
scale_y_continuous("Slugging Percentage",
limits = c(0.28, 0.75))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
p
## Warning: Removed 1 rows containing missing values (`geom_point()`).
q = p + geom_abline(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2], color="red")
q
## Warning: Removed 1 rows containing missing values (`geom_point()`).
Suppose we wanted to draw the function y=0.7-x on the graph:
(p <- p + geom_abline(slope = -1,
intercept = seq(0.7, 1, by=0.1),
color="black",
linetype = "dotted"))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
#In our final iteration, we want to add labels to the lines showing the constant values of OPS
#First Let's create a dataframe which contains the coordiates
ops_labels <- tibble(
OBP = rep(0.3, 4),
SLG = seq(0.4, 0.7, by = 0.1),
label = paste("OPS = ", OBP + SLG)
)
p + geom_text(data = ops_labels, hjust = "right",
aes(label = label))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
Ch 3.6.1 Parallel Strip Charts
#create a new column that's HR rate (home run per at bat) as opposed to total numbers
hof <- mutate(hof, hr_rate = HR/AB)
ggplot(hof, aes(hr_rate, Era)) +
geom_jitter(height=0.1)
r <- ggplot(hof, aes(Era, hr_rate)) +
geom_boxplot(outlier.colour = "red") +
coord_flip()
outliers <- subset(hof, hr_rate > quantile(hr_rate, 0.75) + 1.5 * IQR(hr_rate) | hr_rate < quantile(hr_rate, 0.25) - 1.5 * IQR(hr_rate))
r <- r + geom_text_repel(
data = outliers,
aes(x = hr_rate, y = Era, label = Player), # Swap x and y aesthetics
color = "#000000",
size = 4,
nudge_x = -0.2, nudge_y = 0.4,
segment.color = "red",
)
print(r)
3.7 Comparing Ruth Aaron, Bons, and A-Rod Now we’ll start working with
the Lahman data
#First we need to adjust birthyear, MLB defines age season by the age of te player on June 30 of the season
#First let's create a function which will get us the correct age of each player
library(Lahman)
## Warning: package 'Lahman' was built under R version 4.3.2
##
## Attaching package: 'Lahman'
## The following object is masked _by_ '.GlobalEnv':
##
## Batting
Master <- Lahman::People #book said to use master.csv file, it looks like Lahman changed this to player.csv
Batting <- Lahman::Batting
get_birthyear <- function(Name) {
Names <- unlist(strsplit(Name, " ")) #first we'll need to split the Name string into first and last name
Master %>%
filter(nameFirst == Names[1],
nameLast == Names[2]) %>%
mutate(birthyear = ifelse(birthMonth >= 7,
birthYear + 1, birthYear),
Player = paste(nameFirst, nameLast)) %>%
select(playerID, Player, birthyear)
}
PlayerInfo <- bind_rows(get_birthyear("Babe Ruth"),
get_birthyear("Hank Aaron"),
get_birthyear("Barry Bonds"),
get_birthyear("Alex Rodriguez"))
Batting %>%
inner_join(PlayerInfo, by="playerID") %>%
mutate(Age = yearID - birthyear) %>%
select(Player, Age, HR) %>%
group_by(Player)%>%
mutate(CHR = cumsum(HR)) -> HRdata
#After the data cleaning let's construct a graph
lineplot <- ggplot(HRdata, aes(x=Age, y = CHR, linetype = Player)) +
geom_line(aes(colour = Player))
print(lineplot)
Now it’s time to plot the Sammy Sosa vs Mark mcGuire Home Run Race
fields <- read.csv("baseball_R/data/fields.csv")
data1998 <- read.csv("baseball_R/data/all1998.csv",
col.names = fields$Header)
#First thing we need to do after pulling in the data is to retrieve Mac and Sammy's ids
sosa_id <- Master %>%
filter(nameFirst == "Sammy", nameLast == "Sosa") %>%
pull(retroID)
mac_id <- Master %>%
filter(nameFirst == "Mark", nameLast == "McGwire") %>%
pull(retroID)
#now that we have their id's let's go to the play by play data
hr_race <- data1998 %>%
filter(BAT_ID %in% c(sosa_id, mac_id))
3.8.2 Extracting the variables
library(lubridate)
cum_hr <- function(d) {
d%>%
mutate(Date = ymd(str_sub(GAME_ID, 4, 11))) %>%
arrange(Date) %>%
mutate(HR = ifelse(EVENT_CD ==23,1,0),
cumHR = cumsum(HR)) %>%
select(Date, cumHR)
}
#Next, we will use map_df() function to itrate our new function cum_hr() twice, once for SOSAand once for McGwire's batting data, and obtaining a new data frame...hr_ytd
hr_ytd <- hr_race %>%
split(pull(., BAT_ID)) %>%
map_df(cum_hr, .id = "BAT_ID") %>%
inner_join(Master, by = c("BAT_ID" = "retroID"))
#contructing the graph
ggplot(hr_ytd, aes(Date, cumHR, linetype=nameLast)) +
geom_line() +
geom_hline(yintercept=62, color="blue") +
annotate("text", ymd("1998-04-15"), 65, label = "62", color="blue") +
ylab("Home Runs in the 1998 Season")
Next let’s try the chapter 3 exercises 1. Hall of Fame Pitchers
hofpitching <- read.csv("baseball_R/data/hofpitching.csv")
#BF variable is the number of batters faced in a pitcher's career. Let's group the pitchers using the intervals designated in the book
hofpitching <- hofpitching %>%
mutate(BF.group = cut(BF,
c(0, 10000, 15000, 20000, 30000),
labels = c("Less than 10000", "(10000, 14999)", ("15000, 19999"), "20000 or More")))
1.a Construct a Frequency Table
#Let's do pitching era like we did with batting:
hofpitching <- hofpitching %>%
mutate(MidCareer = (From + To) / 2,
Era = cut(MidCareer,
breaks = c(1800, 1900, 1919, 1941, 1960, 1976, 1993, 2050),
labels = c("19th Century", "Dead Ball", "Lively Ball", "Integration", "Expansion", "Free Agency", "Long Ball")))
hofpitching_bfgroupfreqtable <- hofpitching %>%
group_by(BF.group) %>%
summarise(
Frequency = n(),
WAR = sum(WAR),
SO = sum(SO),
BB = sum(BB),
HBP = sum(HBP),
CG = sum(CG)
)
hofpitching_yearfreqtable <- hofpitching %>%
group_by(Era) %>%
summarise(
Frequency = n(),
WAR = sum(WAR),
SO = sum(SO),
BB = sum(BB),
HBP = sum(HBP),
CG = sum(CG)
)
Construct the histogram
ggplot(hofpitching, aes(x=BF)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hofpitching, aes(x=HBP)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).
ggplot(hofpitching, aes(x=SO)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hofpitching, aes(x=BB)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hofpitching, aes(x=WAR)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hofpitching, aes(x=CG)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hofpitching, aes(x = BF, fill = BF > 20000)) +
geom_histogram(binwidth = 1000) +
scale_fill_manual(values = c("FALSE" = "gray", "TRUE" = "red")) +
ylab("Count of Pitchers") +
xlab("Batters Faced (BF)") +
ggtitle("Histogram of Batters Faced (BF)") +
theme_minimal()
ggplot(hofpitching, aes(x = BF)) +
geom_density(aes(fill = BF > 20000, color = BF > 20000), alpha = 0.5) +
scale_fill_manual(values = c("FALSE" = "gray", "TRUE" = "blue")) +
scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "blue")) +
ylab("Density") +
xlab("Batters Faced (BF)") +
ggtitle("Density Curve with Two Colors for Batters Faced (BF)") +
theme_minimal()
Now let’s do a bar graph by BF.group
ggplot(hofpitching, aes(BF.group)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count), vjust=-0.5), color="blue") +
ylab("Count of Pitchers") +
xlab("Batters Faced Group") +
ggtitle("Number of Pitchers in Each Batters Faced Group")
ggsave("HOFPitchingBarChart.png")
## Saving 7 x 5 in image
ggplot(hofpitching_bfgroupfreqtable, aes(BF.group, Frequency)) +
geom_point() +
ylab("Count of Pitchers") +
xlab("Batters Faced Group") +
ggtitle("Number of Pitchers in Each Batters Faced Group")
ggsave("HOFPitchingdotplot.png")
## Saving 7 x 5 in image
ggplot(hofpitching, aes(Era)) +
geom_bar() +
geom_text(stat = "count", aes(label = after_stat(count), vjust=-0.5), color="blue") +
ylab("Count of Pitchers") +
xlab("Era") +
ggtitle("Number of Pitchers in Each Batters Faced Group")
ggsave("HOFPitchingbyeya.png")
## Saving 7 x 5 in image
ggplot(hofpitching, aes(WAR)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
linear_model <- lm(WAR ~ G, data=hofpitching)
ggplot(hofpitching, aes(G, WAR)) +
geom_point() +
geom_text_repel(
data = subset(hofpitching, WAR > quantile(WAR, 0.75) + 1.5 * IQR(WAR) | WAR < quantile(WAR, 0.25) - 1.5 * IQR(WAR)),
aes(x = G, y = WAR, label = X),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red") +
geom_abline(intercept=coef(linear_model)[1], slope = coef(linear_model)[2], color = "blue")
linear_model2 <- lm(WAR ~ SO, data=hofpitching)
ggplot(hofpitching, aes(SO, WAR)) +
geom_point() +
geom_text_repel(
data = subset(hofpitching, WAR > quantile(WAR, 0.75) + 1.5 * IQR(WAR) | WAR < quantile(WAR, 0.25) - 1.5 * IQR(WAR)),
aes(x = SO, y = WAR, label = X),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red") +
geom_abline(intercept=coef(linear_model2)[1], slope = coef(linear_model2)[2], color = "blue")
Create a variable which shows WAR per season
hofpitching <- hofpitching %>%
mutate(WAR.season = WAR/Yrs)
Let’s graph
linear_model2 <- lm(WAR.season ~ WAR, data=hofpitching)
ggplot(hofpitching, aes(WAR, WAR.season)) +
geom_point() +
geom_text_repel(
data = subset(hofpitching, WAR.season > quantile(WAR.season, 0.75) + 1.5 * IQR(WAR.season) | WAR.season < quantile(WAR.season, 0.25) - 1.5 * IQR(WAR.season)),
aes(x = WAR, y = WAR.season, label = X),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red") +
geom_abline(intercept=coef(linear_model2)[1], slope = coef(linear_model2)[2], color = "blue")
ggplot(hofpitching, aes(WAR.season, y=1)) +
geom_jitter(height = 0.6) + ylim(-1,3) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
coord_fixed(ratio = 0.03) +
geom_text_repel(
data = subset(hofpitching, WAR.season > 7 | WAR.season < 2.5),
aes(x = WAR.season, y = 1, label = X),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red") +
geom_vline(xintercept=7, color="red") +
geom_vline(xintercept=2.5, color = "blue") +
coord_cartesian(xlim = c(0,10))
## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.
## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(hofpitching, aes(WAR.season, BF.group)) +
geom_jitter(height = 0.1) +
geom_vline(xintercept=7, color="red") +
geom_vline(xintercept=2.5, color = "blue") +
coord_cartesian(xlim = c(0,10))
ggplot(hofpitching, aes(MidCareer, WAR)) +
geom_point() +
geom_smooth() +
ggtitle("WAR by Individual HOF Player MidCareer Season") +
geom_text_repel(
data = subset(hofpitching, WAR.season < 1.75 & MidCareer <1900) ,
aes(x=MidCareer, y=WAR, label=X)
)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
hofpitching <- hofpitching %>%
arrange(MidCareer)
hofpitching.recent <- hofpitching
ggplot(hofpitching, aes(MidCareer, WAR.season)) +
geom_point() +
geom_text_repel(
data = subset(hofpitching, WAR.season > 4.2 & MidCareer >1959),
aes(x = MidCareer, y = WAR.season, label = X),
color = "#000000",
size = 4,
nudge_x = 0.2, nudge_y = 0.4,
segment.color = "red")
Exercise 3.6 Working with the Lahman Batting Dataset
#Batting %>%
# inner_join(PlayerInfo, by="playerID") %>%
# mutate(Age = yearID - birthyear) %>%
# select(Player, Age, HR) %>%
# group_by(Player)%>%
# mutate(CHR = cumsum(HR)) -> HRdata
get_player_id <- function(player_name) {
Names <- unlist(strsplit(player_name, " "))
Master %>%
filter(nameFirst == Names[1],
nameLast == Names[2]) %>%
mutate(birthyear = ifelse(birthMonth >= 7,
birthYear+1, birthYear),
Player = paste(nameFirst, nameLast)) %>%
select(playerID, Player, birthyear, height, weight, bats, throws, birthCountry, birthState, birthCity)
}
TedWilliams_PeteRose_TyCobb_Info <- bind_rows(get_player_id("Ted Williams"),
get_player_id("Pete Rose"),
get_player_id("Ty Cobb"))
TedWilliams_PeteRose_TyCobb_Info <- TedWilliams_PeteRose_TyCobb_Info %>%
filter(playerID != "rosepe02")
Batting_TC_PR_TW <- Batting %>%
inner_join(TedWilliams_PeteRose_TyCobb_Info, by="playerID") %>%
mutate(age = yearID - birthyear) %>%
group_by(Player) %>%
arrange(Player, yearID) %>%
mutate(cumH = cumsum(H)) -> Hdata
Pete_Rose_Lineplot <- ggplot(Hdata, aes(x=age, y=cumH, linetype=Player)) +
geom_line(aes(colour=Player)) +
theme_minimal()
print(Pete_Rose_Lineplot)
As you can see from the above graph, Ted Williams’ starting accumulating
hits after Ty Cobb (around the age 20 season). However, there were large
periods of stagnation resulting in him finishing with significantly less
hits than Pete Rose and Ty Cobb. The slope of the line however, appears
to be similar in his first two periods before stagnation. The eriods of
stagnation are seasons Ted did not play, because he was in the war. It
may be reasonable to project a similar slope to his line as Pete and Ty,
however, we do see a change in the slope of the line in his 30s and
beyond. It’s tough to project what would have happened had he not been
absent from the game.
Next, let’s start exercise 3.7 working with Play by Play data
The goal of this exercise is to recreate the homerun race, by comparing space between homeruns
#first let's create a Mark M data frame
mac.data <- data1998 %>%
filter(BAT_ID == mac_id)
#next, a Sammy S data frame
sosa.data <- data1998 %>%
filter(BAT_ID == sosa_id)
Now we need to restrict the two data frames to plays where a batting event occurred.
mac.data <- filter(mac.data, BAT_EVENT_FL == TRUE)
sosa.data <- filter(sosa.data, BAT_EVENT_FL == TRUE)
Next, we’ll need to create a new variable that numbers the plate appearances. This should simply equal the number of rows, so we’ll just use the nrow() function
mac.data <- mutate(mac.data, PA = 1:nrow(mac.data))
sosa.data <- mutate(sosa.data, PA = 1:nrow(sosa.data))
#Now let's create a command that returns the number of plate appearances when the player hits a home run
mac.HR.PA <- mac.data %>%
filter(EVENT_CD == 23) %>%
pull(PA)
sosa.HR.PA <- sosa.data %>%
filter(EVENT_CD == 23) %>%
pull(PA)
#Now let's compute the number of spacings between occurrences of home runs
mac.spacings <- diff(c(0,mac.HR.PA))
sosa.spacings <- diff(c(0, sosa.HR.PA))
#Let's create a new df with Player Name and Spacing Value
homerunSpacingData <- data.frame(
Player = c("Mark McGuire", "Sammy Sosa"),
Spacing = c(mac.spacings, sosa.spacings)
)
print(homerunSpacingData)
## Player Spacing
## 1 Mark McGuire 2
## 2 Sammy Sosa 25
## 3 Mark McGuire 8
## 4 Sammy Sosa 8
## 5 Mark McGuire 5
## 6 Sammy Sosa 19
## 7 Mark McGuire 1
## 8 Sammy Sosa 22
## 9 Mark McGuire 2
## 10 Sammy Sosa 29
## 11 Mark McGuire 11
## 12 Sammy Sosa 1
## 13 Mark McGuire 4
## 14 Sammy Sosa 1
## 15 Mark McGuire 7
## 16 Sammy Sosa 6
## 17 Mark McGuire 22
## 18 Sammy Sosa 8
## 19 Mark McGuire 12
## 20 Sammy Sosa 7
## 21 Mark McGuire 13
## 22 Sammy Sosa 5
## 23 Mark McGuire 2
## 24 Sammy Sosa 17
## 25 Mark McGuire 7
## 26 Sammy Sosa 1
## 27 Mark McGuire 2
## 28 Sammy Sosa 13
## 29 Mark McGuire 8
## 30 Sammy Sosa 5
## 31 Mark McGuire 1
## 32 Sammy Sosa 11
## 33 Mark McGuire 35
## 34 Sammy Sosa 8
## 35 Mark McGuire 3
## 36 Sammy Sosa 6
## 37 Mark McGuire 8
## 38 Sammy Sosa 1
## 39 Mark McGuire 2
## 40 Sammy Sosa 7
## 41 Mark McGuire 26
## 42 Sammy Sosa 8
## 43 Mark McGuire 9
## 44 Sammy Sosa 9
## 45 Mark McGuire 5
## 46 Sammy Sosa 5
## 47 Mark McGuire 1
## 48 Sammy Sosa 8
## 49 Mark McGuire 3
## 50 Sammy Sosa 12
## 51 Mark McGuire 40
## 52 Sammy Sosa 22
## 53 Mark McGuire 1
## 54 Sammy Sosa 3
## 55 Mark McGuire 17
## 56 Sammy Sosa 3
## 57 Mark McGuire 15
## 58 Sammy Sosa 15
## 59 Mark McGuire 15
## 60 Sammy Sosa 33
## 61 Mark McGuire 15
## 62 Sammy Sosa 5
## 63 Mark McGuire 8
## 64 Sammy Sosa 5
## 65 Mark McGuire 7
## 66 Sammy Sosa 25
## 67 Mark McGuire 4
## 68 Sammy Sosa 2
## 69 Mark McGuire 3
## 70 Sammy Sosa 2
## 71 Mark McGuire 12
## 72 Sammy Sosa 1
## 73 Mark McGuire 3
## 74 Sammy Sosa 12
## 75 Mark McGuire 15
## 76 Sammy Sosa 2
## 77 Mark McGuire 16
## 78 Sammy Sosa 46
## 79 Mark McGuire 16
## 80 Sammy Sosa 48
## 81 Mark McGuire 1
## 82 Sammy Sosa 6
## 83 Mark McGuire 4
## 84 Sammy Sosa 7
## 85 Mark McGuire 5
## 86 Sammy Sosa 7
## 87 Mark McGuire 4
## 88 Sammy Sosa 2
## 89 Mark McGuire 1
## 90 Sammy Sosa 2
## 91 Mark McGuire 5
## 92 Sammy Sosa 8
## 93 Mark McGuire 2
## 94 Sammy Sosa 5
## 95 Mark McGuire 2
## 96 Sammy Sosa 3
## 97 Mark McGuire 15
## 98 Sammy Sosa 22
## 99 Mark McGuire 18
## 100 Sammy Sosa 2
## 101 Mark McGuire 22
## 102 Sammy Sosa 11
## 103 Mark McGuire 10
## 104 Sammy Sosa 10
## 105 Mark McGuire 1
## 106 Sammy Sosa 6
## 107 Mark McGuire 9
## 108 Sammy Sosa 12
## 109 Mark McGuire 6
## 110 Sammy Sosa 4
## 111 Mark McGuire 2
## 112 Sammy Sosa 22
## 113 Mark McGuire 15
## 114 Sammy Sosa 20
## 115 Mark McGuire 9
## 116 Sammy Sosa 4
## 117 Mark McGuire 7
## 118 Sammy Sosa 20
## 119 Mark McGuire 23
## 120 Sammy Sosa 5
## 121 Mark McGuire 25
## 122 Sammy Sosa 13
## 123 Mark McGuire 4
## 124 Sammy Sosa 20
## 125 Mark McGuire 1
## 126 Sammy Sosa 3
## 127 Mark McGuire 20
## 128 Sammy Sosa 13
## 129 Mark McGuire 12
## 130 Sammy Sosa 20
## 131 Mark McGuire 7
## 132 Sammy Sosa 8
## 133 Mark McGuire 21
## 134 Sammy Sosa 8
## 135 Mark McGuire 1
## 136 Sammy Sosa 19
#contructing the graph
ggplot(homerunSpacingData, aes(Spacing, color=Player, fill = Player)) +
geom_histogram(alpha = 0.5, bins= 10, position = "identity")
ggplot(homerunSpacingData, aes(Spacing)) +
geom_histogram() + facet_wrap(~ Player, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
homerunSpacingData %>% group_by(Player) %>%
summarize(M = median(Spacing),
Mean = mean(Spacing),
Max = max(Spacing),
Min = min(Spacing))