Analyzing baseball Data with R

First let’s install and upload some of the libraries we’ll be using

library(ggplot2)
library(devtools)

## Warning: package 'devtools' was built under R version 4.3.2

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.3.2

install.packages("ggrepel")

## Installing package into 'C:/Users/james/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'ggrepel' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'ggrepel'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\james\AppData\Local\R\win-library\4.3\00LOCK\ggrepel\libs\x64\ggrepel.dll
## to C:\Users\james\AppData\Local\R\win-library\4.3\ggrepel\libs\x64\ggrepel.dll:
## Permission denied

## Warning: restored 'ggrepel'

## 
## The downloaded binary packages are in
##  C:\Users\james\AppData\Local\Temp\RtmpY7gJDH\downloaded_packages

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

install.packages("Lahman")

## Installing package into 'C:/Users/james/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'Lahman' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\james\AppData\Local\Temp\RtmpY7gJDH\downloaded_packages

Let’s get into evaluating the Lahman data now

github_pull("beanumber/baseball_R/tree/master/data")

## [1] "beanumber/baseball_R/tree/master/data"
## attr(,"class")
## [1] "github_pull"

Batting <- Lahman::Batting
halloffame <- Lahman::HallOfFame

setwd("C:\\Users\\james\\R_Working_Directory\\Analyzing_Baseball_Data_With_R")

Batting <- Batting %>%
  group_by(playerID) %>%
  mutate(From = min(yearID)) %>%
  mutate(To = max(yearID))

batting_summary <- Batting %>%
  group_by(playerID) %>%
  summarise(From = mean(From), To = mean(To))

hof_lahman <- halloffame %>%
  left_join(select(batting_summary, playerID, From, To), by = "playerID")

hof_lahman <- hof_lahman %>%
  filter(inducted == "Y" & category == "Player" & !is.na(From))


hof <- read.csv("hofbatting.csv")

First, we define a player’s mid-career as the average of the first and last seasons of baseball, we then use mutate() and cut() functions to create a new factor variable Era

hof <- hof %>%
  mutate(MidCareer = (From + To) / 2,
         Era = cut(MidCareer,
                   breaks = c(1800, 1900, 1919, 1941, 1960, 1976, 1993, 2050),
                   labels = c("19th Century", "Dead Ball", "Lively Ball", "Integration", "Expansion", "Free Agency", "Long Ball")))

hof_eras <- summarise(group_by(hof, Era), N=n())
hof_eras

ggplot(hof, aes(x=Era)) + 
  geom_bar() +
  xlab("Baseball Era") +
  ylab("Frequency") +
  ggtitle("Era of the Nonpitching Hall of Famers")

ggsave("chapter3_BarGraph.png")

## Saving 7 x 5 in image

ggplot(hof_eras, aes(Era, N)) +
  geom_point() +
  xlab("Baseball Era") +
  ylab("Frequency") +
  ggtitle("Era of the Nonpitching Hall of Famers") +
  coord_flip()

Saving a pdf of the graphs

pdf("Chapter3_Multiple_Graphs.pdf")
ggplot(hof, aes(Era)) + geom_bar() 
ggplot(hof_eras, aes(Era, N)) + geom_point() + coord_flip()
dev.off()

## png 
##   2

3.4 Numeric Variable: One-Dimensional Scatterplot and Histogram

ggplot(hof, aes(x=OPS, y=1)) + 
  geom_jitter(height = 0.6) + ylim(-1,3) +
  theme(axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  coord_fixed(ratio = 0.03)

lmOPS_hof = lm(OPS ~ SLG, data= hof)
summary(lmOPS_hof)

## 
## Call:
## lm(formula = OPS ~ SLG, data = hof)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.132071 -0.018723  0.000047  0.017066  0.090489 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.22288    0.01492   14.94   <2e-16 ***
## SLG          1.32785    0.03206   41.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02722 on 154 degrees of freedom
## Multiple R-squared:  0.9176, Adjusted R-squared:  0.9171 
## F-statistic:  1715 on 1 and 154 DF,  p-value: < 2.2e-16

#let's try a qplot
library(ggrepel)
p <- qplot(HR, OPS, data = hof, color = Era, size = AB)

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
  data = subset(hof, OPS > quantile(OPS, 0.75) + 1.5 * IQR(OPS) | OPS < quantile(OPS, 0.25) - 1.5 * IQR(OPS) | HR > quantile(HR, 0.75) + 1.5 * IQR(HR) | HR < quantile(HR, 0.25) - 1.5 * IQR(HR) ),
  aes(x = HR, y = OPS, label = Player),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red",
)

print(r)

p <- qplot(MidCareer, OBP, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
  data = subset(hof, OBP > quantile(OBP, 0.75) + 1.5 * IQR(OBP) | OBP < quantile(OBP, 0.25) - 1.5 * IQR(OBP)),
  aes(x = MidCareer, y = OBP, label = Player),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red",
)

print(r)

p <- qplot(MidCareer, SLG, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
  data = subset(hof, SLG > quantile(SLG, 0.75) + 1.5 * IQR(SLG) | SLG < quantile(SLG, 0.25) - 1.5 * IQR(SLG)),
  aes(x = MidCareer, y = SLG, label = Player),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red",
)

print(r)

p <- qplot(MidCareer, OPS, data = hof, color = Era, size = HR)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
  data = subset(hof, OPS > quantile(OPS, 0.75) + 1.5 * IQR(OPS) | OPS < quantile(OPS, 0.25) - 1.5 * IQR(OPS)),
  aes(x = MidCareer, y = OPS, label = Player),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red",
)

# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$OPS, 0.75) + 1.5 * IQR(hof$OPS), color = "red", linetype = "dotted")
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$OPS, 0.25) - 1.5 * IQR(hof$OPS), color = "blue", linetype = "dotted")



print(r)

p <- qplot(MidCareer, HR, data = hof, color = Era, size = OBP)
# Label speed outliers using geom_text_repel
r <- p + geom_text_repel(
  data = subset(hof, HR > quantile(HR, 0.75) + 1.5 * IQR(HR) | HR< quantile(HR, 0.25) - 1.5 * IQR(HR)),
  aes(x = MidCareer, y = HR, label = Player),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red",
)

# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$HR, 0.75) + 1.5 * IQR(hof$HR), color = "red", linetype = "dotted")
# Add a horizontal line at the y-value corresponding to the OPS outlier threshold
r <- r + geom_hline(yintercept = quantile(hof$HR, 0.25) - 1.5 * IQR(hof$HR), color = "blue", linetype = "dotted")

print(r)

Now let’s make a histogram

ggplot(hof, aes(x=OPS)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Let’s specify bin size

ggplot(hof, aes(x=OPS)) +
  geom_histogram(breaks = seq(0.4, 1.2, by = 0.1),
                 color = "blue", fill = "white")

Chapter 3.5 Two Numeric Values, Scatter Plot with Smoothing

ggplot(hof, aes(MidCareer, OPS)) + 
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

(p <- ggplot(hof, aes(OBP,SLG)) + geom_point())

lm_fit <- lm(SLG ~OBP, data = hof)

(p <- p + 
    xlim(0.25, 0.50) + ylim(0.28, 0.75) +
    xlab("On Base Percentage") + 
    ylab("Slugging Percentage"))

## Warning: Removed 1 rows containing missing values (`geom_point()`).

#Alternatively we could change the limits and the labels by appealing directly to the scale_x_continuous() and scale_y_continuous() functions

p <- p +
  scale_x_continuous("On Base Percentage", 
                     limits = c(0.25, 0.50)) +
  scale_y_continuous("Slugging  Percentage", 
                     limits = c(0.28, 0.75))

## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

## Warning: Removed 1 rows containing missing values (`geom_point()`).

q = p + geom_abline(intercept = coef(lm_fit)[1], slope = coef(lm_fit)[2], color="red")
q

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Suppose we wanted to draw the function y=0.7-x on the graph:

(p <- p + geom_abline(slope = -1,
                      intercept = seq(0.7, 1, by=0.1),
                      color="black",
                      linetype = "dotted"))

## Warning: Removed 1 rows containing missing values (`geom_point()`).

#In our final iteration, we want to add labels to the lines showing the constant values of OPS
#First Let's create a dataframe which contains the coordiates
ops_labels <- tibble(
  OBP = rep(0.3, 4),
  SLG = seq(0.4, 0.7, by = 0.1),
  label = paste("OPS = ", OBP + SLG)
)

p + geom_text(data = ops_labels, hjust = "right", 
              aes(label = label))

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Ch 3.6.1 Parallel Strip Charts

#create a new column that's HR rate (home run per at bat) as opposed to total numbers
hof <- mutate(hof, hr_rate = HR/AB)

ggplot(hof, aes(hr_rate, Era)) + 
  geom_jitter(height=0.1)

r <- ggplot(hof, aes(Era, hr_rate)) + 
  geom_boxplot(outlier.colour = "red") + 
  coord_flip()

outliers <- subset(hof, hr_rate > quantile(hr_rate, 0.75) + 1.5 * IQR(hr_rate) | hr_rate < quantile(hr_rate, 0.25) - 1.5 * IQR(hr_rate))

r <- r + geom_text_repel(
  data = outliers,
  aes(x = hr_rate, y = Era, label = Player),  # Swap x and y aesthetics
  color = "#000000", 
  size = 4,
  nudge_x = -0.2, nudge_y = 0.4,
  segment.color = "red",
)

print(r)

3.7 Comparing Ruth Aaron, Bons, and A-Rod Now we’ll start working with the Lahman data

#First we need to adjust birthyear, MLB defines age season by the age of te player on June 30 of the season
#First let's create a function which will get us the correct age of each player
library(Lahman)

## Warning: package 'Lahman' was built under R version 4.3.2

## 
## Attaching package: 'Lahman'

## The following object is masked _by_ '.GlobalEnv':
## 
##     Batting

Master <- Lahman::People #book said to use master.csv file, it looks like Lahman changed this to player.csv
Batting <- Lahman::Batting

get_birthyear <- function(Name) {
  Names <- unlist(strsplit(Name, " "))  #first we'll need to split the Name string into first and last name
  Master %>%
    filter(nameFirst == Names[1],
           nameLast == Names[2]) %>%
    mutate(birthyear = ifelse(birthMonth >= 7,
                              birthYear + 1, birthYear),
  Player = paste(nameFirst, nameLast)) %>%
  select(playerID, Player, birthyear)
}

PlayerInfo <- bind_rows(get_birthyear("Babe Ruth"),
                        get_birthyear("Hank Aaron"),
                        get_birthyear("Barry Bonds"),
                        get_birthyear("Alex Rodriguez"))

Batting %>%
  inner_join(PlayerInfo, by="playerID") %>%
  mutate(Age = yearID - birthyear) %>%
  select(Player, Age, HR) %>%
  group_by(Player)%>%
  mutate(CHR = cumsum(HR)) -> HRdata


#After the data cleaning let's construct a graph
lineplot <- ggplot(HRdata, aes(x=Age, y = CHR, linetype = Player)) +
  geom_line(aes(colour = Player))

print(lineplot)

Now it’s time to plot the Sammy Sosa vs Mark mcGuire Home Run Race

fields <- read.csv("baseball_R/data/fields.csv")
data1998 <- read.csv("baseball_R/data/all1998.csv",
                     col.names = fields$Header)

#First thing we need to do after pulling in the data is to retrieve Mac and Sammy's ids
sosa_id <- Master %>%
  filter(nameFirst == "Sammy", nameLast == "Sosa") %>%
  pull(retroID)

mac_id <- Master %>%
  filter(nameFirst == "Mark", nameLast == "McGwire") %>%
  pull(retroID)


#now that we have their id's let's go to the play by play data
hr_race <- data1998 %>%
  filter(BAT_ID %in% c(sosa_id, mac_id))

3.8.2 Extracting the variables

library(lubridate)
cum_hr <- function(d) {
  d%>%
    mutate(Date = ymd(str_sub(GAME_ID, 4, 11))) %>%
    arrange(Date) %>%
    mutate(HR = ifelse(EVENT_CD ==23,1,0),
                       cumHR = cumsum(HR)) %>%
    select(Date, cumHR)
}

#Next, we will use map_df() function to itrate our new function cum_hr() twice, once for SOSAand once for McGwire's batting data, and obtaining a new data frame...hr_ytd
hr_ytd <- hr_race %>%
  split(pull(., BAT_ID)) %>%
  map_df(cum_hr, .id = "BAT_ID") %>%
  inner_join(Master, by = c("BAT_ID" = "retroID"))

#contructing the graph
ggplot(hr_ytd, aes(Date, cumHR, linetype=nameLast)) +
  geom_line() +
  geom_hline(yintercept=62, color="blue") +
  annotate("text", ymd("1998-04-15"), 65, label = "62", color="blue") +
  ylab("Home Runs in the 1998 Season")

Next let’s try the chapter 3 exercises 1. Hall of Fame Pitchers

hofpitching <- read.csv("baseball_R/data/hofpitching.csv")

#BF variable is the number of batters faced in a pitcher's career.  Let's group the pitchers using the intervals designated in the book

hofpitching <- hofpitching %>% 
  mutate(BF.group = cut(BF,
                        c(0, 10000, 15000, 20000, 30000),
                        labels = c("Less than 10000", "(10000, 14999)", ("15000, 19999"), "20000 or More")))

1.a Construct a Frequency Table

#Let's do pitching era like we did with batting:
hofpitching <- hofpitching %>%
  mutate(MidCareer = (From + To) / 2,
         Era = cut(MidCareer,
                   breaks = c(1800, 1900, 1919, 1941, 1960, 1976, 1993, 2050),
                   labels = c("19th Century", "Dead Ball", "Lively Ball", "Integration", "Expansion", "Free Agency", "Long Ball")))

hofpitching_bfgroupfreqtable <- hofpitching %>%
  group_by(BF.group) %>%
  summarise(
    Frequency = n(),
    WAR = sum(WAR),
    SO = sum(SO),
    BB = sum(BB),
    HBP = sum(HBP),
    CG = sum(CG)
  )

hofpitching_yearfreqtable <- hofpitching %>%
  group_by(Era) %>%
  summarise(
    Frequency = n(),
    WAR = sum(WAR),
    SO = sum(SO),
    BB = sum(BB),
    HBP = sum(HBP),
    CG = sum(CG)
  )

Construct the histogram

ggplot(hofpitching, aes(x=BF)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(hofpitching, aes(x=HBP)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

ggplot(hofpitching, aes(x=SO)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(hofpitching, aes(x=BB)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(hofpitching, aes(x=WAR)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(hofpitching, aes(x=CG)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(hofpitching, aes(x = BF, fill = BF > 20000)) +
  geom_histogram(binwidth = 1000) +
  scale_fill_manual(values = c("FALSE" = "gray", "TRUE" = "red")) +
  ylab("Count of Pitchers") +
  xlab("Batters Faced (BF)") +
  ggtitle("Histogram of Batters Faced (BF)") +
  theme_minimal()

ggplot(hofpitching, aes(x = BF)) +
  geom_density(aes(fill = BF > 20000, color = BF > 20000), alpha = 0.5) +
  scale_fill_manual(values = c("FALSE" = "gray", "TRUE" = "blue")) +
  scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "blue")) +
  ylab("Density") +
  xlab("Batters Faced (BF)") +
  ggtitle("Density Curve with Two Colors for Batters Faced (BF)") +
  theme_minimal()

Now let’s do a bar graph by BF.group

ggplot(hofpitching, aes(BF.group)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count), vjust=-0.5), color="blue") +
  ylab("Count of Pitchers") + 
  xlab("Batters Faced Group") + 
  ggtitle("Number of Pitchers in Each Batters Faced Group")

ggsave("HOFPitchingBarChart.png")

## Saving 7 x 5 in image

ggplot(hofpitching_bfgroupfreqtable, aes(BF.group, Frequency)) +
  geom_point() +
  ylab("Count of Pitchers") + 
  xlab("Batters Faced Group") + 
  ggtitle("Number of Pitchers in Each Batters Faced Group")

ggsave("HOFPitchingdotplot.png")

## Saving 7 x 5 in image

ggplot(hofpitching, aes(Era)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = after_stat(count), vjust=-0.5), color="blue") +
  ylab("Count of Pitchers") + 
  xlab("Era") + 
  ggtitle("Number of Pitchers in Each Batters Faced Group")

  ggsave("HOFPitchingbyeya.png")

## Saving 7 x 5 in image

ggplot(hofpitching, aes(WAR)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

linear_model <- lm(WAR ~ G, data=hofpitching)

ggplot(hofpitching, aes(G, WAR)) +
  geom_point() +
  geom_text_repel(
    data = subset(hofpitching, WAR > quantile(WAR, 0.75) + 1.5 * IQR(WAR) | WAR < quantile(WAR, 0.25) - 1.5 * IQR(WAR)),
  aes(x = G, y = WAR, label = X),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red") +
  geom_abline(intercept=coef(linear_model)[1], slope = coef(linear_model)[2], color = "blue")

linear_model2 <- lm(WAR ~ SO, data=hofpitching)

ggplot(hofpitching, aes(SO, WAR)) +
  geom_point() +
  geom_text_repel(
    data = subset(hofpitching, WAR > quantile(WAR, 0.75) + 1.5 * IQR(WAR) | WAR < quantile(WAR, 0.25) - 1.5 * IQR(WAR)),
  aes(x = SO, y = WAR, label = X),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red") +
  geom_abline(intercept=coef(linear_model2)[1], slope = coef(linear_model2)[2], color = "blue")

Create a variable which shows WAR per season

hofpitching <- hofpitching %>%
  mutate(WAR.season = WAR/Yrs)

Let’s graph

linear_model2 <- lm(WAR.season ~ WAR, data=hofpitching)

ggplot(hofpitching, aes(WAR, WAR.season)) +
  geom_point() +
  geom_text_repel(
    data = subset(hofpitching, WAR.season > quantile(WAR.season, 0.75) + 1.5 * IQR(WAR.season) | WAR.season < quantile(WAR.season, 0.25) - 1.5 * IQR(WAR.season)),
  aes(x = WAR, y = WAR.season, label = X),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red") +
  geom_abline(intercept=coef(linear_model2)[1], slope = coef(linear_model2)[2], color = "blue")

ggplot(hofpitching, aes(WAR.season, y=1)) + 
  geom_jitter(height = 0.6) + ylim(-1,3) +
  theme(axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) +
  coord_fixed(ratio = 0.03) +
  geom_text_repel(
    data = subset(hofpitching, WAR.season > 7 | WAR.season < 2.5),
  aes(x = WAR.season, y = 1, label = X),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red") +
  geom_vline(xintercept=7, color="red") +
  geom_vline(xintercept=2.5, color = "blue") +
  coord_cartesian(xlim = c(0,10))

## Coordinate system already present. Adding new coordinate system, which will
## replace the existing one.

## Warning: ggrepel: 18 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggplot(hofpitching, aes(WAR.season, BF.group)) + 
  geom_jitter(height = 0.1) +
  geom_vline(xintercept=7, color="red") +
  geom_vline(xintercept=2.5, color = "blue") +
  coord_cartesian(xlim = c(0,10))

ggplot(hofpitching, aes(MidCareer, WAR)) + 
  geom_point() +
  geom_smooth() + 
  ggtitle("WAR by Individual HOF Player MidCareer Season") +
  geom_text_repel(
    data = subset(hofpitching, WAR.season < 1.75 & MidCareer <1900) , 
    aes(x=MidCareer, y=WAR, label=X)
  )

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

hofpitching <- hofpitching %>%
  arrange(MidCareer)

hofpitching.recent <- hofpitching
ggplot(hofpitching, aes(MidCareer, WAR.season)) +
  geom_point() +
  geom_text_repel(
    data = subset(hofpitching, WAR.season > 4.2 & MidCareer >1959),
  aes(x = MidCareer, y = WAR.season, label = X),
  color = "#000000", 
  size = 4,
  nudge_x = 0.2, nudge_y = 0.4,
  segment.color = "red")

Exercise 3.6 Working with the Lahman Batting Dataset

Read in the Lahman Master and Batting Data Frames (this was already done above)
Collect in a single data frame the season batting statistics for the great hitters ty Cobb, Ted Williams, and Pete Rose

#Batting %>%
#  inner_join(PlayerInfo, by="playerID") %>%
#  mutate(Age = yearID - birthyear) %>%
#  select(Player, Age, HR) %>%
#  group_by(Player)%>%
#  mutate(CHR = cumsum(HR)) -> HRdata

get_player_id <- function(player_name) {
  Names <- unlist(strsplit(player_name, " "))
  Master %>%
    filter(nameFirst == Names[1],
           nameLast == Names[2]) %>%
    mutate(birthyear = ifelse(birthMonth >= 7, 
                              birthYear+1, birthYear),
           Player =  paste(nameFirst, nameLast)) %>%
    select(playerID, Player, birthyear, height, weight, bats, throws, birthCountry, birthState, birthCity)
}

TedWilliams_PeteRose_TyCobb_Info <- bind_rows(get_player_id("Ted Williams"),
                                              get_player_id("Pete Rose"),
                                              get_player_id("Ty Cobb"))

TedWilliams_PeteRose_TyCobb_Info <- TedWilliams_PeteRose_TyCobb_Info %>%
  filter(playerID != "rosepe02")

Batting_TC_PR_TW <-  Batting %>%
  inner_join(TedWilliams_PeteRose_TyCobb_Info, by="playerID") %>%
  mutate(age = yearID - birthyear) %>%
  group_by(Player) %>%
  arrange(Player, yearID) %>%
  mutate(cumH = cumsum(H)) -> Hdata
  
Pete_Rose_Lineplot <- ggplot(Hdata, aes(x=age, y=cumH, linetype=Player)) +
  geom_line(aes(colour=Player)) +
  theme_minimal()

print(Pete_Rose_Lineplot)

As you can see from the above graph, Ted Williams’ starting accumulating hits after Ty Cobb (around the age 20 season). However, there were large periods of stagnation resulting in him finishing with significantly less hits than Pete Rose and Ty Cobb. The slope of the line however, appears to be similar in his first two periods before stagnation. The eriods of stagnation are seasons Ted did not play, because he was in the war. It may be reasonable to project a similar slope to his line as Pete and Ty, however, we do see a change in the slope of the line in his 30s and beyond. It’s tough to project what would have happened had he not been absent from the game.

Pete Rose and Ty Cobb had similar slopes, meaning they acucmulated hits at a similar rate. However, Ty Cobb achieved 4000 hits before his age 40 season, whereas Pete Rose accumlated 4000 after his age 40 season. However, since the slopes are very similar, we notice that this is largely due to Pete beginning to accumulate hits later in his career (after age 20) than Ty Cobb (stated. before age 20).

Next, let’s start exercise 3.7 working with Play by Play data

The goal of this exercise is to recreate the homerun race, by comparing space between homeruns

#first let's create a Mark M data frame
mac.data <- data1998 %>%
  filter(BAT_ID == mac_id)

#next, a Sammy S data frame
sosa.data <- data1998 %>%
  filter(BAT_ID == sosa_id)

Now we need to restrict the two data frames to plays where a batting event occurred.

mac.data <- filter(mac.data, BAT_EVENT_FL == TRUE)
sosa.data <- filter(sosa.data, BAT_EVENT_FL == TRUE)

Next, we’ll need to create a new variable that numbers the plate appearances. This should simply equal the number of rows, so we’ll just use the nrow() function

mac.data <- mutate(mac.data, PA = 1:nrow(mac.data))
sosa.data <- mutate(sosa.data, PA = 1:nrow(sosa.data))

#Now let's  create a command that returns the number of plate appearances when the player hits a home run
mac.HR.PA <- mac.data %>%
  filter(EVENT_CD == 23) %>%
  pull(PA)

sosa.HR.PA <- sosa.data %>%
  filter(EVENT_CD == 23) %>%
  pull(PA)

#Now let's compute the number of spacings between occurrences of home runs
mac.spacings <- diff(c(0,mac.HR.PA))
sosa.spacings <- diff(c(0, sosa.HR.PA))

#Let's create a new df with Player Name and Spacing Value
homerunSpacingData <- data.frame(
  Player = c("Mark McGuire", "Sammy Sosa"),
  Spacing = c(mac.spacings, sosa.spacings)
)

print(homerunSpacingData)

##           Player Spacing
## 1   Mark McGuire       2
## 2     Sammy Sosa      25
## 3   Mark McGuire       8
## 4     Sammy Sosa       8
## 5   Mark McGuire       5
## 6     Sammy Sosa      19
## 7   Mark McGuire       1
## 8     Sammy Sosa      22
## 9   Mark McGuire       2
## 10    Sammy Sosa      29
## 11  Mark McGuire      11
## 12    Sammy Sosa       1
## 13  Mark McGuire       4
## 14    Sammy Sosa       1
## 15  Mark McGuire       7
## 16    Sammy Sosa       6
## 17  Mark McGuire      22
## 18    Sammy Sosa       8
## 19  Mark McGuire      12
## 20    Sammy Sosa       7
## 21  Mark McGuire      13
## 22    Sammy Sosa       5
## 23  Mark McGuire       2
## 24    Sammy Sosa      17
## 25  Mark McGuire       7
## 26    Sammy Sosa       1
## 27  Mark McGuire       2
## 28    Sammy Sosa      13
## 29  Mark McGuire       8
## 30    Sammy Sosa       5
## 31  Mark McGuire       1
## 32    Sammy Sosa      11
## 33  Mark McGuire      35
## 34    Sammy Sosa       8
## 35  Mark McGuire       3
## 36    Sammy Sosa       6
## 37  Mark McGuire       8
## 38    Sammy Sosa       1
## 39  Mark McGuire       2
## 40    Sammy Sosa       7
## 41  Mark McGuire      26
## 42    Sammy Sosa       8
## 43  Mark McGuire       9
## 44    Sammy Sosa       9
## 45  Mark McGuire       5
## 46    Sammy Sosa       5
## 47  Mark McGuire       1
## 48    Sammy Sosa       8
## 49  Mark McGuire       3
## 50    Sammy Sosa      12
## 51  Mark McGuire      40
## 52    Sammy Sosa      22
## 53  Mark McGuire       1
## 54    Sammy Sosa       3
## 55  Mark McGuire      17
## 56    Sammy Sosa       3
## 57  Mark McGuire      15
## 58    Sammy Sosa      15
## 59  Mark McGuire      15
## 60    Sammy Sosa      33
## 61  Mark McGuire      15
## 62    Sammy Sosa       5
## 63  Mark McGuire       8
## 64    Sammy Sosa       5
## 65  Mark McGuire       7
## 66    Sammy Sosa      25
## 67  Mark McGuire       4
## 68    Sammy Sosa       2
## 69  Mark McGuire       3
## 70    Sammy Sosa       2
## 71  Mark McGuire      12
## 72    Sammy Sosa       1
## 73  Mark McGuire       3
## 74    Sammy Sosa      12
## 75  Mark McGuire      15
## 76    Sammy Sosa       2
## 77  Mark McGuire      16
## 78    Sammy Sosa      46
## 79  Mark McGuire      16
## 80    Sammy Sosa      48
## 81  Mark McGuire       1
## 82    Sammy Sosa       6
## 83  Mark McGuire       4
## 84    Sammy Sosa       7
## 85  Mark McGuire       5
## 86    Sammy Sosa       7
## 87  Mark McGuire       4
## 88    Sammy Sosa       2
## 89  Mark McGuire       1
## 90    Sammy Sosa       2
## 91  Mark McGuire       5
## 92    Sammy Sosa       8
## 93  Mark McGuire       2
## 94    Sammy Sosa       5
## 95  Mark McGuire       2
## 96    Sammy Sosa       3
## 97  Mark McGuire      15
## 98    Sammy Sosa      22
## 99  Mark McGuire      18
## 100   Sammy Sosa       2
## 101 Mark McGuire      22
## 102   Sammy Sosa      11
## 103 Mark McGuire      10
## 104   Sammy Sosa      10
## 105 Mark McGuire       1
## 106   Sammy Sosa       6
## 107 Mark McGuire       9
## 108   Sammy Sosa      12
## 109 Mark McGuire       6
## 110   Sammy Sosa       4
## 111 Mark McGuire       2
## 112   Sammy Sosa      22
## 113 Mark McGuire      15
## 114   Sammy Sosa      20
## 115 Mark McGuire       9
## 116   Sammy Sosa       4
## 117 Mark McGuire       7
## 118   Sammy Sosa      20
## 119 Mark McGuire      23
## 120   Sammy Sosa       5
## 121 Mark McGuire      25
## 122   Sammy Sosa      13
## 123 Mark McGuire       4
## 124   Sammy Sosa      20
## 125 Mark McGuire       1
## 126   Sammy Sosa       3
## 127 Mark McGuire      20
## 128   Sammy Sosa      13
## 129 Mark McGuire      12
## 130   Sammy Sosa      20
## 131 Mark McGuire       7
## 132   Sammy Sosa       8
## 133 Mark McGuire      21
## 134   Sammy Sosa       8
## 135 Mark McGuire       1
## 136   Sammy Sosa      19

#contructing the graph
ggplot(homerunSpacingData, aes(Spacing, color=Player, fill = Player)) +
  geom_histogram(alpha = 0.5, bins= 10, position = "identity")

ggplot(homerunSpacingData, aes(Spacing)) +
  geom_histogram() + facet_wrap(~ Player, ncol = 1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

homerunSpacingData %>% group_by(Player) %>% 
  summarize(M = median(Spacing),
            Mean = mean(Spacing),
            Max = max(Spacing),
            Min = min(Spacing))