Import Libraries

# import libraries
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.9      ✓ rsample      0.1.0 
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.3 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.8 
## ✓ recipes      0.1.16

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(dplyr)
library(tidyr)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:purrr':
## 
##     lift

library(ggplot2)

Load Data Files

# Load data files
battingdf <- read.csv(file = "Batters-1960-2019.csv", header=TRUE)
pitchingdf <- read.csv(file = "Pitchers-1960-2019.csv", header=TRUE)

Data Wrangling

battingdf <- battingdf %>%
  mutate(College = ifelse(College =='#N/A',NA, College)) %>%
  mutate(CollegeOrHS = ifelse(!is.na(College),1,0))

pitchingdf <- pitchingdf %>%
  mutate(College = ifelse(College.or.HS =='#N/A',NA, College.or.HS)) %>%
  mutate(CollegeOrHS = ifelse(!is.na(College),1,0)) %>%
  filter(Year <= 2019)

Drop unneeded columns

battingdf <- select(battingdf, -College.or.HS., -Country)
pitchingdf <- select(pitchingdf, -X, -College.or.HS)

———————————————————

BATTERS

Note: High School Batters means batters who went to HS before MLB College Batters means batters who played in College before MLB

All High School and College Batters

Criteria: More than 400 PA in back to back years Played in consecutive years

Min_PA = 400
  
battingdfByPAs <- battingdf %>%
  mutate(PA_400 = ifelse(PA >= Min_PA, PA, NA)) %>%
  mutate(WAR_400 = ifelse(PA >= Min_PA, WAR, NA)) %>%
  group_by(bbrefID) %>%
  arrange(Year) %>%
  mutate(WAR_change = WAR_400 - lag(WAR_400)) %>%
  mutate(Standardized_WAR = (WAR_400 / PA_400)*(Min_PA)) %>% 
  mutate(Standardized_WAR_change = Standardized_WAR - lag(Standardized_WAR))

#Frequency Table of Ages - Batters Made decision to drop those ages with frequencies below 100

battingdfByPAs %>%
  group_by(Age) %>%
  count(Age)

## # A tibble: 36 × 2
## # Groups:   Age [36]
##      Age     n
##    <int> <int>
##  1    17     6
##  2    18    44
##  3    19   182
##  4    20   521
##  5    21  1308
##  6    22  2544
##  7    23  4033
##  8    24  5471
##  9    25  6283
## 10    26  6341
## # … with 26 more rows

Dropping Ages based on small sample size

Made decision to drop those ages with frequencies below 100

battingdfByPAs <- battingdfByPAs %>%
  filter(Age > 18) %>%
  filter(Age < 42)

# Average WAR changes for all players who meet criteria (HS and College)
Bat_WAR_All_by_age <- battingdfByPAs %>%
  filter(Age < 41) %>%
  filter(Age > 20) %>%
  group_by(Age) %>%
  arrange(Age) %>%
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Create Chained WAR
Bat_WAR_All_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_All_by_age$WAR_change, 0)) + Bat_WAR_All_by_age$WAR_change * 0

# Created Chain Standardized WAR Change (Cumulamative )
Bat_WAR_All_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_All_by_age$Standardized_WAR_change, 0)) + 
  Bat_WAR_All_by_age$Standardized_WAR_change * 0

# Change order of columns
Bat_WAR_All_by_age <- Bat_WAR_All_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

#Rename Columns
Bat_WAR_All_by_age <- Bat_WAR_All_by_age %>%
  rename(Avg_WAR_400 = WAR_400,
         Avg_WAR_change = WAR_change,
         Chain_WAR_400 = Chain_WAR)

College Batters

# Do same for colleges 
# College or HS = College

Min_PA = 400

batcollegeByPAs <- battingdfByPAs %>% 
  filter(CollegeOrHS == 1)

# Average war changes for all college players who meet PA criteria
Bat_WAR_avg_College_by_age <- batcollegeByPAs %>%
  filter(Age < 41) %>%
  filter(Age > 21) %>%
  group_by(Age) %>%
  arrange(Age) %>%
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Standard deviation of war changes for all college players who meet PA criteria
WAR_sd_College_by_age <- batcollegeByPAs %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  summarize_at(vars(WAR_change, WAR_400), (sd), na.rm=TRUE)

# Create Chained WAR for college players
Bat_WAR_avg_College_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_avg_College_by_age$WAR_change, 0)) + Bat_WAR_avg_College_by_age$WAR_change * 0

# Created Chain Standardized WAR Change
Bat_WAR_avg_College_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_avg_College_by_age$Standardized_WAR_change, 0)) + 
  Bat_WAR_avg_College_by_age$Standardized_WAR_change * 0

# Change order of columns
Bat_WAR_avg_College_by_age <- Bat_WAR_avg_College_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

#Rename Columns
Bat_WAR_avg_College_by_age <- Bat_WAR_avg_College_by_age %>%
  rename(Avg_WAR_400 = WAR_400,
         Avg_WAR_change = WAR_change,
         Chain_WAR_400 = Chain_WAR)

High School Batters

# Do same for High Schools 
# College or HS = HS

Min_PA = 400

batHSByPAs <- battingdfByPAs %>% 
  filter(CollegeOrHS == 0)

# Average WAR changes and WAR for all high school players who meet PA criteria
Bat_WAR_avg_HS_by_age <- batHSByPAs %>%
  filter(Age < 41) %>%
  filter(Age > 21) %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Standard Deviation of WAR changes and WAR for all high school players who meet PA criteria
WAR_sd_HS_by_age <- batHSByPAs %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  summarize_at(vars(WAR_change, WAR_400), sd, na.rm=TRUE)

# Create Chained WAR for college players
Bat_WAR_avg_HS_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_avg_HS_by_age$WAR_change, 0)) + 
  Bat_WAR_avg_HS_by_age$WAR_change * 0

# Created Chain Standardized WAR Change
Bat_WAR_avg_HS_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_avg_HS_by_age$Standardized_WAR_change, 0)) + 
  Bat_WAR_avg_HS_by_age$Standardized_WAR_change * 0

# Change order of columns
Bat_WAR_avg_HS_by_age <- Bat_WAR_avg_HS_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

#Rename Columns
Bat_WAR_avg_HS_by_age <- Bat_WAR_avg_HS_by_age %>%
  rename(Avg_WAR_400 = WAR_400,
         Avg_WAR_change = WAR_change,
         Chain_WAR_400 = Chain_WAR)

Aging Curves - Batters

Professor Luea Suggestion: Maybe conduct aging curves for different cohorts of players - maybe young star players like a Trea Turner get own aging curve. etc

Delta Method - Chained WAR

# Plot aging curves for all players who meet PA criteria

aging_curve <- ggplot(Bat_WAR_All_by_age, aes(x = Age, y = Chain_WAR_400)) + 
  geom_line() + 
  geom_point() +
  labs(title= "All Batters Aging Curve", y="Chained WAR", x = "Age") + xlim(15, 45)
print(aging_curve)

Delta Method - All Batters - Standardized WAR per 400 PA

aging_curve <- ggplot(Bat_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "All Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve

Comparison to see which is better Blue is Delta Method Black is Delta Method per 400 PA

combined_plotALLBatters <- ggplot() +               
  geom_point(data = Bat_WAR_All_by_age, aes(Age, Chain_WAR_400), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data=Bat_WAR_All_by_age, mapping = aes(Age, Chain_WAR_400,), colour="blue") + 
  geom_point(data = Bat_WAR_All_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Bat_WAR_All_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "All Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotALLBatters

Delta Method - College Batters

aging_curve_college <- ggplot(Bat_WAR_avg_College_by_age, aes(x = Age, y = Chain_WAR_400)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "College Batters Aging Curve", y=" Chained WAR", x = "Age") + 
  xlim(15, 45)
aging_curve_college

Delta Method - College Batters - WAR per 400 PA

aging_curve_college2 <- ggplot(Bat_WAR_avg_College_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "College Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve_college2

Comparison to see which is better Blue is Delta Method Black is Delta Method per 400 PA

combined_plotALLCollegeBatters <- ggplot() +               
  geom_point(data = Bat_WAR_avg_College_by_age, aes(Age, Chain_WAR_400), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data= Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_400), colour="blue") + 
  geom_point(data =  Bat_WAR_avg_College_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "College Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotALLCollegeBatters

Delta Method - High School Batters

aging_curve_HSBatters <- ggplot(Bat_WAR_avg_HS_by_age, aes(x = Age, y = Chain_WAR_400)) + geom_point() + geom_line() + 
  labs(title= "High School Batters Aging Curve", y=" Chained WAR", x = "Age") + 
  xlim(15, 45)
aging_curve_HSBatters

Standardized WAR per 400 PA - High School Batters

aging_curve_HS2 <- ggplot(Bat_WAR_avg_HS_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "High School Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve_HS2

Combined Plot - High School Batters - Goal is to determine which is better, WAR or WAR Per 400 PA - must make decision regarding this

combined_plotALLHSBatters <- ggplot() +               
  geom_point(data = Bat_WAR_avg_HS_by_age, aes(Age, Chain_WAR_400), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data= Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_400), colour="blue") + 
  geom_point(data =  Bat_WAR_avg_HS_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "High School Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotALLHSBatters

High School Versus College Battters Comparison Plots - Delta Method - not per 400 PA

combined_plotDelta <- ggplot() + 
  
  geom_point(data = Bat_WAR_avg_College_by_age, aes(Age, Chain_WAR_400), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data=Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_400,), colour="blue") + 
  
  geom_point(data = Bat_WAR_avg_HS_by_age, aes(Age, Chain_WAR_400), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data=Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_400), colour="black") +
  
  labs(title = "College vs High School Aging Curves - Chained WAR", x = "Age", y = "Chained WAR") + xlim(19, 41) + theme_light() + scale_x_continuous(breaks = seq(19, 41, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotDelta

Plotted a regression on the aging curve to see how it looks but not sure this tells me anything.

combined_plotregress <- ggplot() + 
  stat_smooth(data=Bat_WAR_avg_College_by_age, mapping = aes(Age, Avg_WAR_400), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="blue", na.rm = TRUE) + stat_smooth(data=Bat_WAR_avg_HS_by_age, mapping = aes(Age, Avg_WAR_400), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="black", na.rm = TRUE) +
  labs(title = "College vs High School Aging Curve Regression - WAR", x = "Age", y = "WAR") + 
  xlim(19, 41) + 
  theme_light() + 
  scale_x_continuous(breaks = seq(19, 41, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotregress

Regressions - Batters

Plot Regression for all batters

View the data first in a scatter plot

ggplot(battingdfByPAs, aes(Age, WAR_400)) + geom_point()

## Warning: Removed 50544 rows containing missing values (geom_point).

Suggestion from Professor Luea: Potentially think of non-linear transformations on the age variable

Function to fit model

fit_model <- function(battingdfByPAs) {
  fit <- lm(WAR_400 ~ I((Age-30)^2) + CollegeOrHS , data = battingdfByPAs)
  b <- coef(fit)
  Age.max <- 30- b[2] / b[3] / 2
  Max <- b[1] - b[2] ^ 2 / b[3] / 4
  list(fit = fit, Age.max = Age.max, Max = Max)
}

F2 <- fit_model(battingdfByPAs)
coef(F2$fit)

##     (Intercept) I((Age - 30)^2)     CollegeOrHS 
##     2.738226258    -0.005562773    -0.156034743

c(F2$Age.max, F2$Max)

## I((Age - 30)^2)     (Intercept) 
##       29.982175        2.738276

ggplot(battingdfByPAs, aes(Age, WAR_400)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE, size = 1.5, 
              formula = y ~ poly(x, 2, raw = TRUE)) +
  geom_vline(xintercept = F2$Age.max, 
             linetype = "dashed", color = "darkgrey") +
  geom_hline(yintercept = F2$Max, 
             linetype = "dashed", color = "darkgrey") + 
  annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1), 
           label = c("Peak Age", "Max WAR"), size = 5) +
  labs(title = "All Players Regression")

## Warning: Removed 50544 rows containing non-finite values (stat_smooth).

## Warning: Removed 50544 rows containing missing values (geom_point).

F2 %>% pluck("fit") %>% summary()

## 
## Call:
## lm(formula = WAR_400 ~ I((Age - 30)^2) + CollegeOrHS, data = battingdfByPAs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5380 -1.4894 -0.1992  1.2362  9.7118 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.738226   0.032852  83.351  < 2e-16 ***
## I((Age - 30)^2) -0.005563   0.001003  -5.546    3e-08 ***
## CollegeOrHS     -0.156035   0.041613  -3.750 0.000178 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.112 on 10909 degrees of freedom
##   (50544 observations deleted due to missingness)
## Multiple R-squared:  0.003673,   Adjusted R-squared:  0.00349 
## F-statistic: 20.11 on 2 and 10909 DF,  p-value: 1.922e-09

Suggestion from Professor Luea on how to achieve better model fit: For example, run model with WAR ~ Age, then WAR ~ Age + Age^2, then WAR ~ Age + Age^2 + CollegeOrHS For each model, write down coefficient estimates and R^2 and if estimates are statistically significant or not Also look at what else is being done in the space by researchers

Plot regression for college batters

View the data first in a scatter plot

ggplot(batcollegeByPAs, aes(Age, WAR_400)) + geom_point()

## Warning: Removed 23573 rows containing missing values (geom_point).

Function to fit model

fit_model2 <- function(batcollegeByPAs) {
  fit <- lm(WAR_400 ~ I(Age-30) + I((Age-30)^2), data = batcollegeByPAs)
  b <- coef(fit)
  Age.max <- 30- b[2] / b[3] / 2
  Max <- b[1] - b[2] ^ 2 / b[3] / 4
  list(fit = fit, Age.max = Age.max, Max = Max)
}

F3 <- fit_model2(batcollegeByPAs)
coef(F3$fit)

##     (Intercept)     I(Age - 30) I((Age - 30)^2) 
##     2.579065367    -0.045916244    -0.008282423

c(F3$Age.max, F2$Max)

## I(Age - 30) (Intercept) 
##   27.228091    2.738276

ggplot(batcollegeByPAs, aes(Age, WAR_400)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE, size = 1.5, 
              formula = y ~ poly(x, 2, raw = TRUE)) +
  geom_vline(xintercept = F3$Age.max, 
             linetype = "dashed", color = "darkgrey") +
  geom_hline(yintercept = F3$Max, 
             linetype = "dashed", color = "darkgrey") + 
  annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1), 
           label = c("Peak Age", "Max WAR"), size = 5) +
  labs(title = "College Players Regression")

## Warning: Removed 23573 rows containing non-finite values (stat_smooth).

## Warning: Removed 23573 rows containing missing values (geom_point).

F3 %>% pluck("fit") %>% summary()

## 
## Call:
## lm(formula = WAR_400 ~ I(Age - 30) + I((Age - 30)^2), data = batcollegeByPAs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7541 -1.4423 -0.2302  1.1751  9.8946 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.579065   0.040961  62.963  < 2e-16 ***
## I(Age - 30)     -0.045916   0.008344  -5.503 3.96e-08 ***
## I((Age - 30)^2) -0.008282   0.001758  -4.711 2.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.046 on 4356 degrees of freedom
##   (23573 observations deleted due to missingness)
## Multiple R-squared:  0.01164,    Adjusted R-squared:  0.01118 
## F-statistic: 25.64 on 2 and 4356 DF,  p-value: 8.496e-12

Plot regression for High School batters

View the data first in a scatter plot

ggplot(batHSByPAs, aes(Age, WAR_400)) + geom_point()

## Warning: Removed 26971 rows containing missing values (geom_point).

Function to fit model

fit_model3 <- function(batHSByPAs) {
  fit <- lm(WAR_400 ~ I(Age-30) + I((Age-30)^2), data = batHSByPAs)
  b <- coef(fit)
  Age.max <- 30- b[2] / b[3] / 2
  Max <- b[1] - b[2] ^ 2 / b[3] / 4
  list(fit = fit, Age.max = Age.max, Max = Max)
}

F4 <- fit_model3(batHSByPAs)
coef(F4$fit)

##     (Intercept)     I(Age - 30) I((Age - 30)^2) 
##     2.687670933    -0.051732102    -0.008141429

c(F4$Age.max, F4$Max)

## I(Age - 30) (Intercept) 
##    26.82291     2.76985

ggplot(batHSByPAs, aes(Age, WAR_400)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE, size = 1.5, 
              formula = y ~ poly(x, 2, raw = TRUE)) +
  geom_vline(xintercept = F4$Age.max, 
             linetype = "dashed", color = "darkgrey") +
  geom_hline(yintercept = F4$Max, 
             linetype = "dashed", color = "darkgrey") + 
  annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1), 
           label = c("Peak Age", "Max WAR"), size = 5) +
  labs(title = "College Players Regression")

## Warning: Removed 26971 rows containing non-finite values (stat_smooth).

## Warning: Removed 26971 rows containing missing values (geom_point).

F4 %>% pluck("fit") %>% summary()

## 
## Call:
## lm(formula = WAR_400 ~ I(Age - 30) + I((Age - 30)^2), data = batHSByPAs)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.705 -1.543 -0.205  1.295  9.630 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.687671   0.036142  74.364  < 2e-16 ***
## I(Age - 30)     -0.051732   0.007162  -7.223 5.67e-13 ***
## I((Age - 30)^2) -0.008141   0.001323  -6.156 7.93e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.142 on 6550 degrees of freedom
##   (26971 observations deleted due to missingness)
## Multiple R-squared:  0.009909,   Adjusted R-squared:  0.009606 
## F-statistic: 32.78 on 2 and 6550 DF,  p-value: 6.866e-15

———————————————————

PITCHERS

# Convert chr types to numeric
pitchingdf$IP <- as.numeric(pitchingdf$IP)

## Warning: NAs introduced by coercion

pitchingdf$WAR <- as.numeric(pitchingdf$WAR)

## Warning: NAs introduced by coercion

All High School and College Pitchers

Note: High School Pitchers means Pitchers who went to HS before MLB College Pitchers means Pitchers who played in College before MLB

Criteria: More than 150 IP in back to back years Played in consecutive years

Min_IP = 150
  
pitchingdfByIP <- pitchingdf %>%
  mutate(IP_150 = ifelse(IP >= Min_IP, IP, NA)) %>%
  mutate(WAR_150 = ifelse(IP >= Min_IP, WAR, NA)) %>%
  group_by(bbrefID) %>%
  arrange(Year) %>%
  mutate(WAR_change = WAR_150 - lag(WAR_150)) %>%
  mutate(Standardized_WAR = (WAR_150 / IP_150)*(Min_IP)) %>% 
  mutate(Standardized_WAR_change = Standardized_WAR - lag(Standardized_WAR))

#Frequency Table of Ages - Pitchers Made decision to drop those ages with frequencies below 100

pitchingdfByIP %>%
  group_by(Age) %>%
  count(Age)

## # A tibble: 34 × 2
## # Groups:   Age [34]
##      Age     n
##    <int> <int>
##  1    17     2
##  2    18    19
##  3    19    82
##  4    20   228
##  5    21   613
##  6    22  1231
##  7    23  1969
##  8    24  2745
##  9    25  3245
## 10    26  3327
## # … with 24 more rows

Dropping Ages based on small sample size

pitchingdfByIP <- pitchingdfByIP %>%
  filter(Age > 19) %>%
  filter(Age < 41)

# Average WAR changes for all pitchers who meet criteria (HS and College)
Pitch_WAR_All_by_age <- pitchingdfByIP %>%
  filter(Age > 20) %>%
  group_by(Age) %>%
  arrange(Age) %>%
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Create Chained WAR
Pitch_WAR_All_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_All_by_age$WAR_change, 0)) + Pitch_WAR_All_by_age$WAR_change * 0

# Created Chain Standardized WAR Change
Pitch_WAR_All_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_All_by_age$Standardized_WAR_change, 0)) + 
  Pitch_WAR_All_by_age$Standardized_WAR_change * 0

# Change order of columns
Pitch_WAR_All_by_age <- Pitch_WAR_All_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

#Rename Columns
Pitch_WAR_All_by_age <- Pitch_WAR_All_by_age %>%
  rename(Avg_WAR_150 = WAR_150,
         Avg_WAR_change = WAR_change,
         Chain_WAR_150 = Chain_WAR)

College Pitchers

# Do same for college pitchers 
# College or HS = College

Min_IP = 150
  
PitchCollegeByIP <- pitchingdfByIP %>%
  filter(CollegeOrHS == 1)

# Average war changes for all college pitchers who meet IP criteria
Pitch_WAR_avg_College_by_age <- PitchCollegeByIP %>%
  filter(Age > 20) %>%
  group_by(Age) %>%
  arrange(Age) %>%
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Standard deviation of war changes for all college players who meet IP criteria
WAR_sd_College_by_age <- batcollegeByPAs %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  summarize_at(vars(WAR_change, WAR_400), (sd), na.rm=TRUE)

# Create Chained WAR for college players
Pitch_WAR_avg_College_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_avg_College_by_age$WAR_change, 0)) + Pitch_WAR_avg_College_by_age$WAR_change * 0

# Created Chain Standardized WAR Change
Pitch_WAR_avg_College_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_avg_College_by_age$Standardized_WAR_change, 0)) + 
  Pitch_WAR_avg_College_by_age$Standardized_WAR_change * 0

# Change order of columns
Pitch_WAR_avg_College_by_age <- Pitch_WAR_avg_College_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

# Rename Columns
Pitch_WAR_avg_College_by_age <- Pitch_WAR_avg_College_by_age %>%
rename(Avg_WAR_150 = WAR_150,
         Avg_WAR_change = WAR_change,
         Chain_WAR_150 = Chain_WAR)

High School Pitchers

# Do same for High Schools 
# College or HS = HS

Min_IP = 150
  
PitchHSByIP <- pitchingdfByIP %>%
  filter(CollegeOrHS == 0)

# Average WAR changes and WAR for all high school pitchers who meet IP criteria
Pitch_WAR_avg_HS_by_age <- PitchHSByIP %>%
  filter(Age > 20) %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  # take the mean of each of the variables listed
  summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)

# Standard Deviation of WAR changes and WAR for all high school players who meet PA criteria
WAR_sd_HS_by_age <- batHSByPAs %>%
  group_by(Age) %>%
  arrange(Age) %>% 
  summarize_at(vars(WAR_change, WAR_400), sd, na.rm=TRUE)

# Create Chained WAR for high school players
Pitch_WAR_avg_HS_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_avg_HS_by_age$WAR_change, 0)) + 
  Pitch_WAR_avg_HS_by_age$WAR_change * 0

# Created Chain Standardized WAR Change
Pitch_WAR_avg_HS_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_avg_HS_by_age$Standardized_WAR_change, 0)) + 
  Pitch_WAR_avg_HS_by_age$Standardized_WAR_change * 0

# Change order of columns
Pitch_WAR_avg_HS_by_age <- Pitch_WAR_avg_HS_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows

# Rename Columns
Pitch_WAR_avg_HS_by_age <- Pitch_WAR_avg_HS_by_age %>%
rename(Avg_WAR_150 = WAR_150,
         Avg_WAR_change = WAR_change,
         Chain_WAR_150 = Chain_WAR)

Aging Curves - Pitchers

Delta Method

aging_curve <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_WAR_150)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "All Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
print(aging_curve)

Delta Method - Standardized Per 150 IP

aging_curve <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "All Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
print(aging_curve)

Comparison of the two approaches to determine which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP

combined_plotALL <- ggplot() +               
  geom_point(data = Pitch_WAR_All_by_age, aes(Age, Chain_WAR_150), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data=Pitch_WAR_All_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") + 
  geom_point(data = Pitch_WAR_All_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Pitch_WAR_All_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "All Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotALL

College Pitchers

Delta Method

aging_curve_college <- ggplot(Pitch_WAR_avg_College_by_age, aes(x = Age, y = Chain_WAR_150)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "College Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
aging_curve_college

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

Delta Method - Standardized per 150IP

aging_curve_college <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "College Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
aging_curve_college

Comparsion of two approaches to see which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP

combined_plotCollege <- ggplot() +               
  geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_WAR_150), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data= Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") + 
  geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "College Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotCollege

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

High School Pitchers

Delta Method

aging_curve_HS <- ggplot(Pitch_WAR_avg_HS_by_age, aes(x = Age, y = Chain_WAR_150)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "High School Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
aging_curve_HS

Delta Method - Standardized Per 150IP

aging_curve_HS <- ggplot(Pitch_WAR_avg_HS_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) + 
  geom_point() + 
  geom_line() + 
  labs(title= "High School Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
aging_curve_HS

Comparsion to see which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP

combined_plotHS <- ggplot() +               
  geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_WAR_150), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data= Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") + 
  geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_Standardized_WAR_change), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data = Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
  labs(title = "High School Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotHS

High School Versus College Pitchers Comparison Plots

Blue is College Black is HS

combined_plotDeltaPitchers <- ggplot() + 
  
  geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_WAR_150), 
             fill = "blue", colour = "blue", size = 2, shape = 21) + 
  geom_line(data= Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_150), colour="blue") + 
  
  geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_WAR_150), 
             fill = "black", colour = "black", 
             size = 2, shape = 21) + geom_line(data= Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_150), colour="black") +
  
  labs(title = "College vs High School Pitchers Aging Curves - Chained WAR", x = "Age", y = "Chained WAR") + xlim(19, 41) + theme_light() + scale_x_continuous(breaks = seq(19, 41, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotDeltaPitchers

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 row(s) containing missing values (geom_path).

Ran regressions on the aging curves, again don’t know if tells me anything and also do not know why upside down lol

combined_plotregressPitchers <- ggplot() + 
  stat_smooth(data=Pitch_WAR_avg_College_by_age, mapping = aes(Age, Avg_WAR_150), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="blue", na.rm = TRUE) + stat_smooth(data=Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Avg_WAR_150), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="black", na.rm = TRUE) +
  labs(title = "College vs High School Aging Curve Regression - WAR", x = "Age", y = "WAR") + 
  xlim(19, 41) + 
  theme_light() + 
  scale_x_continuous(breaks = seq(19, 41, 2))

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

combined_plotregressPitchers

Regressions - Pitchers

Plot Regression for all pitchers

View the data first in a scatter plot

ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point()

## Warning: Removed 26389 rows containing missing values (geom_point).

Function to fit model

fit_model <- function(pitchingdfByIP) {
  fit <- lm(WAR_150 ~ I(Age-30) + I((Age-30)^2) + CollegeOrHS , data = pitchingdfByIP)
  b <- coef(fit)
  Age.max <- 30- b[2] / b[3] / 2
  Max <- b[1] - b[2] ^ 2 / b[3] / 4
  list(fit = fit, Age.max = Age.max, Max = Max)
}

F5 <- fit_model(pitchingdfByIP)
coef(F5$fit)

##     (Intercept)     I(Age - 30) I((Age - 30)^2)     CollegeOrHS 
##    2.8253902959    0.0138487604    0.0003813992   -0.0804159260

c(F5$Age.max, F5$Max)

## I(Age - 30) (Intercept) 
##   11.844797    2.699677

ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE, size = 1.5, 
              formula = y ~ poly(x, 2, raw = TRUE)) +
  geom_vline(xintercept = F5$Age.max, 
             linetype = "dashed", color = "darkgrey") +
  geom_hline(yintercept = F5$Max, 
             linetype = "dashed", color = "darkgrey") + 
  annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1), 
           label = c("Peak Age", "Max WAR"), size = 5) +
  labs(title = "All Pitchers Regression")

## Warning: Removed 26389 rows containing non-finite values (stat_smooth).

## Warning: Removed 26389 rows containing missing values (geom_point).

F5 %>% pluck("fit") %>% summary()

## 
## Call:
## lm(formula = WAR_150 ~ I(Age - 30) + I((Age - 30)^2) + CollegeOrHS, 
##     data = pitchingdfByIP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1415 -1.4457 -0.1996  1.2504  9.4550 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.8253903  0.0529327  53.377   <2e-16 ***
## I(Age - 30)      0.0138488  0.0078437   1.766   0.0775 .  
## I((Age - 30)^2)  0.0003814  0.0014969   0.255   0.7989    
## CollegeOrHS     -0.0804159  0.0610854  -1.316   0.1881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.124 on 4977 degrees of freedom
##   (26389 observations deleted due to missingness)
## Multiple R-squared:  0.0009256,  Adjusted R-squared:  0.0003234 
## F-statistic: 1.537 on 3 and 4977 DF,  p-value: 0.2028

Plot regression for college pitchers

View the data first in a scatter plot

ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point()

## Warning: Removed 26389 rows containing missing values (geom_point).

Function to fit model

fit_model <- function(pitchingdfByIP) {
  fit <- lm(WAR_150 ~ I(Age-30) + I((Age-30)^2) + CollegeOrHS , data = pitchingdfByIP)
  b <- coef(fit)
  Age.max <- 30- b[2] / b[3] / 2
  Max <- b[1] - b[2] ^ 2 / b[3] / 4
  list(fit = fit, Age.max = Age.max, Max = Max)
}

F5 <- fit_model(pitchingdfByIP)
coef(F5$fit)

##     (Intercept)     I(Age - 30) I((Age - 30)^2)     CollegeOrHS 
##    2.8253902959    0.0138487604    0.0003813992   -0.0804159260

c(F5$Age.max, F5$Max)

## I(Age - 30) (Intercept) 
##   11.844797    2.699677

ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point() +
  geom_smooth(method = "lm", se = FALSE, size = 1.5, 
              formula = y ~ poly(x, 2, raw = TRUE)) +
  geom_vline(xintercept = F5$Age.max, 
             linetype = "dashed", color = "darkgrey") +
  geom_hline(yintercept = F5$Max, 
             linetype = "dashed", color = "darkgrey") + 
  annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1), 
           label = c("Peak Age", "Max WAR"), size = 5) +
  labs(title = "All Pitchers Regression")

## Warning: Removed 26389 rows containing non-finite values (stat_smooth).

## Warning: Removed 26389 rows containing missing values (geom_point).

F5 %>% pluck("fit") %>% summary()

## 
## Call:
## lm(formula = WAR_150 ~ I(Age - 30) + I((Age - 30)^2) + CollegeOrHS, 
##     data = pitchingdfByIP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1415 -1.4457 -0.1996  1.2504  9.4550 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.8253903  0.0529327  53.377   <2e-16 ***
## I(Age - 30)      0.0138488  0.0078437   1.766   0.0775 .  
## I((Age - 30)^2)  0.0003814  0.0014969   0.255   0.7989    
## CollegeOrHS     -0.0804159  0.0610854  -1.316   0.1881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.124 on 4977 degrees of freedom
##   (26389 observations deleted due to missingness)
## Multiple R-squared:  0.0009256,  Adjusted R-squared:  0.0003234 
## F-statistic: 1.537 on 3 and 4977 DF,  p-value: 0.2028

Statistical Analysis of Batting Data

Bobby Kent

11/3/2021

Import Libraries

Load Data Files

Data Wrangling

Drop unneeded columns

———————————————————

BATTERS

All High School and College Batters

Dropping Ages based on small sample size

College Batters

High School Batters

Aging Curves - Batters

High School Versus College Battters Comparison Plots - Delta Method - not per 400 PA

Regressions - Batters

Plot Regression for all batters

Plot regression for college batters

Plot regression for High School batters

———————————————————

PITCHERS

All High School and College Pitchers

Dropping Ages based on small sample size

College Pitchers

High School Pitchers

Aging Curves - Pitchers

High School Versus College Pitchers Comparison Plots

Regressions - Pitchers

Plot Regression for all pitchers

Plot regression for college pitchers

Plot regression for high school pitchers

TODO