# import libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──
## ✓ broom 0.7.9 ✓ rsample 0.1.0
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.3
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.8
## ✓ recipes 0.1.16
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
library(dplyr)
library(tidyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
# Load data files
battingdf <- read.csv(file = "Batters-1960-2019.csv", header=TRUE)
pitchingdf <- read.csv(file = "Pitchers-1960-2019.csv", header=TRUE)
battingdf <- battingdf %>%
mutate(College = ifelse(College =='#N/A',NA, College)) %>%
mutate(CollegeOrHS = ifelse(!is.na(College),1,0))
pitchingdf <- pitchingdf %>%
mutate(College = ifelse(College.or.HS =='#N/A',NA, College.or.HS)) %>%
mutate(CollegeOrHS = ifelse(!is.na(College),1,0)) %>%
filter(Year <= 2019)
battingdf <- select(battingdf, -College.or.HS., -Country)
pitchingdf <- select(pitchingdf, -X, -College.or.HS)
Note: High School Batters means batters who went to HS before MLB College Batters means batters who played in College before MLB
Criteria: More than 400 PA in back to back years Played in consecutive years
Min_PA = 400
battingdfByPAs <- battingdf %>%
mutate(PA_400 = ifelse(PA >= Min_PA, PA, NA)) %>%
mutate(WAR_400 = ifelse(PA >= Min_PA, WAR, NA)) %>%
group_by(bbrefID) %>%
arrange(Year) %>%
mutate(WAR_change = WAR_400 - lag(WAR_400)) %>%
mutate(Standardized_WAR = (WAR_400 / PA_400)*(Min_PA)) %>%
mutate(Standardized_WAR_change = Standardized_WAR - lag(Standardized_WAR))
#Frequency Table of Ages - Batters Made decision to drop those ages with frequencies below 100
battingdfByPAs %>%
group_by(Age) %>%
count(Age)
## # A tibble: 36 × 2
## # Groups: Age [36]
## Age n
## <int> <int>
## 1 17 6
## 2 18 44
## 3 19 182
## 4 20 521
## 5 21 1308
## 6 22 2544
## 7 23 4033
## 8 24 5471
## 9 25 6283
## 10 26 6341
## # … with 26 more rows
Made decision to drop those ages with frequencies below 100
battingdfByPAs <- battingdfByPAs %>%
filter(Age > 18) %>%
filter(Age < 42)
# Average WAR changes for all players who meet criteria (HS and College)
Bat_WAR_All_by_age <- battingdfByPAs %>%
filter(Age < 41) %>%
filter(Age > 20) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Create Chained WAR
Bat_WAR_All_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_All_by_age$WAR_change, 0)) + Bat_WAR_All_by_age$WAR_change * 0
# Created Chain Standardized WAR Change (Cumulamative )
Bat_WAR_All_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_All_by_age$Standardized_WAR_change, 0)) +
Bat_WAR_All_by_age$Standardized_WAR_change * 0
# Change order of columns
Bat_WAR_All_by_age <- Bat_WAR_All_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
#Rename Columns
Bat_WAR_All_by_age <- Bat_WAR_All_by_age %>%
rename(Avg_WAR_400 = WAR_400,
Avg_WAR_change = WAR_change,
Chain_WAR_400 = Chain_WAR)
# Do same for colleges
# College or HS = College
Min_PA = 400
batcollegeByPAs <- battingdfByPAs %>%
filter(CollegeOrHS == 1)
# Average war changes for all college players who meet PA criteria
Bat_WAR_avg_College_by_age <- batcollegeByPAs %>%
filter(Age < 41) %>%
filter(Age > 21) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Standard deviation of war changes for all college players who meet PA criteria
WAR_sd_College_by_age <- batcollegeByPAs %>%
group_by(Age) %>%
arrange(Age) %>%
summarize_at(vars(WAR_change, WAR_400), (sd), na.rm=TRUE)
# Create Chained WAR for college players
Bat_WAR_avg_College_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_avg_College_by_age$WAR_change, 0)) + Bat_WAR_avg_College_by_age$WAR_change * 0
# Created Chain Standardized WAR Change
Bat_WAR_avg_College_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_avg_College_by_age$Standardized_WAR_change, 0)) +
Bat_WAR_avg_College_by_age$Standardized_WAR_change * 0
# Change order of columns
Bat_WAR_avg_College_by_age <- Bat_WAR_avg_College_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
#Rename Columns
Bat_WAR_avg_College_by_age <- Bat_WAR_avg_College_by_age %>%
rename(Avg_WAR_400 = WAR_400,
Avg_WAR_change = WAR_change,
Chain_WAR_400 = Chain_WAR)
# Do same for High Schools
# College or HS = HS
Min_PA = 400
batHSByPAs <- battingdfByPAs %>%
filter(CollegeOrHS == 0)
# Average WAR changes and WAR for all high school players who meet PA criteria
Bat_WAR_avg_HS_by_age <- batHSByPAs %>%
filter(Age < 41) %>%
filter(Age > 21) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_change, WAR_400, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Standard Deviation of WAR changes and WAR for all high school players who meet PA criteria
WAR_sd_HS_by_age <- batHSByPAs %>%
group_by(Age) %>%
arrange(Age) %>%
summarize_at(vars(WAR_change, WAR_400), sd, na.rm=TRUE)
# Create Chained WAR for college players
Bat_WAR_avg_HS_by_age$Chain_WAR <- cumsum(coalesce(Bat_WAR_avg_HS_by_age$WAR_change, 0)) +
Bat_WAR_avg_HS_by_age$WAR_change * 0
# Created Chain Standardized WAR Change
Bat_WAR_avg_HS_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Bat_WAR_avg_HS_by_age$Standardized_WAR_change, 0)) +
Bat_WAR_avg_HS_by_age$Standardized_WAR_change * 0
# Change order of columns
Bat_WAR_avg_HS_by_age <- Bat_WAR_avg_HS_by_age[, c("Age", "WAR_400", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
#Rename Columns
Bat_WAR_avg_HS_by_age <- Bat_WAR_avg_HS_by_age %>%
rename(Avg_WAR_400 = WAR_400,
Avg_WAR_change = WAR_change,
Chain_WAR_400 = Chain_WAR)
Professor Luea Suggestion: Maybe conduct aging curves for different cohorts of players - maybe young star players like a Trea Turner get own aging curve. etc
Delta Method - Chained WAR
# Plot aging curves for all players who meet PA criteria
aging_curve <- ggplot(Bat_WAR_All_by_age, aes(x = Age, y = Chain_WAR_400)) +
geom_line() +
geom_point() +
labs(title= "All Batters Aging Curve", y="Chained WAR", x = "Age") + xlim(15, 45)
print(aging_curve)
Delta Method - All Batters - Standardized WAR per 400 PA
aging_curve <- ggplot(Bat_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "All Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve
Comparison to see which is better Blue is Delta Method Black is Delta Method per 400 PA
combined_plotALLBatters <- ggplot() +
geom_point(data = Bat_WAR_All_by_age, aes(Age, Chain_WAR_400),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data=Bat_WAR_All_by_age, mapping = aes(Age, Chain_WAR_400,), colour="blue") +
geom_point(data = Bat_WAR_All_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Bat_WAR_All_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "All Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotALLBatters
Delta Method - College Batters
aging_curve_college <- ggplot(Bat_WAR_avg_College_by_age, aes(x = Age, y = Chain_WAR_400)) +
geom_point() +
geom_line() +
labs(title= "College Batters Aging Curve", y=" Chained WAR", x = "Age") +
xlim(15, 45)
aging_curve_college
Delta Method - College Batters - WAR per 400 PA
aging_curve_college2 <- ggplot(Bat_WAR_avg_College_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "College Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve_college2
Comparison to see which is better Blue is Delta Method Black is Delta Method per 400 PA
combined_plotALLCollegeBatters <- ggplot() +
geom_point(data = Bat_WAR_avg_College_by_age, aes(Age, Chain_WAR_400),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data= Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_400), colour="blue") +
geom_point(data = Bat_WAR_avg_College_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "College Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotALLCollegeBatters
Delta Method - High School Batters
aging_curve_HSBatters <- ggplot(Bat_WAR_avg_HS_by_age, aes(x = Age, y = Chain_WAR_400)) + geom_point() + geom_line() +
labs(title= "High School Batters Aging Curve", y=" Chained WAR", x = "Age") +
xlim(15, 45)
aging_curve_HSBatters
Standardized WAR per 400 PA - High School Batters
aging_curve_HS2 <- ggplot(Bat_WAR_avg_HS_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "High School Batters Aging Curve", y="WAR Per 400 PA", x = "Age") + xlim(15, 45)
aging_curve_HS2
Combined Plot - High School Batters - Goal is to determine which is better, WAR or WAR Per 400 PA - must make decision regarding this
combined_plotALLHSBatters <- ggplot() +
geom_point(data = Bat_WAR_avg_HS_by_age, aes(Age, Chain_WAR_400),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data= Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_400), colour="blue") +
geom_point(data = Bat_WAR_avg_HS_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "High School Batters - WAR vs WAR Per 400 PA Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotALLHSBatters
combined_plotDelta <- ggplot() +
geom_point(data = Bat_WAR_avg_College_by_age, aes(Age, Chain_WAR_400),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data=Bat_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_400,), colour="blue") +
geom_point(data = Bat_WAR_avg_HS_by_age, aes(Age, Chain_WAR_400),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data=Bat_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_400), colour="black") +
labs(title = "College vs High School Aging Curves - Chained WAR", x = "Age", y = "Chained WAR") + xlim(19, 41) + theme_light() + scale_x_continuous(breaks = seq(19, 41, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotDelta
Plotted a regression on the aging curve to see how it looks but not sure this tells me anything.
combined_plotregress <- ggplot() +
stat_smooth(data=Bat_WAR_avg_College_by_age, mapping = aes(Age, Avg_WAR_400), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="blue", na.rm = TRUE) + stat_smooth(data=Bat_WAR_avg_HS_by_age, mapping = aes(Age, Avg_WAR_400), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="black", na.rm = TRUE) +
labs(title = "College vs High School Aging Curve Regression - WAR", x = "Age", y = "WAR") +
xlim(19, 41) +
theme_light() +
scale_x_continuous(breaks = seq(19, 41, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotregress
View the data first in a scatter plot
ggplot(battingdfByPAs, aes(Age, WAR_400)) + geom_point()
## Warning: Removed 50544 rows containing missing values (geom_point).
Suggestion from Professor Luea: Potentially think of non-linear transformations on the age variable
Function to fit model
fit_model <- function(battingdfByPAs) {
fit <- lm(WAR_400 ~ I((Age-30)^2) + CollegeOrHS , data = battingdfByPAs)
b <- coef(fit)
Age.max <- 30- b[2] / b[3] / 2
Max <- b[1] - b[2] ^ 2 / b[3] / 4
list(fit = fit, Age.max = Age.max, Max = Max)
}
F2 <- fit_model(battingdfByPAs)
coef(F2$fit)
## (Intercept) I((Age - 30)^2) CollegeOrHS
## 2.738226258 -0.005562773 -0.156034743
c(F2$Age.max, F2$Max)
## I((Age - 30)^2) (Intercept)
## 29.982175 2.738276
ggplot(battingdfByPAs, aes(Age, WAR_400)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 1.5,
formula = y ~ poly(x, 2, raw = TRUE)) +
geom_vline(xintercept = F2$Age.max,
linetype = "dashed", color = "darkgrey") +
geom_hline(yintercept = F2$Max,
linetype = "dashed", color = "darkgrey") +
annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1),
label = c("Peak Age", "Max WAR"), size = 5) +
labs(title = "All Players Regression")
## Warning: Removed 50544 rows containing non-finite values (stat_smooth).
## Warning: Removed 50544 rows containing missing values (geom_point).
F2 %>% pluck("fit") %>% summary()
##
## Call:
## lm(formula = WAR_400 ~ I((Age - 30)^2) + CollegeOrHS, data = battingdfByPAs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5380 -1.4894 -0.1992 1.2362 9.7118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.738226 0.032852 83.351 < 2e-16 ***
## I((Age - 30)^2) -0.005563 0.001003 -5.546 3e-08 ***
## CollegeOrHS -0.156035 0.041613 -3.750 0.000178 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.112 on 10909 degrees of freedom
## (50544 observations deleted due to missingness)
## Multiple R-squared: 0.003673, Adjusted R-squared: 0.00349
## F-statistic: 20.11 on 2 and 10909 DF, p-value: 1.922e-09
Suggestion from Professor Luea on how to achieve better model fit: For example, run model with WAR ~ Age, then WAR ~ Age + Age^2, then WAR ~ Age + Age^2 + CollegeOrHS For each model, write down coefficient estimates and R^2 and if estimates are statistically significant or not Also look at what else is being done in the space by researchers
View the data first in a scatter plot
ggplot(batcollegeByPAs, aes(Age, WAR_400)) + geom_point()
## Warning: Removed 23573 rows containing missing values (geom_point).
Function to fit model
fit_model2 <- function(batcollegeByPAs) {
fit <- lm(WAR_400 ~ I(Age-30) + I((Age-30)^2), data = batcollegeByPAs)
b <- coef(fit)
Age.max <- 30- b[2] / b[3] / 2
Max <- b[1] - b[2] ^ 2 / b[3] / 4
list(fit = fit, Age.max = Age.max, Max = Max)
}
F3 <- fit_model2(batcollegeByPAs)
coef(F3$fit)
## (Intercept) I(Age - 30) I((Age - 30)^2)
## 2.579065367 -0.045916244 -0.008282423
c(F3$Age.max, F2$Max)
## I(Age - 30) (Intercept)
## 27.228091 2.738276
ggplot(batcollegeByPAs, aes(Age, WAR_400)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 1.5,
formula = y ~ poly(x, 2, raw = TRUE)) +
geom_vline(xintercept = F3$Age.max,
linetype = "dashed", color = "darkgrey") +
geom_hline(yintercept = F3$Max,
linetype = "dashed", color = "darkgrey") +
annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1),
label = c("Peak Age", "Max WAR"), size = 5) +
labs(title = "College Players Regression")
## Warning: Removed 23573 rows containing non-finite values (stat_smooth).
## Warning: Removed 23573 rows containing missing values (geom_point).
F3 %>% pluck("fit") %>% summary()
##
## Call:
## lm(formula = WAR_400 ~ I(Age - 30) + I((Age - 30)^2), data = batcollegeByPAs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7541 -1.4423 -0.2302 1.1751 9.8946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.579065 0.040961 62.963 < 2e-16 ***
## I(Age - 30) -0.045916 0.008344 -5.503 3.96e-08 ***
## I((Age - 30)^2) -0.008282 0.001758 -4.711 2.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.046 on 4356 degrees of freedom
## (23573 observations deleted due to missingness)
## Multiple R-squared: 0.01164, Adjusted R-squared: 0.01118
## F-statistic: 25.64 on 2 and 4356 DF, p-value: 8.496e-12
View the data first in a scatter plot
ggplot(batHSByPAs, aes(Age, WAR_400)) + geom_point()
## Warning: Removed 26971 rows containing missing values (geom_point).
Function to fit model
fit_model3 <- function(batHSByPAs) {
fit <- lm(WAR_400 ~ I(Age-30) + I((Age-30)^2), data = batHSByPAs)
b <- coef(fit)
Age.max <- 30- b[2] / b[3] / 2
Max <- b[1] - b[2] ^ 2 / b[3] / 4
list(fit = fit, Age.max = Age.max, Max = Max)
}
F4 <- fit_model3(batHSByPAs)
coef(F4$fit)
## (Intercept) I(Age - 30) I((Age - 30)^2)
## 2.687670933 -0.051732102 -0.008141429
c(F4$Age.max, F4$Max)
## I(Age - 30) (Intercept)
## 26.82291 2.76985
ggplot(batHSByPAs, aes(Age, WAR_400)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 1.5,
formula = y ~ poly(x, 2, raw = TRUE)) +
geom_vline(xintercept = F4$Age.max,
linetype = "dashed", color = "darkgrey") +
geom_hline(yintercept = F4$Max,
linetype = "dashed", color = "darkgrey") +
annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1),
label = c("Peak Age", "Max WAR"), size = 5) +
labs(title = "College Players Regression")
## Warning: Removed 26971 rows containing non-finite values (stat_smooth).
## Warning: Removed 26971 rows containing missing values (geom_point).
F4 %>% pluck("fit") %>% summary()
##
## Call:
## lm(formula = WAR_400 ~ I(Age - 30) + I((Age - 30)^2), data = batHSByPAs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.705 -1.543 -0.205 1.295 9.630
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.687671 0.036142 74.364 < 2e-16 ***
## I(Age - 30) -0.051732 0.007162 -7.223 5.67e-13 ***
## I((Age - 30)^2) -0.008141 0.001323 -6.156 7.93e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.142 on 6550 degrees of freedom
## (26971 observations deleted due to missingness)
## Multiple R-squared: 0.009909, Adjusted R-squared: 0.009606
## F-statistic: 32.78 on 2 and 6550 DF, p-value: 6.866e-15
# Convert chr types to numeric
pitchingdf$IP <- as.numeric(pitchingdf$IP)
## Warning: NAs introduced by coercion
pitchingdf$WAR <- as.numeric(pitchingdf$WAR)
## Warning: NAs introduced by coercion
Note: High School Pitchers means Pitchers who went to HS before MLB College Pitchers means Pitchers who played in College before MLB
Criteria: More than 150 IP in back to back years Played in consecutive years
Min_IP = 150
pitchingdfByIP <- pitchingdf %>%
mutate(IP_150 = ifelse(IP >= Min_IP, IP, NA)) %>%
mutate(WAR_150 = ifelse(IP >= Min_IP, WAR, NA)) %>%
group_by(bbrefID) %>%
arrange(Year) %>%
mutate(WAR_change = WAR_150 - lag(WAR_150)) %>%
mutate(Standardized_WAR = (WAR_150 / IP_150)*(Min_IP)) %>%
mutate(Standardized_WAR_change = Standardized_WAR - lag(Standardized_WAR))
#Frequency Table of Ages - Pitchers Made decision to drop those ages with frequencies below 100
pitchingdfByIP %>%
group_by(Age) %>%
count(Age)
## # A tibble: 34 × 2
## # Groups: Age [34]
## Age n
## <int> <int>
## 1 17 2
## 2 18 19
## 3 19 82
## 4 20 228
## 5 21 613
## 6 22 1231
## 7 23 1969
## 8 24 2745
## 9 25 3245
## 10 26 3327
## # … with 24 more rows
pitchingdfByIP <- pitchingdfByIP %>%
filter(Age > 19) %>%
filter(Age < 41)
# Average WAR changes for all pitchers who meet criteria (HS and College)
Pitch_WAR_All_by_age <- pitchingdfByIP %>%
filter(Age > 20) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Create Chained WAR
Pitch_WAR_All_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_All_by_age$WAR_change, 0)) + Pitch_WAR_All_by_age$WAR_change * 0
# Created Chain Standardized WAR Change
Pitch_WAR_All_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_All_by_age$Standardized_WAR_change, 0)) +
Pitch_WAR_All_by_age$Standardized_WAR_change * 0
# Change order of columns
Pitch_WAR_All_by_age <- Pitch_WAR_All_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
#Rename Columns
Pitch_WAR_All_by_age <- Pitch_WAR_All_by_age %>%
rename(Avg_WAR_150 = WAR_150,
Avg_WAR_change = WAR_change,
Chain_WAR_150 = Chain_WAR)
# Do same for college pitchers
# College or HS = College
Min_IP = 150
PitchCollegeByIP <- pitchingdfByIP %>%
filter(CollegeOrHS == 1)
# Average war changes for all college pitchers who meet IP criteria
Pitch_WAR_avg_College_by_age <- PitchCollegeByIP %>%
filter(Age > 20) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Standard deviation of war changes for all college players who meet IP criteria
WAR_sd_College_by_age <- batcollegeByPAs %>%
group_by(Age) %>%
arrange(Age) %>%
summarize_at(vars(WAR_change, WAR_400), (sd), na.rm=TRUE)
# Create Chained WAR for college players
Pitch_WAR_avg_College_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_avg_College_by_age$WAR_change, 0)) + Pitch_WAR_avg_College_by_age$WAR_change * 0
# Created Chain Standardized WAR Change
Pitch_WAR_avg_College_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_avg_College_by_age$Standardized_WAR_change, 0)) +
Pitch_WAR_avg_College_by_age$Standardized_WAR_change * 0
# Change order of columns
Pitch_WAR_avg_College_by_age <- Pitch_WAR_avg_College_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
# Rename Columns
Pitch_WAR_avg_College_by_age <- Pitch_WAR_avg_College_by_age %>%
rename(Avg_WAR_150 = WAR_150,
Avg_WAR_change = WAR_change,
Chain_WAR_150 = Chain_WAR)
# Do same for High Schools
# College or HS = HS
Min_IP = 150
PitchHSByIP <- pitchingdfByIP %>%
filter(CollegeOrHS == 0)
# Average WAR changes and WAR for all high school pitchers who meet IP criteria
Pitch_WAR_avg_HS_by_age <- PitchHSByIP %>%
filter(Age > 20) %>%
group_by(Age) %>%
arrange(Age) %>%
# take the mean of each of the variables listed
summarize_at(vars(WAR_150, WAR_change, Standardized_WAR, Standardized_WAR_change), mean, na.rm=TRUE)
# Standard Deviation of WAR changes and WAR for all high school players who meet PA criteria
WAR_sd_HS_by_age <- batHSByPAs %>%
group_by(Age) %>%
arrange(Age) %>%
summarize_at(vars(WAR_change, WAR_400), sd, na.rm=TRUE)
# Create Chained WAR for high school players
Pitch_WAR_avg_HS_by_age$Chain_WAR <- cumsum(coalesce(Pitch_WAR_avg_HS_by_age$WAR_change, 0)) +
Pitch_WAR_avg_HS_by_age$WAR_change * 0
# Created Chain Standardized WAR Change
Pitch_WAR_avg_HS_by_age$Chain_Standardized_WAR_change <- cumsum(coalesce(Pitch_WAR_avg_HS_by_age$Standardized_WAR_change, 0)) +
Pitch_WAR_avg_HS_by_age$Standardized_WAR_change * 0
# Change order of columns
Pitch_WAR_avg_HS_by_age <- Pitch_WAR_avg_HS_by_age[, c("Age", "WAR_150", "WAR_change", "Chain_WAR", "Standardized_WAR", "Standardized_WAR_change", "Chain_Standardized_WAR_change")] # leave the row index blank to keep all rows
# Rename Columns
Pitch_WAR_avg_HS_by_age <- Pitch_WAR_avg_HS_by_age %>%
rename(Avg_WAR_150 = WAR_150,
Avg_WAR_change = WAR_change,
Chain_WAR_150 = Chain_WAR)
Delta Method
aging_curve <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_WAR_150)) +
geom_point() +
geom_line() +
labs(title= "All Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
print(aging_curve)
Delta Method - Standardized Per 150 IP
aging_curve <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "All Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
print(aging_curve)
Comparison of the two approaches to determine which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP
combined_plotALL <- ggplot() +
geom_point(data = Pitch_WAR_All_by_age, aes(Age, Chain_WAR_150),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data=Pitch_WAR_All_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") +
geom_point(data = Pitch_WAR_All_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Pitch_WAR_All_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "All Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotALL
College Pitchers
Delta Method
aging_curve_college <- ggplot(Pitch_WAR_avg_College_by_age, aes(x = Age, y = Chain_WAR_150)) +
geom_point() +
geom_line() +
labs(title= "College Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
aging_curve_college
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Delta Method - Standardized per 150IP
aging_curve_college <- ggplot(Pitch_WAR_All_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "College Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
aging_curve_college
Comparsion of two approaches to see which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP
combined_plotCollege <- ggplot() +
geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_WAR_150),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data= Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") +
geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "College Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotCollege
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
High School Pitchers
Delta Method
aging_curve_HS <- ggplot(Pitch_WAR_avg_HS_by_age, aes(x = Age, y = Chain_WAR_150)) +
geom_point() +
geom_line() +
labs(title= "High School Pitchers Aging Curve", y="WAR", x = "Age") + xlim(15, 45)
aging_curve_HS
Delta Method - Standardized Per 150IP
aging_curve_HS <- ggplot(Pitch_WAR_avg_HS_by_age, aes(x = Age, y = Chain_Standardized_WAR_change)) +
geom_point() +
geom_line() +
labs(title= "High School Pitchers Aging Curve", y="WAR Per 150 IP", x = "Age") + xlim(15, 45)
aging_curve_HS
Comparsion to see which is best Comparison to see which is better Blue is Delta Method Black is Delta Method per 150 IP
combined_plotHS <- ggplot() +
geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_WAR_150),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data= Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_150,), colour="blue") +
geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_Standardized_WAR_change),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data = Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_Standardized_WAR_change), colour="black") +
labs(title = "High School Pitchers - WAR vs WAR Per 150 IP Plot", x = "Age", y = "WAR") + xlim(21, 47) + theme_light() + scale_x_continuous(breaks = seq(21, 47, 1))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotHS
Blue is College Black is HS
combined_plotDeltaPitchers <- ggplot() +
geom_point(data = Pitch_WAR_avg_College_by_age, aes(Age, Chain_WAR_150),
fill = "blue", colour = "blue", size = 2, shape = 21) +
geom_line(data= Pitch_WAR_avg_College_by_age, mapping = aes(Age, Chain_WAR_150), colour="blue") +
geom_point(data = Pitch_WAR_avg_HS_by_age, aes(Age, Chain_WAR_150),
fill = "black", colour = "black",
size = 2, shape = 21) + geom_line(data= Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Chain_WAR_150), colour="black") +
labs(title = "College vs High School Pitchers Aging Curves - Chained WAR", x = "Age", y = "Chained WAR") + xlim(19, 41) + theme_light() + scale_x_continuous(breaks = seq(19, 41, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotDeltaPitchers
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Ran regressions on the aging curves, again don’t know if tells me anything and also do not know why upside down lol
combined_plotregressPitchers <- ggplot() +
stat_smooth(data=Pitch_WAR_avg_College_by_age, mapping = aes(Age, Avg_WAR_150), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="blue", na.rm = TRUE) + stat_smooth(data=Pitch_WAR_avg_HS_by_age, mapping = aes(Age, Avg_WAR_150), method = "lm", formula = y ~ poly(x, 2), size = 1, se=FALSE, colour="black", na.rm = TRUE) +
labs(title = "College vs High School Aging Curve Regression - WAR", x = "Age", y = "WAR") +
xlim(19, 41) +
theme_light() +
scale_x_continuous(breaks = seq(19, 41, 2))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
combined_plotregressPitchers
View the data first in a scatter plot
ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point()
## Warning: Removed 26389 rows containing missing values (geom_point).
Function to fit model
fit_model <- function(pitchingdfByIP) {
fit <- lm(WAR_150 ~ I(Age-30) + I((Age-30)^2) + CollegeOrHS , data = pitchingdfByIP)
b <- coef(fit)
Age.max <- 30- b[2] / b[3] / 2
Max <- b[1] - b[2] ^ 2 / b[3] / 4
list(fit = fit, Age.max = Age.max, Max = Max)
}
F5 <- fit_model(pitchingdfByIP)
coef(F5$fit)
## (Intercept) I(Age - 30) I((Age - 30)^2) CollegeOrHS
## 2.8253902959 0.0138487604 0.0003813992 -0.0804159260
c(F5$Age.max, F5$Max)
## I(Age - 30) (Intercept)
## 11.844797 2.699677
ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 1.5,
formula = y ~ poly(x, 2, raw = TRUE)) +
geom_vline(xintercept = F5$Age.max,
linetype = "dashed", color = "darkgrey") +
geom_hline(yintercept = F5$Max,
linetype = "dashed", color = "darkgrey") +
annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1),
label = c("Peak Age", "Max WAR"), size = 5) +
labs(title = "All Pitchers Regression")
## Warning: Removed 26389 rows containing non-finite values (stat_smooth).
## Warning: Removed 26389 rows containing missing values (geom_point).
F5 %>% pluck("fit") %>% summary()
##
## Call:
## lm(formula = WAR_150 ~ I(Age - 30) + I((Age - 30)^2) + CollegeOrHS,
## data = pitchingdfByIP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1415 -1.4457 -0.1996 1.2504 9.4550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8253903 0.0529327 53.377 <2e-16 ***
## I(Age - 30) 0.0138488 0.0078437 1.766 0.0775 .
## I((Age - 30)^2) 0.0003814 0.0014969 0.255 0.7989
## CollegeOrHS -0.0804159 0.0610854 -1.316 0.1881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.124 on 4977 degrees of freedom
## (26389 observations deleted due to missingness)
## Multiple R-squared: 0.0009256, Adjusted R-squared: 0.0003234
## F-statistic: 1.537 on 3 and 4977 DF, p-value: 0.2028
View the data first in a scatter plot
ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point()
## Warning: Removed 26389 rows containing missing values (geom_point).
Function to fit model
fit_model <- function(pitchingdfByIP) {
fit <- lm(WAR_150 ~ I(Age-30) + I((Age-30)^2) + CollegeOrHS , data = pitchingdfByIP)
b <- coef(fit)
Age.max <- 30- b[2] / b[3] / 2
Max <- b[1] - b[2] ^ 2 / b[3] / 4
list(fit = fit, Age.max = Age.max, Max = Max)
}
F5 <- fit_model(pitchingdfByIP)
coef(F5$fit)
## (Intercept) I(Age - 30) I((Age - 30)^2) CollegeOrHS
## 2.8253902959 0.0138487604 0.0003813992 -0.0804159260
c(F5$Age.max, F5$Max)
## I(Age - 30) (Intercept)
## 11.844797 2.699677
ggplot(pitchingdfByIP, aes(Age, WAR_150)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, size = 1.5,
formula = y ~ poly(x, 2, raw = TRUE)) +
geom_vline(xintercept = F5$Age.max,
linetype = "dashed", color = "darkgrey") +
geom_hline(yintercept = F5$Max,
linetype = "dashed", color = "darkgrey") +
annotate(geom = "text", x = c(29,20), y = c(0.72, 1.1),
label = c("Peak Age", "Max WAR"), size = 5) +
labs(title = "All Pitchers Regression")
## Warning: Removed 26389 rows containing non-finite values (stat_smooth).
## Warning: Removed 26389 rows containing missing values (geom_point).
F5 %>% pluck("fit") %>% summary()
##
## Call:
## lm(formula = WAR_150 ~ I(Age - 30) + I((Age - 30)^2) + CollegeOrHS,
## data = pitchingdfByIP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1415 -1.4457 -0.1996 1.2504 9.4550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8253903 0.0529327 53.377 <2e-16 ***
## I(Age - 30) 0.0138488 0.0078437 1.766 0.0775 .
## I((Age - 30)^2) 0.0003814 0.0014969 0.255 0.7989
## CollegeOrHS -0.0804159 0.0610854 -1.316 0.1881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.124 on 4977 degrees of freedom
## (26389 observations deleted due to missingness)
## Multiple R-squared: 0.0009256, Adjusted R-squared: 0.0003234
## F-statistic: 1.537 on 3 and 4977 DF, p-value: 0.2028