library(baseballr)
## Warning: package 'baseballr' was built under R version 4.2.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
# Load prospect ranking and statistics spreadsheets from Fangraphs
top_prospects_19 <- read_excel('top_prospects_data.xlsx', sheet = 'top_prospects_19')
top_prospects_23 <- read_excel('top_prospects_data.xlsx', sheet = 'top_prospects_23')
## Remove players who are still ranked prospects in 2023
top_prospects_19 <- top_prospects_19 %>%
anti_join(top_prospects_23, by = "Name")
batter_stats <- read_excel('top_prospects_data.xlsx', sheet = 'batter_stats') %>%
select(Name, G, WAR)
batter_stats$WAR_162 <- batter_stats$WAR / (batter_stats$G / 162)
pitcher_stats <- read_excel('top_prospects_data.xlsx', sheet = 'pitcher_stats') %>%
select(Name, IP, WAR)
pitcher_stats$WAR_200 <- pitcher_stats$WAR / (pitcher_stats$IP / 200)
joined_table <- top_prospects_19 %>%
left_join(batter_stats, by = 'Name') %>%
left_join(pitcher_stats, by = 'Name') %>%
select(Rk, Name, Position, G, IP, WAR_162, WAR_200, Age)
## Combine WAR_162 (batters) and WAR_200 (pitchers)
joined_table$WAR_season <- if_else(joined_table$G >= 81 | joined_table$IP >= 100,
coalesce(joined_table$WAR_162, 0) +
coalesce(joined_table$WAR_200, 0),
0)
joined_table <- joined_table %>%
select(Rk, Name, Position, WAR_season, G, IP, WAR_162, WAR_200, Age)
joined_table <- joined_table %>%
rename(Rank2019 = Rk)
## Create 2023 approximate age column
joined_table$Age <- round(joined_table$Age,0) + 4
## Re-rank prospects based on MLB production
### Age is used as differentiator among those with no MLB production, with younger players ranking above older players.
joined_table <- joined_table %>%
arrange(desc(WAR_season), desc((coalesce(G, 0) * (200 / 162)) + coalesce(IP, 0)), Age) %>%
mutate(Rerank = row_number())
joined_table <- joined_table %>%
select(Rerank, Rank2019, Name, Position, Age, WAR_season, WAR_162, WAR_200, G, IP)
## print the first 20 rows
print(joined_table[1:20,])
## # A tibble: 20 × 10
## Rerank Rank2019 Name Position Age WAR_season WAR_162 WAR_200 G IP
## <int> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3 Fernan… SS 24 7.58 7.58 NA 374 NA
## 2 2 125 Yordan… DH 26 6.13 6.13 NA 444 NA
## 3 3 35 Sean M… C 28 5.76 5.76 NA 416 NA
## 4 4 2 Wander… SS 22 5.75 5.75 NA 265 NA
## 5 5 43 Luis R… CF 26 5.58 5.58 NA 337 NA
## 6 6 80 Will S… C 28 5.41 5.41 NA 452 NA
## 7 7 10 Kyle T… RF 26 5.29 5.29 NA 490 NA
## 8 8 9 Bo Bic… SS 25 4.90 4.90 NA 499 NA
## 9 9 46 Brando… 2B 29 4.55 4.55 NA 434 NA
## 10 10 105 Nico H… 2B 26 4.48 4.48 NA 358 NA
## 11 11 52 Andrés… SS 24 4.37 4.37 NA 378 NA
## 12 12 20 Ke'Bry… 3B 26 4.05 4.05 NA 344 NA
## 13 13 33 Austin… 3B 26 4.01 4.01 NA 570 NA
## 14 14 58 Dylan … RHP 27 3.91 NA 3.91 NA 614.
## 15 15 127 Sandy … RHP 27 3.85 NA 3.85 NA 832
## 16 16 55 Willia… C 25 3.76 3.76 NA 254 NA
## 17 17 36 Michae… RHP 26 3.68 NA 3.68 NA 217.
## 18 18 48 Pete A… 1B 28 3.65 3.65 NA 643 NA
## 19 19 21 Dustin… RHP 25 3.56 NA 3.56 NA 191.
## 20 20 31 Jazz C… SS 25 3.50 3.50 NA 264 NA
Run correlation tests between 2019 rankings and MLB production ranking to determine accuracy. First, test the accuracy of the entire ranking, followed by position groups.
# Analyze ranking accuracy by position groups
of_vector <- c('CF', 'RF', 'LF')
inf_vector <- c('1B', '2B', '3B', 'SS')
p_vector <- c('RHP', 'LHP')
joined_table$Pos_group <- if_else(joined_table$Position %in% of_vector, 'OF',
if_else(joined_table$Position %in% inf_vector, 'INF',
if_else(joined_table$Position %in% p_vector, 'P',
if_else(joined_table$Position == 'C', 'C', 'other'))))
position_group_frequency <- table(joined_table$Pos_group)
print(position_group_frequency)
##
## C INF OF other P
## 12 36 28 2 47
# Correlation for entire data set
joined_table %>%
ggplot(aes(x = Rank2019, y = Rerank)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = 'Rank2019 vs Rerank')
## `geom_smooth()` using formula = 'y ~ x'
corr_result <- corr.test(joined_table$Rank2019, joined_table$Rerank)
print(corr_result)
## Call:corr.test(x = joined_table$Rank2019, y = joined_table$Rerank)
## Correlation matrix
## [1] 0.32
## Sample Size
## [1] 125
## These are the unadjusted probability values.
## The probability values adjusted for multiple tests are in the p.adj object.
## [1] 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
This model indicates a somewhat weak, but significant positive correlation. This implies that the 2019 overall rankings were somewhat accurate.
# Correlation for Outfielders
of_table <- joined_table %>%
filter(Pos_group == 'OF')
of_table %>%
ggplot(aes(x = Rank2019, y = Rerank)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = 'Outfielders')
## `geom_smooth()` using formula = 'y ~ x'
of_corr_result <- corr.test(of_table$Rank2019, of_table$Rerank)
print(of_corr_result)
## Call:corr.test(x = of_table$Rank2019, y = of_table$Rerank)
## Correlation matrix
## [1] 0.38
## Sample Size
## [1] 28
## These are the unadjusted probability values.
## The probability values adjusted for multiple tests are in the p.adj object.
## [1] 0.05
##
## To see confidence intervals of the correlations, print with the short=FALSE option
Like the overall model, the Outfielders model shows a somewhat weak, but significant positive correlation.
#Infielders
inf_table <- joined_table %>%
filter(Pos_group == 'INF')
inf_table %>%
ggplot(aes(x = Rank2019, y = Rerank)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = 'Infielders')
## `geom_smooth()` using formula = 'y ~ x'
inf_corr_result <- corr.test(inf_table$Rank2019, inf_table$Rerank)
print(inf_corr_result)
## Call:corr.test(x = inf_table$Rank2019, y = inf_table$Rerank)
## Correlation matrix
## [1] 0.5
## Sample Size
## [1] 36
## These are the unadjusted probability values.
## The probability values adjusted for multiple tests are in the p.adj object.
## [1] 0
##
## To see confidence intervals of the correlations, print with the short=FALSE option
This model shows a stronger positive correlation. This indicates that Infielders in the 2019 rankings were accurately ranked.
# Pitchers
p_table <- joined_table %>%
filter(Pos_group == 'P')
p_table %>%
ggplot(aes(x = Rank2019, y = Rerank)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = 'Pitchers')
## `geom_smooth()` using formula = 'y ~ x'
p_corr_result <- corr.test(p_table$Rank2019, p_table$Rerank)
print(p_corr_result)
## Call:corr.test(x = p_table$Rank2019, y = p_table$Rerank)
## Correlation matrix
## [1] 0.24
## Sample Size
## [1] 47
## These are the unadjusted probability values.
## The probability values adjusted for multiple tests are in the p.adj object.
## [1] 0.11
##
## To see confidence intervals of the correlations, print with the short=FALSE option
This shows an insignificant relationship between the original rankings and re-rank. This implies that a prospect’s ranking within the top 100 gave no indication of his future MLB production relative to other top 100 prospects.
# Catchers
c_table <- joined_table %>%
filter(Pos_group == 'C')
c_table %>%
ggplot(aes(x = Rank2019, y = Rerank)) +
geom_point() +
geom_smooth(method = 'lm') +
labs(title = 'Catchers')
## `geom_smooth()` using formula = 'y ~ x'
c_corr_result <- corr.test(c_table$Rank2019, c_table$Rerank)
print(c_corr_result)
## Call:corr.test(x = c_table$Rank2019, y = c_table$Rerank)
## Correlation matrix
## [1] 0.19
## Sample Size
## [1] 12
## These are the unadjusted probability values.
## The probability values adjusted for multiple tests are in the p.adj object.
## [1] 0.56
##
## To see confidence intervals of the correlations, print with the short=FALSE option
As with pitchers, there is no significant correlation for catchers; however, the smaller sample size should be noted.
Within the top 100, a prospect’s ranking provides some, but not an overwhelming indication of that player’s future production. Therefore, a player ranked in the lower parts of the top 100 should be valued just as highly as one in the higher parts. This is especially true for pitchers and catchers.
This process can be used in hindsight to evaluate the performance of internal scouting operations. It can also be used to give context to the value of rankings. For example, By how much should a top 10 prospect be valued above a top 50 prospect? These insights can be used in trade and acquisition strategies.