Setting up the basic framework for the projection model. First, we'll source a file that reads in the data and does some basic data cleaning and management, as well as load relavent libraries.
library(boot)
source("readfgData.R")
There are 967 players in the dataset.
Here is a quick look at the distribution of some variables and their corresponding normal approximations, FStrike, the percentage of first strikes seen by the batter, SwStr, the batter's swinging strike rate, and ZContact, the batter's contact percentage within the strikezone.
par(mfrow = c(1, 3))
hist(fg$FStrike, breaks = 20, freq = FALSE, main = "Distribution of the Percentage of \n First Strikes Seen",
cex.main = 1.3, ylab = "", xlab = "First Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$FStrike), max(fg$FStrike), length = 100), mean = mean(fg$FStrike),
sd = sd(fg$FStrike))
points(seq(min(fg$FStrike), max(fg$FStrike), length = 100), normFit, type = "l",
col = "red")
hist(fg$SwStr, breaks = 20, freq = FALSE, main = "Distribution of Batter's Swinging Strike Rates",
cex.main = 1.3, ylab = "", xlab = "Swinging Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$SwStr), max(fg$SwStr), length = 100), mean = mean(fg$SwStr),
sd = sd(fg$SwStr))
points(seq(min(fg$SwStr), max(fg$SwStr), length = 100), normFit, type = "l",
col = "red")
hist(fg$ZContact, breaks = 20, freq = FALSE, main = "Distribution of Batter's Contact Rate \n on Pitches Within The Stike Zone",
cex.main = 1.3, ylab = "", xlab = "In-Zone Contact Rate", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$ZContact), max(fg$ZContact), length = 100), mean = mean(fg$ZContact),
sd = sd(fg$ZContact))
points(seq(min(fg$ZContact), max(fg$ZContact), length = 100), normFit, type = "l",
col = "red")
fg[fg$playerid == min(fg$playerid), ]
## Season Name Team playerid PA AVG OSwing ZSwing Swing
## 23 2006 Alfredo Amezaga Marlins 1 378 0.260 0.223 0.628 0.449
## 457 2007 Alfredo Amezaga Marlins 1 448 0.263 0.237 0.640 0.447
## 896 2008 Alfredo Amezaga Marlins 1 337 0.264 0.250 0.643 0.461
## OContact ZContact Contact Zone FStrike SwStr G AB H Singles
## 23 0.701 0.885 0.845 0.559 0.579 0.067 132 334 87 72
## 457 0.714 0.938 0.881 0.522 0.565 0.052 133 400 105 80
## 896 0.667 0.918 0.855 0.536 0.599 0.067 125 311 82 61
## Doubles Triples HR R RBI BB IBB SO HBP SF SH GDP SB CS BBpercent
## 23 9 3 3 42 19 33 4 46 3 1 7 5 20 12 0.087
## 457 14 9 2 46 30 35 0 52 4 5 4 4 13 7 0.078
## 896 13 5 3 41 32 19 1 47 3 0 4 6 8 2 0.056
## Kpercent BBperK OBP SLG OPS ISO Spd BABIP UBR wSB wRC wRAA
## 23 0.122 0.72 0.332 0.332 0.664 0.072 5.9 0.294 3.8 -1.4 36 -11.5
## 457 0.116 0.67 0.324 0.358 0.682 0.095 6.4 0.293 -0.3 -0.8 45 -9.9
## 896 0.139 0.40 0.312 0.367 0.679 0.103 6.8 0.303 2.6 0.5 33 -7.5
## wOBA wRCadd PowFac
## 23 0.296 75 1.277
## 457 0.305 79 1.361
## 896 0.301 77 1.390
A relavent thing to look at it how stable some of the data is from year to year. Let's look at the year-to-year correlation for data throughout this dataset.
for (var in 5:length(names(fg))) {
fgyear <- fg[fg$Season == 2006, ]
varPair <- c()
for (year in 2007:2013) {
fglyear <- fgyear
fgyear <- fg[fg$Season == year, ]
for (pid in intersect(fgyear$playerid, fglyear$playerid)) {
varPair <- rbind(varPair, c(fglyear[fglyear$playerid == pid, var],
fgyear[fgyear$playerid == pid, var]))
}
}
cat("The year-to-year correlation for ", names(fg)[var], "is: ", cor(varPair[,
2], varPair[, 1]), "\n")
}
## The year-to-year correlation for PA is: 0.5881
## The year-to-year correlation for AVG is: 0.3944
## The year-to-year correlation for OSwing is: 0.8024
## The year-to-year correlation for ZSwing is: 0.7921
## The year-to-year correlation for Swing is: 0.8025
## The year-to-year correlation for OContact is: 0.8096
## The year-to-year correlation for ZContact is: 0.8055
## The year-to-year correlation for Contact is: 0.8657
## The year-to-year correlation for Zone is: 0.7341
## The year-to-year correlation for FStrike is: 0.4482
## The year-to-year correlation for SwStr is: 0.8592
## The year-to-year correlation for G is: 0.4428
## The year-to-year correlation for AB is: 0.5821
## The year-to-year correlation for H is: 0.6088
## The year-to-year correlation for Singles is: 0.6073
## The year-to-year correlation for Doubles is: 0.5382
## The year-to-year correlation for Triples is: 0.5068
## The year-to-year correlation for HR is: 0.6775
## The year-to-year correlation for R is: 0.6266
## The year-to-year correlation for RBI is: 0.6403
## The year-to-year correlation for BB is: 0.6852
## The year-to-year correlation for IBB is: 0.6509
## The year-to-year correlation for SO is: 0.6379
## The year-to-year correlation for HBP is: 0.5522
## The year-to-year correlation for SF is: 0.3251
## The year-to-year correlation for SH is: 0.6132
## The year-to-year correlation for GDP is: 0.5267
## The year-to-year correlation for SB is: 0.7562
## The year-to-year correlation for CS is: 0.5961
## The year-to-year correlation for BBpercent is: 0.6663
## The year-to-year correlation for Kpercent is: 0.8188
## The year-to-year correlation for BBperK is: 0.6575
## The year-to-year correlation for OBP is: 0.4784
## The year-to-year correlation for SLG is: 0.5034
## The year-to-year correlation for OPS is: 0.4909
## The year-to-year correlation for ISO is: 0.6265
## The year-to-year correlation for Spd is: 0.7322
## The year-to-year correlation for BABIP is: 0.2759
## The year-to-year correlation for UBR is: 0.4773
## The year-to-year correlation for wSB is: 0.4677
## The year-to-year correlation for wRC is: 0.6373
## The year-to-year correlation for wRAA is: 0.5503
## The year-to-year correlation for wOBA is: 0.4747
## The year-to-year correlation for wRCadd is: 0.4544
## The year-to-year correlation for PowFac is: 0.6678
Here's some stuff I added just now for you Jake.
par(mfrow = c(1, 2))
HRregress <- lm(HR ~ PowFac + SB + HBP, data = fg)
plot(fg$PowFac, fg$HR, pch = 19, col = "blue", main = "Basic Scatterplot", xlab = "Power Factor",
ylab = "HRs", cex.main = 1.5, cex.lab = 1.3)
plot(fg$PowFac, HRregress$residuals, main = "Residuals of the Regression of HR \n on Power Factor, Stolen Bases, and HBP",
ylab = "Residuals", xlab = "Power Factor", cex.lab = 1.3, pch = 19, col = "dark red")
summary(HRregress)
##
## Call:
## lm(formula = HR ~ PowFac + SB + HBP, data = fg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.818 -3.646 -0.306 3.298 28.834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -38.8337 0.7705 -50.4 <2e-16 ***
## PowFac 29.4040 0.4874 60.3 <2e-16 ***
## SB 0.1490 0.0113 13.2 <2e-16 ***
## HBP 0.6769 0.0304 22.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.14 on 3550 degrees of freedom
## Multiple R-squared: 0.588, Adjusted R-squared: 0.587
## F-statistic: 1.69e+03 on 3 and 3550 DF, p-value: <2e-16
Player ID 1744 is Miguel Cabrera. Lets try using the league-wide distribution of Power Factor to be the prior and use his season data to get a posterior distribution for his “true” power factor for each season.
First let's figure out a good approximation to the prior distribution of power factor:
hist(fg$PowFac, breaks = 20, freq = FALSE, main = "Distribution of Power Factor",
cex.main = 1.3, ylab = "", xlab = "Power Factor", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$PowFac), max(fg$PowFac), length = 100), mean = mean(fg$PowFac),
sd = sd(fg$PowFac))
points(seq(min(fg$PowFac), max(fg$PowFac), length = 100), normFit, type = "l",
col = "red")
As we can see, the distribution of power factor for qualifying MLB players between 2006 and 2013 can be reasonably appoximated by a Normal distribution with a mean of 1.5709 and a standard deviation of 0.2192. The player with the maximum power factor was Jose Bautista in 2010
So now power factor is basically the average number of bases per hit a player achieved. As a random variable who is the mean of a collection of random variables (the value for the number of bases for each hit), it can be treated as normal with a standard devation of \( \frac{ \sigma}{\sqrt{n}} \) where \( \sigma \) here will be the prior standard deviation and \( n \) is the number of hits in the season. Strictly this is not the proper treatment for \( \sigma \) but will be adequate for the first look.
Because we have a normal prior and normal likelihood (a conjugate pair), we can do all the calculations very straightforwardly. So now we can get our posterior for Cabrera for each season as:
player <- fg[fg$playerid == 1744, ]
priorMean <- mean(fg$PowFac)
priorSD <- sd(fg$PowFac)
postMean <- rep(0, length(player$Season))
postSD <- rep(0, length(player$Season))
min(fg$PowFac)
## [1] 1
cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
# Plot the prior distribution
datRange <- seq(min(fg$PowFac), max(fg$PowFac), length = 200)
plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean,
priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor", ylab = "",
main = "Prior and Yearly Posterior Distributions for \n Miguel Cabrera's Power Factor",
cex.main = 1.3, cex.lab = 1.5, ylim = c(0, 1.1))
for (ind in 1:length(player$Season)) {
yearMean <- player$PowFac[ind]
yearSD <- priorSD/sqrt(player$H[ind])
postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 +
1/yearSD^2)
postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange,
postMean[ind], postSD[ind])), col = cols[ind + 1])
cat("The posterior for Miguel Cabrera's Power Factor in ", player$Season[ind],
" has a mean of ", postMean[ind], "with a standard deviation of", postSD[ind],
"\n")
}
## The posterior for Miguel Cabrera's Power Factor in 2006 has a mean of 1.675 with a standard deviation of 0.01566
## The posterior for Miguel Cabrera's Power Factor in 2007 has a mean of 1.765 with a standard deviation of 0.01594
## The posterior for Miguel Cabrera's Power Factor in 2008 has a mean of 1.838 with a standard deviation of 0.01629
## The posterior for Miguel Cabrera's Power Factor in 2009 has a mean of 1.688 with a standard deviation of 0.01554
## The posterior for Miguel Cabrera's Power Factor in 2010 has a mean of 1.895 with a standard deviation of 0.01629
## The posterior for Miguel Cabrera's Power Factor in 2011 has a mean of 1.703 with a standard deviation of 0.01558
## The posterior for Miguel Cabrera's Power Factor in 2012 has a mean of 1.835 with a standard deviation of 0.01527
## The posterior for Miguel Cabrera's Power Factor in 2013 has a mean of 1.826 with a standard deviation of 0.01574
legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)
plot(player$Season, postMean, pch = 19, cex = 1.4, main = "Posterior Power Factor vs Season For Miguel Cabrera",
xlab = "Season", ylab = "Power Factor", cex.lab = 1.4, ylim = c(min(fg$PowFac),
max(fg$PowFac)))
arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 * postSD,
angle = 90, code = 3)
This analysis assumes that Cabrera's power factor is constant throughout each year. We see that Cabrera's power factor is not terribly consistent across years. Most notably, there were two seasons in which Cabrera's power factor was well below what we would expect from the adjacent years.
Let's take a look at a few other players:
for (pName in c("Hanley Ramirez", "Grady Sizemore", "Giancarlo Stanton", "Jay Bruce",
"Jose Bautista", "Elvis Andrus")) {
player <- fg[fg$Name == pName, ]
playerName = unique(as.character(player$Name))
priorMean <- mean(fg$PowFac)
priorSD <- sd(fg$PowFac)
postMean <- rep(0, length(player$Season))
postSD <- rep(0, length(player$Season))
min(fg$PowFac)
cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
# Plot the prior distribution
datRange <- seq(min(fg$PowFac) - 0.25, max(fg$PowFac) + 0.25, length = 200)
plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean,
priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor",
ylab = "", main = paste("Prior and Yearly Posterior Distributions for \n ",
playerName, "'s Power Factor"), cex.main = 1.3, cex.lab = 1.5, ylim = c(0,
1.1))
for (ind in 1:length(player$Season)) {
yearMean <- player$PowFac[ind]
yearSD <- priorSD/sqrt(player$H[ind])
postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 +
1/yearSD^2)
postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange,
postMean[ind], postSD[ind])), col = cols[ind + 1])
cat("The posterior for", playerName, "'s Power Factor in ", player$Season[ind],
" has a mean of ", postMean[ind], "with a standard deviation of",
postSD[ind], "\n")
}
legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)
plot(player$Season, postMean, pch = 19, cex = 1.4, main = paste("Posterior Power Factor vs Season For",
playerName), xlab = "Season", ylab = "Power Factor", cex.lab = 1.4,
ylim = c(min(fg$PowFac), max(fg$PowFac)))
arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 *
postSD, angle = 90, code = 3)
}
## The posterior for Hanley Ramirez 's Power Factor in 2006 has a mean of 1.643 with a standard deviation of 0.01607
## The posterior for Hanley Ramirez 's Power Factor in 2007 has a mean of 1.692 with a standard deviation of 0.01502
## The posterior for Hanley Ramirez 's Power Factor in 2008 has a mean of 1.793 with a standard deviation of 0.01643
## The posterior for Hanley Ramirez 's Power Factor in 2009 has a mean of 1.588 with a standard deviation of 0.01558
## The posterior for Hanley Ramirez 's Power Factor in 2010 has a mean of 1.583 with a standard deviation of 0.01712
## The posterior for Hanley Ramirez 's Power Factor in 2011 has a mean of 1.56 with a standard deviation of 0.02406
## The posterior for Hanley Ramirez 's Power Factor in 2012 has a mean of 1.7 with a standard deviation of 0.01755
## The posterior for Hanley Ramirez 's Power Factor in 2013 has a mean of 1.847 with a standard deviation of 0.02129
## The posterior for Grady Sizemore 's Power Factor in 2006 has a mean of 1.837 with a standard deviation of 0.01586
## The posterior for Grady Sizemore 's Power Factor in 2007 has a mean of 1.667 with a standard deviation of 0.01657
## The posterior for Grady Sizemore 's Power Factor in 2008 has a mean of 1.871 with a standard deviation of 0.01676
## The posterior for Grady Sizemore 's Power Factor in 2009 has a mean of 1.792 with a standard deviation of 0.021
## The posterior for Grady Sizemore 's Power Factor in 2010 has a mean of 1.377 with a standard deviation of 0.04143
## The posterior for Grady Sizemore 's Power Factor in 2011 has a mean of 1.879 with a standard deviation of 0.02807
## The posterior for Giancarlo Stanton 's Power Factor in 2010 has a mean of 1.953 with a standard deviation of 0.02261
## The posterior for Giancarlo Stanton 's Power Factor in 2011 has a mean of 2.046 with a standard deviation of 0.0188
## The posterior for Giancarlo Stanton 's Power Factor in 2012 has a mean of 2.093 with a standard deviation of 0.01915
## The posterior for Giancarlo Stanton 's Power Factor in 2013 has a mean of 1.924 with a standard deviation of 0.02119
## The posterior for Jay Bruce 's Power Factor in 2008 has a mean of 1.781 with a standard deviation of 0.02129
## The posterior for Jay Bruce 's Power Factor in 2009 has a mean of 2.101 with a standard deviation of 0.02482
## The posterior for Jay Bruce 's Power Factor in 2010 has a mean of 1.753 with a standard deviation of 0.01827
## The posterior for Jay Bruce 's Power Factor in 2011 has a mean of 1.85 with a standard deviation of 0.01784
## The posterior for Jay Bruce 's Power Factor in 2012 has a mean of 2.036 with a standard deviation of 0.0184
## The posterior for Jay Bruce 's Power Factor in 2013 has a mean of 1.823 with a standard deviation of 0.01707
## The posterior for Jose Bautista 's Power Factor in 2006 has a mean of 1.785 with a standard deviation of 0.02249
## The posterior for Jose Bautista 's Power Factor in 2007 has a mean of 1.629 with a standard deviation of 0.0188
## The posterior for Jose Bautista 's Power Factor in 2008 has a mean of 1.7 with a standard deviation of 0.02324
## The posterior for Jose Bautista 's Power Factor in 2009 has a mean of 1.734 with a standard deviation of 0.02451
## The posterior for Jose Bautista 's Power Factor in 2010 has a mean of 2.368 with a standard deviation of 0.01796
## The posterior for Jose Bautista 's Power Factor in 2011 has a mean of 2.01 with a standard deviation of 0.01755
## The posterior for Jose Bautista 's Power Factor in 2012 has a mean of 2.179 with a standard deviation of 0.02436
## The posterior for Jose Bautista 's Power Factor in 2013 has a mean of 1.92 with a standard deviation of 0.02018
## The posterior for Elvis Andrus 's Power Factor in 2009 has a mean of 1.398 with a standard deviation of 0.0193
## The posterior for Elvis Andrus 's Power Factor in 2010 has a mean of 1.139 with a standard deviation of 0.01749
## The posterior for Elvis Andrus 's Power Factor in 2011 has a mean of 1.296 with a standard deviation of 0.01707
## The posterior for Elvis Andrus 's Power Factor in 2012 has a mean of 1.323 with a standard deviation of 0.01629
## The posterior for Elvis Andrus 's Power Factor in 2013 has a mean of 1.223 with a standard deviation of 0.01686
One approach to projecting players performance is to use straightforward, unadjusted outcomes and rates on these. We can think of a players at-bat being a sequence of stochastic outcomes. The result of a plate appearance can be anything in the set: {K, BB, HBP, Ball in play}, which can be modeled as a multinomial random variable. Conditional on the result being a ball in play, the outcome can then be anything from another set: {single, double, triple, HR, out, sacrifice}, which can again be treated as a multinomial. The parametersfor these multinomials can be estimated from the players historical data with appropriate uncertainty applied. Once we have estimates for these quantities, we could do a Monte Carlo study to build a projection for a player given a number of plate appearances.
Lets try estimating these parameters for Miguel Cabrera from his 2013 numbers and then simulate some seasons using those estimated parameters as the true parameters:
par(mfrow = c(1, 3))
for (pName in c("Miguel Cabrera")) {
player <- fg[fg$Name == pName & fg$Season == 2013, ]
playerName = unique(as.character(player$Name))
Krate <- player$SO/player$PA
BBrate <- player$BB/player$PA
HBPrate <- player$HBP/player$PA
BIPrate <- 1 - Krate - BBrate - HBPrate
BIPs <- BIPrate * player$PA
singleRate <- player$Singles/BIPs
doubleRate <- player$Doubles/BIPs
tripleRate <- player$Triples/BIPs
hrRate <- player$HR/BIPs
sacRate <- (player$SF + player$SH)/BIPs
outRate <- 1 - singleRate - doubleRate - tripleRate - hrRate - sacRate
simSeasons <- data.frame()
for (i in 1:5000) {
simData <- t(rmultinom(1, size = player$PA, c(Krate, BBrate, HBPrate,
BIPrate)))
simData2 <- t(rmultinom(1, size = simData[4], c(singleRate, doubleRate,
tripleRate, hrRate, sacRate, outRate)))
simSeasons <- rbind(simSeasons, cbind(simData, simData2))
}
names(simSeasons) <- c("K", "BB", "HBP", "BIP", "Singles", "Doubles", "Triples",
"HR", "Sac", "Outs")
print(head(simSeasons))
hist(simSeasons$HR, breaks = 20, xlab = "HRs", ylab = "", main = paste("Simulated Number of Home Runs for \n Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
hist((simSeasons$Singles + simSeasons$Doubles + simSeasons$Triples)/(simSeasons$BIP -
simSeasons$HR), breaks = 20, xlab = "BABIP", ylab = "", main = paste("Simulated BABIP for Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
hist((4 * simSeasons$HR + 3 * simSeasons$Triples + 2 * simSeasons$Doubles +
simSeasons$Singles)/(simSeasons$HR + simSeasons$Triples + simSeasons$Doubles +
simSeasons$Singles), breaks = 20, xlab = "Power Factor", ylab = "",
main = paste("Simulated Power Factor for \n Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
}
## K BB HBP BIP Singles Doubles Triples HR Sac Outs
## 1 91 94 4 463 129 24 0 45 4 261
## 2 68 97 4 483 137 33 3 46 0 264
## 3 106 97 5 444 110 25 0 44 1 264
## 4 113 99 5 435 132 26 0 39 2 236
## 5 97 104 5 446 107 30 0 46 2 261
## 6 99 92 5 456 119 23 1 38 0 275