Setting up the basic framework for the projection model. First, we'll source a file that reads in the data and does some basic data cleaning and management, as well as load relavent libraries.
library(boot)
source("readfgData.R")
There are 1668 players in the dataset.
Here is a quick look at the distribution of some variables and their corresponding normal approximations, FStrike, the percentage of first strikes seen by the batter, SwStr, the batter's swinging strike rate, and ZContact, the batter's contact percentage within the strikezone.
par(mfrow = c(1, 3))
hist(fg$FStrike, breaks = 20, freq = FALSE, main = "Distribution of the Percentage of \n First Strikes Seen",
cex.main = 1.3, ylab = "", xlab = "First Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$FStrike), max(fg$FStrike), length = 100), mean = mean(fg$FStrike),
sd = sd(fg$FStrike))
## Error: missing value where TRUE/FALSE needed
points(seq(min(fg$FStrike), max(fg$FStrike), length = 100), normFit, type = "l",
col = "red")
## Error: missing value where TRUE/FALSE needed
hist(fg$SwStr, breaks = 20, freq = FALSE, main = "Distribution of Batter's Swinging Strike Rates",
cex.main = 1.3, ylab = "", xlab = "Swinging Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$SwStr), max(fg$SwStr), length = 100), mean = mean(fg$SwStr),
sd = sd(fg$SwStr))
## Error: missing value where TRUE/FALSE needed
points(seq(min(fg$SwStr), max(fg$SwStr), length = 100), normFit, type = "l",
col = "red")
## Error: missing value where TRUE/FALSE needed
hist(fg$ZContact, breaks = 20, freq = FALSE, main = "Distribution of Batter's Contact Rate \n on Pitches Within The Stike Zone",
cex.main = 1.3, ylab = "", xlab = "In-Zone Contact Rate", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$ZContact), max(fg$ZContact), length = 100), mean = mean(fg$ZContact),
sd = sd(fg$ZContact))
## Error: missing value where TRUE/FALSE needed
points(seq(min(fg$ZContact), max(fg$ZContact), length = 100), normFit, type = "l",
col = "red")
## Error: missing value where TRUE/FALSE needed
fg[fg$playerid == min(fg$playerid), ]
## Season Name Team G AB PA H Singles Doubles
## 5897 2007 Alfredo Amezaga Marlins 133 400 448 105 80 14
## 6157 2008 Alfredo Amezaga Marlins 125 311 337 82 61 13
## 6488 2006 Alfredo Amezaga Marlins 132 334 378 87 72 9
## 7655 2003 Alfredo Amezaga Angels 37 105 120 22 15 3
## 8610 2004 Alfredo Amezaga Angels 73 93 105 15 11 2
## Triples HR R RBI BB IBB SO HBP SF SH GDP SB CS AVG OBP SLG
## 5897 9 2 46 30 35 0 52 4 5 4 4 13 7 0.263 0.324 0.358
## 6157 5 3 41 32 19 1 47 3 0 4 6 8 2 0.264 0.312 0.367
## 6488 3 3 42 19 33 4 46 3 1 7 5 20 12 0.260 0.332 0.332
## 7655 2 2 15 7 9 0 23 1 0 5 2 2 2 0.210 0.278 0.333
## 8610 0 2 12 11 3 0 24 3 0 6 2 3 2 0.161 0.212 0.247
## ISO BABIP GB.FB LD. GB. FB. IFFB. HR.FB IFH. BUH.
## 5897 0.095 0.293 1.42 20.6 % 46.6 % 32.7 % 11.7 % 1.8 % 7.0 % 38.9 %
## 6157 0.103 0.303 1.38 19.5 % 46.6 % 33.9 % 12.9 % 3.5 % 6.0 % 37.5 %
## 6488 0.072 0.294 1.55 16.8 % 50.5 % 32.6 % 14.3 % 3.3 % 11.3 % 23.5 %
## 7655 0.124 0.250 1.22 23.1 % 42.3 % 34.6 % 14.8 % 7.4 % 3.0 % 11.1 %
## 8610 0.086 0.194 2.38 8.5 % 64.4 % 27.1 % 25.0 % 12.5 % 5.3 % 18.8 %
## SL. CB. CH. OSwing ZSwing Swing OContact ZContact Contact
## 5897 7.1 % 6.6 % 13.0 % 0.237 0.640 0.447 0.714 0.938 0.881
## 6157 9.2 % 6.6 % 12.0 % 0.250 0.643 0.461 0.667 0.918 0.855
## 6488 10.4 % 6.1 % 9.4 % 0.223 0.628 0.449 0.701 0.885 0.845
## 7655 3.2 % 7.3 % 16.1 % 0.309 0.616 0.472 0.609 0.895 0.807
## 8610 7.6 % 6.1 % 12.1 % 0.197 0.688 0.497 0.292 0.871 0.782
## Zone FStrike SwStr Off Lg Age playerid PowFac
## 5897 0.522 0.565 0.052 -13.5 1.1 29 1 1.361
## 6157 0.536 0.599 0.067 -6.5 0.9 30 1 1.390
## 6488 0.559 0.579 0.067 -10.0 1.2 28 1 1.277
## 7655 0.529 0.692 0.093 -4.5 0.5 25 1 1.586
## 8610 0.612 0.581 0.108 -10.6 0.3 26 1 1.534
A relavent thing to look at it how stable some of the data is from year to year. Let's look at the year-to-year correlation for data throughout this dataset.
for (var in 5:length(names(fg))) {
fgyear <- fg[fg$Season == 2006, ]
varPair <- c()
for (year in 2007:2013) {
fglyear <- fgyear
fgyear <- fg[fg$Season == year, ]
for (pid in intersect(fgyear$playerid, fglyear$playerid)) {
varPair <- rbind(varPair, c(fglyear[fglyear$playerid == pid, var],
fgyear[fgyear$playerid == pid, var]))
}
}
cat("The year-to-year correlation for ", names(fg)[var], "is: ", cor(varPair[,
2], varPair[, 1]), "\n")
}
## The year-to-year correlation for AB is: 0.5817
## The year-to-year correlation for PA is: 0.5877
## The year-to-year correlation for H is: 0.6085
## The year-to-year correlation for Singles is: 0.6071
## The year-to-year correlation for Doubles is: 0.538
## The year-to-year correlation for Triples is: 0.5067
## The year-to-year correlation for HR is: 0.6774
## The year-to-year correlation for R is: 0.6262
## The year-to-year correlation for RBI is: 0.6402
## The year-to-year correlation for BB is: 0.6851
## The year-to-year correlation for IBB is: 0.6509
## The year-to-year correlation for SO is: 0.6378
## The year-to-year correlation for HBP is: 0.552
## The year-to-year correlation for SF is: 0.325
## The year-to-year correlation for SH is: 0.6132
## The year-to-year correlation for GDP is: 0.527
## The year-to-year correlation for SB is: 0.7562
## The year-to-year correlation for CS is: 0.596
## The year-to-year correlation for AVG is: 0.3946
## The year-to-year correlation for OBP is: 0.4784
## The year-to-year correlation for SLG is: 0.5032
## The year-to-year correlation for ISO is: 0.6263
## The year-to-year correlation for BABIP is: 0.2766
## The year-to-year correlation for GB.FB is: 0.7588
## The year-to-year correlation for LD. is: 0.2365
## The year-to-year correlation for GB. is: 0.7219
## The year-to-year correlation for FB. is: 0.7204
## The year-to-year correlation for IFFB. is: 0.1607
## The year-to-year correlation for HR.FB is: 0.1827
## The year-to-year correlation for IFH. is: 0.08121
## The year-to-year correlation for BUH. is: 0.3537
## The year-to-year correlation for SL. is: 0.3665
## The year-to-year correlation for CB. is: 0.2751
## The year-to-year correlation for CH. is: 0.3535
## The year-to-year correlation for OSwing is: 0.8024
## The year-to-year correlation for ZSwing is: 0.7922
## The year-to-year correlation for Swing is: 0.8026
## The year-to-year correlation for OContact is: 0.8098
## The year-to-year correlation for ZContact is: 0.8052
## The year-to-year correlation for Contact is: 0.8657
## The year-to-year correlation for Zone is: 0.734
## The year-to-year correlation for FStrike is: 0.4485
## The year-to-year correlation for SwStr is: 0.8592
## The year-to-year correlation for Off is: 0.5197
## The year-to-year correlation for Lg is: 0.5398
## The year-to-year correlation for Age is: 0.9999
## The year-to-year correlation for playerid is: 1
## The year-to-year correlation for PowFac is: 0.6676
Here's some stuff I added just now for you Jake.
par(mfrow = c(1, 2))
HRregress <- lm(HR ~ PowFac + SB + HBP, data = fg)
plot(fg$PowFac, fg$HR, pch = 19, col = "blue", main = "Basic Scatterplot", xlab = "Power Factor",
ylab = "HRs", cex.main = 1.5, cex.lab = 1.3)
plot(fg$PowFac, HRregress$residuals, main = "Residuals of the Regression of HR \n on Power Factor, Stolen Bases, and HBP",
ylab = "Residuals", xlab = "Power Factor", cex.lab = 1.3, pch = 19, col = "dark red")
summary(HRregress)
##
## Call:
## lm(formula = HR ~ PowFac + SB + HBP, data = fg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.74 -3.79 -0.31 3.30 38.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.98204 0.50722 -82.8 <2e-16 ***
## PowFac 31.60702 0.31981 98.8 <2e-16 ***
## SB 0.16223 0.00756 21.4 <2e-16 ***
## HBP 0.63970 0.01966 32.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.43 on 8656 degrees of freedom
## Multiple R-squared: 0.601, Adjusted R-squared: 0.601
## F-statistic: 4.34e+03 on 3 and 8656 DF, p-value: <2e-16
Player ID 1744 is Miguel Cabrera. Lets try using the league-wide distribution of Power Factor to be the prior and use his season data to get a posterior distribution for his “true” power factor for each season.
First let's figure out a good approximation to the prior distribution of power factor:
hist(fg$PowFac, breaks = 20, freq = FALSE, main = "Distribution of Power Factor",
cex.main = 1.3, ylab = "", xlab = "Power Factor", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$PowFac), max(fg$PowFac), length = 100), mean = mean(fg$PowFac),
sd = sd(fg$PowFac))
points(seq(min(fg$PowFac), max(fg$PowFac), length = 100), normFit, type = "l",
col = "red")
As we can see, the distribution of power factor for qualifying MLB players between 2006 and 2013 can be reasonably appoximated by a Normal distribution with a mean of 1.5736 and a standard deviation of 0.2231. The player with the maximum power factor was Frank Thomas in 2005
So now power factor is the ratio between a weighted sum of multinomial random variables and an unweighted sum of those variables, but when \( n \gg 1 \) it is asymptotically normal with a standard devation that we can solve for analytically using the delta method. The code below uses the function powfacSD to calculate the plug-in estimate for this standard deviation.
Because we have a normal prior and normal likelihood (a conjugate pair), we can do all the calculations very straightforwardly. So now we can get our posterior for Cabrera for each season as:
powfacSD <- function(player) {
# Calculating the estimated sqrt(variance) of the players power factor via
# the delta method
Krate <- player$SO/player$PA
BBrate <- player$BB/player$PA
HBPrate <- player$HBP/player$PA
BIPrate <- 1 - Krate - BBrate - HBPrate
BIPs <- BIPrate * player$PA
singleRate <- player$Singles/BIPs
doubleRate <- player$Doubles/BIPs
tripleRate <- player$Triples/BIPs
hrRate <- player$HR/BIPs
num <- 4 * hrRate + 3 * tripleRate + 2 * doubleRate + singleRate
denom <- hrRate + tripleRate + doubleRate + singleRate
deriv <- matrix(c((4 * denom - num)/denom^2, (3 * denom - num)/denom^2,
(2 * denom - num)/denom^2, (denom - num)/denom^2), nrow = 4, ncol = 1)
covar <- matrix(c(hrRate * (1 - hrRate), -hrRate * tripleRate, -hrRate *
doubleRate, -hrRate * singleRate, -hrRate * tripleRate, tripleRate *
(1 - tripleRate), -tripleRate * doubleRate, -tripleRate * singleRate,
-doubleRate * hrRate, -tripleRate * doubleRate, doubleRate * (1 - doubleRate),
-doubleRate * singleRate, -singleRate * hrRate, -singleRate * tripleRate,
-singleRate * doubleRate, singleRate * (1 - singleRate)), nrow = 4,
ncol = 4)
return(sqrt(1/BIPs * t(deriv) %*% covar %*% deriv))
}
player <- fg[fg$playerid == 1744, ]
priorMean <- mean(fg$PowFac)
priorSD <- sd(fg$PowFac)
postMean <- rep(0, length(player$Season))
postSD <- rep(0, length(player$Season))
min(fg$PowFac)
## [1] 1
cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
# Plot the prior distribution
datRange <- seq(min(fg$PowFac), max(fg$PowFac), length = 200)
plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean,
priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor", ylab = "",
main = "Prior and Yearly Posterior Distributions for \n Miguel Cabrera's Power Factor",
cex.main = 1.3, cex.lab = 1.5, ylim = c(0, 1.1))
for (ind in 1:length(player$Season)) {
yearMean <- player$PowFac[ind]
yearSD <- powfacSD(player[ind, ])
# yearSD <- priorSD/sqrt(player$H[ind])
postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 +
1/yearSD^2)
postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange,
postMean[ind], postSD[ind])), col = cols[ind + 1])
cat("The posterior for Miguel Cabrera's Power Factor in ", player$Season[ind],
" has a mean of ", postMean[ind], "with a standard deviation of", postSD[ind],
"\n")
}
## The posterior for Miguel Cabrera's Power Factor in 2013 has a mean of 1.793 with a standard deviation of 0.08246
## The posterior for Miguel Cabrera's Power Factor in 2011 has a mean of 1.69 with a standard deviation of 0.07153
## The posterior for Miguel Cabrera's Power Factor in 2010 has a mean of 1.853 with a standard deviation of 0.08131
## The posterior for Miguel Cabrera's Power Factor in 2012 has a mean of 1.804 with a standard deviation of 0.07805
## The posterior for Miguel Cabrera's Power Factor in 2006 has a mean of 1.666 with a standard deviation of 0.06938
## The posterior for Miguel Cabrera's Power Factor in 2007 has a mean of 1.742 with a standard deviation of 0.07754
## The posterior for Miguel Cabrera's Power Factor in 2009 has a mean of 1.675 with a standard deviation of 0.07473
## The posterior for Miguel Cabrera's Power Factor in 2005 has a mean of 1.719 with a standard deviation of 0.07388
## The posterior for Miguel Cabrera's Power Factor in 2008 has a mean of 1.803 with a standard deviation of 0.08178
## The posterior for Miguel Cabrera's Power Factor in 2004 has a mean of 1.72 with a standard deviation of 0.08052
## The posterior for Miguel Cabrera's Power Factor in 2003 has a mean of 1.71 with a standard deviation of 0.1024
legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)
plot(player$Season, postMean, pch = 19, cex = 1.4, main = "Posterior Power Factor vs Season For Miguel Cabrera",
xlab = "Season", ylab = "Power Factor", cex.lab = 1.4, ylim = c(min(fg$PowFac),
max(fg$PowFac)))
arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 * postSD,
angle = 90, code = 3)
This analysis assumes that Cabrera's power factor is constant throughout each year. We see that Cabrera's power factor is not terribly consistent across years. Most notably, there were two seasons in which Cabrera's power factor was well below what we would expect from the adjacent years.
Let's take a look at a few other players:
for (pName in c("Hanley Ramirez", "Grady Sizemore", "Giancarlo Stanton", "Jay Bruce",
"Jose Bautista", "Elvis Andrus", "Chris Carter", "Starlin Castro", "Bobby Higginson",
"Juan Gonzalez")) {
player <- fg[fg$Name == pName, ]
playerName = unique(as.character(player$Name))
priorMean <- mean(fg$PowFac)
priorSD <- sd(fg$PowFac)
postMean <- rep(0, length(player$Season))
postSD <- rep(0, length(player$Season))
min(fg$PowFac)
cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
# Plot the prior distribution
datRange <- seq(min(fg$PowFac) - 0.25, max(fg$PowFac) + 0.25, length = 200)
plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean,
priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor",
ylab = "", main = paste("Prior and Yearly Posterior Distributions for \n ",
playerName, "'s Power Factor"), cex.main = 1.3, cex.lab = 1.5, ylim = c(0,
1.1))
for (ind in 1:length(player$Season)) {
yearMean <- player$PowFac[ind]
yearSD <- powfacSD(player[ind, ])
# yearSD <- priorSD/sqrt(player$H[ind])
postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 +
1/yearSD^2)
postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange,
postMean[ind], postSD[ind])), col = cols[ind + 1])
cat("The posterior for", playerName, "'s Power Factor in ", player$Season[ind],
" has a mean of ", postMean[ind], "with a standard deviation of",
postSD[ind], "\n")
}
legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)
plot(player$Season, postMean, pch = 19, cex = 1.4, main = paste("Posterior Power Factor vs Season For",
playerName), xlab = "Season", ylab = "Power Factor", cex.lab = 1.4,
ylim = c(min(fg$PowFac), max(fg$PowFac)))
arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 *
postSD, angle = 90, code = 3)
}
## The posterior for Hanley Ramirez 's Power Factor in 2013 has a mean of 1.794 with a standard deviation of 0.09988
## The posterior for Hanley Ramirez 's Power Factor in 2009 has a mean of 1.586 with a standard deviation of 0.06735
## The posterior for Hanley Ramirez 's Power Factor in 2007 has a mean of 1.682 with a standard deviation of 0.06826
## The posterior for Hanley Ramirez 's Power Factor in 2008 has a mean of 1.765 with a standard deviation of 0.08069
## The posterior for Hanley Ramirez 's Power Factor in 2010 has a mean of 1.582 with a standard deviation of 0.07522
## The posterior for Hanley Ramirez 's Power Factor in 2006 has a mean of 1.638 with a standard deviation of 0.06656
## The posterior for Hanley Ramirez 's Power Factor in 2012 has a mean of 1.683 with a standard deviation of 0.08154
## The posterior for Hanley Ramirez 's Power Factor in 2011 has a mean of 1.562 with a standard deviation of 0.09809
## The posterior for Grady Sizemore 's Power Factor in 2006 has a mean of 1.81 with a standard deviation of 0.07307
## The posterior for Grady Sizemore 's Power Factor in 2008 has a mean of 1.832 with a standard deviation of 0.08239
## The posterior for Grady Sizemore 's Power Factor in 2007 has a mean of 1.657 with a standard deviation of 0.07511
## The posterior for Grady Sizemore 's Power Factor in 2005 has a mean of 1.664 with a standard deviation of 0.07142
## The posterior for Grady Sizemore 's Power Factor in 2009 has a mean of 1.752 with a standard deviation of 0.09763
## The posterior for Grady Sizemore 's Power Factor in 2004 has a mean of 1.621 with a standard deviation of 0.1381
## The posterior for Grady Sizemore 's Power Factor in 2011 has a mean of 1.798 with a standard deviation of 0.1171
## The posterior for Grady Sizemore 's Power Factor in 2010 has a mean of 1.415 with a standard deviation of 0.1049
## The posterior for Giancarlo Stanton 's Power Factor in 2012 has a mean of 1.992 with a standard deviation of 0.09966
## The posterior for Giancarlo Stanton 's Power Factor in 2011 has a mean of 1.961 with a standard deviation of 0.09609
## The posterior for Giancarlo Stanton 's Power Factor in 2013 has a mean of 1.852 with a standard deviation of 0.103
## The posterior for Giancarlo Stanton 's Power Factor in 2010 has a mean of 1.864 with a standard deviation of 0.1099
## The posterior for Jay Bruce 's Power Factor in 2010 has a mean of 1.726 with a standard deviation of 0.08778
## The posterior for Jay Bruce 's Power Factor in 2012 has a mean of 1.959 with a standard deviation of 0.09293
## The posterior for Jay Bruce 's Power Factor in 2011 has a mean of 1.807 with a standard deviation of 0.08962
## The posterior for Jay Bruce 's Power Factor in 2013 has a mean of 1.791 with a standard deviation of 0.08145
## The posterior for Jay Bruce 's Power Factor in 2008 has a mean of 1.739 with a standard deviation of 0.1023
## The posterior for Jay Bruce 's Power Factor in 2009 has a mean of 1.947 with a standard deviation of 0.1224
## The posterior for Jose Bautista 's Power Factor in 2011 has a mean of 1.935 with a standard deviation of 0.09428
## The posterior for Jose Bautista 's Power Factor in 2010 has a mean of 2.222 with a standard deviation of 0.09706
## The posterior for Jose Bautista 's Power Factor in 2012 has a mean of 1.996 with a standard deviation of 0.1245
## The posterior for Jose Bautista 's Power Factor in 2013 has a mean of 1.851 with a standard deviation of 0.1012
## The posterior for Jose Bautista 's Power Factor in 2009 has a mean of 1.697 with a standard deviation of 0.1098
## The posterior for Jose Bautista 's Power Factor in 2007 has a mean of 1.623 with a standard deviation of 0.07777
## The posterior for Jose Bautista 's Power Factor in 2006 has a mean of 1.742 with a standard deviation of 0.1025
## The posterior for Jose Bautista 's Power Factor in 2008 has a mean of 1.674 with a standard deviation of 0.1045
## The posterior for Elvis Andrus 's Power Factor in 2012 has a mean of 1.333 with a standard deviation of 0.04712
## The posterior for Elvis Andrus 's Power Factor in 2011 has a mean of 1.308 with a standard deviation of 0.04972
## The posterior for Elvis Andrus 's Power Factor in 2009 has a mean of 1.413 with a standard deviation of 0.06769
## The posterior for Elvis Andrus 's Power Factor in 2010 has a mean of 1.144 with a standard deviation of 0.03121
## The posterior for Elvis Andrus 's Power Factor in 2013 has a mean of 1.236 with a standard deviation of 0.04546
## The posterior for Chris Carter 's Power Factor in 2012 has a mean of 1.924 with a standard deviation of 0.1397
## The posterior for Chris Carter 's Power Factor in 2013 has a mean of 1.925 with a standard deviation of 0.1039
## The posterior for Starlin Castro 's Power Factor in 2011 has a mean of 1.417 with a standard deviation of 0.05307
## The posterior for Starlin Castro 's Power Factor in 2010 has a mean of 1.373 with a standard deviation of 0.05407
## The posterior for Starlin Castro 's Power Factor in 2012 has a mean of 1.524 with a standard deviation of 0.06482
## The posterior for Starlin Castro 's Power Factor in 2013 has a mean of 1.428 with a standard deviation of 0.06014
## The posterior for Bobby Higginson 's Power Factor in 1996 has a mean of 1.768 with a standard deviation of 0.0872
## The posterior for Bobby Higginson 's Power Factor in 1997 has a mean of 1.717 with a standard deviation of 0.08142
## The posterior for Bobby Higginson 's Power Factor in 2000 has a mean of 1.767 with a standard deviation of 0.07726
## The posterior for Bobby Higginson 's Power Factor in 1998 has a mean of 1.677 with a standard deviation of 0.07556
## The posterior for Bobby Higginson 's Power Factor in 2001 has a mean of 1.603 with a standard deviation of 0.07663
## The posterior for Bobby Higginson 's Power Factor in 2002 has a mean of 1.489 with a standard deviation of 0.07436
## The posterior for Bobby Higginson 's Power Factor in 1999 has a mean of 1.594 with a standard deviation of 0.09684
## The posterior for Bobby Higginson 's Power Factor in 2004 has a mean of 1.577 with a standard deviation of 0.08517
## The posterior for Bobby Higginson 's Power Factor in 1995 has a mean of 1.717 with a standard deviation of 0.102
## The posterior for Bobby Higginson 's Power Factor in 2003 has a mean of 1.571 with a standard deviation of 0.09059
## The posterior for Juan Gonzalez 's Power Factor in 1996 has a mean of 1.971 with a standard deviation of 0.08973
## The posterior for Juan Gonzalez 's Power Factor in 1998 has a mean of 1.928 with a standard deviation of 0.08055
## The posterior for Juan Gonzalez 's Power Factor in 1999 has a mean of 1.807 with a standard deviation of 0.08191
## The posterior for Juan Gonzalez 's Power Factor in 2001 has a mean of 1.782 with a standard deviation of 0.08282
## The posterior for Juan Gonzalez 's Power Factor in 1997 has a mean of 1.918 with a standard deviation of 0.0928
## The posterior for Juan Gonzalez 's Power Factor in 1995 has a mean of 1.91 with a standard deviation of 0.1081
## The posterior for Juan Gonzalez 's Power Factor in 2003 has a mean of 1.854 with a standard deviation of 0.111
## The posterior for Juan Gonzalez 's Power Factor in 2000 has a mean of 1.721 with a standard deviation of 0.08767
## The posterior for Juan Gonzalez 's Power Factor in 1994 has a mean of 1.691 with a standard deviation of 0.09428
## The posterior for Juan Gonzalez 's Power Factor in 2002 has a mean of 1.595 with a standard deviation of 0.09592
## The posterior for Juan Gonzalez 's Power Factor in 2004 has a mean of 1.588 with a standard deviation of 0.1408
One approach to projecting players performance is to use straightforward, unadjusted outcomes and rates on these. We can think of a players at-bat being a sequence of stochastic outcomes. The result of a plate appearance can be anything in the set: {K, BB, HBP, Ball in play}, which can be modeled as a multinomial random variable. Conditional on the result being a ball in play, the outcome can then be anything from another set: {single, double, triple, HR, out, sacrifice}, which can again be treated as a multinomial. The parametersfor these multinomials can be estimated from the players historical data with appropriate uncertainty applied. Once we have estimates for these quantities, we could do a Monte Carlo study to build a projection for a player given a number of plate appearances.
Lets try estimating these parameters for Miguel Cabrera from his 2013 numbers and then simulate some seasons using those estimated parameters as the true parameters:
par(mfrow = c(1, 3))
for (pName in c("Miguel Cabrera")) {
player <- fg[fg$Name == pName & fg$Season == 2013, ]
playerName = unique(as.character(player$Name))
Krate <- player$SO/player$PA
BBrate <- player$BB/player$PA
HBPrate <- player$HBP/player$PA
BIPrate <- 1 - Krate - BBrate - HBPrate
BIPs <- BIPrate * player$PA
singleRate <- player$Singles/BIPs
doubleRate <- player$Doubles/BIPs
tripleRate <- player$Triples/BIPs
hrRate <- player$HR/BIPs
sacRate <- (player$SF + player$SH)/BIPs
outRate <- 1 - singleRate - doubleRate - tripleRate - hrRate - sacRate
print(16 * hrRate * (1 - hrRate) + 9 * tripleRate * (1 - tripleRate) + 4 *
doubleRate * (1 - doubleRate) + singleRate * (1 - singleRate))
simSeasons <- data.frame()
for (i in 1:5000) {
simData <- t(rmultinom(1, size = player$PA, c(Krate, BBrate, HBPrate,
BIPrate)))
simData2 <- t(rmultinom(1, size = simData[4], c(singleRate, doubleRate,
tripleRate, hrRate, sacRate, outRate)))
simSeasons <- rbind(simSeasons, cbind(simData, simData2))
}
names(simSeasons) <- c("K", "BB", "HBP", "BIP", "Singles", "Doubles", "Triples",
"HR", "Sac", "Outs")
print(head(simSeasons))
hist(simSeasons$HR, breaks = 20, xlab = "HRs", ylab = "", main = paste("Simulated Number of Home Runs for \n Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
hist((simSeasons$Singles + simSeasons$Doubles + simSeasons$Triples)/(simSeasons$BIP -
simSeasons$HR), breaks = 20, xlab = "BABIP", ylab = "", main = paste("Simulated BABIP for Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
hist((4 * simSeasons$HR + 3 * simSeasons$Triples + 2 * simSeasons$Doubles +
simSeasons$Singles)/(simSeasons$HR + simSeasons$Triples + simSeasons$Doubles +
simSeasons$Singles), breaks = 40, xlab = "Power Factor", ylab = "",
main = paste("Simulated Power Factor for \n Miguel Cabrera \n using rates from the 2013 season"),
cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
xseq <- seq(1, 2.5, length = 200)
simSeasons$PowFac = (4 * simSeasons$HR + 3 * simSeasons$Triples + 2 * simSeasons$Doubles +
simSeasons$Singles)/(simSeasons$HR + simSeasons$Triples + simSeasons$Doubles +
simSeasons$Singles)
points(xseq, dnorm(xseq, mean = mean(simSeasons$PowFac), sd = powfacSD(player)),
type = "l", col = "red", lwd = 3)
}
## [1] 1.801
## K BB HBP BIP Singles Doubles Triples HR Sac Outs
## 1 87 94 6 465 126 19 2 51 6 261
## 2 75 85 4 488 116 37 2 60 4 269
## 3 100 91 4 457 123 23 1 50 3 257
## 4 88 76 5 483 131 32 1 51 2 266
## 5 81 107 8 456 117 21 0 46 5 267
## 6 108 80 6 458 121 17 1 50 2 267