Looking at some batter data from Fangraphs.

Setting up the basic framework for the projection model. First, we'll source a file that reads in the data and does some basic data cleaning and management, as well as load relavent libraries.

library(boot)
source("readfgData.R")

There are 1668 players in the dataset.

Here is a quick look at the distribution of some variables and their corresponding normal approximations, FStrike, the percentage of first strikes seen by the batter, SwStr, the batter's swinging strike rate, and ZContact, the batter's contact percentage within the strikezone.

par(mfrow = c(1, 3))
hist(fg$FStrike, breaks = 20, freq = FALSE, main = "Distribution of the Percentage of \n First Strikes Seen", 
    cex.main = 1.3, ylab = "", xlab = "First Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$FStrike), max(fg$FStrike), length = 100), mean = mean(fg$FStrike), 
    sd = sd(fg$FStrike))

## Error: missing value where TRUE/FALSE needed

points(seq(min(fg$FStrike), max(fg$FStrike), length = 100), normFit, type = "l", 
    col = "red")

## Error: missing value where TRUE/FALSE needed

hist(fg$SwStr, breaks = 20, freq = FALSE, main = "Distribution of Batter's Swinging Strike Rates", 
    cex.main = 1.3, ylab = "", xlab = "Swinging Strike Percentage", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$SwStr), max(fg$SwStr), length = 100), mean = mean(fg$SwStr), 
    sd = sd(fg$SwStr))

## Error: missing value where TRUE/FALSE needed

points(seq(min(fg$SwStr), max(fg$SwStr), length = 100), normFit, type = "l", 
    col = "red")

## Error: missing value where TRUE/FALSE needed

hist(fg$ZContact, breaks = 20, freq = FALSE, main = "Distribution of Batter's Contact Rate \n on Pitches Within The Stike Zone", 
    cex.main = 1.3, ylab = "", xlab = "In-Zone Contact Rate", cex.lab = 1.5)

plot of chunk unnamed-chunk-2

normFit <- dnorm(seq(min(fg$ZContact), max(fg$ZContact), length = 100), mean = mean(fg$ZContact), 
    sd = sd(fg$ZContact))

## Error: missing value where TRUE/FALSE needed

points(seq(min(fg$ZContact), max(fg$ZContact), length = 100), normFit, type = "l", 
    col = "red")

## Error: missing value where TRUE/FALSE needed


fg[fg$playerid == min(fg$playerid), ]

##      Season            Name    Team   G  AB  PA   H Singles Doubles
## 5897   2007 Alfredo Amezaga Marlins 133 400 448 105      80      14
## 6157   2008 Alfredo Amezaga Marlins 125 311 337  82      61      13
## 6488   2006 Alfredo Amezaga Marlins 132 334 378  87      72       9
## 7655   2003 Alfredo Amezaga  Angels  37 105 120  22      15       3
## 8610   2004 Alfredo Amezaga  Angels  73  93 105  15      11       2
##      Triples HR  R RBI BB IBB SO HBP SF SH GDP SB CS   AVG   OBP   SLG
## 5897       9  2 46  30 35   0 52   4  5  4   4 13  7 0.263 0.324 0.358
## 6157       5  3 41  32 19   1 47   3  0  4   6  8  2 0.264 0.312 0.367
## 6488       3  3 42  19 33   4 46   3  1  7   5 20 12 0.260 0.332 0.332
## 7655       2  2 15   7  9   0 23   1  0  5   2  2  2 0.210 0.278 0.333
## 8610       0  2 12  11  3   0 24   3  0  6   2  3  2 0.161 0.212 0.247
##        ISO BABIP GB.FB    LD.    GB.    FB.  IFFB.  HR.FB   IFH.   BUH.
## 5897 0.095 0.293  1.42 20.6 % 46.6 % 32.7 % 11.7 %  1.8 %  7.0 % 38.9 %
## 6157 0.103 0.303  1.38 19.5 % 46.6 % 33.9 % 12.9 %  3.5 %  6.0 % 37.5 %
## 6488 0.072 0.294  1.55 16.8 % 50.5 % 32.6 % 14.3 %  3.3 % 11.3 % 23.5 %
## 7655 0.124 0.250  1.22 23.1 % 42.3 % 34.6 % 14.8 %  7.4 %  3.0 % 11.1 %
## 8610 0.086 0.194  2.38  8.5 % 64.4 % 27.1 % 25.0 % 12.5 %  5.3 % 18.8 %
##         SL.   CB.    CH. OSwing ZSwing Swing OContact ZContact Contact
## 5897  7.1 % 6.6 % 13.0 %  0.237  0.640 0.447    0.714    0.938   0.881
## 6157  9.2 % 6.6 % 12.0 %  0.250  0.643 0.461    0.667    0.918   0.855
## 6488 10.4 % 6.1 %  9.4 %  0.223  0.628 0.449    0.701    0.885   0.845
## 7655  3.2 % 7.3 % 16.1 %  0.309  0.616 0.472    0.609    0.895   0.807
## 8610  7.6 % 6.1 % 12.1 %  0.197  0.688 0.497    0.292    0.871   0.782
##       Zone FStrike SwStr   Off  Lg Age playerid PowFac
## 5897 0.522   0.565 0.052 -13.5 1.1  29        1  1.361
## 6157 0.536   0.599 0.067  -6.5 0.9  30        1  1.390
## 6488 0.559   0.579 0.067 -10.0 1.2  28        1  1.277
## 7655 0.529   0.692 0.093  -4.5 0.5  25        1  1.586
## 8610 0.612   0.581 0.108 -10.6 0.3  26        1  1.534

A relavent thing to look at it how stable some of the data is from year to year. Let's look at the year-to-year correlation for data throughout this dataset.

for (var in 5:length(names(fg))) {
    fgyear <- fg[fg$Season == 2006, ]
    varPair <- c()
    for (year in 2007:2013) {
        fglyear <- fgyear
        fgyear <- fg[fg$Season == year, ]
        for (pid in intersect(fgyear$playerid, fglyear$playerid)) {
            varPair <- rbind(varPair, c(fglyear[fglyear$playerid == pid, var], 
                fgyear[fgyear$playerid == pid, var]))
        }
    }
    cat("The year-to-year correlation for ", names(fg)[var], "is: ", cor(varPair[, 
        2], varPair[, 1]), "\n")
}

## The year-to-year correlation for  AB is:  0.5817 
## The year-to-year correlation for  PA is:  0.5877 
## The year-to-year correlation for  H is:  0.6085 
## The year-to-year correlation for  Singles is:  0.6071 
## The year-to-year correlation for  Doubles is:  0.538 
## The year-to-year correlation for  Triples is:  0.5067 
## The year-to-year correlation for  HR is:  0.6774 
## The year-to-year correlation for  R is:  0.6262 
## The year-to-year correlation for  RBI is:  0.6402 
## The year-to-year correlation for  BB is:  0.6851 
## The year-to-year correlation for  IBB is:  0.6509 
## The year-to-year correlation for  SO is:  0.6378 
## The year-to-year correlation for  HBP is:  0.552 
## The year-to-year correlation for  SF is:  0.325 
## The year-to-year correlation for  SH is:  0.6132 
## The year-to-year correlation for  GDP is:  0.527 
## The year-to-year correlation for  SB is:  0.7562 
## The year-to-year correlation for  CS is:  0.596 
## The year-to-year correlation for  AVG is:  0.3946 
## The year-to-year correlation for  OBP is:  0.4784 
## The year-to-year correlation for  SLG is:  0.5032 
## The year-to-year correlation for  ISO is:  0.6263 
## The year-to-year correlation for  BABIP is:  0.2766 
## The year-to-year correlation for  GB.FB is:  0.7588 
## The year-to-year correlation for  LD. is:  0.2365 
## The year-to-year correlation for  GB. is:  0.7219 
## The year-to-year correlation for  FB. is:  0.7204 
## The year-to-year correlation for  IFFB. is:  0.1607 
## The year-to-year correlation for  HR.FB is:  0.1827 
## The year-to-year correlation for  IFH. is:  0.08121 
## The year-to-year correlation for  BUH. is:  0.3537 
## The year-to-year correlation for  SL. is:  0.3665 
## The year-to-year correlation for  CB. is:  0.2751 
## The year-to-year correlation for  CH. is:  0.3535 
## The year-to-year correlation for  OSwing is:  0.8024 
## The year-to-year correlation for  ZSwing is:  0.7922 
## The year-to-year correlation for  Swing is:  0.8026 
## The year-to-year correlation for  OContact is:  0.8098 
## The year-to-year correlation for  ZContact is:  0.8052 
## The year-to-year correlation for  Contact is:  0.8657 
## The year-to-year correlation for  Zone is:  0.734 
## The year-to-year correlation for  FStrike is:  0.4485 
## The year-to-year correlation for  SwStr is:  0.8592 
## The year-to-year correlation for  Off is:  0.5197 
## The year-to-year correlation for  Lg is:  0.5398 
## The year-to-year correlation for  Age is:  0.9999 
## The year-to-year correlation for  playerid is:  1 
## The year-to-year correlation for  PowFac is:  0.6676

Here's some stuff I added just now for you Jake.

par(mfrow = c(1, 2))
HRregress <- lm(HR ~ PowFac + SB + HBP, data = fg)
plot(fg$PowFac, fg$HR, pch = 19, col = "blue", main = "Basic Scatterplot", xlab = "Power Factor", 
    ylab = "HRs", cex.main = 1.5, cex.lab = 1.3)
plot(fg$PowFac, HRregress$residuals, main = "Residuals of the Regression of HR \n on Power Factor, Stolen Bases, and HBP", 
    ylab = "Residuals", xlab = "Power Factor", cex.lab = 1.3, pch = 19, col = "dark red")

plot of chunk unnamed-chunk-4


summary(HRregress)

## 
## Call:
## lm(formula = HR ~ PowFac + SB + HBP, data = fg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.74  -3.79  -0.31   3.30  38.03 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -41.98204    0.50722   -82.8   <2e-16 ***
## PowFac       31.60702    0.31981    98.8   <2e-16 ***
## SB            0.16223    0.00756    21.4   <2e-16 ***
## HBP           0.63970    0.01966    32.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.43 on 8656 degrees of freedom
## Multiple R-squared:  0.601,  Adjusted R-squared:  0.601 
## F-statistic: 4.34e+03 on 3 and 8656 DF,  p-value: <2e-16

This basic regression above indicates a statistically significant association between HBP and HRs when conditioning on the other two predictors, as well as between power factor and HR. Power Factor being defined as the ratio of a batter's slugging percentage and their batting average. These hypothesis testing results are questionable due to the heteroskedasticty in the data.

Some quick tests of Bayesian updating for a players info.

Player ID 1744 is Miguel Cabrera. Lets try using the league-wide distribution of Power Factor to be the prior and use his season data to get a posterior distribution for his “true” power factor for each season.

First let's figure out a good approximation to the prior distribution of power factor:

hist(fg$PowFac, breaks = 20, freq = FALSE, main = "Distribution of Power Factor", 
    cex.main = 1.3, ylab = "", xlab = "Power Factor", cex.lab = 1.5)
normFit <- dnorm(seq(min(fg$PowFac), max(fg$PowFac), length = 100), mean = mean(fg$PowFac), 
    sd = sd(fg$PowFac))
points(seq(min(fg$PowFac), max(fg$PowFac), length = 100), normFit, type = "l", 
    col = "red")

plot of chunk unnamed-chunk-5

As we can see, the distribution of power factor for qualifying MLB players between 2006 and 2013 can be reasonably appoximated by a Normal distribution with a mean of 1.5736 and a standard deviation of 0.2231. The player with the maximum power factor was Frank Thomas in 2005

So now power factor is the ratio between a weighted sum of multinomial random variables and an unweighted sum of those variables, but when \( n \gg 1 \) it is asymptotically normal with a standard devation that we can solve for analytically using the delta method. The code below uses the function powfacSD to calculate the plug-in estimate for this standard deviation.

Because we have a normal prior and normal likelihood (a conjugate pair), we can do all the calculations very straightforwardly. So now we can get our posterior for Cabrera for each season as:

powfacSD <- function(player) {
    # Calculating the estimated sqrt(variance) of the players power factor via
    # the delta method
    Krate <- player$SO/player$PA
    BBrate <- player$BB/player$PA
    HBPrate <- player$HBP/player$PA
    BIPrate <- 1 - Krate - BBrate - HBPrate
    BIPs <- BIPrate * player$PA
    singleRate <- player$Singles/BIPs
    doubleRate <- player$Doubles/BIPs
    tripleRate <- player$Triples/BIPs
    hrRate <- player$HR/BIPs
    num <- 4 * hrRate + 3 * tripleRate + 2 * doubleRate + singleRate
    denom <- hrRate + tripleRate + doubleRate + singleRate
    deriv <- matrix(c((4 * denom - num)/denom^2, (3 * denom - num)/denom^2, 
        (2 * denom - num)/denom^2, (denom - num)/denom^2), nrow = 4, ncol = 1)
    covar <- matrix(c(hrRate * (1 - hrRate), -hrRate * tripleRate, -hrRate * 
        doubleRate, -hrRate * singleRate, -hrRate * tripleRate, tripleRate * 
        (1 - tripleRate), -tripleRate * doubleRate, -tripleRate * singleRate, 
        -doubleRate * hrRate, -tripleRate * doubleRate, doubleRate * (1 - doubleRate), 
        -doubleRate * singleRate, -singleRate * hrRate, -singleRate * tripleRate, 
        -singleRate * doubleRate, singleRate * (1 - singleRate)), nrow = 4, 
        ncol = 4)
    return(sqrt(1/BIPs * t(deriv) %*% covar %*% deriv))
}
player <- fg[fg$playerid == 1744, ]
priorMean <- mean(fg$PowFac)
priorSD <- sd(fg$PowFac)
postMean <- rep(0, length(player$Season))
postSD <- rep(0, length(player$Season))
min(fg$PowFac)

## [1] 1

cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
# Plot the prior distribution
datRange <- seq(min(fg$PowFac), max(fg$PowFac), length = 200)
plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean, 
    priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor", ylab = "", 
    main = "Prior and Yearly Posterior Distributions for \n Miguel Cabrera's Power Factor", 
    cex.main = 1.3, cex.lab = 1.5, ylim = c(0, 1.1))

for (ind in 1:length(player$Season)) {
    yearMean <- player$PowFac[ind]
    yearSD <- powfacSD(player[ind, ])
    # yearSD <- priorSD/sqrt(player$H[ind])
    postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 + 
        1/yearSD^2)
    postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
    polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange, 
        postMean[ind], postSD[ind])), col = cols[ind + 1])
    cat("The posterior for Miguel Cabrera's Power Factor in ", player$Season[ind], 
        " has a mean of ", postMean[ind], "with a standard deviation of", postSD[ind], 
        "\n")
}

## The posterior for Miguel Cabrera's Power Factor in  2013  has a mean of  1.793 with a standard deviation of 0.08246 
## The posterior for Miguel Cabrera's Power Factor in  2011  has a mean of  1.69 with a standard deviation of 0.07153 
## The posterior for Miguel Cabrera's Power Factor in  2010  has a mean of  1.853 with a standard deviation of 0.08131 
## The posterior for Miguel Cabrera's Power Factor in  2012  has a mean of  1.804 with a standard deviation of 0.07805 
## The posterior for Miguel Cabrera's Power Factor in  2006  has a mean of  1.666 with a standard deviation of 0.06938 
## The posterior for Miguel Cabrera's Power Factor in  2007  has a mean of  1.742 with a standard deviation of 0.07754 
## The posterior for Miguel Cabrera's Power Factor in  2009  has a mean of  1.675 with a standard deviation of 0.07473 
## The posterior for Miguel Cabrera's Power Factor in  2005  has a mean of  1.719 with a standard deviation of 0.07388 
## The posterior for Miguel Cabrera's Power Factor in  2008  has a mean of  1.803 with a standard deviation of 0.08178 
## The posterior for Miguel Cabrera's Power Factor in  2004  has a mean of  1.72 with a standard deviation of 0.08052 
## The posterior for Miguel Cabrera's Power Factor in  2003  has a mean of  1.71 with a standard deviation of 0.1024


legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)

plot of chunk unnamed-chunk-6

plot(player$Season, postMean, pch = 19, cex = 1.4, main = "Posterior Power Factor vs Season For Miguel Cabrera", 
    xlab = "Season", ylab = "Power Factor", cex.lab = 1.4, ylim = c(min(fg$PowFac), 
        max(fg$PowFac)))
arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 * postSD, 
    angle = 90, code = 3)

plot of chunk unnamed-chunk-6

This analysis assumes that Cabrera's power factor is constant throughout each year. We see that Cabrera's power factor is not terribly consistent across years. Most notably, there were two seasons in which Cabrera's power factor was well below what we would expect from the adjacent years.

Let's take a look at a few other players:

for (pName in c("Hanley Ramirez", "Grady Sizemore", "Giancarlo Stanton", "Jay Bruce", 
    "Jose Bautista", "Elvis Andrus", "Chris Carter", "Starlin Castro", "Bobby Higginson", 
    "Juan Gonzalez")) {
    player <- fg[fg$Name == pName, ]
    playerName = unique(as.character(player$Name))
    priorMean <- mean(fg$PowFac)
    priorSD <- sd(fg$PowFac)
    postMean <- rep(0, length(player$Season))
    postSD <- rep(0, length(player$Season))
    min(fg$PowFac)
    cols <- rgb(1:9/9, 1:9/18, 1 - 1:9/9, 0.35)
    # Plot the prior distribution
    datRange <- seq(min(fg$PowFac) - 0.25, max(fg$PowFac) + 0.25, length = 200)
    plot(datRange, dnorm(datRange, priorMean, priorSD)/max(dnorm(datRange, priorMean, 
        priorSD)), type = "l", col = cols[1], lwd = 3, xlab = "Power Factor", 
        ylab = "", main = paste("Prior and Yearly Posterior Distributions for \n ", 
            playerName, "'s Power Factor"), cex.main = 1.3, cex.lab = 1.5, ylim = c(0, 
            1.1))

    for (ind in 1:length(player$Season)) {
        yearMean <- player$PowFac[ind]
        yearSD <- powfacSD(player[ind, ])
        # yearSD <- priorSD/sqrt(player$H[ind])
        postMean[ind] <- (priorMean/priorSD^2 + yearMean/yearSD^2)/(1/priorSD^2 + 
            1/yearSD^2)
        postSD[ind] <- sqrt(1/(1/priorSD^2 + 1/yearSD^2))
        polygon(datRange, dnorm(datRange, postMean[ind], postSD[ind])/max(dnorm(datRange, 
            postMean[ind], postSD[ind])), col = cols[ind + 1])
        cat("The posterior for", playerName, "'s Power Factor in ", player$Season[ind], 
            " has a mean of ", postMean[ind], "with a standard deviation of", 
            postSD[ind], "\n")
    }

    legend("topleft", c("Prior", player$Season), col = cols, lty = 1, lwd = 3)
    plot(player$Season, postMean, pch = 19, cex = 1.4, main = paste("Posterior Power Factor vs Season For", 
        playerName), xlab = "Season", ylab = "Power Factor", cex.lab = 1.4, 
        ylim = c(min(fg$PowFac), max(fg$PowFac)))
    arrows(player$Season, postMean - 2 * postSD, player$Season, postMean + 2 * 
        postSD, angle = 90, code = 3)
}

plot of chunk unnamed-chunk-7

## The posterior for Hanley Ramirez 's Power Factor in  2013  has a mean of  1.794 with a standard deviation of 0.09988 
## The posterior for Hanley Ramirez 's Power Factor in  2009  has a mean of  1.586 with a standard deviation of 0.06735 
## The posterior for Hanley Ramirez 's Power Factor in  2007  has a mean of  1.682 with a standard deviation of 0.06826 
## The posterior for Hanley Ramirez 's Power Factor in  2008  has a mean of  1.765 with a standard deviation of 0.08069 
## The posterior for Hanley Ramirez 's Power Factor in  2010  has a mean of  1.582 with a standard deviation of 0.07522 
## The posterior for Hanley Ramirez 's Power Factor in  2006  has a mean of  1.638 with a standard deviation of 0.06656 
## The posterior for Hanley Ramirez 's Power Factor in  2012  has a mean of  1.683 with a standard deviation of 0.08154 
## The posterior for Hanley Ramirez 's Power Factor in  2011  has a mean of  1.562 with a standard deviation of 0.09809

plot of chunk unnamed-chunk-7

## The posterior for Grady Sizemore 's Power Factor in  2006  has a mean of  1.81 with a standard deviation of 0.07307 
## The posterior for Grady Sizemore 's Power Factor in  2008  has a mean of  1.832 with a standard deviation of 0.08239 
## The posterior for Grady Sizemore 's Power Factor in  2007  has a mean of  1.657 with a standard deviation of 0.07511 
## The posterior for Grady Sizemore 's Power Factor in  2005  has a mean of  1.664 with a standard deviation of 0.07142 
## The posterior for Grady Sizemore 's Power Factor in  2009  has a mean of  1.752 with a standard deviation of 0.09763 
## The posterior for Grady Sizemore 's Power Factor in  2004  has a mean of  1.621 with a standard deviation of 0.1381 
## The posterior for Grady Sizemore 's Power Factor in  2011  has a mean of  1.798 with a standard deviation of 0.1171 
## The posterior for Grady Sizemore 's Power Factor in  2010  has a mean of  1.415 with a standard deviation of 0.1049

plot of chunk unnamed-chunk-7

## The posterior for Giancarlo Stanton 's Power Factor in  2012  has a mean of  1.992 with a standard deviation of 0.09966 
## The posterior for Giancarlo Stanton 's Power Factor in  2011  has a mean of  1.961 with a standard deviation of 0.09609 
## The posterior for Giancarlo Stanton 's Power Factor in  2013  has a mean of  1.852 with a standard deviation of 0.103 
## The posterior for Giancarlo Stanton 's Power Factor in  2010  has a mean of  1.864 with a standard deviation of 0.1099

plot of chunk unnamed-chunk-7

## The posterior for Jay Bruce 's Power Factor in  2010  has a mean of  1.726 with a standard deviation of 0.08778 
## The posterior for Jay Bruce 's Power Factor in  2012  has a mean of  1.959 with a standard deviation of 0.09293 
## The posterior for Jay Bruce 's Power Factor in  2011  has a mean of  1.807 with a standard deviation of 0.08962 
## The posterior for Jay Bruce 's Power Factor in  2013  has a mean of  1.791 with a standard deviation of 0.08145 
## The posterior for Jay Bruce 's Power Factor in  2008  has a mean of  1.739 with a standard deviation of 0.1023 
## The posterior for Jay Bruce 's Power Factor in  2009  has a mean of  1.947 with a standard deviation of 0.1224

plot of chunk unnamed-chunk-7

## The posterior for Jose Bautista 's Power Factor in  2011  has a mean of  1.935 with a standard deviation of 0.09428 
## The posterior for Jose Bautista 's Power Factor in  2010  has a mean of  2.222 with a standard deviation of 0.09706 
## The posterior for Jose Bautista 's Power Factor in  2012  has a mean of  1.996 with a standard deviation of 0.1245 
## The posterior for Jose Bautista 's Power Factor in  2013  has a mean of  1.851 with a standard deviation of 0.1012 
## The posterior for Jose Bautista 's Power Factor in  2009  has a mean of  1.697 with a standard deviation of 0.1098 
## The posterior for Jose Bautista 's Power Factor in  2007  has a mean of  1.623 with a standard deviation of 0.07777 
## The posterior for Jose Bautista 's Power Factor in  2006  has a mean of  1.742 with a standard deviation of 0.1025 
## The posterior for Jose Bautista 's Power Factor in  2008  has a mean of  1.674 with a standard deviation of 0.1045

plot of chunk unnamed-chunk-7

## The posterior for Elvis Andrus 's Power Factor in  2012  has a mean of  1.333 with a standard deviation of 0.04712 
## The posterior for Elvis Andrus 's Power Factor in  2011  has a mean of  1.308 with a standard deviation of 0.04972 
## The posterior for Elvis Andrus 's Power Factor in  2009  has a mean of  1.413 with a standard deviation of 0.06769 
## The posterior for Elvis Andrus 's Power Factor in  2010  has a mean of  1.144 with a standard deviation of 0.03121 
## The posterior for Elvis Andrus 's Power Factor in  2013  has a mean of  1.236 with a standard deviation of 0.04546

plot of chunk unnamed-chunk-7

## The posterior for Chris Carter 's Power Factor in  2012  has a mean of  1.924 with a standard deviation of 0.1397 
## The posterior for Chris Carter 's Power Factor in  2013  has a mean of  1.925 with a standard deviation of 0.1039

plot of chunk unnamed-chunk-7

## The posterior for Starlin Castro 's Power Factor in  2011  has a mean of  1.417 with a standard deviation of 0.05307 
## The posterior for Starlin Castro 's Power Factor in  2010  has a mean of  1.373 with a standard deviation of 0.05407 
## The posterior for Starlin Castro 's Power Factor in  2012  has a mean of  1.524 with a standard deviation of 0.06482 
## The posterior for Starlin Castro 's Power Factor in  2013  has a mean of  1.428 with a standard deviation of 0.06014

plot of chunk unnamed-chunk-7

## The posterior for Bobby Higginson 's Power Factor in  1996  has a mean of  1.768 with a standard deviation of 0.0872 
## The posterior for Bobby Higginson 's Power Factor in  1997  has a mean of  1.717 with a standard deviation of 0.08142 
## The posterior for Bobby Higginson 's Power Factor in  2000  has a mean of  1.767 with a standard deviation of 0.07726 
## The posterior for Bobby Higginson 's Power Factor in  1998  has a mean of  1.677 with a standard deviation of 0.07556 
## The posterior for Bobby Higginson 's Power Factor in  2001  has a mean of  1.603 with a standard deviation of 0.07663 
## The posterior for Bobby Higginson 's Power Factor in  2002  has a mean of  1.489 with a standard deviation of 0.07436 
## The posterior for Bobby Higginson 's Power Factor in  1999  has a mean of  1.594 with a standard deviation of 0.09684 
## The posterior for Bobby Higginson 's Power Factor in  2004  has a mean of  1.577 with a standard deviation of 0.08517 
## The posterior for Bobby Higginson 's Power Factor in  1995  has a mean of  1.717 with a standard deviation of 0.102 
## The posterior for Bobby Higginson 's Power Factor in  2003  has a mean of  1.571 with a standard deviation of 0.09059

plot of chunk unnamed-chunk-7

## The posterior for Juan Gonzalez 's Power Factor in  1996  has a mean of  1.971 with a standard deviation of 0.08973 
## The posterior for Juan Gonzalez 's Power Factor in  1998  has a mean of  1.928 with a standard deviation of 0.08055 
## The posterior for Juan Gonzalez 's Power Factor in  1999  has a mean of  1.807 with a standard deviation of 0.08191 
## The posterior for Juan Gonzalez 's Power Factor in  2001  has a mean of  1.782 with a standard deviation of 0.08282 
## The posterior for Juan Gonzalez 's Power Factor in  1997  has a mean of  1.918 with a standard deviation of 0.0928 
## The posterior for Juan Gonzalez 's Power Factor in  1995  has a mean of  1.91 with a standard deviation of 0.1081 
## The posterior for Juan Gonzalez 's Power Factor in  2003  has a mean of  1.854 with a standard deviation of 0.111 
## The posterior for Juan Gonzalez 's Power Factor in  2000  has a mean of  1.721 with a standard deviation of 0.08767 
## The posterior for Juan Gonzalez 's Power Factor in  1994  has a mean of  1.691 with a standard deviation of 0.09428 
## The posterior for Juan Gonzalez 's Power Factor in  2002  has a mean of  1.595 with a standard deviation of 0.09592 
## The posterior for Juan Gonzalez 's Power Factor in  2004  has a mean of  1.588 with a standard deviation of 0.1408

plot of chunk unnamed-chunk-7

One approach to projecting players performance is to use straightforward, unadjusted outcomes and rates on these. We can think of a players at-bat being a sequence of stochastic outcomes. The result of a plate appearance can be anything in the set: {K, BB, HBP, Ball in play}, which can be modeled as a multinomial random variable. Conditional on the result being a ball in play, the outcome can then be anything from another set: {single, double, triple, HR, out, sacrifice}, which can again be treated as a multinomial. The parametersfor these multinomials can be estimated from the players historical data with appropriate uncertainty applied. Once we have estimates for these quantities, we could do a Monte Carlo study to build a projection for a player given a number of plate appearances.

Lets try estimating these parameters for Miguel Cabrera from his 2013 numbers and then simulate some seasons using those estimated parameters as the true parameters:

par(mfrow = c(1, 3))
for (pName in c("Miguel Cabrera")) {
    player <- fg[fg$Name == pName & fg$Season == 2013, ]
    playerName = unique(as.character(player$Name))
    Krate <- player$SO/player$PA
    BBrate <- player$BB/player$PA
    HBPrate <- player$HBP/player$PA
    BIPrate <- 1 - Krate - BBrate - HBPrate
    BIPs <- BIPrate * player$PA
    singleRate <- player$Singles/BIPs
    doubleRate <- player$Doubles/BIPs
    tripleRate <- player$Triples/BIPs
    hrRate <- player$HR/BIPs
    sacRate <- (player$SF + player$SH)/BIPs
    outRate <- 1 - singleRate - doubleRate - tripleRate - hrRate - sacRate
    print(16 * hrRate * (1 - hrRate) + 9 * tripleRate * (1 - tripleRate) + 4 * 
        doubleRate * (1 - doubleRate) + singleRate * (1 - singleRate))
    simSeasons <- data.frame()
    for (i in 1:5000) {
        simData <- t(rmultinom(1, size = player$PA, c(Krate, BBrate, HBPrate, 
            BIPrate)))
        simData2 <- t(rmultinom(1, size = simData[4], c(singleRate, doubleRate, 
            tripleRate, hrRate, sacRate, outRate)))
        simSeasons <- rbind(simSeasons, cbind(simData, simData2))
    }
    names(simSeasons) <- c("K", "BB", "HBP", "BIP", "Singles", "Doubles", "Triples", 
        "HR", "Sac", "Outs")
    print(head(simSeasons))
    hist(simSeasons$HR, breaks = 20, xlab = "HRs", ylab = "", main = paste("Simulated Number of Home Runs for \n Miguel Cabrera \n using rates from the 2013 season"), 
        cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
    hist((simSeasons$Singles + simSeasons$Doubles + simSeasons$Triples)/(simSeasons$BIP - 
        simSeasons$HR), breaks = 20, xlab = "BABIP", ylab = "", main = paste("Simulated BABIP for Miguel Cabrera \n using rates from the 2013 season"), 
        cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
    hist((4 * simSeasons$HR + 3 * simSeasons$Triples + 2 * simSeasons$Doubles + 
        simSeasons$Singles)/(simSeasons$HR + simSeasons$Triples + simSeasons$Doubles + 
        simSeasons$Singles), breaks = 40, xlab = "Power Factor", ylab = "", 
        main = paste("Simulated Power Factor for \n Miguel Cabrera \n using rates from the 2013 season"), 
        cex.axis = 1.3, cex.main = 1.5, cex.lab = 1.5, freq = FALSE)
    xseq <- seq(1, 2.5, length = 200)
    simSeasons$PowFac = (4 * simSeasons$HR + 3 * simSeasons$Triples + 2 * simSeasons$Doubles + 
        simSeasons$Singles)/(simSeasons$HR + simSeasons$Triples + simSeasons$Doubles + 
        simSeasons$Singles)
    points(xseq, dnorm(xseq, mean = mean(simSeasons$PowFac), sd = powfacSD(player)), 
        type = "l", col = "red", lwd = 3)

}

## [1] 1.801
##     K  BB HBP BIP Singles Doubles Triples HR Sac Outs
## 1  87  94   6 465     126      19       2 51   6  261
## 2  75  85   4 488     116      37       2 60   4  269
## 3 100  91   4 457     123      23       1 50   3  257
## 4  88  76   5 483     131      32       1 51   2  266
## 5  81 107   8 456     117      21       0 46   5  267
## 6 108  80   6 458     121      17       1 50   2  267

plot of chunk unnamed-chunk-8