After excluding the players whose at bats, AB, either in the first or second period, are no larger than 10 (\(N_{i1}\leq10\) or \(N_{i2}\leq10\)), 491 players are left in the analysis.
raw_data <- read.table("data/Bat.dat")
data <- raw_data[raw_data$AB.1>10 & raw_data$AB.2>10,]
data_shape <- as.data.frame(dim(data)[1])
colnames(data_shape) <- "numbers"
rownames(data_shape) <- "players"
data_shape
## numbers
## players 491
# data information
data_hitting_ability.1 <- matrix(0,dim(data)[1],1)
colnames(data_hitting_ability.1) <- c("Batting Average")
for (i in 1:dim(data)[1]) {
data_hitting_ability.1[i,1] <- data$H.1[i]/data$AB.1[i]
}
summary(data_hitting_ability.1)
## Batting Average
## Min. :0.0000
## 1st Qu.:0.1345
## Median :0.1691
## Mean :0.1638
## 3rd Qu.:0.1972
## Max. :0.3846
# monte carlo
posterior_hitting_ability_p2 <- matrix(0,dim(data)[1],1)
colnames(posterior_hitting_ability_p2) <- c("Batting Average")
set.seed(1)
for (i in 1:dim(data)[1]) {
a <- 1
b <- 4
p.mc5000 <- rbeta(5000,data$H.1[i]+a,data$AB.1[i]-data$H.1[i]+b)
posterior_hitting_ability_p2[i,1] <- mean(p.mc5000)
}
data$X <- 0
data$var <- 0
for (i in 1:dim(data)[1]) {
value <- (data$H.1[i]+0.25)/(data$AB.1[i]+0.5)
data$X[i] <- asin(sqrt(value))
data$var[i] <- 1/(4*data$AB.1[i])
}
# weakly informative priors
eta0 <- 1 ; t20 <- 1
mu0 <- 0.4 ; g20 <- 0.01
# starting values
m <- dim(data)[1]
n <- 1
theta <- ybar <- data$X
sigma2 <- data$var
mu <- mean(theta)
tau2 <- var(theta)
# setup MCMC
set.seed(1)
S <- 5000
THETA <- matrix(nrow=S,ncol=m)
MST <- matrix(nrow=S,ncol=2)
# MCMC algorithm
for(s in 1:S)
{
# sample new values of the thetas
for(j in 1:m)
{
vtheta <- 1/(n/sigma2[j]+1/tau2)
etheta <- vtheta*(ybar[j]*n/sigma2[j]+mu/tau2)
theta[j] <- rnorm(1,etheta,sqrt(vtheta))
}
# sample a new value of mu
vmu <- 1/(m/tau2+1/g20)
emu <- vmu*(m*mean(theta)/tau2+mu0/g20)
mu <- rnorm(1,emu,sqrt(vmu))
# sample a new value of tau2
etam <- eta0+m
ss <- eta0*t20+sum((theta-mu)^2)
tau2 <- 1/rgamma(1,etam/2,ss/2)
# store results
THETA[s,] <- theta
MST[s,] <- c(mu,tau2)
}
mcmc <- list(THETA=THETA,MST=MST)
theta.mc5000 <- apply(THETA, 2, mean)
posterior_hitting_ability_p3 <- (sin(theta.mc5000))^2
The information and histograms of our estimates \(\frac{H_{i2}}{N_{i2}}\) from Problem 2 and Problem 3 are presented below.
# estimates (Problem 2)
summary(posterior_hitting_ability_p2)
## Batting Average
## Min. :0.02968
## 1st Qu.:0.13649
## Median :0.17047
## Mean :0.16707
## 3rd Qu.:0.19700
## Max. :0.35481
# histogram (Problem 2)
hist(posterior_hitting_ability_p2,main=expression("Posterior "~p[i]),
xlab="Batting Average",breaks=25)
# estimates (Problem 3)
posterior_hitting_ability_p3 <- matrix(posterior_hitting_ability_p3,dim(data)[1],1)
colnames(posterior_hitting_ability_p3) <- c("Batting Average")
summary(posterior_hitting_ability_p3)
## Batting Average
## Min. :0.07102
## 1st Qu.:0.14612
## Median :0.16926
## Mean :0.16813
## 3rd Qu.:0.19003
## Max. :0.28837
# histogram (Problem 3)
hist(posterior_hitting_ability_p3,main=expression("Posterior "~p[i]),
xlab="Batting Average",breaks=25)
The MSE of the estimates from Problem 2 is slightly larger than the MSE of the estimates from Problem 3. We believe that the reason for this is because the variance of the estimates from Problem 3 is smaller than the variance of the estimates from Problem 2 and the biases of the estimates from Problem 2 and Problem 3 are similar based on the information and histograms in Problem 4.
data_hitting_ability.2 <- matrix(0,dim(data)[1],1)
colnames(data_hitting_ability.2) <- c("Batting Average")
for (i in 1:dim(data)[1]) {
data_hitting_ability.2[i,1] <- data$H.2[i]/data$AB.2[i]
}
#MSE
mse2 <- sum((posterior_hitting_ability_p2-data_hitting_ability.2)^2)/nrow(data)
mse3 <- sum((posterior_hitting_ability_p3-data_hitting_ability.2)^2)/nrow(data)
MSE <- matrix(c(mse2,mse3),1,2)
rownames(MSE) <- c("MSE")
colnames(MSE) <- c("Problem 2","Problem 3")
MSE
## Problem 2 Problem 3
## MSE 0.01176792 0.01073689