#full sample space of possible theta values
thetas <- seq(.1, 5, by = .1)
y <- 36
theta_prob <- .2
#calculate the sum of joint probabilities for the denominator
sum <- 0
for(i in 1:length(thetas)){
sum = sum + dpois(y, 20 *thetas[i]) * theta_prob
}
y_marginal <- sum
posterior1 <- c()
for(i in 1:length(thetas)){
new_post <- (dpois(y, 20 * thetas[i])*theta_prob)/
y_marginal
posterior1 <- c(posterior1, new_post)
}
# I used a line plot because it looked a bit nicer, but it is important to note
# that we were given discrete values of theta
plot(thetas, posterior1, main = "P(theta | Y = 36); n = 20", xlab = "Theta", ylab = "Posterior probability", type = "l")
ev <- thetas*posterior1
sum(ev)
## [1] 1.85
plot(thetas, posterior1, main = "P(theta | Y = 36)", xlab = "Theta", ylab = "Posterior probability", type = "l")
abline(v = sum(ev), col = "red")
The posterior mean of theta is 1.85.
data <- data.frame(thetas, posterior1)
cdf <- function(data, percentile){
lower = 0
for(i in 1:nrow(data)){
sum <- sum(data[1:i, 2])
if(sum >= percentile){
bound <- data[i,1]
break()
}
}
bound
}
lower <- cdf(data, .025)
lower
## [1] 1.3
upper <- cdf(data, .975)
upper
## [1] 2.5
So the 95 percent confidence interval for the posterior mean is (1.3, 2.5).
This time, n = 40 and y = 72. I essentially copied and pasted the code from before, but changed the neccesary values.
thetas <- seq(.1, 5, by = .1)
y <- 72
theta_prob <- .2
n <- 40
#calculate the sum of joint probabilities for the denominator
sum <- 0
for(i in 1:length(thetas)){
sum = sum + dpois(y, n *thetas[i]) * theta_prob
}
y_marginal <- sum
posterior2 <- c()
for(i in 1:length(thetas)){
new_post <- (dpois(y, n * thetas[i])*theta_prob)/
y_marginal
posterior2 <- c(posterior2, new_post)
}
# I used a line plot because it looked a bit nicer, but it is important to note
# that we were given discrete values of theta
plot(thetas, posterior2, main = "P(theta | Y = 72); n = 40", xlab = "Theta", ylab = "Posterior probability", type = "l")
Here are both distributions graphed on the same coordinate system.
plot(thetas, posterior1, main = "Probability plot comparison", xlab = "Theta", ylab = "Posterior probability", type = "l", col = "blue", ylim = c(0, .20))
lines(thetas, posterior2, col = "red")
legend(x = "topright", legend = c("Y = 36, n = 20", "Y = 72, n = 40"), col = c("blue", "red"), lty = 1)
Although the modes for both distributions are around the same theta value, the distribution with a higher sample size is much narrower with a higher peak. That is to say the distribution with a higher sample size is more precise in predicting the value of theta. This makes sense since Y and n are both doubled, so each observed nest is still averaging 1.8 eggs in both cases, the only difference being the larger sample size in the second plot.