Ex-4.1

The posterior distribution corresponding to the 2nd sample survey is \(Beta(31, 21)\) which was computed from the general posterior distribution \(Beta(a + (\sum_{i=1}^{n} y_i), ~ b + n - (\sum_{i=1}^{n} y_i))\) corresponding to the general prior distribution \(Beta(a, b)\). The posterior distribution corresponding to the 1st sample survey from \(Ex-3.1\) is \(Beta(58, 44)\). Using posterior distributions we will estimate \(Pr(\theta_1 < \theta_2 ~ | ~ the ~ data ~ and ~ prior)\). The solution is simpler than I thought it would be. I initially assumed I would have to derive the pdf of difference between the two posterior random variables corresponding to \(\theta_1\) and \(\theta_2\).

set.seed(27)

theta1_mc <- rbeta(n = 5000, shape1 = 58, shape2 = 44)

theta2_mc <- rbeta(n = 5000, shape1 = 31, shape2 = 21)

# answer
mean(theta1_mc < theta2_mc)
## [1] 0.6194

Ex-4.2(a)

set.seed(27)

# Sampling from Posterior A
mcA <- rgamma(n = 5000, shape = 237, rate = 20)

# Sampling from Posterior B
mcB <- rgamma(n = 5000, shape = 125, rate = 14)

# required probability estimate
mean(mcB < mcA)
## [1] 0.9944

Ex-4.2(b)

# prior sample size for strain B
n0 <- 1:1000

# function for computing proportion of successful events
props <- sapply(
  X = n0,
  FUN = function(n) {
    set.seed(27)
    
    # Sampling from Posterior A
    mcA <- rgamma(n = 5000, shape = 237, rate = 20)
    
    # Sampling from Posterior B
    mcB <- rgamma(n = 5000,
                  shape = (12 * n + 113),
                  rate = (n + 13))
    
    # required probability estimate
    mean(mcB < mcA)
    
  }
)

# graphics
tibble(prior_sample_size = n0, proportions = props) %>% 
  ggplot(aes(x = prior_sample_size, y = proportions)) + 
  geom_line(color = "blue", 
            linewidth = 1)

Ex-4.2(c)

The posterior distributions were computed by hand and the procedures can be found in the HomeCopy-I. Information acquired from \(Ex-3.3\) is mentioned on sticky notes instead of repeating derivations.

###############################################################

# Repeating for part(a)

set.seed(27)

# predicting 5000 values for strain A
A_pred_a <- rnbinom(n = 5000, size = 237, prob = (20/21))

# predicting 5000 values for strain B from part(a)'s distribution
B_pred_a <- rnbinom(n = 5000, size = 125, prob = (14/15))

# Answer for part(a): required probability estimate
mean(B_pred_a < A_pred_a)
## [1] 0.6918
###############################################################

# Repeating for part(b)

# generating prior sample sizes for part(b)
n_0 <- 1:1000

# function for computing proportions of successful events corresponding to part(b)
props <- sapply(
  X = n_0,
  FUN = function(n0) {
    set.seed(27)
    
    # Sampling from Posterior Predictive of Strain A
    A_pred_b <- rnbinom(n = 5000, size = 237, prob = (20/21))
    
    # Sampling from Posterior Predictive of strain B
    B_pred_b <- rnbinom(n = 5000, 
                        size = (12*n0 + 113), 
                        prob = ((n0 + 13)/(n0 + 14)))
    
    # required probability estimate
    mean(A_pred_b < B_pred_b)
    
  }
)

# Answer for part(b)
# graphics
tibble(prior_sample_size = n_0, proportions = props) %>% 
  ggplot(aes(x = prior_sample_size, y = proportions)) + 
  geom_line(color = "blue", 
            linewidth = 1)

Ex-4.3(a)

They want me to run posterior predictive checks, to investigate the adequacy of the given (Poisson) sampling model, to assess the fit of the Poisson model for these data.

These are all just fancy ways of telling me to check whether the sample or data supports our assumed Poisson sampling model. We’re operating under the assumption that our data came from a Poisson sampling model. If that is truly the case then our observed sample statistic would not be exceptional in comparison to the rest of the statistics simulated by our posterior predictive distribution.

# strain A data
yA <- c(12, 9, 12, 14, 13, 13, 15, 8, 15, 6)

# observed sample statistic
T_stat <- mean(yA)/sd(yA)

# prescribed sample size
nA <- 10

set.seed(27)

# simulated statistics
ts <- sapply(
  X = 1:1000,
  FUN = function(b) {
    
    a <- rnbinom(n = nA,
                 size = 237,
                 prob = (20 / 21))
    
    mean(a)/sd(a)
  }
)

# graphics
tibble(stat = ts) %>% 
  ggplot(aes(x = ts)) + 
  geom_histogram(binwidth = 0.4,
                 # (max - min)/20 recommended width for continuous data  
                 fill = "darkgreen", 
                 color = "black") + 
  geom_vline(xintercept = T_stat, # observed sample statistic 
             color = "red", 
             linetype = "dashed", 
             linewidth = 1.5) + 
  geom_vline(xintercept = quantile(x = ts, 
                                   probs = 0.025),  
             color = "darkblue", 
             linetype = "dashed", 
             linewidth = 1.5) +
  geom_vline(xintercept = quantile(x = ts, 
                                   probs = 0.975),  
             color = "darkblue", 
             linetype = "dashed", 
             linewidth = 1.5) 

Conclusion

It appears that the observed sample statistic falls within the quantile-based \(95\)% confidence interval of the posterior predictive distribution of our statistic of interest. Hence, it is not uncommon relative to the main bulk of the produced sampling distribution. And so, we conclude that the Poisson sampling model is adequate.

Ex-4.3(b)

We’re running the same posterior predictive check for data from strain B.

# strain B data
yB <- c(11, 11, 10, 9, 9, 8, 7, 10, 6, 8, 8, 9, 7)

# observed sample statistic
T_stat <- mean(yB)/sd(yB)

# prescribed sample size
nB <- 13

set.seed(27)

# simulated statistics
ts <- sapply(
  X = 1:1000,
  FUN = function(b) {
    
    a <- rnbinom(n = nB,
                 size = 237,
                 prob = (20 / 21))
    
    mean(a)/sd(a)
  }
)

# graphics
tibble(stat = ts) %>% 
  ggplot(aes(x = ts)) + 
  geom_histogram(binwidth = 0.4,
                 # (max - min)/20 recommended width for continuous data  
                 fill = "darkgreen", 
                 color = "black") + 
  geom_vline(xintercept = T_stat, # observed sample statistic 
             color = "red", 
             linetype = "dashed", 
             linewidth = 1.5) + 
  geom_vline(xintercept = quantile(x = ts, 
                                   probs = 0.025),  
             color = "darkblue", 
             linetype = "dashed", 
             linewidth = 1.5) +
  geom_vline(xintercept = quantile(x = ts, 
                                   probs = 0.975),  
             color = "darkblue", 
             linetype = "dashed", 
             linewidth = 1.5) 

Conclusion

It appears that the observed sample statistic falls outside the quantile-based \(95\)% confidence interval of the posterior predictive distribution of our statistic of interest. Hence, it is uncommon relative to the main bulk of the produced sampling distribution. And so, we conclude that the Poisson sampling model is not adequate.

Ex-4.4(a)

Discrete approximation is taught in ch-\(6\) which is admittedly, a little advanced compared to ch-\(4\). We will use the discretized posterior distribution for interval estimation. We resort to this method because the actual posterior pdf has a non-standard form or an unrecognized form, making it difficult to compute the interval.

Discrete Approximation for Interval Estimation

We’re gonna compute the \(95\)% quantile-based confidence interval.

# discrete set of parameter values, very dense sequence
D <- seq(from = 0, to = 1, length.out = 300)

# given
n <- 43
y <- 15

# non-marginalized mixed posterior
mixed.post <- 3*(D^(y+1))*((1-D)^(n+7-y)) + (D^(y+7))*((1-D)^(n+1-y))

# discrete approximation of mixed posterior
mixed.post <- mixed.post/sum(mixed.post)

# approximate cumulative density 
mixed.post.cdf <- cumsum(mixed.post) 

# 95% quantile-based interval
(
disc.approx.CI <- c(
  Lower = max(D[mixed.post.cdf <= 0.025]), 
  Upper = min(D[mixed.post.cdf >= 0.975])
)
)
##     Lower     Upper 
## 0.2006689 0.4581940
# approximate CDF graph
tibble(
  theta = D, 
  approx.cdf = mixed.post.cdf 
) %>% 
  ggplot(aes(x = theta, y = approx.cdf)) + 
  geom_line(
    color = "red", 
    linewidth = 1.5
  ) + 
  geom_hline(
    yintercept = c(0.025, 0.975), # desired quantiles
    linewidth = 1, 
    linetype = "dashed"
  ) + 
  geom_vline(
    xintercept = disc.approx.CI, 
    linewidth = 1, 
    linetype = "dashed"
  )

Plotting the Actual Mixed Posterior

We have the actual pdf at hand, derived in the HomeCopy-II. Hence, there is no need for discrete approximation with regards to plotting the density.

# discrete set of parameter values, very dense sequence
D <- seq(from = 0, to = 1, length.out = 300)

# pdf of mixed posterior
dMixedPosterior <- function(theta, n = 43, y = 15){
  
  # constant
  `C*` <- (
    3*beta(a = (y+2), b = (n+8-y)) + beta(a = (y+8), b = (n+2-y))
  )
  
  # weight for 1st component
  w1 <- (3 / `C*`) * beta(a = (y+2), b = (n+8-y))
  
  # weight for 2nd component
  w2 <- (1 / `C*`) * beta(a = (y+8), b = (n+2-y))
  
  # 1st component of mixed posterior pdf
  post1 <- dbeta(x = theta,
                 shape1 = (y + 2),
                 shape2 = (n + 8 - y))
  
  # 2nd component of mixed posterior pdf
  post2 <- dbeta(x = theta,
                 shape1 = (y + 8),
                 shape2 = (n + 2 - y))
  
  # mixed posterior pdf (weighted sum)
  posteriorPDF <- w1*post1 + w2*post2 
  
  return(data.frame(post1, post2, posteriorPDF))
  
  #return(posteriorPDF)
  
}

# plotting begins
tibble(
  theta = D,
  mixedPDF = dMixedPosterior(theta = theta)$posteriorPDF,
  post1 = dMixedPosterior(theta = theta)$post1,
  post2 = dMixedPosterior(theta = theta)$post2
) %>%
  ggplot(aes(x = theta)) +
  geom_line(
    aes(
      y = post1,
      color = "Beta(17, 36)",
      linetype = "Beta(17, 36)",
      linewidth = "Beta(17, 36)"
    ),
  ) +
  geom_line(
    aes(
      y = post2,
      color = "Beta(23, 30)",
      linetype = "Beta(23, 30)",
      linewidth = "Beta(23, 30)"
    ),
  ) +
  geom_line(
    aes(
      y = mixedPDF,
      color = "Mixed Posterior",
      linetype = "Mixed Posterior",
      linewidth = "Mixed Posterior"
    ),
  ) +
  scale_color_manual(values = c(
    "Beta(17, 36)" = "red",
    "Beta(23, 30)" = "darkgreen",
    "Mixed Posterior" = "blue"
  )) +
  scale_linetype_manual(
    values = c(
      "Beta(17, 36)" = "solid",
      "Beta(23, 30)" = "solid",
      "Mixed Posterior" = "dashed"
    )
  ) +
  scale_linewidth_manual(values = c(
    "Beta(17, 36)" = 2,
    "Beta(23, 30)" = 2,
    "Mixed Posterior" = 1.5
  )) +
  labs(color = "", 
       linetype = "", 
       linewidth = "", 
       title = "Mixed Posterior vs Pure Posteriors") +
  ylab(label = "Density") + 
  xlab(label = expression(theta)) + 
  theme_minimal()

Conclusion

Because the \(1^{st}\) pure posterior has more weight relative to the \(2^{nd}\) pure posterior, it appears that the mixed posterior almost superimposes itself upon the \(1^{st}\) pure posterior i.e. it is biased towards the \(1^{st}\) pure posterior. The question asked to plot just the mixed posterior but I’m also leaving the pure posteriors for the purpose of comparison (and fun).

Ex-4.4(b)

We will construct a Monte-Carlo approximation of the mixed posterior distribution plotted in \(EX-4.4(A)\). For that we need to define a random number generator like rbinom or rgamma. Once the numbers have been drawn, we’ll plot them on via an empirical histogram or an empirical density graph.

# random number generating function
rMixedPosterior <- function(size = 1000, n = 43, y = 15){
  
  # for storage purpose
  number <- NULL
  
  # constant
  `C*` <- (
    3*beta(a = (y+2), b = (n+8-y)) + beta(a = (y+8), b = (n+2-y))
  )
  
  # the probability from the 1st pure posterior
  w <- (3 / `C*`) * beta(a = (y+2), b = (n+8-y))
  
  for(i in 1:size){
    
    if(rbinom(n = 1, size = 1, prob = w) == 1){
      
      number[i] <- rbeta(n = 1, shape1 = (y+8), shape2 = (n+2-y))
      
    }
    else{
      
      number[i] <- rbeta(n = 1, shape1 = (y+2), shape2 = (n+8-y))
      
    }
    
  }
  
  return(number)
  
}
set.seed(27)

# random numbers generated for 4(b)
rn4b <- rMixedPosterior(size = 10000)

# Monte-Carlo approximated quantile-based 95% confidence interval
(
mc.approx.CI <- c(
  Lower = quantile(x = rn4b, probs = 0.025), 
  Upper = quantile(x = rn4b, probs = 0.975)
)
)
##  Lower.2.5% Upper.97.5% 
##   0.2990839   0.5639193
# empirical histogram
tibble(
  zahlen = rn4b
) %>% 
  ggplot(aes(x = zahlen)) + 
  geom_histogram(bins = 30, 
                 fill = "red", 
                 color = "white") + 
  labs(title = "Empirical Histogram for Mixed Beta Posterior") + 
  xlab(label = expression(theta)) + 
  ylab(label = "Frequency") + 
  theme_minimal()

# empirical density graph
tibble(zahlen = rn4b) %>%
  ggplot(aes(x = zahlen)) +
  geom_density(fill = "red", color = "white", alpha = 0.8) +

  # Monte Carlo CI (2 lines)
  geom_vline(
    data = tibble(mc.CI = mc.approx.CI),
    aes(color = "Monte Carlo", 
        linetype = "Monte Carlo", 
        linewidth = "Monte Carlo", 
        xintercept = mc.CI)
  ) +

  # Discrete Approximation CI (2 lines)
  geom_vline(
    data = tibble(disc.CI = disc.approx.CI),
    aes(color = "Discrete Approximation", 
        linetype = "Discrete Approximation", 
        linewidth = "Discrete Approximation", 
        xintercept = disc.CI)
  ) +

  scale_color_manual(values = c(
    "Discrete Approximation" = "black",
    "Monte Carlo" = "blue"
  )) +
  scale_linetype_manual(values = c(
    "Discrete Approximation" = "dashed",
    "Monte Carlo" = "dotdash"
  )) +
  scale_linewidth_manual(values = c(
    "Discrete Approximation" = 2,
    "Monte Carlo" = 2
  )) +
  labs(title = "Empirical Density for Mixed Beta Posterior", 
       color = "", 
       linetype = "", 
       linewidth = "") +
  xlab(expression(theta)) +
  ylab("Density") +
  theme_economist()