Note I struggled a lot more with this HW than the first 3. I’m sure you’ll notice a few oddities throughout. Problem 1 and 3 were the most confusing for me. I want to make sure I fully understand everything (conceptually, at least). Your office hours are booked this week, but let me know if you have a 30min time slot at some point.

Question 1: Influence of sample size on inferential strength – Grid Approximation

set.seed(0)
#Our initial parameters
p1 = .4
p2 = .6
#samples vector to hold each our sample trials
samples = seq(from=5, to=100, by=5)
parameters = seq(from=0, to=1, by=.01)

#Uniform prior - instead of making a vector, will just multiply
#my likelihoods by this number
uform.prior = rep(1/length(parameters), length(parameters))
d.frame = data.frame()
par(mfrow=c(3, 2))

##:-----Main loop------:###
for (n in 1:20) {
  #Two data sets at sample size n
  p1.distr = rbinom(1, samples[n], p1)
  p2.distr = rbinom(1, samples[n], p2)
  
  #Likelihood functions
  p1.likelihood = sapply(parameters, function(p) {
    dbinom(p1.distr, size=samples[n], prob=p)
  })
  p2.likelihood = sapply(parameters, function(p) {
    dbinom(p2.distr, size=samples[n], prob=p)
  })
  
  #Calculate non-normalized posteriors
  p1.nnposterior = p1.likelihood * uform.prior
  p2.nnposterior = p2.likelihood * uform.prior
  
  #Calculate normalized posteriors
  p1.normpost = p1.nnposterior / sum(p1.nnposterior)
  p2.normpost = p2.nnposterior / sum(p2.nnposterior)
  
  ###:-------Plots--------:###
  if (n == 1 || n == 10 || n == 20) {
    #p1 plot
    plot(parameters, p1.normpost, xlab="Parameter",
         ylab="Normalized posterior",
         main=paste("n.size= ", samples[n]))
    #p2 plot
    plot(parameters, p2.normpost, xlab="Parameter",
         ylab="Normalized posterior",
         main=paste("n.size= ", samples[n]))
  }
  
  #Quantiles at .025 and .975 to form symmetric 95% CI
  #p1.CI25 = quantile(p1.normpost, probs=c(.025))
  #p1.CI975 = quantile(p1.normpost, probs=c(.975))
  #p2.CI25 = quantile(p2.normpost, probs=c(.025))
  #p2.CI975 = quantile(p2.normpost, probs=c(.975))
  p1.CI25 = qbeta(p=c(.025), p1.distr, samples[n])
  p1.CI975 = qbeta(p=c(.975), p1.distr, samples[n])
  p2.CI25 = qbeta(p=c(.025), p2.distr, samples[n])
  p2.CI975 = qbeta(p=c(.975), p2.distr, samples[n])
  
  #Overlap betwen 95% CIs
  overlap = ((p1.CI975 - p2.CI25) > 0)
  
  #Bayes factor
  p1.MLE = parameters[which(p1.normpost == max(p1.normpost))]
  p2.MLE = parameters[which(p2.normpost == max(p2.normpost))]
  p1.lik = dbinom(p1.distr, size=samples[n], prob=p1.MLE)
  p2.lik = dbinom(p2.distr, size=samples[n], prob=p2.MLE)
  p1.null.lhood = dbinom(p1.distr, size=samples[n], prob=.5)
  p2.null.lhood = dbinom(p2.distr, size=samples[n], prob=.5)
  bayes.factor = (p1.lik * p2.lik) / (p1.null.lhood * p2.null.lhood)
  
  vector = c(samples[n], p1.CI25, p1.CI975, p2.CI25, p2.CI975, overlap, bayes.factor)
  d.frame = rbind(d.frame, vector)
}

colnames(d.frame) = c("sample", "P1.025", "P1.975", "P2.025", "P2.975", "overlap",
                      "Bayes.factor")
d.frame
##    sample     P1.025    P1.975    P2.025    P2.975 overlap Bayes.factor
## 1       5 0.09898828 0.7095791 0.1570128 0.7551368       1     2.899103
## 2      10 0.05486064 0.4841377 0.1633643 0.6161963       1     2.784822
## 3      15 0.19707642 0.5726560 0.2440237 0.6133465       1     7.389713
## 4      20 0.19929863 0.5281200 0.1375266 0.4628487       1     1.653472
## 5      25 0.16851715 0.4630446 0.2181250 0.5135268       1     1.434576
## 6      30 0.09826564 0.3515524 0.2699671 0.5376145       1   163.097847
## 7      35 0.12882281 0.3708876 0.2429070 0.4937621       1    16.942615
## 8      40 0.16455471 0.3965447 0.2416017 0.4769485       1     4.312710
## 9      45 0.18199404 0.4023162 0.2499494 0.4720703       1     3.263003
## 10     50 0.25695374 0.4678468 0.2805651 0.4903548       1     6.141210
## 11     55 0.21634231 0.4175371 0.2324333 0.4339924       1     1.264015
## 12     60 0.17040925 0.3593635 0.2747966 0.4672944       1    35.798214
## 13     65 0.15989810 0.3397359 0.2967413 0.4808912       1   605.682527
## 14     70 0.19421688 0.3711392 0.3352559 0.5101784       1 12517.110129
## 15     75 0.22271046 0.3952216 0.3011895 0.4728670       1    19.542895
## 16     80 0.18777631 0.3524628 0.2628875 0.4304742       1    23.553636
## 17     85 0.14787084 0.3022144 0.2768999 0.4393534       1  6657.270859
## 18     90 0.20512116 0.3615458 0.2891548 0.4468102       1    28.023998
## 19     95 0.21753758 0.3704594 0.2958124 0.4491502       1    21.294395
## 20    100 0.24251033 0.3924257 0.3426000 0.4896886       1 12331.250458
#Plotting bayes factor as a function of n
par(mfrow=c(1,1))
plot(samples, d.frame[,"Bayes.factor"], ylab="Bayes factor",
     main="Bayes factor as function of sample size")

#Plotting CI's as a function of n
par(mfrow=c(1,2))
plot(samples, (d.frame[,3] - d.frame[,2]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p1))
plot(samples, (d.frame[,5] - d.frame[,4]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p2))

Using grid approximation we’re unable to distinguish between the data sets with overlap between the 95% CI for sample sizes 5 - 100. It appears that we need to increase the sample size to ~ n > 160 in order to actually distinguish between the two sets.

set.seed(0)
#Our initial parameters
p1 = .4
p2 = .6
#samples vector to hold each our sample trials
samples = seq(from=105, to=200, by=5)
parameters = seq(from=0, to=1, by=.01)

#Uniform prior - instead of making a vector, will just multiply
#my likelihoods by this number
uform.prior = rep(1/length(parameters), length(parameters))
d.frame = data.frame()
par(mfrow=c(3, 2))

##:-----Main loop------:###
for (n in 1:20) {
  #Two data sets at sample size n
  p1.distr = rbinom(1, samples[n], p1)
  p2.distr = rbinom(1, samples[n], p2)
  
  #Likelihood functions
  p1.likelihood = sapply(parameters, function(p) {
    dbinom(p1.distr, size=samples[n], prob=p)
  })
  p2.likelihood = sapply(parameters, function(p) {
    dbinom(p2.distr, size=samples[n], prob=p)
  })
  
  #Calculate non-normalized posteriors
  p1.nnposterior = p1.likelihood * uform.prior
  p2.nnposterior = p2.likelihood * uform.prior
  
  #Calculate normalized posteriors
  p1.normpost = p1.nnposterior / sum(p1.nnposterior)
  p2.normpost = p2.nnposterior / sum(p2.nnposterior)

  #CI's
  p1.CI25 = qbeta(p=c(.025), p1.distr, samples[n])
  p1.CI975 = qbeta(p=c(.975), p1.distr, samples[n])
  p2.CI25 = qbeta(p=c(.025), p2.distr, samples[n])
  p2.CI975 = qbeta(p=c(.975), p2.distr, samples[n])
  
  #Overlap betwen 95% CIs
  overlap = ((p1.CI975 - p2.CI25) > 0)
  
  #Bayes factor
  p1.MLE = parameters[which(p1.normpost == max(p1.normpost))]
  p2.MLE = parameters[which(p2.normpost == max(p2.normpost))]
  p1.lik = dbinom(p1.distr, size=samples[n], prob=p1.MLE)
  p2.lik = dbinom(p2.distr, size=samples[n], prob=p2.MLE)
  p1.null.lhood = dbinom(p1.distr, size=samples[n], prob=.5)
  p2.null.lhood = dbinom(p2.distr, size=samples[n], prob=.5)
  bayes.factor = (p1.lik * p2.lik) / (p1.null.lhood * p2.null.lhood)
  
  vector = c(samples[n], p1.CI25, p1.CI975, p2.CI25, p2.CI975, overlap, bayes.factor)
  d.frame = rbind(d.frame, vector)
}
colnames(d.frame) = c("sample", "P1.025", "P1.975", "P2.025", "P2.975", "overlap",
                      "Bayes.factor")
d.frame
##    sample    P1.025    P1.975    P2.025    P2.975 overlap Bayes.factor
## 1     105 0.2295700 0.3754814 0.3072437 0.4528726       1 3.712182e+01
## 2     110 0.2083865 0.3498197 0.3015326 0.4441086       1 1.382927e+02
## 3     115 0.2145181 0.3531611 0.3066148 0.4459594       1 1.567078e+02
## 4     120 0.2516253 0.3886524 0.3080168 0.4444502       1 1.459165e+01
## 5     125 0.2330232 0.3667870 0.3216955 0.4549086       1 3.481044e+02
## 6     130 0.2032833 0.3328079 0.3075431 0.4387617       1 1.445784e+03
## 7     135 0.2450368 0.3740842 0.3450721 0.4722482       1 4.461881e+04
## 8     140 0.2386307 0.3651579 0.3156630 0.4419509       1 1.066802e+02
## 9     145 0.2223996 0.3460420 0.3084586 0.4328207       1 2.442639e+02
## 10    150 0.2267766 0.3485177 0.3122946 0.4344985       1 2.830447e+02
## 11    155 0.2433198 0.3636662 0.3497047 0.4684140       1 4.177212e+05
## 12    160 0.2554612 0.3741841 0.3118423 0.4302533       1 2.152787e+01
## 13    165 0.1918985 0.3056354 0.3176608 0.4341333       0 6.410437e+05
## 14    170 0.2716701 0.3870312 0.3207980 0.4354839       1 4.845964e+01
## 15    175 0.2195224 0.3317501 0.3192889 0.4324027       1 7.030573e+03
## 16    180 0.2344646 0.3457770 0.3329215 0.4440353       1 1.626753e+04
## 17    185 0.1990285 0.3068143 0.3207852 0.4308149       0 1.158573e+06
## 18    190 0.2459530 0.3546445 0.3131122 0.4219052       1 8.505993e+01
## 19    195 0.2337678 0.3406345 0.3035374 0.4111292       1 1.565788e+02
## 20    200 0.2114604 0.3158518 0.3286684 0.4343398       0 1.033325e+06

NOTE: I had a bit of trouble dealing with the concept of likelihood using this method. I understand that in grid approximation we assign a likelihood value to each possible parameter value (in our case from 0:1 by .01). However, in order to calculate our bayes factor we need a single likelihood value from each distribution so I ended up taking the MLE of our parameters to calculate the bayes factor. Was this the correct approach or am I missing something?

Question 2: Influence of sample size on inferential strength – Rejection Sampling

samples = seq(from=5, to=100, by=5)
d.frame = data.frame()
n.samples = 500
par(mfrow=c(2, 2))
for (n in 1:20) {
  #n=1
  p1.prob = .4
  p2.prob = .6
  p1.accepted = p2.accepted = 0
  p1.samples = p2.samples = rep(NA, n.samples)
  
  while (p1.accepted < n.samples) {
    p1 = runif(1,0,1)
    p1.sim = rbinom(n=1, size=samples[n], prob=p1)
    
    if (p1.sim == p1.prob * samples[n]) {
      p1.accepted = p1.accepted + 1
      p1.samples[p1.accepted] = p1
    } 
  }
  while (p2.accepted < n.samples) {
    p2 = runif(1,0,1)
    p2.sim = rbinom(n=1, size=samples[n], prob=p2)
    
    if (p2.sim == p2.prob * samples[n]) {
      p2.accepted = p2.accepted + 1
      p2.samples[p2.accepted] = p2
    } 
  }
  
  ###:-------Plots--------:###
  if (n %% 10 == 0) {
    #p1 plot
    plot(density(p1.samples), xlab="Parameter",
         ylab="Density",
         main=paste(round(p1, digits=2), "n.size= ", samples[n]))
    abline(v=quantile(p1.samples, probs=c(.025)),
           col='red')
    abline(v=quantile(p1.samples, probs=c(.975)),
           col='red')
    #abline(v=mean(p1.samples), col='blue')
    #p2 plot
    plot(density(p2.samples), xlab="Parameter",
         ylab="Density",
         main=paste(round(p2, digits=2), "n.size= ", samples[n]))
    abline(v=quantile(p2.samples, probs=c(.025)),
           col='red')
    abline(v=quantile(p2.samples, probs=c(.975)),
           col='red')
    #abline(v=mean(p2.samples), col='blue')
  }
  ###:-------Plots--------:###
  
  #Bayes factor
  p1.null.lhood = dbinom(p1.sim, samples[n], prob=.5)
  p2.null.lhood = dbinom(p2.sim, samples[n], prob=.5)
  p1.likelihood = dbinom(p1.sim, samples[n], prob=p1)
  p2.likelihood = dbinom(p2.sim, samples[n], prob=p2)
  bayes.factor = (p1.likelihood * p2.likelihood) / (p1.null.lhood * p2.null.lhood)
  
  #Quantiles at 2.5% and 97.5% for 95% CI
  p1CI.25 = quantile(p1.samples, prob=.025)
  p1CI.975 = quantile(p1.samples, prob=.975)
  p2CI.25 = quantile(p2.samples, prob=.025)
  p2CI.975 = quantile(p2.samples, prob=.975)
  
  #Quantile overlap to see if we can distinguish between
  #datasets
  overlap = ((p1CI.975 - p2CI.25) > 0)
  
  #Populating data frame
  vector = c(samples[n], p1CI.25, p1CI.975, p2CI.25,
             p2CI.975, overlap, bayes.factor)
  d.frame = rbind(d.frame, vector)
}

#Column labels for data frame
colnames(d.frame) = c("sample", "P1.025", "P1.975", "P2.025",
                      "P2.975", "overlap", "Bayes.factor")
d.frame
##    sample    P1.025    P1.975    P2.025    P2.975 overlap Bayes.factor
## 1       5 0.1291102 0.7857864 0.2220404 0.8856304       1    0.4917228
## 2      10 0.1620728 0.6889589 0.3203843 0.8256095       1    1.2937046
## 3      15 0.2128824 0.6435298 0.3581028 0.8084680       1    1.1515769
## 4      20 0.2193480 0.6298250 0.3954446 0.7798549       1    1.5880051
## 5      25 0.2317759 0.5896698 0.4054500 0.7721763       1    2.4143511
## 6      30 0.2597831 0.5830676 0.4234181 0.7659713       1    2.5331596
## 7      35 0.2634492 0.5614818 0.4590484 0.7440260       1    2.2895313
## 8      40 0.2541241 0.5543758 0.4576196 0.7427679       1    2.9284359
## 9      45 0.2593899 0.5502291 0.4402083 0.7317515       1    2.5831662
## 10     50 0.2685979 0.5397735 0.4615442 0.7131894       1    0.1237692
## 11     55 0.2799953 0.5342934 0.4764592 0.7190079       1    0.3801470
## 12     60 0.2981440 0.5145099 0.4890773 0.7162903       1   11.0684043
## 13     65 0.2835528 0.5224112 0.4763791 0.7117003       1    7.4254024
## 14     70 0.2879036 0.5187014 0.4871545 0.7109635       1    2.9257058
## 15     75 0.2910675 0.5046246 0.4815780 0.7010502       1    2.6611683
## 16     80 0.3038460 0.5198291 0.4950653 0.6962858       1    5.3329765
## 17     85 0.3033787 0.4945157 0.4884171 0.6913218       1    4.9014133
## 18     90 0.3127235 0.5046555 0.4884161 0.7026945       1   16.3632238
## 19     95 0.3099607 0.5009861 0.5023043 0.6926595       0   24.0665262
## 20    100 0.3081390 0.4951913 0.4977037 0.6930728       0    5.3575931
#Plotting bayes factor as a function of n
par(mfrow=c(1,1))
plot(samples, d.frame[,"Bayes.factor"], ylab="Bayes factor",
     main="Bayes factor as function of sample size")

par(mfrow=c(1,2))
#Plotting CI's as a function of n
plot(samples, (d.frame[,3] - d.frame[,2]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p1.prob))
plot(samples, (d.frame[,5] - d.frame[,4]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p2.prob))

This finding suggests that rejection sampling is a more efficient method for distinguishing between data sets than with grid approximation (we can see a distinction between the data sets ~ sample n > 95)

Question 3: Non-uniform priors

samples = seq(from=5, to=100, by=5)
d.frame = data.frame()
n.samples = 500
par(mfrow=c(2, 2))

#Our priors
probFair = .95
probHeads = .02
probTails = .02
probUform = runif(1, 0, 1) #Will re-initialized in for-loop

for (n in 1:20) {
  #n=1
  p1.prob = .4
  p2.prob = .6
  p1.accepted = p2.accepted = 0
  p1.samples = p2.samples = rep(NA, n.samples)
  
  while (p1.accepted < n.samples) {
    p1.prior = runif(1,0,1)
    if (p1.prior <= .95) {
      p1.prior = .5
    } else if (p1.prior > .95 && p1.prior <= .97) {
      p1.prior = 1
    } else if (p1.prior > .97 && p1.prior <= .99) {
      p1.prior = 0
    } else {
      p1.prior = runif(1,0,1)
    }
    
    p1.sim = rbinom(n=1, size=samples[n], prob=p1.prior)
    if (p1.sim == p1.prob * samples[n]) {
      p1.accepted = p1.accepted + 1
      p1.samples[p1.accepted] = p1.prior
    } 
  }
  while (p2.accepted < n.samples) {
    #Initial prior
    p2.prior = runif(1,0,1)
    #Fair 95% of the time
    if (p2.prior <= .95) {
      p2.prior = .5
    }
    #Two-sided Heads 2% of the time
    else if (p2.prior > .95 && p2.prior <= .97) {
      p2.prior = 1
    }
    #Two-sided Tails 2% of the time
    else if (p2.prior > .97 && p2.prior <= .99) {
      p2.prior = 0
    }
    #Uniform otherwise
    else {
      p2.prior = runif(1,0,1)
    }
    
    p2.sim = rbinom(n=1, size=samples[n], prob=p2.prior)
    if (p2.sim == p2.prob * samples[n]) {
      p2.accepted = p2.accepted + 1
      p2.samples[p2.accepted] = p2.prior
    } 
  }
  
  ###:-------Plots--------:###
  if (n %% 10 == 0) {
    #p1 plot
    plot(density(p1.samples), xlab="Parameter",
         ylab="Density",
         main=paste(round(p1.prior, digits=2), "n.size= ", samples[n]))
    abline(v=quantile(p1.samples, probs=c(.025)),
           col='red')
    abline(v=quantile(p1.samples, probs=c(.975)),
           col='red')
    #abline(v=mean(p1.samples), col='blue')
    #p2 plot
    plot(density(p2.samples), xlab="Parameter",
         ylab="Density",
         main=paste(round(p2.prior, digits=2), "n.size= ", samples[n]))
    abline(v=quantile(p2.samples, probs=c(.025)),
           col='red')
    abline(v=quantile(p2.samples, probs=c(.975)),
           col='red')
    #abline(v=mean(p2.samples), col='blue')
  }
  ###:-------Plots--------:###
  
  #Bayes factor
  p1.null.lhood = dbinom(p1.sim, samples[n], prob=.5)
  p2.null.lhood = dbinom(p2.sim, samples[n], prob=.5)
  p1.likelihood = dbinom(p1.sim, samples[n], prob=p1)
  p2.likelihood = dbinom(p2.sim, samples[n], prob=p2)
  bayes.factor = (p1.likelihood * p2.likelihood) / (p1.null.lhood * p2.null.lhood)
  
  #Quantiles at 2.5% and 97.5% for 95% CI
  p1CI.25 = quantile(p1.samples, prob=.025)
  p1CI.975 = quantile(p1.samples, prob=.975)
  p2CI.25 = quantile(p2.samples, prob=.025)
  p2CI.975 = quantile(p2.samples, prob=.975)
  
  #Quantile overlap to see if we can distinguish between
  #datasets
  overlap = ((p1CI.975 - p2CI.25) > 0)
  
  #Populating data frame
  vector = c(samples[n], p1CI.25, p1CI.975, p2CI.25,
             p2CI.975, overlap, bayes.factor)
  d.frame = rbind(d.frame, vector)
}

#Column labels for data frame
colnames(d.frame) = c("sample", "P1.025", "P1.975", "P2.025",
                      "P2.975", "overlap", "bayes.factor")
d.frame
##    sample P1.025 P1.975 P2.025 P2.975 overlap bayes.factor
## 1       5    0.5    0.5    0.5    0.5       0     1.087548
## 2      10    0.5    0.5    0.5    0.5       0     1.182761
## 3      15    0.5    0.5    0.5    0.5       0     1.286309
## 4      20    0.5    0.5    0.5    0.5       0     1.398923
## 5      25    0.5    0.5    0.5    0.5       0     1.521397
## 6      30    0.5    0.5    0.5    0.5       0     1.654592
## 7      35    0.5    0.5    0.5    0.5       0     1.799448
## 8      40    0.5    0.5    0.5    0.5       0     1.956987
## 9      45    0.5    0.5    0.5    0.5       0     2.128317
## 10     50    0.5    0.5    0.5    0.5       0     2.314648
## 11     55    0.5    0.5    0.5    0.5       0     2.517291
## 12     60    0.5    0.5    0.5    0.5       0     2.737675
## 13     65    0.5    0.5    0.5    0.5       0     2.977353
## 14     70    0.5    0.5    0.5    0.5       0     3.238015
## 15     75    0.5    0.5    0.5    0.5       0     3.521497
## 16     80    0.5    0.5    0.5    0.5       0     3.829797
## 17     85    0.5    0.5    0.5    0.5       0     4.165089
## 18     90    0.5    0.5    0.5    0.5       0     4.529735
## 19     95    0.5    0.5    0.5    0.5       0     4.926304
## 20    100    0.5    0.5    0.5    0.5       0     5.357593
#Plotting bayes factor as a function of n
par(mfrow=c(1,1))
plot(samples, d.frame[,"bayes.factor"], ylab="Bayes factor",
     main="Bayes factor as function of sample size")

par(mfrow=c(1,2))
#Plotting CI's as a function of n
plot(samples, (d.frame[,3] - d.frame[,2]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p1.prob))
plot(samples, (d.frame[,5] - d.frame[,4]), ylab="CI width",
     main=paste("CI width as a function\nof sample size,\nProb=", p2.prob))

I’m struggling with this problem. Conceptually, this is my understanding: In the previous two examples we’ve distinguished between the data sets by comparing the overlap of the 95% CI for our parameters. We assumed uniform priors meaning that each possible parameter we examined was equally likely. So in problem 3 the idea is that we might have prior expectations about the coins we’re seeing (i.e that they are fair, weighted or something else). So we might expect that the vast majority of the time (95%) the coin we’re examining is fair and the proportion of success / failure would be .5. Here’s where I’m confused - when we use rejection sampling we generate trials using runif() but only accept them if the observed number of successes matches what we’d expect from our original two parameters (p1 = .4, p2 = .6). So 95% of the time we generate trials using a prob = .5 in runif() and only accepted when the number of observed successes matches what we would have expected at the sample size n given parameter p1 or p2. Intuitively, I believe the answer to your question is - if we went into this study with the prior expectation that the coins were fair it should take us longer (i.e. more sample) to be able to distinguish between them (the fact that they are not fair). However, I’m now no longer able to distinguish between the two data sets because all my parameters are .5 (I’m aware this is incorrect). I’d like to understand this problem - perhaps we can set a time to walk through my thinking and how you’d advise to approach it in R?