Bayesian statistics Duke Univ.

klinares — Mar 20, 2017, 1:13 PM

####################################
### bayes coursera class 3/10/17 ###
####################################


# bayes rule


# 86 of 512(all 30-49) of 30-49 year olds use online sites. N=1738
# 86/512=17%
# Bayes rule = p(a/b) = P(A U B) / P(B)
# P(use online site | 30-49 yrs) = P(Use of site & 30-49) / P(30-49)
86/512

[1] 0.1679688

(86/1738) / (512/1738)

[1] 0.1679688

# HIV example
# elisa test, sensitivity 93%, specificity 99%, 1.48 in 1000 HIV pos
sens = .93
spec = .99
HIV_rate = 1.48/1000 # HIV rate
No_HIV = 1 - 1.48/1000
True_pos = sens      # P(+ | hiv)
Fal_neg = 1 - sens # P(- | hiv)
Fal_Pos = 1 - spec # P(+ | hiv)
Fal_neg = spec       # P(- | hiv)

# posterior prob
# what is the probability that patient has HIV given pos test
# P(HIV | +) = P(HIV & +) / P(+)
(HIV_rate*True_pos) / ((HIV_rate*True_pos) + (No_HIV*Fal_Pos))

[1] 0.1211449

# P(No_HIV | +)
Prob_1st = (No_HIV*True_pos* Fal_Pos) / ((No_HIV*True_pos* Fal_Pos) + 
                                (HIV_rate*True_pos))
# patient that tested pos has a 12% of having HIV


# Updating with posterior
# P(HIV | 2nd +) 2nd test reveals positive outcome
# updating priors from posteriors of previous test
Sec_HIV_Rate = Prob_1st
Sec_No_HIV = 1-Sec_HIV_Rate
# sensitivity and specificity remains constant
# P(HIV | 2nd +)
Prob_2nd = (Sec_HIV_Rate*True_pos) / ((Sec_HIV_Rate*True_pos) + 
                             (Sec_No_HIV*Fal_Pos))
# patient that tested pos twice has a 93% chance of having HIV

# third test
Thi_HIV_rate = Prob_2nd
Thi_No_HIV = 1-Thi_HIV_rate
Prob_3rd = (Thi_HIV_rate*True_pos) / ((Thi_HIV_rate*True_pos) + 
                             (Thi_No_HIV*Fal_Pos))

####### Notes #######

# Frequentist def of prob: relative freq in a large # of trials
  # confidence intervals: proportion of random samples in size N 
   # from the same population that produce CI that contain the 
   # true pop parameter   
      # interpretation; pop parameter is fixed rather than variable.

# Bayes def of prob: Equating prob of event to prob of another event
  # credible interval: describes the parameter not as fixed but 
  # with a prob dist. ex. Posterior distribution yield a 95% CI
  # that contains the pop parameter distribution.

# def of P-value = prob of the observed or more extreme outcome
  # given that the null hypothesis is true.


# inference proportion

# ex.
# Pregnancies, treatment new birth control (N=40 20control 20 treat)
   # 16 of 20 in control got preg
   # 4 of 20 in treat got preg

# ex. of frequentist approach
 # pvalue=p(k<=4) probability that 4 or less preg will occur
sum(dbinom(0:4, size=20, p=.5))

[1] 0.005908966

#pvalue= p(k<=16)
sum(dbinom(0:16, size=20, p=.5))

[1] 0.9987116

# treatment more effective than control

# ex. of bayes approach
# hyp P(preg) = can be 0 to 1
# consider 9 models, 10% to 90%
# specify prior, reflect state of belief
 # ex. of prior
   # Model  10%  20%  30%  40%  50%  60%  70%  80%  90%
   # prior  .06 .06  .06  .06  .052  .06  .06  .06 .06
  # P(data | model) = P(k=4 | n=20, p)
p <- seq(from=0.1, to=0.9, by=0.1)
prior <- c(rep(.06, 4), 0.52, rep(0.06, 4))
likelihood <- dbinom(4, size=20, prob=p)

format(likelihood, scientific=F) # gets pvalues

[1] "0.0897788281498716756" "0.2181994019461005185" "0.1304209743738706517"
[4] "0.0349907904041583395" "0.0046205520629882708" "0.0002696861504765946"
[7] "0.0000050075583315124" "0.0000000130056978432" "0.0000000000003178804"

# bayes rule to calculate posterior prob P(data | model)

# P(data | model) = P(data | model) * P(model) / P(data) =
   # likelihood * prior / sum (likelihood * prior)

numerator <- likelihood * prior
denominator <- sum(numerator) # p(data|model)*p(model) or likelihood prior
format(sum(numerator), scientific=F)

[1] "0.03082257"

posterior <- numerator/denominator
sum(posterior) # should add up to 1

[1] 1

data = data.frame(p, prior,
             format(likelihood, scientific=F),
             format(numerator, scientific=F),
       format(posterior, scientific=F)) 

# look at results, Model 2 w/ prior of .06 posterior .42 has highest post
  # model 2 = 4 preg of 20, most likely model based posterior data
View(data)
paste(p, prior, likelihood, numerator, posterior) # another way

[1] "0.1 0.06 0.0897788281498717 0.0053867296889923 0.174765758805409"       
[2] "0.2 0.06 0.218199401946101 0.013091964116766 0.424752526156148"         
[3] "0.3 0.06 0.130420974373871 0.00782525846243224 0.253880798182627"       
[4] "0.4 0.06 0.0349907904041583 0.0020994474242495 0.0681139658670461"      
[5] "0.5 0.52 0.00462055206298827 0.0024026870727539 0.0779521998848127"     
[6] "0.6 0.06 0.000269686150476595 1.61811690285957e-05 0.000524977945230834"
[7] "0.7 0.06 5.00755833151243e-06 3.00453499890746e-07 9.74784088413576e-06"
[8] "0.8 0.06 1.30056978431999e-08 7.80341870591997e-10 2.53172234389869e-08"
[9] "0.9 0.06 3.17880449999999e-13 1.90728269999999e-14 6.18794199016665e-13"

# if we add up posteriors of the 1st 4 models (.17 + .42 + .25 + .06)=.9, 
   # there is a probability of .9 that treatment is 90% effective



################################################### 
# ex of M & M, models tested P(Yellow)=10% or 20% #
###################################################

# frequentist inference: 

#hyp, H0 < 10%; HA > 10%, p=.05
# test this model, P(k>=1 | n=5, p=.10) sample of 5 M&M colors.
   # test statistic is the number of Yellow M&Ms in sample
     # 1 - .90^5 = .41, p>.05 (p value observed an extreme outcome if Hyp is true)
# failed to reject Null, S(0, 1, 1, 1, 1, 1) = 5 successes
# prob of 5 successes, not greater than 10%
# P(k>=1 | n=5, p=.10) = 1-p(k=0 | n=5, p=.10) = 1 - .90^5
1-.90^5 # compliment of no successess, .90 is p(no success)

[1] 0.40951

    # compliment of success being 10%
# using the binom test
model1 = binom.test(1,5,.10, alternative="two.sided")
# other sample sizes
model2 = binom.test(2,10,.10, alternative="two.sided")
model3 = binom.test(3,15,.10, alternative="two.sided")
model4 = binom.test(4,20,.10, alternative="two.sided")
freq_prob_10per <- rbind(model1$p.value, model2$p.value, model3$p.value, model4$p.value)
n <- seq(5,20, by=5)
k <- seq(1:4)
cbind(n, k, freq_prob_10per)

      n k          
[1,]  5 1 0.4095100
[2,] 10 2 0.2639011
[3,] 15 3 0.1840611
[4,] 20 4 0.1329533

Freq_results = data.frame(n, k, freq_prob_10per)
View(Freq_results)


# Bayes inference:

# prior = .5 or both hypotheses
prior = .5
# hyp=H1 = 10%; H2 = 20%

# model1 # observed data: k=1, n=5

# likelihood = p(k=1|H1) = (5/1)*.10*.90^4 = .33 
likemod1h1 = dbinom(1, size=5, prob=0.1)
# likelihood = p(k=1|H2) = (5/1)*.20*.80^4 = .41
likemod1h2 = dbinom(1, size=5, prob=0.2)
# posterior(H1|k=1) = P(H1)*p(k=1|H1) / (p(k=1) 
# posterior(H1|k=1) = P(H2)*p(k=1|H2) / (p(k=1)

postmod1h1 = (prior * likemod1h1 ) /
  ((prior * likemod1h1) + (prior * likemod1h2)) 
postmod1h2 = (prior * likemod1h2 ) /
  ((prior * likemod1h2) + (prior * likemod1h1))

print(cbind(postmod1h1, postmod1h2))

     postmod1h1 postmod1h2
[1,]  0.4447231  0.5552769

# w/ equal priors and small N, posteriors are similar, H2 is winner.

######
# other models of n samples and k successes
######

# model2 # observed data: k=2, n=10

likemod2h1 = dbinom(2, size=10, prob=0.1)
likemod2h2 = dbinom(2, size=10, prob=0.2)

postmod2h1 = (prior * likemod2h1 ) /
  ((prior * likemod2h1) + (prior * likemod2h2)) 
postmod2h2 = (prior * likemod2h2 ) /
  ((prior * likemod2h2) + (prior * likemod2h1))

print(cbind(postmod2h1, postmod2h2))

     postmod2h1 postmod2h2
[1,]  0.3907811  0.6092189

# model3 # observed data: k=2, n=10

likemod3h1 = dbinom(3, size=15, prob=0.1)
likemod3h2 = dbinom(3, size=15, prob=0.2)

postmod3h1 = (prior * likemod3h1 ) /
  ((prior * likemod3h1) + (prior * likemod3h2)) 
postmod3h2 = (prior * likemod3h2 ) /
  ((prior * likemod3h2) + (prior * likemod3h1))

print(cbind(postmod3h1, postmod3h2))

     postmod3h1 postmod3h2
[1,]   0.339383   0.660617

# model4 # observed data: k=2, n=10

likemod4h1 = dbinom(4, size=20, prob=0.1)
likemod4h2 = dbinom(4, size=20, prob=0.2)

postmod4h1 = (prior * likemod4h1 ) /
  ((prior * likemod4h1) + (prior * likemod4h2)) 
postmod4h2 = (prior * likemod4h2 ) /
  ((prior * likemod4h2) + (prior * likemod4h1))

print(cbind(postmod4h1, postmod4h2))

     postmod4h1 postmod4h2
[1,]  0.2915103  0.7084897

posterior_10per <- c(postmod1h1, postmod2h1, postmod3h1, postmod4h1)
posterior_20per <- c(postmod1h2, postmod2h2, postmod3h2, postmod4h2)
cbind(posterior_10per, posterior_20per)

     posterior_10per posterior_20per
[1,]       0.4447231       0.5552769
[2,]       0.3907811       0.6092189
[3,]       0.3393830       0.6606170
[4,]       0.2915103       0.7084897

# attach posterior results to frequentist results
final_results = data.frame(n, k, freq_prob_10per, 
                           posterior_10per, posterior_20per)
print(final_results)

   n k freq_prob_10per posterior_10per posterior_20per
1  5 1       0.4095100       0.4447231       0.5552769
2 10 2       0.2639011       0.3907811       0.6092189
3 15 3       0.1840611       0.3393830       0.6606170
4 20 4       0.1329533       0.2915103       0.7084897

################################################