klinares — Mar 20, 2017, 1:13 PM
####################################
### bayes coursera class 3/10/17 ###
####################################
# bayes rule
# 86 of 512(all 30-49) of 30-49 year olds use online sites. N=1738
# 86/512=17%
# Bayes rule = p(a/b) = P(A U B) / P(B)
# P(use online site | 30-49 yrs) = P(Use of site & 30-49) / P(30-49)
86/512
[1] 0.1679688
(86/1738) / (512/1738)
[1] 0.1679688
# HIV example
# elisa test, sensitivity 93%, specificity 99%, 1.48 in 1000 HIV pos
sens = .93
spec = .99
HIV_rate = 1.48/1000 # HIV rate
No_HIV = 1 - 1.48/1000
True_pos = sens # P(+ | hiv)
Fal_neg = 1 - sens # P(- | hiv)
Fal_Pos = 1 - spec # P(+ | hiv)
Fal_neg = spec # P(- | hiv)
# posterior prob
# what is the probability that patient has HIV given pos test
# P(HIV | +) = P(HIV & +) / P(+)
(HIV_rate*True_pos) / ((HIV_rate*True_pos) + (No_HIV*Fal_Pos))
[1] 0.1211449
# P(No_HIV | +)
Prob_1st = (No_HIV*True_pos* Fal_Pos) / ((No_HIV*True_pos* Fal_Pos) +
(HIV_rate*True_pos))
# patient that tested pos has a 12% of having HIV
# Updating with posterior
# P(HIV | 2nd +) 2nd test reveals positive outcome
# updating priors from posteriors of previous test
Sec_HIV_Rate = Prob_1st
Sec_No_HIV = 1-Sec_HIV_Rate
# sensitivity and specificity remains constant
# P(HIV | 2nd +)
Prob_2nd = (Sec_HIV_Rate*True_pos) / ((Sec_HIV_Rate*True_pos) +
(Sec_No_HIV*Fal_Pos))
# patient that tested pos twice has a 93% chance of having HIV
# third test
Thi_HIV_rate = Prob_2nd
Thi_No_HIV = 1-Thi_HIV_rate
Prob_3rd = (Thi_HIV_rate*True_pos) / ((Thi_HIV_rate*True_pos) +
(Thi_No_HIV*Fal_Pos))
####### Notes #######
# Frequentist def of prob: relative freq in a large # of trials
# confidence intervals: proportion of random samples in size N
# from the same population that produce CI that contain the
# true pop parameter
# interpretation; pop parameter is fixed rather than variable.
# Bayes def of prob: Equating prob of event to prob of another event
# credible interval: describes the parameter not as fixed but
# with a prob dist. ex. Posterior distribution yield a 95% CI
# that contains the pop parameter distribution.
# def of P-value = prob of the observed or more extreme outcome
# given that the null hypothesis is true.
# inference proportion
# ex.
# Pregnancies, treatment new birth control (N=40 20control 20 treat)
# 16 of 20 in control got preg
# 4 of 20 in treat got preg
# ex. of frequentist approach
# pvalue=p(k<=4) probability that 4 or less preg will occur
sum(dbinom(0:4, size=20, p=.5))
[1] 0.005908966
#pvalue= p(k<=16)
sum(dbinom(0:16, size=20, p=.5))
[1] 0.9987116
# treatment more effective than control
# ex. of bayes approach
# hyp P(preg) = can be 0 to 1
# consider 9 models, 10% to 90%
# specify prior, reflect state of belief
# ex. of prior
# Model 10% 20% 30% 40% 50% 60% 70% 80% 90%
# prior .06 .06 .06 .06 .052 .06 .06 .06 .06
# P(data | model) = P(k=4 | n=20, p)
p <- seq(from=0.1, to=0.9, by=0.1)
prior <- c(rep(.06, 4), 0.52, rep(0.06, 4))
likelihood <- dbinom(4, size=20, prob=p)
format(likelihood, scientific=F) # gets pvalues
[1] "0.0897788281498716756" "0.2181994019461005185" "0.1304209743738706517"
[4] "0.0349907904041583395" "0.0046205520629882708" "0.0002696861504765946"
[7] "0.0000050075583315124" "0.0000000130056978432" "0.0000000000003178804"
# bayes rule to calculate posterior prob P(data | model)
# P(data | model) = P(data | model) * P(model) / P(data) =
# likelihood * prior / sum (likelihood * prior)
numerator <- likelihood * prior
denominator <- sum(numerator) # p(data|model)*p(model) or likelihood prior
format(sum(numerator), scientific=F)
[1] "0.03082257"
posterior <- numerator/denominator
sum(posterior) # should add up to 1
[1] 1
data = data.frame(p, prior,
format(likelihood, scientific=F),
format(numerator, scientific=F),
format(posterior, scientific=F))
# look at results, Model 2 w/ prior of .06 posterior .42 has highest post
# model 2 = 4 preg of 20, most likely model based posterior data
View(data)
paste(p, prior, likelihood, numerator, posterior) # another way
[1] "0.1 0.06 0.0897788281498717 0.0053867296889923 0.174765758805409"
[2] "0.2 0.06 0.218199401946101 0.013091964116766 0.424752526156148"
[3] "0.3 0.06 0.130420974373871 0.00782525846243224 0.253880798182627"
[4] "0.4 0.06 0.0349907904041583 0.0020994474242495 0.0681139658670461"
[5] "0.5 0.52 0.00462055206298827 0.0024026870727539 0.0779521998848127"
[6] "0.6 0.06 0.000269686150476595 1.61811690285957e-05 0.000524977945230834"
[7] "0.7 0.06 5.00755833151243e-06 3.00453499890746e-07 9.74784088413576e-06"
[8] "0.8 0.06 1.30056978431999e-08 7.80341870591997e-10 2.53172234389869e-08"
[9] "0.9 0.06 3.17880449999999e-13 1.90728269999999e-14 6.18794199016665e-13"
# if we add up posteriors of the 1st 4 models (.17 + .42 + .25 + .06)=.9,
# there is a probability of .9 that treatment is 90% effective
###################################################
# ex of M & M, models tested P(Yellow)=10% or 20% #
###################################################
# frequentist inference:
#hyp, H0 < 10%; HA > 10%, p=.05
# test this model, P(k>=1 | n=5, p=.10) sample of 5 M&M colors.
# test statistic is the number of Yellow M&Ms in sample
# 1 - .90^5 = .41, p>.05 (p value observed an extreme outcome if Hyp is true)
# failed to reject Null, S(0, 1, 1, 1, 1, 1) = 5 successes
# prob of 5 successes, not greater than 10%
# P(k>=1 | n=5, p=.10) = 1-p(k=0 | n=5, p=.10) = 1 - .90^5
1-.90^5 # compliment of no successess, .90 is p(no success)
[1] 0.40951
# compliment of success being 10%
# using the binom test
model1 = binom.test(1,5,.10, alternative="two.sided")
# other sample sizes
model2 = binom.test(2,10,.10, alternative="two.sided")
model3 = binom.test(3,15,.10, alternative="two.sided")
model4 = binom.test(4,20,.10, alternative="two.sided")
freq_prob_10per <- rbind(model1$p.value, model2$p.value, model3$p.value, model4$p.value)
n <- seq(5,20, by=5)
k <- seq(1:4)
cbind(n, k, freq_prob_10per)
n k
[1,] 5 1 0.4095100
[2,] 10 2 0.2639011
[3,] 15 3 0.1840611
[4,] 20 4 0.1329533
Freq_results = data.frame(n, k, freq_prob_10per)
View(Freq_results)
# Bayes inference:
# prior = .5 or both hypotheses
prior = .5
# hyp=H1 = 10%; H2 = 20%
# model1 # observed data: k=1, n=5
# likelihood = p(k=1|H1) = (5/1)*.10*.90^4 = .33
likemod1h1 = dbinom(1, size=5, prob=0.1)
# likelihood = p(k=1|H2) = (5/1)*.20*.80^4 = .41
likemod1h2 = dbinom(1, size=5, prob=0.2)
# posterior(H1|k=1) = P(H1)*p(k=1|H1) / (p(k=1)
# posterior(H1|k=1) = P(H2)*p(k=1|H2) / (p(k=1)
postmod1h1 = (prior * likemod1h1 ) /
((prior * likemod1h1) + (prior * likemod1h2))
postmod1h2 = (prior * likemod1h2 ) /
((prior * likemod1h2) + (prior * likemod1h1))
print(cbind(postmod1h1, postmod1h2))
postmod1h1 postmod1h2
[1,] 0.4447231 0.5552769
# w/ equal priors and small N, posteriors are similar, H2 is winner.
######
# other models of n samples and k successes
######
# model2 # observed data: k=2, n=10
likemod2h1 = dbinom(2, size=10, prob=0.1)
likemod2h2 = dbinom(2, size=10, prob=0.2)
postmod2h1 = (prior * likemod2h1 ) /
((prior * likemod2h1) + (prior * likemod2h2))
postmod2h2 = (prior * likemod2h2 ) /
((prior * likemod2h2) + (prior * likemod2h1))
print(cbind(postmod2h1, postmod2h2))
postmod2h1 postmod2h2
[1,] 0.3907811 0.6092189
# model3 # observed data: k=2, n=10
likemod3h1 = dbinom(3, size=15, prob=0.1)
likemod3h2 = dbinom(3, size=15, prob=0.2)
postmod3h1 = (prior * likemod3h1 ) /
((prior * likemod3h1) + (prior * likemod3h2))
postmod3h2 = (prior * likemod3h2 ) /
((prior * likemod3h2) + (prior * likemod3h1))
print(cbind(postmod3h1, postmod3h2))
postmod3h1 postmod3h2
[1,] 0.339383 0.660617
# model4 # observed data: k=2, n=10
likemod4h1 = dbinom(4, size=20, prob=0.1)
likemod4h2 = dbinom(4, size=20, prob=0.2)
postmod4h1 = (prior * likemod4h1 ) /
((prior * likemod4h1) + (prior * likemod4h2))
postmod4h2 = (prior * likemod4h2 ) /
((prior * likemod4h2) + (prior * likemod4h1))
print(cbind(postmod4h1, postmod4h2))
postmod4h1 postmod4h2
[1,] 0.2915103 0.7084897
posterior_10per <- c(postmod1h1, postmod2h1, postmod3h1, postmod4h1)
posterior_20per <- c(postmod1h2, postmod2h2, postmod3h2, postmod4h2)
cbind(posterior_10per, posterior_20per)
posterior_10per posterior_20per
[1,] 0.4447231 0.5552769
[2,] 0.3907811 0.6092189
[3,] 0.3393830 0.6606170
[4,] 0.2915103 0.7084897
# attach posterior results to frequentist results
final_results = data.frame(n, k, freq_prob_10per,
posterior_10per, posterior_20per)
print(final_results)
n k freq_prob_10per posterior_10per posterior_20per
1 5 1 0.4095100 0.4447231 0.5552769
2 10 2 0.2639011 0.3907811 0.6092189
3 15 3 0.1840611 0.3393830 0.6606170
4 20 4 0.1329533 0.2915103 0.7084897
################################################