library(purrr)
## Warning: package 'purrr' was built under R version 4.0.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats)
library(ggplot2)
library(ggjoy)
## Warning: package 'ggjoy' was built under R version 4.0.3
## Loading required package: ggridges
## Warning: package 'ggridges' was built under R version 4.0.3
## The ggjoy package has been deprecated. Please switch over to the
## ggridges package, which provides the same functionality. Porting
## guidelines can be found here:
## https://github.com/clauswilke/ggjoy/blob/master/README.md
library(lattice)
The role of probability distributions in bayesian data analysis is to represent uncertainty, and the role of bayesian inference is to updated probability distributions to reflect what has been learned from data
prop_model <-
function (data = c(), prior_prop = c(1, 1), n_draws = 10000,
show_plot = TRUE)
{
data <- as.logical(data)
proportion_success <- c(0, seq(0, 1, length.out = 100), 1)
data_indices <- round(seq(0, length(data), length.out = min(length(data) +
1, 20)))
post_curves <- map_dfr(data_indices, function(i) {
value <- ifelse(i == 0, "Prior", ifelse(data[i], "Success",
"Failure"))
label <- paste0("n=", i)
probability <- dbeta(proportion_success, prior_prop[1] +
sum(data[seq_len(i)]), prior_prop[2] + sum(!data[seq_len(i)]))
probability <- probability/max(probability)
data_frame(value, label, proportion_success, probability)
})
post_curves$label <- fct_rev(factor(post_curves$label, levels = paste0("n=",
data_indices)))
post_curves$value <- factor(post_curves$value, levels = c("Prior",
"Success", "Failure"))
p <- ggplot(post_curves, aes(x = proportion_success, y = label,
height = probability, fill = value)) + geom_joy(stat = "identity",
color = "white", alpha = 0.8, panel_scaling = TRUE, size = 1) +
scale_y_discrete("", expand = c(0.01, 0)) + scale_x_continuous("Underlying proportion of success") +
scale_fill_manual(values = hcl(120 * 2:0 + 15, 100, 65),
name = "", drop = FALSE, labels = c("Prior ", "Success ",
"Failure ")) + theme_light(base_size = 18) +
theme(legend.position = "top")
if (show_plot) {
print(p)
}
invisible(rbeta(n_draws, prior_prop[1] + sum(data), prior_prop[2] +
sum(!data)))
}
data <- c(1, 0, 0, 1, 0, 0 , 1)
prop_model(data)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Prior is a probablity distribution that represents what model knows before seeing the data
Posterior is a probability distribution that represents what model knows having seen the data
When fittin a bayesian model, the end result is always a posterior over some wuantity or parameter of interest. The porterior represents how uncertain the model is of the underlying value
data = c(1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0)
# Extract and explore the posterior
posterior <- prop_model(data)
head(posterior)
## [1] 0.3404846 0.1467915 0.1887210 0.3275088 0.3072024 0.1830295
hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")
data = c(1, 0, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0)
posterior <- prop_model(data)
hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")
# Calculate the median
median(posterior)
## [1] 0.1861541
# Calculate the credible interval
quantile(posterior, c(0.05, 0.95))
## 5% 95%
## 0.06115933 0.38694088
# Calculate the probability
sum(posterior > 0.07) / length(posterior)
## [1] 0.9301
# The generative zombie drug model
# Parameters
prop_success <- 0.42
n_zombies <- 100
# Simulating data
data <- c()
for(zombie in 1:n_zombies) {
data[zombie] <- runif(1, min = 0, max = 1) < prop_success
}
# Count cured
data <- sum( as.numeric(data) )
data
## [1] 47
To get more visitors to your website you are considering paying for an ad to be shown 100 times on a popular social media site. According to the social media site, their ads get clicked on 10% of the time.
# Fill in the parameters
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n_samples, size = n_ads_shown,
prob = proportion_clicks)
# Visualize n_visitors
hist(n_visitors)
Represents how uncertain the model is about the parameters before seeing any data, adn ideally, this uncertainty should reflect our uncertainty
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_samples, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_samples, size = n_ads_shown, prob = proportion_clicks)
# Visualize proportion clicks
hist(proportion_clicks)
# Visualize n_visitors
hist(n_visitors)
# Create the prior data frame
prior <- data.frame(proportion_clicks, n_visitors)
# Examine the prior data frame
head(prior)
## proportion_clicks n_visitors
## 1 0.14092493 23
## 2 0.04463831 3
## 3 0.04127417 2
## 4 0.06175349 7
## 5 0.11637832 11
## 6 0.02619338 5
# Create the posterior data frame
posterior <- prior[prior$n_visitors == 13, ]
# Visualize posterior proportion clicks
hist(posterior$proportion_clicks)
In the last exercise, you updated the probability distribution over the underlying proportions of clicks (proportion_clicks) using new data. Now we want to use this updated proportion_clicks to predict how many visitors we would get if we reran the ad campaign.
# Assign posterior to a new variable called prior
prior <- posterior
# Take a look at the first rows in prior
head(prior)
## proportion_clicks n_visitors
## 25 0.1427897 13
## 38 0.1559213 13
## 78 0.1362270 13
## 80 0.1007337 13
## 83 0.1528802 13
## 99 0.1092731 13
# Replace prior$n_visitors with a new sample and visualize the result
n_samples <- nrow(prior)
n_ads_shown <- 100
prior$n_visitors <- rbinom(n_samples, size = n_ads_shown, prob = prior$proportion_clicks)
hist(prior$n_visitors)
# Calculate the probability that you will get 5 or more visitors
sum(prior$n_visitors >= 5) / length(prior$n_visitors)
## [1] 0.988343
The Beta distribution is a useful probability distribution when you want model uncertainty over a parameter bounded between 0 and 1. Here you’ll explore how the two parameters of the Beta distribution determine its shape.
# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 1, shape2 = 1)
# Visualize the results
hist(beta_sample)
# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 100, shape2 = 100)
# Visualize the results
hist(beta_sample)
# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 100, shape2 = 20)
# Visualize the results
hist(beta_sample)
n_draws <- 100000
n_ads_shown <- 100
# Change the prior on proportion_clicks
proportion_clicks <-
rbeta(n_draws, shape1 = 5, shape2 = 95)
n_visitors <-
rbinom(n_draws, size = n_ads_shown,
prob = proportion_clicks)
prior <-
data.frame(proportion_clicks, n_visitors)
posterior <-
prior[prior$n_visitors == 13, ]
# This plots the prior and the posterior in the same plot
par(mfcol = c(2, 1))
hist(prior$proportion_clicks,
xlim = c(0, 0.25))
hist(posterior$proportion_clicks,
xlim = c(0, 0.25))
Let’s fit the binomial model to both the video ad data (13 out of 100 clicked) and the new text ad data (6 out of a 100 clicked).
# Define parameters
n_draws <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_draws, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_draws, size = n_ads_shown,
prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)
# Create the posteriors for video and text ads
posterior_video <- prior[prior$n_visitors == 13, ]
posterior_text <- prior[prior$n_visitors == 6, ]
# Visualize the posteriors
hist(posterior_video$proportion_clicks, xlim = c(0, 0.25))
hist(posterior_text$proportion_clicks, xlim = c(0, 0.25))
When you take the result of a statistical analysis and port-process it to make it more about what you really care about, in order yo make informed decisions
Each visitor spends USD 2.53 on average, a video ad costs USD 0.25 and a text ad costs USD 0.05. Let’s figure out the probable profit when using video ads and text ads
visitor_spend <- 2.53
video_cost <- 0.25
text_cost <- 0.05
# Add the column posterior$video_profit
posterior$video_profit <- posterior$video_prop * visitor_spend - video_cost
# Add the column posterior$text_profit
posterior$text_profit <- posterior$text_prop * visitor_spend - text_cost
# Visualize the video_profit and text_profit columns
hist(posterior$video_profit)
hist(posterior$text_profit)
# Add the column posterior$profit_diff
posterior$profit_diff <- posterior$video_profit - posterior$text_profit
# Visualize posterior$profit_diff
hist(posterior$profit_diff)
# Calculate a "best guess" for the difference in profits
median(posterior$profit_diff)
# Calculate the probability that text ads are better than video ads
mean(posterior$profit_diff < 0)
The Poisson distribution simulates a process where the outcome is a number of occurrences per day/year/area/unit/etc. Before using it in a Bayesian model, let’s explore it!
# Simulate from a Poisson distribution and visualize the result
x <- rpois(n = 10000, lambda = 11.5)
hist(x)
# Calculate the probability of break-even
mean(x >= 15)
## [1] 0.1807
When you put up a banner on your friend’s site you got 19 clicks in a day, how many daily clicks should you expect this banner to generate on average? Now, modify your model, one piece at a time, to calculate this.
# Change this model so that it uses a Poisson distribution
n_draws <- 100000
mean_clicks <- runif(n_draws, min = 0, max = 80)
n_visitors <- rpois(n = n_draws, mean_clicks)
prior <- data.frame(mean_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 13, ]
hist(prior$mean_clicks)
hist(posterior$mean_clicks)
# Rewrite this code so that it uses dbinom instead of rbinom
n_ads_shown <- 100
proportion_clicks <- 0.1
prob_13_visitors <- dbinom(13,
size = n_ads_shown, prob = proportion_clicks)
prob_13_visitors
## [1] 0.07430209
# Change the code according to the instructions
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- seq(0, 100)
prob <- dbinom(n_visitors,
size = n_ads_shown, prob = proportion_clicks)
prob
## [1] 2.656140e-05 2.951267e-04 1.623197e-03 5.891602e-03 1.587460e-02
## [6] 3.386580e-02 5.957873e-02 8.889525e-02 1.148230e-01 1.304163e-01
## [11] 1.318653e-01 1.198776e-01 9.878801e-02 7.430209e-02 5.130383e-02
## [16] 3.268244e-02 1.929172e-02 1.059153e-02 5.426525e-03 2.602193e-03
## [21] 1.170987e-03 4.956559e-04 1.977617e-04 7.451890e-05 2.656461e-05
## [26] 8.972934e-06 2.875940e-06 8.758007e-07 2.537042e-07 6.998736e-08
## [31] 1.840408e-08 4.617512e-09 1.106279e-09 2.532895e-10 5.545880e-11
## [36] 1.161994e-11 2.331161e-12 4.480309e-13 8.253201e-14 1.457830e-14
## [41] 2.470212e-15 4.016606e-16 6.269305e-17 9.395858e-18 1.352434e-18
## [46] 1.870032e-19 2.484342e-20 3.171501e-21 3.890962e-22 4.587982e-23
## [51] 5.199713e-24 5.664175e-25 5.930440e-26 5.967739e-27 5.771270e-28
## [56] 5.363200e-29 4.788572e-30 4.107157e-31 3.383290e-32 2.676049e-33
## [61] 2.031815e-34 1.480375e-35 1.034671e-36 6.934301e-38 4.454325e-39
## [66] 2.741123e-40 1.615140e-41 9.106926e-43 4.910597e-44 2.530420e-45
## [71] 1.245128e-46 5.845669e-48 2.616117e-49 1.114936e-50 4.520010e-52
## [76] 1.741041e-53 6.363454e-55 2.203794e-56 7.220406e-58 2.234162e-59
## [81] 6.516307e-61 1.787738e-62 4.602579e-64 1.109055e-65 2.493907e-67
## [86] 5.216014e-69 1.010856e-70 1.807405e-72 2.966699e-74 4.444493e-76
## [91] 6.035732e-78 7.369636e-80 8.010474e-82 7.656367e-84 6.335055e-86
## [96] 4.445653e-88 2.572716e-90 1.178793e-92 4.009500e-95 9.000000e-98
## [101] 1.000000e-100
plot(n_visitors, prob, type = "h")
Out of a 100 impressions of the text ad, 6 out of a 100 clicked and visited your site.
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- seq(0, 100, by = 1)
pars <- expand.grid(proportion_clicks = proportion_clicks,
n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
# Condition on the data
pars <- pars[pars$n_visitors == 6, ]
# Normalize again
pars$probability <- pars$probability / sum(pars$probability)
# Plot the posterior pars$probability
plot(pars$proportion_clicks, pars$probability, type = "h")
# Simplify the code below by directly conditioning on the data
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 6
pars <- expand.grid(proportion_clicks = proportion_clicks)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(n_visitors,
size = n_ads_shown, prob = pars$proportion_clicks)
# BAYES THEOREM!!!!!!!!!!!!!!!!!!!!!!!!
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
plot(pars$proportion_clicks, pars$probability, type = "h")
\[ P(\theta | D) = \frac{P(D | \theta) \cdot P(\theta)}{\sum P(D | \theta) P(\theta)} \]
There are many more algorithms to fit bayesian models
# Assign mu and sigma
mu <- 3500
sigma <- 600
weight_distr <- rnorm(n = 100000, mean = mu, sd = sigma)
hist(weight_distr, 60, xlim = c(0, 6000), col = "lightgreen")
# Create weight
weight <- seq(0, 6000, by = 100)
# Calculate likelihood
likelihood <- dnorm(weight, mu, sigma)
# Plot the distribution of weight
plot(weight, likelihood, type = "h")
temp <- c(19, 23, 20, 17, 23)
mu <- seq(8, 30, by = 0.5)
sigma <- seq(0.1, 10, by = 0.3)
pars <- expand.grid(mu = mu, sigma = sigma)
pars$mu_prior <- dnorm(pars$mu, mean = 18, sd = 5)
pars$sigma_prior <- dunif(pars$sigma, min = 0, max = 10)
pars$prior <- pars$mu_prior * pars$sigma_prior
for(i in 1:nrow(pars)) {
likelihoods <- dnorm(temp, pars$mu[i], pars$sigma[i])
pars$likelihood[i] <- prod(likelihoods)
}
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
levelplot(probability ~ mu * sigma, data = pars)
sample_indices <- sample(1:nrow(pars), size = 10000,
replace= TRUE, prob = pars$probability)
pars_sample <- pars[sample_indices, c("mu", "sigma")]
hist(pars_sample$mu, 30)
# probability of distribution over the mean temperature
quantile(pars_sample$mu, c(0.25, 0.95))
## 25% 95%
## 19.5 22.5
pred_temp <- rnorm(10000, mean = pars_sample$mu, sd = pars_sample$sigma)
hist(pred_temp, 30)
# probabiliy fo temperature will be above 18
sum(pred_temp >= 18) / length(pred_temp )
## [1] 0.7305
# The IQ of zombies on a regular diet and a brain based diet.
iq_brains <- c(44, 52, 42, 66, 53, 42, 55, 57, 56, 51)
iq_regular <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)
# Calculate the mean difference in IQ between the two groups
mean(iq_brains) - mean(iq_regular)
# Fit the BEST model to the data from both groups
library(BEST)
best_posterior <- BESTmcmc(iq_brains, iq_regular)
# Plot the model result
plot(best_posterior)