library(purrr)

## Warning: package 'purrr' was built under R version 4.0.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(forcats)
library(ggplot2)
library(ggjoy)

## Warning: package 'ggjoy' was built under R version 4.0.3

## Loading required package: ggridges

## Warning: package 'ggridges' was built under R version 4.0.3

## The ggjoy package has been deprecated. Please switch over to the
## ggridges package, which provides the same functionality. Porting
## guidelines can be found here:
## https://github.com/clauswilke/ggjoy/blob/master/README.md

library(lattice)

Bayesian inference

A method for figuring out unobservable quantities given known facts that uses probability to describe the uncertainty over what the values of the unknown quantities could be

Bayesian Data analysis

The use of bayesian inference to learn from data
Can be used for hypothesis testing, linear regression
Is flexible and allows you to construct problem specific-problems

Probaiblity

A statment about centainty / uncertainty

The role of probability distributions in bayesian data analysis is to represent uncertainty, and the role of bayesian inference is to updated probability distributions to reflect what has been learned from data

A bayesian model for the proportion of success

The result is a probability distribution that represents what the model knows about the underlying proportion of success

prop_model <-
function (data = c(), prior_prop = c(1, 1), n_draws = 10000, 
    show_plot = TRUE) 
{
    data <- as.logical(data)
    proportion_success <- c(0, seq(0, 1, length.out = 100), 1)
    data_indices <- round(seq(0, length(data), length.out = min(length(data) + 
        1, 20)))
    post_curves <- map_dfr(data_indices, function(i) {
        value <- ifelse(i == 0, "Prior", ifelse(data[i], "Success", 
            "Failure"))
        label <- paste0("n=", i)
        probability <- dbeta(proportion_success, prior_prop[1] + 
            sum(data[seq_len(i)]), prior_prop[2] + sum(!data[seq_len(i)]))
        probability <- probability/max(probability)
        data_frame(value, label, proportion_success, probability)
    })
    post_curves$label <- fct_rev(factor(post_curves$label, levels = paste0("n=", 
        data_indices)))
    post_curves$value <- factor(post_curves$value, levels = c("Prior", 
        "Success", "Failure"))
    p <- ggplot(post_curves, aes(x = proportion_success, y = label, 
        height = probability, fill = value)) + geom_joy(stat = "identity", 
        color = "white", alpha = 0.8, panel_scaling = TRUE, size = 1) + 
        scale_y_discrete("", expand = c(0.01, 0)) + scale_x_continuous("Underlying proportion of success") + 
        scale_fill_manual(values = hcl(120 * 2:0 + 15, 100, 65), 
            name = "", drop = FALSE, labels = c("Prior   ", "Success   ", 
                "Failure   ")) + theme_light(base_size = 18) + 
        theme(legend.position = "top")
    if (show_plot) {
        print(p)
    }
    invisible(rbeta(n_draws, prior_prop[1] + sum(data), prior_prop[2] + 
        sum(!data)))
}


data <- c(1, 0, 0, 1, 0, 0 , 1)
prop_model(data)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Priors and posteriors

Prior is a probablity distribution that represents what model knows before seeing the data
Posterior is a probability distribution that represents what model knows having seen the data

When fittin a bayesian model, the end result is always a posterior over some wuantity or parameter of interest. The porterior represents how uncertain the model is of the underlying value

data = c(1, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0)

# Extract and explore the posterior
posterior <- prop_model(data)

head(posterior)

## [1] 0.3404846 0.1467915 0.1887210 0.3275088 0.3072024 0.1830295

hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")

data = c(1, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0)
posterior <- prop_model(data)

hist(posterior, breaks = 30, xlim = c(0, 1), col = "palegreen4")

# Calculate the median
median(posterior)

## [1] 0.1861541

# Calculate the credible interval
quantile(posterior, c(0.05, 0.95))

##         5%        95% 
## 0.06115933 0.38694088

# Calculate the probability
sum(posterior > 0.07) / length(posterior)

## [1] 0.9301

Howe bayesian inference actually works

Data
Generative model: it’s any kind of computer program, mathematical expression or set or rules that you can feed fixed parameter values and that you can use to generate simulated data
Priors: what the model knows before seeing the data

# The generative zombie drug model

# Parameters
prop_success <- 0.42
n_zombies <- 100

# Simulating data
data <- c()
for(zombie in 1:n_zombies) {
  data[zombie] <- runif(1, min = 0, max = 1) < prop_success
}

# Count cured
data <- sum( as.numeric(data) )
data

## [1] 47

Website example

To get more visitors to your website you are considering paying for an ad to be shown 100 times on a popular social media site. According to the social media site, their ads get clicked on 10% of the time.

# Fill in the parameters
n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- rbinom(n_samples, size = n_ads_shown, 
                     prob = proportion_clicks)
# Visualize n_visitors
hist(n_visitors)

Prior probability distribution

Represents how uncertain the model is about the parameters before seeing any data, adn ideally, this uncertainty should reflect our uncertainty

n_samples <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_samples, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_samples, size = n_ads_shown, prob = proportion_clicks)

# Visualize proportion clicks
hist(proportion_clicks)

# Visualize n_visitors
hist(n_visitors)

Bayesian inference is conditioning on data, in order to learn about parameter values

# Create the prior data frame
prior <- data.frame(proportion_clicks, n_visitors)

# Examine the prior data frame
head(prior)

##   proportion_clicks n_visitors
## 1        0.14092493         23
## 2        0.04463831          3
## 3        0.04127417          2
## 4        0.06175349          7
## 5        0.11637832         11
## 6        0.02619338          5

# Create the posterior data frame
posterior <- prior[prior$n_visitors == 13, ]

# Visualize posterior proportion clicks
hist(posterior$proportion_clicks)

In the last exercise, you updated the probability distribution over the underlying proportions of clicks (proportion_clicks) using new data. Now we want to use this updated proportion_clicks to predict how many visitors we would get if we reran the ad campaign.

# Assign posterior to a new variable called prior
prior <- posterior

# Take a look at the first rows in prior
head(prior)

##    proportion_clicks n_visitors
## 25         0.1427897         13
## 38         0.1559213         13
## 78         0.1362270         13
## 80         0.1007337         13
## 83         0.1528802         13
## 99         0.1092731         13

# Replace prior$n_visitors with a new sample and visualize the result
n_samples <-  nrow(prior)
n_ads_shown <- 100
prior$n_visitors <- rbinom(n_samples, size = n_ads_shown, prob = prior$proportion_clicks)
hist(prior$n_visitors)

# Calculate the probability that you will get 5 or more visitors
sum(prior$n_visitors >= 5) / length(prior$n_visitors)

## [1] 0.988343

Bayesian data analysis

Bayes is flexible

You can include information sources in addition to the data

Background information
Expert opinion
Common knowledge

You can make any comparisons between groups or datasets
You can use the result of a bayesian analysis to do decision analysis
You can change the underlying statistical model

The Beta distribution is a useful probability distribution when you want model uncertainty over a parameter bounded between 0 and 1. Here you’ll explore how the two parameters of the Beta distribution determine its shape.

# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 1, shape2 = 1)

# Visualize the results
hist(beta_sample)

# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 100, shape2 = 100)

# Visualize the results
hist(beta_sample)

# Explore using the rbeta function
beta_sample <- rbeta(n = 1000000, shape1 = 100, shape2 = 20)

# Visualize the results
hist(beta_sample)

n_draws <- 100000
n_ads_shown <- 100

# Change the prior on proportion_clicks
proportion_clicks <- 
  rbeta(n_draws, shape1 = 5, shape2 = 95)
n_visitors <- 
  rbinom(n_draws, size = n_ads_shown, 
         prob = proportion_clicks)
prior <- 
  data.frame(proportion_clicks, n_visitors)
posterior <- 
  prior[prior$n_visitors == 13, ]

# This plots the prior and the posterior in the same plot
par(mfcol = c(2, 1))
hist(prior$proportion_clicks, 
     xlim = c(0, 0.25))
hist(posterior$proportion_clicks, 
     xlim = c(0, 0.25))

Comparisons

Let’s fit the binomial model to both the video ad data (13 out of 100 clicked) and the new text ad data (6 out of a 100 clicked).

# Define parameters
n_draws <- 100000
n_ads_shown <- 100
proportion_clicks <- runif(n_draws, min = 0.0, max = 0.2)
n_visitors <- rbinom(n = n_draws, size = n_ads_shown, 
                     prob = proportion_clicks)
prior <- data.frame(proportion_clicks, n_visitors)

# Create the posteriors for video and text ads
posterior_video <- prior[prior$n_visitors == 13, ]
posterior_text <- prior[prior$n_visitors == 6, ]

# Visualize the posteriors
hist(posterior_video$proportion_clicks, xlim = c(0, 0.25))

hist(posterior_text$proportion_clicks, xlim = c(0, 0.25))

DEcision analysis

When you take the result of a statistical analysis and port-process it to make it more about what you really care about, in order yo make informed decisions

decision analysis example

Each visitor spends USD 2.53 on average, a video ad costs USD 0.25 and a text ad costs USD 0.05. Let’s figure out the probable profit when using video ads and text ads

visitor_spend <- 2.53
video_cost <- 0.25
text_cost <- 0.05

# Add the column posterior$video_profit
posterior$video_profit <- posterior$video_prop * visitor_spend - video_cost

# Add the column posterior$text_profit
posterior$text_profit <- posterior$text_prop * visitor_spend - text_cost

# Visualize the video_profit and text_profit columns
hist(posterior$video_profit)
hist(posterior$text_profit)

# Add the column posterior$profit_diff
posterior$profit_diff <- posterior$video_profit - posterior$text_profit

# Visualize posterior$profit_diff
hist(posterior$profit_diff)

# Calculate a "best guess" for the difference in profits
median(posterior$profit_diff)

# Calculate the probability that text ads are better than video ads
mean(posterior$profit_diff < 0)

Poisson distribution

The Poisson distribution simulates a process where the outcome is a number of occurrences per day/year/area/unit/etc. Before using it in a Bayesian model, let’s explore it!

# Simulate from a Poisson distribution and visualize the result
x <- rpois(n = 10000, lambda = 11.5)
hist(x)

# Calculate the probability of break-even
mean(x >= 15)

## [1] 0.1807

Clicks per day instead of clicks per ad

When you put up a banner on your friend’s site you got 19 clicks in a day, how many daily clicks should you expect this banner to generate on average? Now, modify your model, one piece at a time, to calculate this.

# Change this model so that it uses a Poisson distribution
n_draws <- 100000
mean_clicks <- runif(n_draws, min = 0, max = 80)
n_visitors <- rpois(n = n_draws, mean_clicks)

prior <- data.frame(mean_clicks, n_visitors)
posterior <- prior[prior$n_visitors == 13, ]


hist(prior$mean_clicks)

hist(posterior$mean_clicks)

Probability theory

P(n_visitors = 13) is a probaiblity
P(n_visitors) is a probability ditribution
P(n_visitors =13 | prop_clicks = 10%) is a conditional probability
P(n_visitors | prop_visitors =10%) is a conditional probability distribution

# Rewrite this code so that it uses dbinom instead of rbinom
n_ads_shown <- 100
proportion_clicks <- 0.1
prob_13_visitors <- dbinom(13, 
    size = n_ads_shown, prob = proportion_clicks)
prob_13_visitors

## [1] 0.07430209

# Change the code according to the instructions
n_ads_shown <- 100
proportion_clicks <- 0.1
n_visitors <- seq(0, 100)
prob <- dbinom(n_visitors, 
    size = n_ads_shown, prob = proportion_clicks)
prob

##   [1]  2.656140e-05  2.951267e-04  1.623197e-03  5.891602e-03  1.587460e-02
##   [6]  3.386580e-02  5.957873e-02  8.889525e-02  1.148230e-01  1.304163e-01
##  [11]  1.318653e-01  1.198776e-01  9.878801e-02  7.430209e-02  5.130383e-02
##  [16]  3.268244e-02  1.929172e-02  1.059153e-02  5.426525e-03  2.602193e-03
##  [21]  1.170987e-03  4.956559e-04  1.977617e-04  7.451890e-05  2.656461e-05
##  [26]  8.972934e-06  2.875940e-06  8.758007e-07  2.537042e-07  6.998736e-08
##  [31]  1.840408e-08  4.617512e-09  1.106279e-09  2.532895e-10  5.545880e-11
##  [36]  1.161994e-11  2.331161e-12  4.480309e-13  8.253201e-14  1.457830e-14
##  [41]  2.470212e-15  4.016606e-16  6.269305e-17  9.395858e-18  1.352434e-18
##  [46]  1.870032e-19  2.484342e-20  3.171501e-21  3.890962e-22  4.587982e-23
##  [51]  5.199713e-24  5.664175e-25  5.930440e-26  5.967739e-27  5.771270e-28
##  [56]  5.363200e-29  4.788572e-30  4.107157e-31  3.383290e-32  2.676049e-33
##  [61]  2.031815e-34  1.480375e-35  1.034671e-36  6.934301e-38  4.454325e-39
##  [66]  2.741123e-40  1.615140e-41  9.106926e-43  4.910597e-44  2.530420e-45
##  [71]  1.245128e-46  5.845669e-48  2.616117e-49  1.114936e-50  4.520010e-52
##  [76]  1.741041e-53  6.363454e-55  2.203794e-56  7.220406e-58  2.234162e-59
##  [81]  6.516307e-61  1.787738e-62  4.602579e-64  1.109055e-65  2.493907e-67
##  [86]  5.216014e-69  1.010856e-70  1.807405e-72  2.966699e-74  4.444493e-76
##  [91]  6.035732e-78  7.369636e-80  8.010474e-82  7.656367e-84  6.335055e-86
##  [96]  4.445653e-88  2.572716e-90  1.178793e-92  4.009500e-95  9.000000e-98
## [101] 1.000000e-100

plot(n_visitors, prob, type = "h")

Bayesian calculation

Out of a 100 impressions of the text ad, 6 out of a 100 clicked and visited your site.

n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- seq(0, 100, by = 1)
pars <- expand.grid(proportion_clicks = proportion_clicks,
                    n_visitors = n_visitors)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2)
pars$likelihood <- dbinom(pars$n_visitors, 
    size = n_ads_shown, prob = pars$proportion_clicks)
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
# Condition on the data 
pars <- pars[pars$n_visitors == 6, ]
# Normalize again
pars$probability <- pars$probability / sum(pars$probability)
# Plot the posterior pars$probability
plot(pars$proportion_clicks, pars$probability, type = "h")

# Simplify the code below by directly conditioning on the data
n_ads_shown <- 100
proportion_clicks <- seq(0, 1, by = 0.01)
n_visitors <- 6
pars <- expand.grid(proportion_clicks = proportion_clicks)
pars$prior <- dunif(pars$proportion_clicks, min = 0, max = 0.2) 
pars$likelihood <- dbinom(n_visitors, 
    size = n_ads_shown, prob = pars$proportion_clicks)

# BAYES THEOREM!!!!!!!!!!!!!!!!!!!!!!!!
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)
plot(pars$proportion_clicks, pars$probability, type = "h")

Bayes theorem

The probability of different parameter values given some data \(P(\theta | D)\)
The likelikehood: the (relative probability) of the data given different parameter values \(P(D | \theta)\)
The prior: The probability of different parameters before seeing the data \(P(\theta)\)
The total sum: of the likelihood weighted by the prior \(\sum P(D | \theta) P(\theta)\)

\[ P(\theta | D) = \frac{P(D | \theta) \cdot P(\theta)}{\sum P(D | \theta) P(\theta)} \]

Grid approximation

Define a grif over all the parameter combinations you need to evaluate
Approximate as it’s ofetn impossible try all parameter combinations

There are many more algorithms to fit bayesian models

# Assign mu and sigma
mu <- 3500
sigma <- 600

weight_distr <- rnorm(n = 100000, mean = mu, sd = sigma)
hist(weight_distr, 60, xlim = c(0, 6000), col = "lightgreen")

# Create weight
weight <- seq(0, 6000, by = 100)

# Calculate likelihood
likelihood <- dnorm(weight, mu, sigma)

# Plot the distribution of weight
plot(weight, likelihood, type = "h")

Bayesian model of water temperature

\(\mu \sim Normal(mean:18, sd:5)\)
\(\sigma \sim \ Uniform(min: 0, max: 10)\)
\(temp_{i} \sim Normal(\mu, \sigma)\)
\(temp = 19, 23, 20, 17, 23\)

temp <- c(19, 23, 20, 17, 23)
mu <- seq(8, 30, by = 0.5)
sigma <- seq(0.1, 10, by = 0.3)
pars <- expand.grid(mu = mu, sigma = sigma)
pars$mu_prior <- dnorm(pars$mu, mean = 18, sd = 5)
pars$sigma_prior <- dunif(pars$sigma, min = 0, max = 10)
pars$prior <- pars$mu_prior * pars$sigma_prior
for(i in 1:nrow(pars)) {    
  likelihoods <- dnorm(temp, pars$mu[i], pars$sigma[i])    
  pars$likelihood[i] <- prod(likelihoods)
}
pars$probability <- pars$likelihood * pars$prior
pars$probability <- pars$probability / sum(pars$probability)


levelplot(probability ~ mu * sigma, data = pars)

sample_indices <- sample(1:nrow(pars), size = 10000,
                         replace= TRUE, prob = pars$probability)
pars_sample <- pars[sample_indices, c("mu", "sigma")]

hist(pars_sample$mu, 30)

# probability of distribution over the mean temperature
quantile(pars_sample$mu, c(0.25, 0.95))

##  25%  95% 
## 19.5 22.5

pred_temp <- rnorm(10000, mean = pars_sample$mu, sd = pars_sample$sigma)
hist(pred_temp, 30)

# probabiliy fo temperature will be above 18
sum(pred_temp >= 18) / length(pred_temp )

## [1] 0.7305

BEST

Assumes the date some from a t-distribution
t-distribution: similar to normal distribution, but with an extra parameters: the degrees of freedom, that governs how likely the t-dsitribution os to generate outliers far from its center.
When Degrees of freedom > 30, the t-distribution takes the same as the normal distribution, when DF low the t-distribution generates more and more outliers
Uses markov chain montecarlo method

# The IQ of zombies on a regular diet and a brain based diet.
iq_brains <- c(44, 52, 42, 66, 53, 42, 55, 57, 56, 51)
iq_regular <- c(55, 44, 34, 18, 51, 40, 40, 49, 48, 46)

# Calculate the mean difference in IQ between the two groups
mean(iq_brains) - mean(iq_regular)

# Fit the BEST model to the data from both groups
library(BEST)
best_posterior <- BESTmcmc(iq_brains, iq_regular)

# Plot the model result
plot(best_posterior)

Bayesian Analysis

Miguel Arquez Abdala

6/1/2021