library(tidyverse)
library(ggthemes)
sf_df <- read.csv("sf2016.csv")

Problem 1: The two-sample bootstrap

Writing the function

bootstrap <- function(x, y, stat, B, ratio = FALSE){
  n_x <- length(x)
  n_y <- length(y)
  stats1 <- numeric(length = B)
  stats2 <- numeric(length = B)
  for(b in 1:B) {
    boot_sample1 <- sample(x, 
                          size = n_x, 
                          replace = TRUE)
    stats1[b] <- stat(boot_sample1)
  
    boot_sample2 <- sample(y,
                           size = n_y, 
                           replace = TRUE)
    stats2[b] <- stat(boot_sample2)
  }
  if(ratio == FALSE){
    stats1 - stats2
  } else {
    stats1/stats2
  } 
}
pp_sample <-
  sf_df %>%
    filter(OrganizationGroup == "Public Protection") %>%
    sample_n(size = 500) %>%
    select(TotalSalary)
ch_sample <-
  sf_df %>%
    filter(OrganizationGroup == "Community Health") %>%
    sample_n(size = 500) %>%
    select(TotalSalary)

Running the two-sample boostrap

diff <- bootstrap(pp_sample$TotalSalary, ch_sample$TotalSalary, mean, 500, ratio = FALSE)
ggplot(data = data.frame(difference = diff), aes(x = difference))+
  geom_histogram(binwidth = 900, fill = "blue", alpha = 0.4)+
  labs(title = "Difference between average salaries", subtitle = "Community Health and Public Protection", x = "Average difference ($)", y = "Count")+
  geom_vline(xintercept = mean(diff))+
  theme_economist_white()

ggplot(data = data.frame(difference = diff), aes(x = difference))+
  geom_density(fill = "green", alpha = 0.4)+
  geom_vline(xintercept = mean(diff))+
  labs(title = "Difference between average salaries", subtitle = "Community Health and Public Protection", x = "Average difference ($)", y = "Density")

The bootstrap distribution shows the average difference between public protection and community health jobs. In diff, pp_sample is the x term and ch_sample is the y. This means that bootstrap subtracts the community health salaries from the public protection salaries. Since the differences are positive, public protection has a higher average total salary than community health. The distribution looks roughly normal.

ci_92 <- quantile(diff, probs = c(0.04, 0.96))
ci_92
##       4%      96% 
## 26722.64 40421.71

92% of the mean differences are between the 4% and 96% values above. If more samples were taken and this interval was was calculated, 92% of the intervals would contain the mean. In other words, there is a 92% chance that this interval contains the population mean.


Problem 2: The Monty Hall problem

Writing the functions

monty_hall <- function(switch = TRUE){
  doors <- c("W", "L1", "L2")
  pick1 <- sample(doors, 1)
  
  if(switch == FALSE & pick1 == "W"){
    return(1)
  } else if(switch == TRUE & pick1 == "W") {
    return(0)
  } else if(switch == FALSE & pick1 == "L1") {
    return(0)
  } else if(switch == TRUE & pick1 == "L1") {
    return(1)
  } else if(switch == FALSE & pick1 == "L2") {
    return(0)
  } else {
    return(1)
  } 
}
sim <- function(nsim, switch = TRUE) {
  result <- replicate(nsim, monty_hall(switch))
  result
}

Simulation results

sim_true <- sim(10000, TRUE)
head(sim_true, 20)
##  [1] 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1
mean(sim_true)
## [1] 0.6637
sim_false <- sim(10000, FALSE)
head(sim_false, 20)
##  [1] 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0
mean(sim_false)
## [1] 0.3397

The simulations show that the probability of winning given switching is two-thirds and the probability of winning given not switching is one-third. The simulations prove that switching will improve your chances of winning.