Problem 1: The two-sample bootstrap

library(tidyverse)
sf2016 <- read.csv("https://github.com/cmsc205/notes/raw/master/day18/sf2016.csv")

Writing the function

# Put the code for your bootstrap2 function in this code chunk
bootstrap2 <- function(x, y, stat, B, ratio){
  n <- length(x)
  m <- length(y)
  stats <- numeric(length = B)
  stats1 <- numeric(length = B)
  
  for(i in 1:B){
    boot_sample_x <- sample(x, size = n, replace = TRUE)
    boot_sample_y <- sample(y, size = m, replace = TRUE)
    
    stats[i] <- stat(boot_sample_x)
    stats1[i] <- stat(boot_sample_y)
  }
  
    if(ratio == FALSE){
      boot_sample_x - boot_sample_y
    } else{
      boot_sample_x / boot_sample_y
      }
  
  
}

Running the two-sample boostrap

# Put the code to create the two-sample bootstrap here
##tables with all values to make pop_mean_diff
publicT <- sf2016 %>%
  filter(OrganizationGroup == "Public Protection") %>%
  select(OrganizationGroup, TotalSalary) 
  
commT <- sf2016 %>%
  filter(OrganizationGroup == "Community Health") %>%
  select(OrganizationGroup, TotalSalary)
  
##tables with sample of 500 values 
public <- sf2016 %>%
  filter(OrganizationGroup == "Public Protection") %>%
  select(OrganizationGroup, TotalSalary) %>%
  sample_n(size = 500)

comm <- sf2016 %>%
  filter(OrganizationGroup == "Community Health") %>%
  select(OrganizationGroup, TotalSalary) %>%
  sample_n(size = 500)
##vector of avg mean differences 
mean_diff <- bootstrap2(public$TotalSalary, comm$TotalSalary, mean, 10000, ratio = FALSE)

ggplot(data = data.frame(diff = mean_diff), aes(x = diff)) +
  geom_histogram() +
  labs(x = "Average Difference in Salary")

Briefly describe the bootstrap distribution here.

The bootstrap distribution seems pretty normal i.e. gaussian. This means that it is a pretty good approximation of the population.

# Put the code used to create your confidence interval here
pop_mean_diff <- mean(publicT$TotalSalary - commT$TotalSalary)

ci <- quantile(mean_diff, probs = c(.04, .96))
ci

##        4%       96% 
## -115256.3  178167.4

ggplot(data = data.frame(diff = mean_diff), aes(x = diff)) +
  geom_histogram() +
  labs(x = "Average Difference in Salary") +
  geom_segment(x = ci[1], y = 0, xend = ci[2], yend = 0, color = "gold", size = 2) +
  geom_point(x = pop_mean_diff, y = 0, color = "red")

Provide an interpretation of the interval here.

I can say that, with 92% confidence, the mean of the difference in salary is between the two values given by the confidence interval. Those values will change every time I run the bootstrap function, but my confidence of 92% will not. I have calculated the mean average salary difference of the population and placed it as a red point on the histogram. The CI is represented by a yellow bar. I can say with 92% confidence that the red point will be in between the yellow bar.

Problem 2: The Monty Hall problem

Writing the functions

# Put the code for your monty_hall function in this code chunk
monty_hall <- function(switch = TRUE){
  win <- 0
  doors <- 1:3
  prize <- sample(1:3, size = 1)
  player_guess <- sample(1:3, size =1)
  
  if(player_guess != prize){
    monty_door <- doors[-c(prize, player_guess)]
  }else {
    monty_door <- sample(doors[-c(prize, player_guess)], size = 1)
  }
  
  if(switch == TRUE){
    new_guess <- doors[-c(monty_door, player_guess)]
    if(new_guess == prize){
      #print(paste("You chose door", new_guess, "the winning door was door", prize, "congrats!"))
      win <- win +1
    }else{
      #print(paste("You lose.  You chose door", new_guess, "the winning door was door", prize))
    }
    
  }else{
    if(player_guess == prize){
      #print(paste("You chose door", player_guess, "the winning door was door", prize, "congrats!"))
      win <- win + 1
      
    }else{
      #print(paste("You lose.  You chose door", player_guess, "the winning door was door", prize))
      
    }
  }
  win
}

  


monty_hall()

## [1] 1

#Put the code for your sim function in this code chunk
sim1 <- function(nsim, switch = TRUE){
  replicate(n = nsim, expr = monty_hall(switch))
}

Simulation results

# Put the code used to run the simulations here
# This should be rather short, since you are using your functions
switch <- sum(sim1(10000))
stay <- sum(sim1(10000, FALSE))
print(paste("Win count when choosing to switch:", switch, "/ Proportion:", switch/10000))

## [1] "Win count when choosing to switch: 6686 / Proportion: 0.6686"

print(paste("Win count when choosing to stay:", stay, "/ Proportion:", stay/10000))

## [1] "Win count when choosing to stay: 3355 / Proportion: 0.3355"

Briefly describe your simulation results here.

It looks like a simulation of 10,000, while switching 100% of the time, yields around 6600-6800 wins. The proportion of switching 100% of the time is around 2/3. For not switching at all, the simulation yields 3200-3400 wins– thus, the proportion when not switching at all is around 1/3. The best way to succeed is to switch 100% of the time, seeing as you have a 2/3 chance of winning.

#Alt method, did first.  Realized wasn't efficient.
sim <- function(nsim, switch = TRUE){
  win <- 0
  for(i in 1:nsim){
    doors <- 1:3
    prize <- sample(1:3, size = 1)
    player_guess <- sample(1:3, size =1)
    
    if(player_guess != prize){
      monty_door <- doors[-c(prize, player_guess)]
    }else {
      monty_door <- sample(doors[-c(prize, player_guess)], size = 1)
    }
    
    if(switch == TRUE){
      new_guess <- doors[-c(monty_door, player_guess)]
      if(new_guess == prize){
        #print(paste("You chose door", new_guess, "the winning door was door", prize, "congrats!"))
        win <- win + 1
      }else{
        #print(paste("You lose.  You chose door", new_guess, "the winning door was door", prize))
      }
      
    }else{
      if(player_guess == prize){
        #print(paste("You chose door", player_guess, "the winning door was door", prize, "congrats!"))
        win <- win + 1
      }else{
        #print(paste("You lose.  You chose door", player_guess, "the winning door was door", prize))
        
      }
    }
  }
  print(paste("Win count is", win))
  print(paste("Proportion of wins to times played is", win/nsim))
}

sim(10000, TRUE)

## [1] "Win count is 6693"
## [1] "Proportion of wins to times played is 0.6693"

sim(10000, FALSE)

## [1] "Win count is 3340"
## [1] "Proportion of wins to times played is 0.334"