| title: “Homework 4” |
| author: “Anthony M. Ortiz” |
| date: “Due Wednesday, May 24 by 9:50 a.m.” |
| output: github_document |
library(tidyverse)
sf2016 <- read.csv("https://github.com/cmsc205/notes/raw/master/day18/sf2016.csv")
# Put the code for your bootstrap2 function in this code chunk
bootstrap2 <- function(x, y, stat, B, ratio){
n <- length(x)
m <- length(y)
stats <- numeric(length = B)
stats1 <- numeric(length = B)
for(i in 1:B){
boot_sample_x <- sample(x, size = n, replace = TRUE)
boot_sample_y <- sample(y, size = m, replace = TRUE)
stats[i] <- stat(boot_sample_x)
stats1[i] <- stat(boot_sample_y)
}
if(ratio == FALSE){
boot_sample_x - boot_sample_y
} else{
boot_sample_x / boot_sample_y
}
}
# Put the code to create the two-sample bootstrap here
##tables with all values to make pop_mean_diff
publicT <- sf2016 %>%
filter(OrganizationGroup == "Public Protection") %>%
select(OrganizationGroup, TotalSalary)
commT <- sf2016 %>%
filter(OrganizationGroup == "Community Health") %>%
select(OrganizationGroup, TotalSalary)
##tables with sample of 500 values
public <- sf2016 %>%
filter(OrganizationGroup == "Public Protection") %>%
select(OrganizationGroup, TotalSalary) %>%
sample_n(size = 500)
comm <- sf2016 %>%
filter(OrganizationGroup == "Community Health") %>%
select(OrganizationGroup, TotalSalary) %>%
sample_n(size = 500)
##vector of avg mean differences
mean_diff <- bootstrap2(public$TotalSalary, comm$TotalSalary, mean, 10000, ratio = FALSE)
ggplot(data = data.frame(diff = mean_diff), aes(x = diff)) +
geom_histogram() +
labs(x = "Average Difference in Salary")
Briefly describe the bootstrap distribution here.
The bootstrap distribution seems pretty normal i.e. gaussian. This means that it is a pretty good approximation of the population.
# Put the code used to create your confidence interval here
pop_mean_diff <- mean(publicT$TotalSalary - commT$TotalSalary)
ci <- quantile(mean_diff, probs = c(.04, .96))
ci
## 4% 96%
## -115256.3 178167.4
ggplot(data = data.frame(diff = mean_diff), aes(x = diff)) +
geom_histogram() +
labs(x = "Average Difference in Salary") +
geom_segment(x = ci[1], y = 0, xend = ci[2], yend = 0, color = "gold", size = 2) +
geom_point(x = pop_mean_diff, y = 0, color = "red")
Provide an interpretation of the interval here.
I can say that, with 92% confidence, the mean of the difference in salary is between the two values given by the confidence interval. Those values will change every time I run the bootstrap function, but my confidence of 92% will not. I have calculated the mean average salary difference of the population and placed it as a red point on the histogram. The CI is represented by a yellow bar. I can say with 92% confidence that the red point will be in between the yellow bar.
# Put the code for your monty_hall function in this code chunk
monty_hall <- function(switch = TRUE){
win <- 0
doors <- 1:3
prize <- sample(1:3, size = 1)
player_guess <- sample(1:3, size =1)
if(player_guess != prize){
monty_door <- doors[-c(prize, player_guess)]
}else {
monty_door <- sample(doors[-c(prize, player_guess)], size = 1)
}
if(switch == TRUE){
new_guess <- doors[-c(monty_door, player_guess)]
if(new_guess == prize){
#print(paste("You chose door", new_guess, "the winning door was door", prize, "congrats!"))
win <- win +1
}else{
#print(paste("You lose. You chose door", new_guess, "the winning door was door", prize))
}
}else{
if(player_guess == prize){
#print(paste("You chose door", player_guess, "the winning door was door", prize, "congrats!"))
win <- win + 1
}else{
#print(paste("You lose. You chose door", player_guess, "the winning door was door", prize))
}
}
win
}
monty_hall()
## [1] 1
#Put the code for your sim function in this code chunk
sim1 <- function(nsim, switch = TRUE){
replicate(n = nsim, expr = monty_hall(switch))
}
# Put the code used to run the simulations here
# This should be rather short, since you are using your functions
switch <- sum(sim1(10000))
stay <- sum(sim1(10000, FALSE))
print(paste("Win count when choosing to switch:", switch, "/ Proportion:", switch/10000))
## [1] "Win count when choosing to switch: 6686 / Proportion: 0.6686"
print(paste("Win count when choosing to stay:", stay, "/ Proportion:", stay/10000))
## [1] "Win count when choosing to stay: 3355 / Proportion: 0.3355"
Briefly describe your simulation results here.
It looks like a simulation of 10,000, while switching 100% of the time, yields around 6600-6800 wins. The proportion of switching 100% of the time is around 2/3. For not switching at all, the simulation yields 3200-3400 wins– thus, the proportion when not switching at all is around 1/3. The best way to succeed is to switch 100% of the time, seeing as you have a 2/3 chance of winning.
#Alt method, did first. Realized wasn't efficient.
sim <- function(nsim, switch = TRUE){
win <- 0
for(i in 1:nsim){
doors <- 1:3
prize <- sample(1:3, size = 1)
player_guess <- sample(1:3, size =1)
if(player_guess != prize){
monty_door <- doors[-c(prize, player_guess)]
}else {
monty_door <- sample(doors[-c(prize, player_guess)], size = 1)
}
if(switch == TRUE){
new_guess <- doors[-c(monty_door, player_guess)]
if(new_guess == prize){
#print(paste("You chose door", new_guess, "the winning door was door", prize, "congrats!"))
win <- win + 1
}else{
#print(paste("You lose. You chose door", new_guess, "the winning door was door", prize))
}
}else{
if(player_guess == prize){
#print(paste("You chose door", player_guess, "the winning door was door", prize, "congrats!"))
win <- win + 1
}else{
#print(paste("You lose. You chose door", player_guess, "the winning door was door", prize))
}
}
}
print(paste("Win count is", win))
print(paste("Proportion of wins to times played is", win/nsim))
}
sim(10000, TRUE)
## [1] "Win count is 6693"
## [1] "Proportion of wins to times played is 0.6693"
sim(10000, FALSE)
## [1] "Win count is 3340"
## [1] "Proportion of wins to times played is 0.334"