library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
georgia <- c(rep("Biden", 2473707), rep("Trump", 2461779))
alaska <- c(rep("Biden", 153778), rep("Trump", 189951))
proportions(table(georgia))
## georgia
## Biden Trump
## 0.5012084 0.4987916
proportions(table(alaska))
## alaska
## Biden Trump
## 0.4473815 0.5526185
Sample of 1000 votes
set.seed(2020)
survey_a1 <- sample(alaska,1000,replace=F)
table(survey_a1) %>% as.data.frame %>%
ggplot(aes(x=survey_a1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Alaska (N = 1,000)")
prop.test(table(survey_a1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_a1), null probability 0.5
## X-squared = 16.129, df = 1, p-value = 5.917e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4050739 0.4674239
## sample estimates:
## p
## 0.436
A Second Survey
survey_a2 <- sample(alaska,1000,replace=F)
table(survey_a2) %>% as.data.frame %>%
ggplot(aes(x=survey_a2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Alaska (N = 1,000)")
Simulate R Code
draw_sample <- function(x,n=1000){
s <- sample(x, n, F)
proportions(table(s))
}
set.seed(2020)
alaska_sim <- replicate(1000,draw_sample(alaska))
alaska_sim[1:2,1:20]
##
## s [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## Biden 0.436 0.442 0.444 0.443 0.42 0.451 0.45 0.461 0.457 0.457 0.448 0.458
## Trump 0.564 0.558 0.556 0.557 0.58 0.549 0.55 0.539 0.543 0.543 0.552 0.542
##
## s [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## Biden 0.449 0.451 0.442 0.434 0.433 0.442 0.435 0.466
## Trump 0.551 0.549 0.558 0.566 0.567 0.558 0.565 0.534
Plot the results from 1000 surveys
dat <- data.frame(t(alaska_sim))
qplot() + theme_minimal() +
geom_histogram(aes(x=dat$Biden),fill="blue", alpha=.5) +
geom_histogram(aes(x=dat$Trump), fill="red", alpha=.5) +
geom_vline(xintercept = mean(dat$Biden), color="blue") +
geom_vline(xintercept = mean(dat$Trump), color="red") +
ggtitle("1,000 surveys in Alaska of 1,000 voters each",
subtitle="Histograms of simulated values for Trump (red) and Biden (blue) percentages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
c(mean(dat$Biden), mean(dat$Trump))
## [1] 0.44751 0.55249
Plot the normal distribution
vote_share <- .564
N <- 1000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- (mean - 5*sd):(mean + 5*sd)
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))
Larger Samples
N <- 10000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- x*10
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))
Standard Error
dat <- data.frame(t(alaska_sim))
sd(dat$Trump)
## [1] 0.01563952
se = sqrt(0.564*0.436/1000)
c(.546-2*se, .564+2*se)
## [1] 0.5146373 0.5953627
Null Hypothesis
N <- 1000
null_hypothesis <- 0.5
our_result <- 0.564
se = sqrt(our_result*(1-our_result)/N)
x <- seq((null_hypothesis - 5*se),(null_hypothesis + 5*se),by=.001)
norm1 <- dnorm(x,null_hypothesis,se)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="black") +
geom_area(aes(x=x,y=norm1),fill="gray",alpha=.1) +
geom_vline(xintercept = our_result, color="red") +
geom_label(aes(x=our_result,y=max(norm1), label = paste0("Our data:\n",our_result*100,"%"))) +
ggtitle("Null hypothesis: True Trump votes = Biden votes",
subtitle="vs. the result of our survey")
P- Value
p_value <- 1 - pnorm(our_result, mean=0.5, sd=se)
p_value
## [1] 2.239347e-05
Test for significance
binom.test(564,1000, p=0.5, alternative="greater")
##
## Exact binomial test
##
## data: 564 and 1000
## number of successes = 564, number of trials = 1000, p-value = 2.895e-05
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
## 0.5375913 1.0000000
## sample estimates:
## probability of success
## 0.564
A survey in Georgia
set.seed(2020)
survey_g1 <- sample(georgia,1000,replace=F)
table(survey_g1) %>% as.data.frame %>%
ggplot(aes(x=survey_g1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Georgia (N = 1,000)")
prop.test(table(survey_g1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g1), null probability 0.5
## X-squared = 0.169, df = 1, p-value = 0.681
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4755473 0.5383982
## sample estimates:
## p
## 0.507
Increase the sample size and ask 100,000 voters
set.seed(2020)
survey_g2 <- sample(georgia,100000,replace=F)
table(survey_g2) %>% as.data.frame %>%
ggplot(aes(x=survey_g2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Georgia (N = 100,000)")
prop.test(table(survey_g2))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g2), null probability 0.5
## X-squared = 0.17161, df = 1, p-value = 0.6787
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4975561 0.5037639
## sample estimates:
## p
## 0.50066
ask half of all voters, i.e. 3 Million people
survey_g4 <- sample(georgia,3000000,replace=F)
table(survey_g4) %>% as.data.frame %>%
ggplot(aes(x=survey_g4, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(round(proportions(table(survey_g4))*100,4),"%")) +
theme_minimal() +
ggtitle("Results of our third survey in Georgia (N = 3,000,000)")
prop.test(table(survey_g4))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g4), null probability 0.5
## X-squared = 12.793, df = 1, p-value = 0.000348
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5004667 0.5015986
## sample estimates:
## p
## 0.5010327