library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
georgia <- c(rep("Biden", 2473707), rep("Trump", 2461779))
alaska <- c(rep("Biden", 153778), rep("Trump", 189951))
proportions(table(georgia))
## georgia
##     Biden     Trump 
## 0.5012084 0.4987916
proportions(table(alaska))
## alaska
##     Biden     Trump 
## 0.4473815 0.5526185

Sample of 1000 votes

set.seed(2020) 
survey_a1 <- sample(alaska,1000,replace=F)
table(survey_a1) %>% as.data.frame %>%
  ggplot(aes(x=survey_a1, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_a1))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our first survey in Alaska (N = 1,000)")

prop.test(table(survey_a1))
## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_a1), null probability 0.5
## X-squared = 16.129, df = 1, p-value = 5.917e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4050739 0.4674239
## sample estimates:
##     p 
## 0.436

A Second Survey

survey_a2 <- sample(alaska,1000,replace=F)
table(survey_a2) %>% as.data.frame %>%
  ggplot(aes(x=survey_a2, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_a2))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our second survey in Alaska (N = 1,000)")

Simulate R Code

draw_sample <- function(x,n=1000){
  s <- sample(x, n, F)
  proportions(table(s))
}
set.seed(2020)
alaska_sim <- replicate(1000,draw_sample(alaska))
alaska_sim[1:2,1:20]
##        
## s        [,1]  [,2]  [,3]  [,4] [,5]  [,6] [,7]  [,8]  [,9] [,10] [,11] [,12]
##   Biden 0.436 0.442 0.444 0.443 0.42 0.451 0.45 0.461 0.457 0.457 0.448 0.458
##   Trump 0.564 0.558 0.556 0.557 0.58 0.549 0.55 0.539 0.543 0.543 0.552 0.542
##        
## s       [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
##   Biden 0.449 0.451 0.442 0.434 0.433 0.442 0.435 0.466
##   Trump 0.551 0.549 0.558 0.566 0.567 0.558 0.565 0.534

Plot the results from 1000 surveys

dat <- data.frame(t(alaska_sim))
qplot() + theme_minimal() + 
  geom_histogram(aes(x=dat$Biden),fill="blue", alpha=.5) + 
  geom_histogram(aes(x=dat$Trump), fill="red", alpha=.5) +
  geom_vline(xintercept = mean(dat$Biden), color="blue") +
  geom_vline(xintercept = mean(dat$Trump), color="red") +
  ggtitle("1,000 surveys in Alaska of 1,000 voters each", 
          subtitle="Histograms of simulated values for Trump (red) and Biden (blue) percentages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

c(mean(dat$Biden), mean(dat$Trump))
## [1] 0.44751 0.55249

Plot the normal distribution

vote_share <- .564
N <- 1000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- (mean - 5*sd):(mean + 5*sd)
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="red") +
  geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
  geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
  geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
  ggtitle("Normal distribution with N = 1,000",
          subtitle= paste0("Vertical lines = 95% interval around the mean: ",
                           round(x[which.min(abs(p-0.025))]/N*100,1), "% to ", 
                           round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

Larger Samples

N <- 10000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- x*10
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="red") +
  geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
  geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
  geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
  ggtitle("Normal distribution with N = 1,000",
          subtitle= paste0("Vertical lines = 95% interval around the mean: ",
                           round(x[which.min(abs(p-0.025))]/N*100,1), "% to ", 
                           round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

Standard Error

dat <- data.frame(t(alaska_sim))
sd(dat$Trump)
## [1] 0.01563952
se = sqrt(0.564*0.436/1000)
c(.546-2*se, .564+2*se)
## [1] 0.5146373 0.5953627

Null Hypothesis

N <- 1000
null_hypothesis <- 0.5
our_result <- 0.564
se = sqrt(our_result*(1-our_result)/N)

x <- seq((null_hypothesis - 5*se),(null_hypothesis + 5*se),by=.001)
norm1 <- dnorm(x,null_hypothesis,se)

qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="black") +
  geom_area(aes(x=x,y=norm1),fill="gray",alpha=.1) +
  geom_vline(xintercept = our_result, color="red") +
  geom_label(aes(x=our_result,y=max(norm1), label = paste0("Our data:\n",our_result*100,"%"))) +
  ggtitle("Null hypothesis: True Trump votes = Biden votes",
          subtitle="vs. the result of our survey")

P- Value

p_value <- 1 - pnorm(our_result, mean=0.5, sd=se)
p_value
## [1] 2.239347e-05

Test for significance

binom.test(564,1000, p=0.5, alternative="greater")
## 
##  Exact binomial test
## 
## data:  564 and 1000
## number of successes = 564, number of trials = 1000, p-value = 2.895e-05
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
##  0.5375913 1.0000000
## sample estimates:
## probability of success 
##                  0.564

A survey in Georgia

set.seed(2020) 

survey_g1 <- sample(georgia,1000,replace=F)

table(survey_g1) %>% as.data.frame %>%
  ggplot(aes(x=survey_g1, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_g1))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our first survey in Georgia (N = 1,000)")

prop.test(table(survey_g1))
## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g1), null probability 0.5
## X-squared = 0.169, df = 1, p-value = 0.681
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4755473 0.5383982
## sample estimates:
##     p 
## 0.507

Increase the sample size and ask 100,000 voters

set.seed(2020)
survey_g2 <- sample(georgia,100000,replace=F)

table(survey_g2) %>% as.data.frame %>%
  ggplot(aes(x=survey_g2, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_g2))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our second survey in Georgia (N = 100,000)")

prop.test(table(survey_g2))
## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g2), null probability 0.5
## X-squared = 0.17161, df = 1, p-value = 0.6787
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4975561 0.5037639
## sample estimates:
##       p 
## 0.50066

ask half of all voters, i.e. 3 Million people

survey_g4 <- sample(georgia,3000000,replace=F)
table(survey_g4) %>% as.data.frame %>%
  ggplot(aes(x=survey_g4, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(round(proportions(table(survey_g4))*100,4),"%")) +
  theme_minimal() + 
  ggtitle("Results of our third survey in Georgia (N = 3,000,000)")

prop.test(table(survey_g4))
## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g4), null probability 0.5
## X-squared = 12.793, df = 1, p-value = 0.000348
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.5004667 0.5015986
## sample estimates:
##         p 
## 0.5010327