library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
georgia <- c(rep("Biden", 2473707), rep("Trump", 2461779))
alaska <- c(rep("Biden", 153778), rep("Trump", 189951))
proportions(table(georgia))
## georgia
## Biden Trump
## 0.5012084 0.4987916
proportions(table(alaska))
## alaska
## Biden Trump
## 0.4473815 0.5526185
A sample of 1,000 voters
survey_a1 <- sample(alaska,1000,replace=F)
table(survey_a1) %>% as.data.frame %>%
ggplot(aes(x=survey_a1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Alaska (N = 1,000)")
prop.test(table(survey_a1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_a1), null probability 0.5
## X-squared = 1.521, df = 1, p-value = 0.2175
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4486740 0.5114816
## sample estimates:
## p
## 0.48
Repeating a survey many times
survey_a2 <- sample(alaska,1000,replace=F)
table(survey_a2) %>% as.data.frame %>%
ggplot(aes(x=survey_a2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Alaska (N = 1,000)")
You can simulate this with R the following way:
draw_sample <- function(x,n=1000){
s <- sample(x, n, F)
proportions(table(s))
}
set.seed(2020)
alaska_sim <- replicate(1000,draw_sample(alaska))
alaska_sim[1:2,1:20]
##
## s [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## Biden 0.436 0.442 0.444 0.443 0.42 0.451 0.45 0.461 0.457 0.457 0.448 0.458
## Trump 0.564 0.558 0.556 0.557 0.58 0.549 0.55 0.539 0.543 0.543 0.552 0.542
##
## s [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## Biden 0.449 0.451 0.442 0.434 0.433 0.442 0.435 0.466
## Trump 0.551 0.549 0.558 0.566 0.567 0.558 0.565 0.534
Let’s plot the results from all our 1,000 surveys:
dat <- data.frame(t(alaska_sim))
qplot() + theme_minimal() +
geom_histogram(aes(x=dat$Biden),fill="blue", alpha=.5) +
geom_histogram(aes(x=dat$Trump), fill="red", alpha=.5) +
geom_vline(xintercept = mean(dat$Biden), color="blue") +
geom_vline(xintercept = mean(dat$Trump), color="red") +
ggtitle("1,000 surveys in Alaska of 1,000 voters each",
subtitle="Histograms of simulated values for Trump (red) and Biden (blue) percentages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
c(mean(dat$Biden), mean(dat$Trump))
## [1] 0.44751 0.55249
Here you see a normal distribution with N = 1,000 around the Alaska vote (from our first survey) for Trump: Figure 4: Normal distribution with 95% interval for N = 1,000 Code for this plot:
vote_share <- .564
N <- 1000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- (mean - 5*sd):(mean + 5*sd)
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))
N <- 10000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- x*10
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))
If we know the distribution of all our 1,000 surveys (because we simulated them ourselves), we can easily get the standard error just from the standard deviation of the distribution of all our repeated experiments. Let’s try this:
dat <- data.frame(t(alaska_sim))
sd(dat$Trump)
## [1] 0.01563952
sqrt(0.564*0.436/1000)
## [1] 0.01568133
An important rule of thumb about the normal distribution as shown in Figures 4 and 5 is that the limits of the 95% confidence interval are ca.: mean value +/- 2 * standard error.
Try it out:
se = sqrt(0.564*0.436/1000)
c(.546-2*se, .564+2*se)
## [1] 0.5146373 0.5953627
This is the essence of the statistical significance test: If the true effect is zero, what is the probability of obtaining my data from a random sample of size N?
This probability is the infamous p-value. Formally, we want to know: p(D|H0), i.e. the probability (“p-value”) of our data (D) [or a more extreme result] given the null hypothesis (H0).
N <- 1000
null_hypothesis <- 0.5
our_result <- 0.564
se = sqrt(our_result*(1-our_result)/N)
x <- seq((null_hypothesis - 5*se),(null_hypothesis + 5*se),by=.001)
norm1 <- dnorm(x,null_hypothesis,se)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="black") +
geom_area(aes(x=x,y=norm1),fill="gray",alpha=.1) +
geom_vline(xintercept = our_result, color="red") +
geom_label(aes(x=our_result,y=max(norm1), label = paste0("Our data:\n",our_result*100,"%"))) +
ggtitle("Null hypothesis: True Trump votes = Biden votes",
subtitle="vs. the result of our survey")
### How you can quickly test for significance in R
binom.test(564,1000, p=0.5, alternative="greater")
##
## Exact binomial test
##
## data: 564 and 1000
## number of successes = 564, number of trials = 1000, p-value = 2.895e-05
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
## 0.5375913 1.0000000
## sample estimates:
## probability of success
## 0.564
In economics, social sciences, market research, etc., you are often inferring from samples of people to the general population (i.e., voter surveys, household panels, customer feedback, etc.). See our example in this post. These are the prototypical applications of significance tests.
In medicine, psychology, or biostatistics, you often have groups of people for experiments such as randomized control trials for a new drug, and again, due to the random assignment of participants into experimental/control groups, and due to the fact that you often want to make an inferential statement about the total population, statistical significance tests are reasonable. The null hypothesis would be that a treatment/drug/etc. does not work at all, and outcomes (e.g., number of deaths) in all groups are equal.
Now let’s do the same thing with the Georgia election results:
set.seed(2020)
survey_g1 <- sample(georgia,1000,replace=F)
table(survey_g1) %>% as.data.frame %>%
ggplot(aes(x=survey_g1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Georgia (N = 1,000)")
prop.test(table(survey_g1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g1), null probability 0.5
## X-squared = 0.169, df = 1, p-value = 0.681
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4755473 0.5383982
## sample estimates:
## p
## 0.507
If we get an insignificant effect, there can be two reasons:
We increase the sample size and ask 100,000 voters, instead of only 1,000, at the Georgia exit polls:
set.seed(2020)
survey_g2 <- sample(georgia,100000,replace=F)
table(survey_g2) %>% as.data.frame %>%
ggplot(aes(x=survey_g2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Georgia (N = 100,000)")
prop.test(table(survey_g2))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g2), null probability 0.5
## X-squared = 0.17161, df = 1, p-value = 0.6787
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4975561 0.5037639
## sample estimates:
## p
## 0.50066
We have unlimited funds and so we decide to ask half of all voters, i.e. 3 Million people:
survey_g4 <- sample(georgia,3000000,replace=F)
table(survey_g4) %>% as.data.frame %>%
ggplot(aes(x=survey_g4, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(round(proportions(table(survey_g4))*100,4),"%")) +
theme_minimal() +
ggtitle("Results of our third survey in Georgia (N = 3,000,000)")
prop.test(table(survey_g4))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g4), null probability 0.5
## X-squared = 12.793, df = 1, p-value = 0.000348
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5004667 0.5015986
## sample estimates:
## p
## 0.5010327