IC-RBloggers_Stat_Significance

# Setup
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

getwd()

## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Homework_10_Due_14Nov22"

# Step 1

georgia <- c(rep("Biden", 2473707), rep("Trump", 2461779))
alaska <- c(rep("Biden", 153778), rep("Trump", 189951))
proportions(table(georgia))

## georgia
##     Biden     Trump 
## 0.5012084 0.4987916

proportions(table(alaska))

## alaska
##     Biden     Trump 
## 0.4473815 0.5526185

# Step 2 Simulate Alaskan Exit Poll
set.seed(2020)
survey_a1 <- sample(alaska,1000,replace=F)
table(survey_a1) %>% as.data.frame %>%
ggplot(aes(x=survey_a1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Alaska (N = 1,000)")

prop.test(table(survey_a1))

## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_a1), null probability 0.5
## X-squared = 16.129, df = 1, p-value = 5.917e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4050739 0.4674239
## sample estimates:
##     p 
## 0.436

# Step 3 2nd Simulated Alaska Poll

survey_a2 <- sample(alaska,1000,replace=F)
table(survey_a2) %>% as.data.frame %>%
ggplot(aes(x=survey_a2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Alaska (N = 1,000)")

# Step 4 Replicated Alaska Survey

draw_sample <- function(x,n=1000){
s <- sample(x, n, F)
proportions(table(s))
}
set.seed(2020)
alaska_sim <- replicate(1000,draw_sample(alaska))
alaska_sim[1:2,1:20]

##        
## s        [,1]  [,2]  [,3]  [,4] [,5]  [,6] [,7]  [,8]  [,9] [,10] [,11] [,12]
##   Biden 0.436 0.442 0.444 0.443 0.42 0.451 0.45 0.461 0.457 0.457 0.448 0.458
##   Trump 0.564 0.558 0.556 0.557 0.58 0.549 0.55 0.539 0.543 0.543 0.552 0.542
##        
## s       [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
##   Biden 0.449 0.451 0.442 0.434 0.433 0.442 0.435 0.466
##   Trump 0.551 0.549 0.558 0.566 0.567 0.558 0.565 0.534

 # Step 5 Plot the results
dat <- data.frame(t(alaska_sim))
qplot() + theme_minimal() +
geom_histogram(aes(x=dat$Biden),fill="blue", alpha=.5) +
geom_histogram(aes(x=dat$Trump), fill="red", alpha=.5) +
geom_vline(xintercept = mean(dat$Biden), color="blue") +
geom_vline(xintercept = mean(dat$Trump), color="red") +
ggtitle("1,000 surveys in Alaska of 1,000 voters each",
subtitle="Histograms of simulated values for Trump (red) and Biden (blue) percentages")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

c(mean(dat$Biden), mean(dat$Trump))

## [1] 0.44751 0.55249

# Step 6 Normal Distribution Plot
vote_share <- .564
N <- 1000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- (mean - 5*sd):(mean + 5*sd)
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="red") +
  geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
  geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
  geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
  ggtitle("Normal distribution with N = 1,000",
          subtitle= paste0("Vertical lines = 95% interval around the mean: ",
                           round(x[which.min(abs(p-0.025))]/N*100,1), "% to ", 
                           round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

# Step 7 Increase n to 10,000 
## Something is wrong w/ this code. I copied it exactly. The percentages for the vertical lines are way off; the header has 1000 instead of 10,000.
N <- 10000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- x*10
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="red") +
  geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
  geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
  geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
  ggtitle("Normal distribution with N = 1,000",
          subtitle= paste0("Vertical lines = 95% interval around the mean: ",
                           round(x[which.min(abs(p-0.025))]/N*100,1), "% to ", 
                           round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

# Step 8 Standard Error Calculations

dat <- data.frame(t(alaska_sim))
sd(dat$Trump)

## [1] 0.01563952

sqrt(0.564*0.436/1000)

## [1] 0.01568133

se = sqrt(0.564*0.436/1000)
c(.546-2*se, .564+2*se)

## [1] 0.5146373 0.5953627

# Step 9

N <- 1000
null_hypothesis <- 0.5
our_result <- 0.564
se = sqrt(our_result*(1-our_result)/N)
x <- seq((null_hypothesis - 5*se),(null_hypothesis + 5*se),by=.001)
norm1 <- dnorm(x,null_hypothesis,se)
qplot() + theme_minimal() +
  geom_line(aes(x=x,y=norm1), color="black") +
  geom_area(aes(x=x,y=norm1),fill="gray",alpha=.1) +
  geom_vline(xintercept = our_result, color="red") +
  geom_label(aes(x=our_result,y=max(norm1), label = paste0("Our data:\n",our_result*100,"%"))) +
  ggtitle("Null hypothesis: True Trump votes = Biden votes",
          subtitle="vs. the result of our survey")

p_value <- 1 - pnorm(our_result, mean=0.5, sd=se)
p_value

## [1] 2.239347e-05

# Step 10 Quickly Test for signficance in R

binom.test(564,1000, p=0.5, alternative="greater")

## 
##  Exact binomial test
## 
## data:  564 and 1000
## number of successes = 564, number of trials = 1000, p-value = 2.895e-05
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
##  0.5375913 1.0000000
## sample estimates:
## probability of success 
##                  0.564

# Step 11 Back to Georgia
set.seed(2020) 
survey_g1 <- sample(georgia,1000,replace=F)
table(survey_g1) %>% as.data.frame %>%
  ggplot(aes(x=survey_g1, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_g1))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our first survey in Georgia (N = 1,000)")

prop.test(table(survey_g1))

## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g1), null probability 0.5
## X-squared = 0.169, df = 1, p-value = 0.681
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4755473 0.5383982
## sample estimates:
##     p 
## 0.507

# Step 12 Much More Georgia
set.seed(2020)
survey_g2 <- sample(georgia,100000,replace=F)
table(survey_g2) %>% as.data.frame %>%
  ggplot(aes(x=survey_g2, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(proportions(table(survey_g2))*100,"%")) +
  theme_minimal() + 
  ggtitle("Results of our second survey in Georgia (N = 100,000)")

prop.test(table(survey_g2))

## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g2), null probability 0.5
## X-squared = 0.17161, df = 1, p-value = 0.6787
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4975561 0.5037639
## sample estimates:
##       p 
## 0.50066

# Step 13 Even more Georgia
survey_g4 <- sample(georgia,3000000,replace=F)
table(survey_g4) %>% as.data.frame %>%
  ggplot(aes(x=survey_g4, y = Freq, label = Freq)) +
  geom_col(fill=c("blue", "red")) + 
  geom_label(label = paste(round(proportions(table(survey_g4))*100,4),"%")) +
  theme_minimal() + 
  ggtitle("Results of our third survey in Georgia (N = 3,000,000)")

prop.test(table(survey_g4))

## 
##  1-sample proportions test with continuity correction
## 
## data:  table(survey_g4), null probability 0.5
## X-squared = 12.793, df = 1, p-value = 0.000348
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.5004667 0.5015986
## sample estimates:
##         p 
## 0.5010327

IC-RBloggers_Stat_Significance_rja

Jerome

2022-11-11