# Setup
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
getwd()
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Montgomery_College/Data_Science_101/Data_101_Fall_2022/Homework_10_Due_14Nov22"
# Step 1
georgia <- c(rep("Biden", 2473707), rep("Trump", 2461779))
alaska <- c(rep("Biden", 153778), rep("Trump", 189951))
proportions(table(georgia))
## georgia
## Biden Trump
## 0.5012084 0.4987916
proportions(table(alaska))
## alaska
## Biden Trump
## 0.4473815 0.5526185
# Step 2 Simulate Alaskan Exit Poll
set.seed(2020)
survey_a1 <- sample(alaska,1000,replace=F)
table(survey_a1) %>% as.data.frame %>%
ggplot(aes(x=survey_a1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Alaska (N = 1,000)")

prop.test(table(survey_a1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_a1), null probability 0.5
## X-squared = 16.129, df = 1, p-value = 5.917e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4050739 0.4674239
## sample estimates:
## p
## 0.436
# Step 3 2nd Simulated Alaska Poll
survey_a2 <- sample(alaska,1000,replace=F)
table(survey_a2) %>% as.data.frame %>%
ggplot(aes(x=survey_a2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_a2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Alaska (N = 1,000)")

# Step 4 Replicated Alaska Survey
draw_sample <- function(x,n=1000){
s <- sample(x, n, F)
proportions(table(s))
}
set.seed(2020)
alaska_sim <- replicate(1000,draw_sample(alaska))
alaska_sim[1:2,1:20]
##
## s [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## Biden 0.436 0.442 0.444 0.443 0.42 0.451 0.45 0.461 0.457 0.457 0.448 0.458
## Trump 0.564 0.558 0.556 0.557 0.58 0.549 0.55 0.539 0.543 0.543 0.552 0.542
##
## s [,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## Biden 0.449 0.451 0.442 0.434 0.433 0.442 0.435 0.466
## Trump 0.551 0.549 0.558 0.566 0.567 0.558 0.565 0.534
# Step 5 Plot the results
dat <- data.frame(t(alaska_sim))
qplot() + theme_minimal() +
geom_histogram(aes(x=dat$Biden),fill="blue", alpha=.5) +
geom_histogram(aes(x=dat$Trump), fill="red", alpha=.5) +
geom_vline(xintercept = mean(dat$Biden), color="blue") +
geom_vline(xintercept = mean(dat$Trump), color="red") +
ggtitle("1,000 surveys in Alaska of 1,000 voters each",
subtitle="Histograms of simulated values for Trump (red) and Biden (blue) percentages")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

c(mean(dat$Biden), mean(dat$Trump))
## [1] 0.44751 0.55249
# Step 6 Normal Distribution Plot
vote_share <- .564
N <- 1000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- (mean - 5*sd):(mean + 5*sd)
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

# Step 7 Increase n to 10,000
## Something is wrong w/ this code. I copied it exactly. The percentages for the vertical lines are way off; the header has 1000 instead of 10,000.
N <- 10000
mean <- vote_share*N
sd <- sqrt(vote_share*(1-vote_share)*N)
x <- x*10
norm1 <- dnorm(x,mean,sd)
p <- pnorm(x,mean,sd)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="red") +
geom_area(aes(x=x,y=norm1),fill="red",alpha=.1) +
geom_vline(xintercept = x[which.min(abs(p-0.025))], color="red") +
geom_vline(xintercept = x[which.min(abs(p-0.975))], color="red") +
ggtitle("Normal distribution with N = 1,000",
subtitle= paste0("Vertical lines = 95% interval around the mean: ",
round(x[which.min(abs(p-0.025))]/N*100,1), "% to ",
round(x[which.min(abs(p-0.975))]/N*100,1),"%"))

# Step 8 Standard Error Calculations
dat <- data.frame(t(alaska_sim))
sd(dat$Trump)
## [1] 0.01563952
sqrt(0.564*0.436/1000)
## [1] 0.01568133
se = sqrt(0.564*0.436/1000)
c(.546-2*se, .564+2*se)
## [1] 0.5146373 0.5953627
# Step 9
N <- 1000
null_hypothesis <- 0.5
our_result <- 0.564
se = sqrt(our_result*(1-our_result)/N)
x <- seq((null_hypothesis - 5*se),(null_hypothesis + 5*se),by=.001)
norm1 <- dnorm(x,null_hypothesis,se)
qplot() + theme_minimal() +
geom_line(aes(x=x,y=norm1), color="black") +
geom_area(aes(x=x,y=norm1),fill="gray",alpha=.1) +
geom_vline(xintercept = our_result, color="red") +
geom_label(aes(x=our_result,y=max(norm1), label = paste0("Our data:\n",our_result*100,"%"))) +
ggtitle("Null hypothesis: True Trump votes = Biden votes",
subtitle="vs. the result of our survey")

p_value <- 1 - pnorm(our_result, mean=0.5, sd=se)
p_value
## [1] 2.239347e-05
# Step 10 Quickly Test for signficance in R
binom.test(564,1000, p=0.5, alternative="greater")
##
## Exact binomial test
##
## data: 564 and 1000
## number of successes = 564, number of trials = 1000, p-value = 2.895e-05
## alternative hypothesis: true probability of success is greater than 0.5
## 95 percent confidence interval:
## 0.5375913 1.0000000
## sample estimates:
## probability of success
## 0.564
# Step 11 Back to Georgia
set.seed(2020)
survey_g1 <- sample(georgia,1000,replace=F)
table(survey_g1) %>% as.data.frame %>%
ggplot(aes(x=survey_g1, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g1))*100,"%")) +
theme_minimal() +
ggtitle("Results of our first survey in Georgia (N = 1,000)")

prop.test(table(survey_g1))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g1), null probability 0.5
## X-squared = 0.169, df = 1, p-value = 0.681
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4755473 0.5383982
## sample estimates:
## p
## 0.507
# Step 12 Much More Georgia
set.seed(2020)
survey_g2 <- sample(georgia,100000,replace=F)
table(survey_g2) %>% as.data.frame %>%
ggplot(aes(x=survey_g2, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(proportions(table(survey_g2))*100,"%")) +
theme_minimal() +
ggtitle("Results of our second survey in Georgia (N = 100,000)")

prop.test(table(survey_g2))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g2), null probability 0.5
## X-squared = 0.17161, df = 1, p-value = 0.6787
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4975561 0.5037639
## sample estimates:
## p
## 0.50066
# Step 13 Even more Georgia
survey_g4 <- sample(georgia,3000000,replace=F)
table(survey_g4) %>% as.data.frame %>%
ggplot(aes(x=survey_g4, y = Freq, label = Freq)) +
geom_col(fill=c("blue", "red")) +
geom_label(label = paste(round(proportions(table(survey_g4))*100,4),"%")) +
theme_minimal() +
ggtitle("Results of our third survey in Georgia (N = 3,000,000)")

prop.test(table(survey_g4))
##
## 1-sample proportions test with continuity correction
##
## data: table(survey_g4), null probability 0.5
## X-squared = 12.793, df = 1, p-value = 0.000348
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.5004667 0.5015986
## sample estimates:
## p
## 0.5010327