# Load libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
# Load datasets
load(file="Pres2016.PV.Rdata")
load(file="Pres2016.StatePolls.Rdata")
load(file="Pres2020.PV.Rdata")
load(file="Pres2020.StatePolls.Rdata")
# Declare election date variables
election.day20 <- as.Date("11/3/2020", "%m/%d/%Y")
election.day16 <- as.Date("11/8/2016", "%m/%d/%Y")
# Process 2020
Pres2020.PV <- Pres2020.PV %>%
mutate(EndDate = as.Date(Pres2020.PV$EndDate, "%m/%d/%Y"),
StartDate = as.Date(Pres2020.PV$StartDate, "%m/%d/%Y"),
DaysToED = as.numeric(election.day20 - EndDate),
margin = Biden - Trump)
# Process 2016 Data
Pres2016.PV <- Pres2016.PV %>%
mutate(EndDate = as.Date(Pres2016.PV$EndDate, "%m/%d/%Y"),
StartDate = as.Date(Pres2016.PV$StartDate, "%m/%d/%Y"),
DaysToED = as.numeric(election.day16 - EndDate),
margin = Clinton - Trump)
# Generate the plot for 2020
BTNational <- ggplot(Pres2020.PV) +
geom_point(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", alpha = .4) +
geom_point(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", alpha = .4) +
geom_smooth(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", se = F) +
geom_smooth(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", se = F) +
labs(title = "% Two-Candidate Share in 2020 National Polls Over Time \n Biden vs Trump") +
labs(y = "Two-Candidate Pct. Support") +
labs(x = "Poll Ending Date") +
scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
scale_y_continuous(breaks=seq(30,70, by=5))
BTNational
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Generate the plot for 2016
CTNational <- ggplot(Pres2016.PV) +
geom_point(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", alpha = .4) +
geom_point(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", alpha = .4) +
geom_smooth(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", se = F) +
geom_smooth(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", se = F) +
labs(title = "% Two-Candidate Share in 2016 National Polls Over Time \n Clinton vs Trump") +
labs(y = "Two-Candidate Pct. Support") +
labs(x = "Poll Ending Date") +
scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
scale_y_continuous(breaks=seq(30,70, by=5))
CTNational
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Plot 2020 poll results (last 100 days)
BTNat_100day <- Pres2020.PV %>%
filter(DaysToED <= 100)
BTNat_100day_plot <- ggplot(BTNat_100day) +
geom_point(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", alpha = .4)+
geom_point(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", alpha = .4) +
geom_smooth(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", se = F) +
geom_smooth(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", se = F) +
labs(title = "% Two-Candidate Share in 2020 National Polls Over Time \n Biden vs Trump (100 days prior)") +
labs(y = "Two-Candidate Pct. Support") +
labs(x = "Poll Ending Date") +
scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
scale_y_continuous(breaks=seq(30,70, by=5))
BTNat_100day_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
CTNat_100day <- Pres2016.PV %>%
filter(DaysToED <= 100)
CTNat_100day_plot <- ggplot(CTNat_100day) +
geom_point(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", alpha = .4) +
geom_point(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", alpha = .4) +
geom_smooth(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", se = F) +
geom_smooth(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", se = F) +
labs(title = "% Two-Candidate Share in 2016 National Polls Over Time \n Clinton vs Trump (100 days prior)") +
labs(y = "Two-Candidate Pct. Support") +
labs(x = "Poll Ending Date") +
scale_x_date(date_breaks = "2 week", date_labels = "%b %d") +
scale_y_continuous(breaks=seq(30,70, by=5))
CTNat_100day_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Pres2016.StatePolls <- Pres2016.StatePolls %>%
mutate(ClintonNorm = Clinton/(Clinton+Trump),
TrumpNorm = 1-ClintonNorm)
ElectoralCollegeVotes <- NULL
B <- 100
for(i in 1:B){
dat <- sample_n(Pres2016.StatePolls,
nrow(Pres2016.StatePolls),
replace = TRUE)
ElectoralCollegeVotes <- dat %>%
group_by(StateName) %>%
summarize(ClintonProbWin1 = mean(Clinton > Trump),
ClintonProbWin2 = mean(Clinton/100),
ClintonProbWin3 = mean(ClintonNorm),
EV = mean(EV)) %>%
mutate(ClintonEV1 = ClintonProbWin1*EV,
ClintonEV2 = ClintonProbWin2*EV,
ClintonEV3 = ClintonProbWin3*EV,
TrumpEV1 = EV - ClintonEV1,
TrumpEV2 = EV - ClintonEV2,
TrumpEV3 = EV - ClintonEV3) %>%
summarize(ClintonECV1 = sum(ClintonEV1),
ClintonECV2 = sum(ClintonEV2),
ClintonECV3 = sum(ClintonEV3),
TrumpECV1 = sum(TrumpEV1),
TrumpECV2 = sum(TrumpEV2),
TrumpECV3 = sum(TrumpEV3)) %>%
bind_rows(ElectoralCollegeVotes)
}
ElectoralCollegeVotes %>%
summarize(ClintonWin1 = mean(ClintonECV1 >270))
## # A tibble: 1 × 1
## ClintonWin1
## <dbl>
## 1 1
ElectoralCollegeVotes %>%
summarize(ClintonWin2 = mean(ClintonECV2 > 270))
## # A tibble: 1 × 1
## ClintonWin2
## <dbl>
## 1 0
ElectoralCollegeVotes %>%
summarize(ClintonWin3 = mean(ClintonECV3 > 270))
## # A tibble: 1 × 1
## ClintonWin3
## <dbl>
## 1 1
#Part 3: Calculate the error in polls by state#
StatePollError_2016 <- Pres2016.StatePolls %>%
group_by(state) %>%
summarize(meanPredicted_Clinton_2016 = mean(Clinton),
meanPredicted_Trump_2016 = mean(Trump),
meanError_Clinton_2016 = DemCertVote - meanPredicted_Clinton_2016,
meanError_Trump_2016 = RepCertVote - meanPredicted_Trump_2016) %>%
mutate(meanError_abs_Clinton_2016 = abs(meanError_Clinton_2016),
meanError_abs_Trump_2016 = abs(meanError_Trump_2016)) %>%
ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
StatePollError_2016 <- unique(StatePollError_2016)
# Print top 5 states with the largest error for Clinton (absolute value)
StatePollError_2016_Clinton <- StatePollError_2016 %>% arrange(-meanError_abs_Clinton_2016)
writeLines("Top 5 states with the most error for Clinton in 2016: ")
## Top 5 states with the most error for Clinton in 2016:
print(StatePollError_2016_Clinton, n=5)
## # A tibble: 50 × 7
## state meanPredicted_C… meanPredicted_T… meanError_Clint… meanError_Trump…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Hawaii 48.9 27.2 13.3 2.78
## 2 Vermont 50.9 21.2 10.2 11.4
## 3 Rhode Island 45.9 31.0 9.54 8.82
## 4 Delaware 44.2 32.2 9.16 9.68
## 5 Massachusetts 52.8 27.8 8.04 5.67
## # … with 45 more rows, and 2 more variables: meanError_abs_Clinton_2016 <dbl>,
## # meanError_abs_Trump_2016 <dbl>
# Print top 5 states with the largest error for Trump (absolute value)
StatePollError_2016_Trump <- StatePollError_2016 %>% arrange(-meanError_abs_Trump_2016)
writeLines("Top 5 states with the most error for Trump in 2016: ")
## Top 5 states with the most error for Trump in 2016:
print(StatePollError_2016_Trump, n=5)
## # A tibble: 50 × 7
## state meanPredicted_C… meanPredicted_T… meanError_Clint… meanError_Trump…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Wyoming 21.9 53.7 0.572 16.4
## 2 South Dakota 30.7 45.2 1.04 16.3
## 3 West Virginia 30.3 52.9 -3.81 15.8
## 4 North Dakota 26.6 49.0 1.23 15.1
## 5 Oklahoma 30.3 50.5 -1.42 14.8
## # … with 45 more rows, and 2 more variables: meanError_abs_Clinton_2016 <dbl>,
## # meanError_abs_Trump_2016 <dbl>
StatePollError_2020 <- Pres2020.StatePolls %>%
group_by(State) %>%
summarize(meanPredicted_Biden_2020 = mean(Biden),
meanPredicted_Trump_2020 = mean(Trump),
meanError_Biden_2020 = BidenCertVote - meanPredicted_Biden_2020,
meanError_Trump_2020 = TrumpCertVote - meanPredicted_Trump_2020) %>%
mutate(meanError_abs_Biden_2020 = abs(meanError_Biden_2020),
meanError_abs_Trump_2020 = abs(meanError_Trump_2020)) %>%
ungroup()
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
StatePollError_2020 <- unique(StatePollError_2020)
# Print top 5 states with the largest error for Biden (absolute value)
StatePollError_2020_Biden <- StatePollError_2020 %>% arrange(-meanError_abs_Biden_2020)
writeLines("Top 5 states with the most error for Biden in 2020: ")
## Top 5 states with the most error for Biden in 2020:
print(StatePollError_2020_Biden, n=5)
## # A tibble: 50 × 7
## State meanPredicted_Bid… meanPredicted_Tru… meanError_Biden_… meanError_Trump…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 RI 65.4 33.6 -6.40 5.4
## 2 WY 32.8 64.3 -5.83 5.67
## 3 ND 37.7 58.3 -5.71 6.71
## 4 NE 44.4 53.8 -5.4 5.2
## 5 WV 35.4 61.5 -5.38 7.5
## # … with 45 more rows, and 2 more variables: meanError_abs_Biden_2020 <dbl>,
## # meanError_abs_Trump_2020 <dbl>
# Print top 5 states with the largest error for Trump (absolute value)
StatePollError_2020_Trump <- StatePollError_2020 %>% arrange(-meanError_abs_Trump_2020)
writeLines("Top 5 states with the most error for Trump in 2020: ")
## Top 5 states with the most error for Trump in 2020:
print(StatePollError_2020_Trump, n=5)
## # A tibble: 50 × 7
## State meanPredicted_Bid… meanPredicted_Tru… meanError_Biden_… meanError_Trump…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 WV 35.4 61.5 -5.38 7.5
## 2 UT 39.9 50.9 -1.93 7.14
## 3 ND 37.7 58.3 -5.71 6.71
## 4 KY 39.1 55.8 -3.11 6.17
## 5 SD 41.1 55.9 -5.14 6.14
## # … with 45 more rows, and 2 more variables: meanError_abs_Biden_2020 <dbl>,
## # meanError_abs_Trump_2020 <dbl>
# Compare means of error in 2016
meanError_allState_2016 <- StatePollError_2016 %>%
summarize(meanError_allState_Clinton_2016 = mean(meanError_Clinton_2016),
meanError_allState_Trump_2016 = mean(meanError_Trump_2016))
print(meanError_allState_2016)
## # A tibble: 1 × 2
## meanError_allState_Clinton_2016 meanError_allState_Trump_2016
## <dbl> <dbl>
## 1 4.05 9.66
# Compare means of error in 2020
meanError_allState_2020 <- StatePollError_2020 %>%
summarize(meanError_allState_Biden_2020 = mean(meanError_Biden_2020),
meanError_allState_Trump_2020 = mean(meanError_Trump_2020))
print(meanError_allState_2020)
## # A tibble: 1 × 2
## meanError_allState_Biden_2020 meanError_allState_Trump_2020
## <dbl> <dbl>
## 1 -1.17 4.06
#Analyze the effect of sample size on polling errors for 2016#
Pres2016_State_Error_SampleN <- Pres2016.StatePolls %>%
mutate(error_Clinton_2016 = DemCertVote - Clinton,
error_Trump_2016 = RepCertVote - Trump)
#Analyze the effect of sample size on polling errors for 2020
Pres2020_State_Error_SampleN <- Pres2020.StatePolls %>%
mutate(error_Biden_2020 = BidenCertVote - Biden,
error_Trump_2020 = TrumpCertVote - Trump)
ErrorVsSampleN_2016 <- ggplot(Pres2016_State_Error_SampleN) +
geom_point(aes(x=samplesize, y=abs(error_Clinton_2016)), color='blue', alpha=.04) +
geom_point(aes(x=samplesize, y=abs(error_Trump_2016)), color='red', alpha=.04) +
geom_smooth(aes(x=samplesize, y=abs(error_Clinton_2016)), color='blue', se = F) +
geom_smooth(aes(x=samplesize, y=abs(error_Trump_2016)), color='red', se = F) +
labs(title = "Polling Error (%) vs. Sample Size [2016]") +
labs(x = "Sample Size") +
labs(y = "Absolute Value of Polling Error (%)")
scale_x_log10()
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
ErrorVsSampleN_2016
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
ErrorVsSampleN_2020 <- ggplot(Pres2020_State_Error_SampleN) +
geom_point(aes(x=SampleSize, y=abs(error_Biden_2020)), color='blue', alpha=.04) +
geom_point(aes(x=SampleSize, y=abs(error_Trump_2020)), color='red', alpha=.04) +
geom_smooth(aes(x=SampleSize, y=abs(error_Biden_2020)), color='blue', se = F) +
geom_smooth(aes(x=SampleSize, y=abs(error_Trump_2020)), color='red', se = F) +
labs(title = "Polling Error (%) vs. Sample Size [2020]") +
labs(x = "Sample Size") +
labs(y = "Absolute Value of Polling Error (%)") +
scale_x_log10()
ErrorVsSampleN_2020
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
#Analyze the effect of duration of poll on error#
# Calculate Days in Field for 2016
Pres2016_State_Error_SampleN_DaysinField <- Pres2016_State_Error_SampleN %>%
mutate(DaysInField = as.integer(EndDate - StartDate))
ErrorVsDuration_2016 <- ggplot(Pres2016_State_Error_SampleN_DaysinField) +
geom_point(aes(x=DaysInField, y=abs(error_Clinton_2016)), color='blue', alpha=.04) +
geom_point(aes(x=DaysInField, y=abs(error_Trump_2016)), color='red', alpha=.04) +
geom_smooth(aes(x=DaysInField, y=abs(error_Clinton_2016)), color='blue', alpha=.04) +
geom_smooth(aes(x=DaysInField, y=abs(error_Trump_2016)), color='red', alpha=.04) +
labs(title = "Polling Errors (%) vs. Poll Duration [2016]") +
labs(x = "Poll Duration") +
labs(y = "Absolute Value of Polling Error (%)") +
scale_x_log10()
ErrorVsDuration_2016
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 65 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 65 rows containing non-finite values (stat_smooth).
ErrorVsDuration_2020 <- ggplot(Pres2020_State_Error_SampleN) +
geom_point(aes(x=DaysinField, y=abs(error_Trump_2020)), color='red', alpha=.04) +
geom_smooth(aes(x=DaysinField, y=abs(error_Biden_2020)), color='blue', se = F) +
geom_smooth(aes(x=DaysinField, y=abs(error_Trump_2020)), color='red', se = F) +
labs(title = "Polling Errors (%) vs. Poll Duration [2016]") +
labs(x = "Poll Duration") +
labs(y = "Absolute Value of Polling Error (%)") +
scale_x_log10()
ErrorVsDuration_2020
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'