# Load libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)

# Load datasets
load(file="Pres2016.PV.Rdata")
load(file="Pres2016.StatePolls.Rdata")
load(file="Pres2020.PV.Rdata")
load(file="Pres2020.StatePolls.Rdata")
# Declare election date variables
election.day20 <- as.Date("11/3/2020", "%m/%d/%Y")   
election.day16 <- as.Date("11/8/2016", "%m/%d/%Y")
# Process 2020
Pres2020.PV <- Pres2020.PV %>%
  mutate(EndDate = as.Date(Pres2020.PV$EndDate, "%m/%d/%Y"),
         StartDate = as.Date(Pres2020.PV$StartDate, "%m/%d/%Y"), 
         DaysToED = as.numeric(election.day20 - EndDate), 
         margin = Biden - Trump)
# Process 2016 Data
Pres2016.PV <- Pres2016.PV %>%
                mutate(EndDate = as.Date(Pres2016.PV$EndDate, "%m/%d/%Y"), 
                      StartDate = as.Date(Pres2016.PV$StartDate, "%m/%d/%Y"),
                      DaysToED = as.numeric(election.day16 - EndDate),
                      margin = Clinton - Trump)
# Generate the plot for 2020
BTNational <- ggplot(Pres2020.PV) +
  geom_point(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", alpha = .4) +
  geom_point(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", alpha = .4) + 
  geom_smooth(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", se = F) + 
  geom_smooth(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", se = F) + 
  labs(title = "% Two-Candidate Share in 2020 National Polls Over Time \n Biden vs Trump") +
  labs(y = "Two-Candidate Pct. Support") + 
  labs(x = "Poll Ending Date") + 
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") + 
  scale_y_continuous(breaks=seq(30,70, by=5))
  
BTNational
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Generate the plot for 2016
CTNational <- ggplot(Pres2016.PV) +
  geom_point(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", alpha = .4) +
  geom_point(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", alpha = .4) + 
  geom_smooth(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", se = F) + 
  geom_smooth(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", se = F) + 
  labs(title = "% Two-Candidate Share in 2016 National Polls Over Time \n Clinton vs Trump") +
  labs(y = "Two-Candidate Pct. Support") + 
  labs(x = "Poll Ending Date") + 
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") + 
  scale_y_continuous(breaks=seq(30,70, by=5))
CTNational
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Plot 2020 poll results (last 100 days)
BTNat_100day <- Pres2020.PV %>%
  filter(DaysToED <= 100)
BTNat_100day_plot <- ggplot(BTNat_100day) + 
  geom_point(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", alpha = .4)+
  geom_point(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", alpha = .4) + 
  geom_smooth(aes(x=EndDate, y = Biden/(Biden+Trump)*100), color = "blue", se = F) + 
  geom_smooth(aes(x=EndDate, y = Trump/(Biden+Trump)*100), color = "red", se = F) + 
  labs(title = "% Two-Candidate Share in 2020 National Polls Over Time \n Biden vs Trump (100 days prior)") +
  labs(y = "Two-Candidate Pct. Support") + 
  labs(x = "Poll Ending Date") + 
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") + 
  scale_y_continuous(breaks=seq(30,70, by=5))
BTNat_100day_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CTNat_100day <- Pres2016.PV %>%
  filter(DaysToED <= 100)
CTNat_100day_plot <- ggplot(CTNat_100day) +
  geom_point(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", alpha = .4) +
  geom_point(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", alpha = .4) + 
  geom_smooth(aes(x=EndDate, y = Clinton/(Clinton+Trump)*100), color = "blue", se = F) + 
  geom_smooth(aes(x=EndDate, y = Trump/(Clinton+Trump)*100), color = "red", se = F) + 
  labs(title = "% Two-Candidate Share in 2016 National Polls Over Time \n Clinton vs Trump (100 days prior)") +
  labs(y = "Two-Candidate Pct. Support") + 
  labs(x = "Poll Ending Date") + 
  scale_x_date(date_breaks = "2 week", date_labels = "%b %d") + 
  scale_y_continuous(breaks=seq(30,70, by=5))
CTNat_100day_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Pres2016.StatePolls <- Pres2016.StatePolls %>%
  mutate(ClintonNorm = Clinton/(Clinton+Trump),
         TrumpNorm = 1-ClintonNorm)
ElectoralCollegeVotes <- NULL
B <- 100
for(i in 1:B){
  dat <- sample_n(Pres2016.StatePolls,
                  nrow(Pres2016.StatePolls),
                  replace = TRUE)
  ElectoralCollegeVotes <- dat %>%
    group_by(StateName) %>%
    summarize(ClintonProbWin1 = mean(Clinton > Trump),
              ClintonProbWin2 = mean(Clinton/100),
              ClintonProbWin3 = mean(ClintonNorm),
              EV = mean(EV)) %>%
    mutate(ClintonEV1 = ClintonProbWin1*EV,
          ClintonEV2 = ClintonProbWin2*EV,
          ClintonEV3 = ClintonProbWin3*EV,
          TrumpEV1 = EV - ClintonEV1,
          TrumpEV2 = EV - ClintonEV2,
          TrumpEV3 = EV - ClintonEV3) %>%
    summarize(ClintonECV1 = sum(ClintonEV1),
              ClintonECV2 = sum(ClintonEV2),
              ClintonECV3 = sum(ClintonEV3),
              TrumpECV1 = sum(TrumpEV1),
              TrumpECV2 = sum(TrumpEV2),
              TrumpECV3 = sum(TrumpEV3)) %>%
  bind_rows(ElectoralCollegeVotes)
  }
 ElectoralCollegeVotes %>%
  summarize(ClintonWin1 = mean(ClintonECV1 >270))
## # A tibble: 1 × 1
##   ClintonWin1
##         <dbl>
## 1           1
ElectoralCollegeVotes %>%
  summarize(ClintonWin2 = mean(ClintonECV2 > 270))
## # A tibble: 1 × 1
##   ClintonWin2
##         <dbl>
## 1           0
ElectoralCollegeVotes %>%
  summarize(ClintonWin3 = mean(ClintonECV3 > 270))
## # A tibble: 1 × 1
##   ClintonWin3
##         <dbl>
## 1           1

#Part 3: Calculate the error in polls by state#

StatePollError_2016 <- Pres2016.StatePolls %>%
  group_by(state) %>%
  summarize(meanPredicted_Clinton_2016 = mean(Clinton),
            meanPredicted_Trump_2016 = mean(Trump),
            meanError_Clinton_2016 = DemCertVote - meanPredicted_Clinton_2016,
            meanError_Trump_2016 = RepCertVote - meanPredicted_Trump_2016) %>%
  mutate(meanError_abs_Clinton_2016 = abs(meanError_Clinton_2016),
         meanError_abs_Trump_2016 = abs(meanError_Trump_2016)) %>%
  ungroup()
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
StatePollError_2016 <- unique(StatePollError_2016)
# Print top 5 states with the largest error for Clinton (absolute value)
StatePollError_2016_Clinton <- StatePollError_2016 %>% arrange(-meanError_abs_Clinton_2016)
writeLines("Top 5 states with the most error for Clinton in 2016: ")
## Top 5 states with the most error for Clinton in 2016:
print(StatePollError_2016_Clinton, n=5)
## # A tibble: 50 × 7
##   state         meanPredicted_C… meanPredicted_T… meanError_Clint… meanError_Trump…
##   <chr>                    <dbl>            <dbl>            <dbl>            <dbl>
## 1 Hawaii                    48.9             27.2            13.3              2.78
## 2 Vermont                   50.9             21.2            10.2             11.4 
## 3 Rhode Island              45.9             31.0             9.54             8.82
## 4 Delaware                  44.2             32.2             9.16             9.68
## 5 Massachusetts             52.8             27.8             8.04             5.67
## # … with 45 more rows, and 2 more variables: meanError_abs_Clinton_2016 <dbl>,
## #   meanError_abs_Trump_2016 <dbl>
# Print top 5 states with the largest error for Trump (absolute value)
StatePollError_2016_Trump <- StatePollError_2016 %>% arrange(-meanError_abs_Trump_2016)
writeLines("Top 5 states with the most error for Trump in 2016: ")
## Top 5 states with the most error for Trump in 2016:
print(StatePollError_2016_Trump, n=5)
## # A tibble: 50 × 7
##   state         meanPredicted_C… meanPredicted_T… meanError_Clint… meanError_Trump…
##   <chr>                    <dbl>            <dbl>            <dbl>            <dbl>
## 1 Wyoming                   21.9             53.7            0.572             16.4
## 2 South Dakota              30.7             45.2            1.04              16.3
## 3 West Virginia             30.3             52.9           -3.81              15.8
## 4 North Dakota              26.6             49.0            1.23              15.1
## 5 Oklahoma                  30.3             50.5           -1.42              14.8
## # … with 45 more rows, and 2 more variables: meanError_abs_Clinton_2016 <dbl>,
## #   meanError_abs_Trump_2016 <dbl>
StatePollError_2020 <- Pres2020.StatePolls %>%
  group_by(State) %>%
  summarize(meanPredicted_Biden_2020 = mean(Biden),
            meanPredicted_Trump_2020 = mean(Trump),
            meanError_Biden_2020 = BidenCertVote - meanPredicted_Biden_2020,
            meanError_Trump_2020 = TrumpCertVote - meanPredicted_Trump_2020) %>%
  mutate(meanError_abs_Biden_2020 = abs(meanError_Biden_2020),
         meanError_abs_Trump_2020 = abs(meanError_Trump_2020)) %>%
  ungroup()
## `summarise()` has grouped output by 'State'. You can override using the `.groups` argument.
StatePollError_2020 <- unique(StatePollError_2020)
# Print top 5 states with the largest error for Biden (absolute value)
StatePollError_2020_Biden <- StatePollError_2020 %>% arrange(-meanError_abs_Biden_2020)
writeLines("Top 5 states with the most error for Biden in 2020: ")
## Top 5 states with the most error for Biden in 2020:
print(StatePollError_2020_Biden, n=5)
## # A tibble: 50 × 7
##   State meanPredicted_Bid… meanPredicted_Tru… meanError_Biden_… meanError_Trump…
##   <chr>              <dbl>              <dbl>             <dbl>            <dbl>
## 1 RI                  65.4               33.6             -6.40             5.4 
## 2 WY                  32.8               64.3             -5.83             5.67
## 3 ND                  37.7               58.3             -5.71             6.71
## 4 NE                  44.4               53.8             -5.4              5.2 
## 5 WV                  35.4               61.5             -5.38             7.5 
## # … with 45 more rows, and 2 more variables: meanError_abs_Biden_2020 <dbl>,
## #   meanError_abs_Trump_2020 <dbl>
# Print top 5 states with the largest error for Trump (absolute value)
StatePollError_2020_Trump <- StatePollError_2020 %>% arrange(-meanError_abs_Trump_2020)
writeLines("Top 5 states with the most error for Trump in 2020: ")
## Top 5 states with the most error for Trump in 2020:
print(StatePollError_2020_Trump, n=5)
## # A tibble: 50 × 7
##   State meanPredicted_Bid… meanPredicted_Tru… meanError_Biden_… meanError_Trump…
##   <chr>              <dbl>              <dbl>             <dbl>            <dbl>
## 1 WV                  35.4               61.5             -5.38             7.5 
## 2 UT                  39.9               50.9             -1.93             7.14
## 3 ND                  37.7               58.3             -5.71             6.71
## 4 KY                  39.1               55.8             -3.11             6.17
## 5 SD                  41.1               55.9             -5.14             6.14
## # … with 45 more rows, and 2 more variables: meanError_abs_Biden_2020 <dbl>,
## #   meanError_abs_Trump_2020 <dbl>
# Compare means of error in 2016
meanError_allState_2016 <- StatePollError_2016 %>%
  summarize(meanError_allState_Clinton_2016 = mean(meanError_Clinton_2016),
            meanError_allState_Trump_2016 = mean(meanError_Trump_2016))
print(meanError_allState_2016)
## # A tibble: 1 × 2
##   meanError_allState_Clinton_2016 meanError_allState_Trump_2016
##                             <dbl>                         <dbl>
## 1                            4.05                          9.66
# Compare means of error in 2020
meanError_allState_2020 <- StatePollError_2020 %>%
  summarize(meanError_allState_Biden_2020 = mean(meanError_Biden_2020),
            meanError_allState_Trump_2020 = mean(meanError_Trump_2020))
print(meanError_allState_2020)
## # A tibble: 1 × 2
##   meanError_allState_Biden_2020 meanError_allState_Trump_2020
##                           <dbl>                         <dbl>
## 1                         -1.17                          4.06

#Analyze the effect of sample size on polling errors for 2016#

Pres2016_State_Error_SampleN <- Pres2016.StatePolls %>%
  mutate(error_Clinton_2016 = DemCertVote - Clinton,
         error_Trump_2016 = RepCertVote - Trump)

#Analyze the effect of sample size on polling errors for 2020

Pres2020_State_Error_SampleN <- Pres2020.StatePolls %>%
  mutate(error_Biden_2020 = BidenCertVote - Biden,
         error_Trump_2020 = TrumpCertVote - Trump)
ErrorVsSampleN_2016 <- ggplot(Pres2016_State_Error_SampleN) + 
  geom_point(aes(x=samplesize, y=abs(error_Clinton_2016)), color='blue', alpha=.04) + 
  geom_point(aes(x=samplesize, y=abs(error_Trump_2016)), color='red', alpha=.04) + 
  geom_smooth(aes(x=samplesize, y=abs(error_Clinton_2016)), color='blue', se = F) + 
  geom_smooth(aes(x=samplesize, y=abs(error_Trump_2016)), color='red', se = F) + 
  labs(title = "Polling Error (%) vs. Sample Size [2016]") + 
  labs(x = "Sample Size") + 
  labs(y = "Absolute Value of Polling Error (%)")
  scale_x_log10()
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
ErrorVsSampleN_2016
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

ErrorVsSampleN_2020 <- ggplot(Pres2020_State_Error_SampleN) + 
  geom_point(aes(x=SampleSize, y=abs(error_Biden_2020)), color='blue', alpha=.04) + 
  geom_point(aes(x=SampleSize, y=abs(error_Trump_2020)), color='red', alpha=.04) + 
  geom_smooth(aes(x=SampleSize, y=abs(error_Biden_2020)), color='blue', se = F) + 
  geom_smooth(aes(x=SampleSize, y=abs(error_Trump_2020)), color='red', se = F) + 
  labs(title = "Polling Error (%) vs. Sample Size [2020]") + 
  labs(x = "Sample Size") + 
  labs(y = "Absolute Value of Polling Error (%)") + 
  scale_x_log10()
ErrorVsSampleN_2020
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 64 rows containing non-finite values (stat_smooth).
## Warning: Removed 64 rows containing missing values (geom_point).

## Warning: Removed 64 rows containing missing values (geom_point).

#Analyze the effect of duration of poll on error#

# Calculate Days in Field for 2016
Pres2016_State_Error_SampleN_DaysinField <- Pres2016_State_Error_SampleN %>%
  mutate(DaysInField = as.integer(EndDate - StartDate))
ErrorVsDuration_2016 <- ggplot(Pres2016_State_Error_SampleN_DaysinField) +
  geom_point(aes(x=DaysInField, y=abs(error_Clinton_2016)), color='blue', alpha=.04) + 
  geom_point(aes(x=DaysInField, y=abs(error_Trump_2016)), color='red', alpha=.04) + 
  geom_smooth(aes(x=DaysInField, y=abs(error_Clinton_2016)), color='blue', alpha=.04) +
  geom_smooth(aes(x=DaysInField, y=abs(error_Trump_2016)), color='red', alpha=.04) + 
  labs(title = "Polling Errors (%) vs. Poll Duration [2016]") + 
  labs(x = "Poll Duration") + 
  labs(y = "Absolute Value of Polling Error (%)") + 
  scale_x_log10()
ErrorVsDuration_2016
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 65 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 65 rows containing non-finite values (stat_smooth).

ErrorVsDuration_2020 <- ggplot(Pres2020_State_Error_SampleN) +
  geom_point(aes(x=DaysinField, y=abs(error_Trump_2020)), color='red', alpha=.04) +
  geom_smooth(aes(x=DaysinField, y=abs(error_Biden_2020)), color='blue', se = F) + 
  geom_smooth(aes(x=DaysinField, y=abs(error_Trump_2020)), color='red', se = F) +
  labs(title = "Polling Errors (%) vs. Poll Duration [2016]") + 
  labs(x = "Poll Duration") + 
  labs(y = "Absolute Value of Polling Error (%)") + 
  scale_x_log10()
ErrorVsDuration_2020
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'