Objective: Choose a dataset from fivethiryeight to demonstrate how
the TidyVerse packages can be used to manipulate data. I chose a dataset
that measures democratic and republican polling averages since middle of
2018.
library(dplyr)
library(ggplot2)
# Define file path
file_path <- '/Users/williamberritt/Downloads/congress-generic-ballot/generic_ballot_averages.csv'
# Read data from CSV file
political_df <- read.csv(file_path)
# Subset data for Republicans and Democrats
republicans <- subset(political_df, political_df$candidate == 'Republicans')
democrats <- subset(political_df, political_df$candidate == 'Democrats')
# Check dimensions of Democrats and Republicans data
dim(democrats)
## [1] 1993 7
dim(republicans)
## [1] 1993 7
# Count distinct dates for Democrats and Republicans
n_distinct(democrats$date)
## [1] 1993
n_distinct(republicans$date)
## [1] 1993
# Left join Republicans and Democrats data
rep_dem <- left_join(republicans, democrats, by = c('date', 'election', 'cycle'))
# Rename columns for clarity
names(rep_dem) <- c("republicans", "r_ballot_average_estimate", "r_ballot_lo_estimate", "r_ballot_hi_estimate", "date", "election_date", "cycle", "democrats", "d_ballot_average_estimate", "d_ballot_lo_estimate", "d_ballot_hi_estimate")
# Select relevant columns for analysis
clean_df <- rep_dem[c("r_ballot_average_estimate", "d_ballot_average_estimate", "r_ballot_lo_estimate", "d_ballot_lo_estimate", "r_ballot_hi_estimate", "d_ballot_hi_estimate", "date", "election_date", "cycle")]
head(clean_df, 10)
## r_ballot_average_estimate d_ballot_average_estimate r_ballot_lo_estimate
## 1 39.54969 43.94449 34.93867
## 2 39.59254 43.74965 34.98343
## 3 39.58794 43.74553 34.97840
## 4 39.58223 43.74052 34.97213
## 5 38.84026 43.41034 34.27031
## 6 38.83375 43.40546 34.26327
## 7 39.77268 44.31843 35.31767
## 8 39.76940 44.31649 35.31392
## 9 39.76566 44.31416 35.30986
## 10 39.76336 44.31274 35.30714
## d_ballot_lo_estimate r_ballot_hi_estimate d_ballot_hi_estimate date
## 1 39.33347 44.16071 48.55550 2017-04-15
## 2 39.14054 44.20165 48.35876 2017-04-16
## 3 39.13599 44.19749 48.35508 2017-04-17
## 4 39.13042 44.19233 48.35062 2017-04-18
## 5 38.84039 43.41022 47.98030 2017-04-19
## 6 38.83498 43.40423 47.97594 2017-04-20
## 7 39.86341 44.22770 48.77344 2017-04-21
## 8 39.86101 44.22487 48.77196 2017-04-22
## 9 39.85835 44.22146 48.76996 2017-04-23
## 10 39.85652 44.21959 48.76897 2017-04-24
## election_date cycle
## 1 2018-11-06 2018
## 2 2018-11-06 2018
## 3 2018-11-06 2018
## 4 2018-11-06 2018
## 5 2018-11-06 2018
## 6 2018-11-06 2018
## 7 2018-11-06 2018
## 8 2018-11-06 2018
## 9 2018-11-06 2018
## 10 2018-11-06 2018
# Convert date columns to Date type
clean_df$date <- as.Date(clean_df$date)
clean_df$election_date <- as.Date(clean_df$election_date)
# Plot Republican and Democrat average estimates over time with election date annotations
ggplot(clean_df, aes(x = date)) +
geom_line(aes(y = r_ballot_average_estimate, color = "Republican")) +
geom_line(aes(y = d_ballot_average_estimate, color = "Democrat")) +
geom_vline(aes(xintercept = as.numeric(election_date)), linetype = "dashed", color = "red") +
labs(title = "Average Estimates Over Time",
x = "Date",
y = "Average Estimate",
color = "Party") +
theme_minimal()
## Warning: Removed 253 rows containing missing values (`geom_vline()`).
