Objective: Choose a dataset from fivethiryeight to demonstrate how the TidyVerse packages can be used to manipulate data. I chose a dataset that measures democratic and republican polling averages since middle of 2018.

library(dplyr)
library(ggplot2)

# Define file path
file_path <- '/Users/williamberritt/Downloads/congress-generic-ballot/generic_ballot_averages.csv'

# Read data from CSV file
political_df <- read.csv(file_path)

# Subset data for Republicans and Democrats
republicans <- subset(political_df, political_df$candidate == 'Republicans')
democrats <- subset(political_df, political_df$candidate == 'Democrats')

# Check dimensions of Democrats and Republicans data
dim(democrats)
## [1] 1993    7
dim(republicans)
## [1] 1993    7
# Count distinct dates for Democrats and Republicans
n_distinct(democrats$date)
## [1] 1993
n_distinct(republicans$date)
## [1] 1993
# Left join Republicans and Democrats data
rep_dem <- left_join(republicans, democrats, by = c('date', 'election', 'cycle'))

# Rename columns for clarity
names(rep_dem) <- c("republicans", "r_ballot_average_estimate", "r_ballot_lo_estimate", "r_ballot_hi_estimate", "date", "election_date", "cycle", "democrats", "d_ballot_average_estimate", "d_ballot_lo_estimate", "d_ballot_hi_estimate")

# Select relevant columns for analysis
clean_df <- rep_dem[c("r_ballot_average_estimate", "d_ballot_average_estimate", "r_ballot_lo_estimate", "d_ballot_lo_estimate", "r_ballot_hi_estimate", "d_ballot_hi_estimate", "date", "election_date", "cycle")]
head(clean_df, 10)
##    r_ballot_average_estimate d_ballot_average_estimate r_ballot_lo_estimate
## 1                   39.54969                  43.94449             34.93867
## 2                   39.59254                  43.74965             34.98343
## 3                   39.58794                  43.74553             34.97840
## 4                   39.58223                  43.74052             34.97213
## 5                   38.84026                  43.41034             34.27031
## 6                   38.83375                  43.40546             34.26327
## 7                   39.77268                  44.31843             35.31767
## 8                   39.76940                  44.31649             35.31392
## 9                   39.76566                  44.31416             35.30986
## 10                  39.76336                  44.31274             35.30714
##    d_ballot_lo_estimate r_ballot_hi_estimate d_ballot_hi_estimate       date
## 1              39.33347             44.16071             48.55550 2017-04-15
## 2              39.14054             44.20165             48.35876 2017-04-16
## 3              39.13599             44.19749             48.35508 2017-04-17
## 4              39.13042             44.19233             48.35062 2017-04-18
## 5              38.84039             43.41022             47.98030 2017-04-19
## 6              38.83498             43.40423             47.97594 2017-04-20
## 7              39.86341             44.22770             48.77344 2017-04-21
## 8              39.86101             44.22487             48.77196 2017-04-22
## 9              39.85835             44.22146             48.76996 2017-04-23
## 10             39.85652             44.21959             48.76897 2017-04-24
##    election_date cycle
## 1     2018-11-06  2018
## 2     2018-11-06  2018
## 3     2018-11-06  2018
## 4     2018-11-06  2018
## 5     2018-11-06  2018
## 6     2018-11-06  2018
## 7     2018-11-06  2018
## 8     2018-11-06  2018
## 9     2018-11-06  2018
## 10    2018-11-06  2018
# Convert date columns to Date type
clean_df$date <- as.Date(clean_df$date)
clean_df$election_date <- as.Date(clean_df$election_date)

# Plot Republican and Democrat average estimates over time with election date annotations
ggplot(clean_df, aes(x = date)) +
  geom_line(aes(y = r_ballot_average_estimate, color = "Republican")) +
  geom_line(aes(y = d_ballot_average_estimate, color = "Democrat")) +
  geom_vline(aes(xintercept = as.numeric(election_date)), linetype = "dashed", color = "red") +
  labs(title = "Average Estimates Over Time",
       x = "Date",
       y = "Average Estimate",
       color = "Party") +
  theme_minimal()
## Warning: Removed 253 rows containing missing values (`geom_vline()`).

Conclusion: Using the dplyr and ggplot packages from TidyVerse, I was able to create a cleaner dataframe that’s better prepared for analysis. I was also able to visualize the polling averages over time.