Project4

library(tidyverse)
library(httr)  
library(jsonlite)
library(fuzzyjoin)

Intro

The goal of this notebook is to create a list of candidates that we’ll use for further analysis. I’ll use data from the Federal Election Committee (FEC) to provide a baseline list of candidates registered for elections for the House of Representatives in the 2022 midterms. The FEC data has most of the key features we need: name, party, state and district. However, I’ll eventually want to scrape tweets for these candidates, so I need Twitter handles. I’ll therefore grab a few additional datasets to map in Twitter handles wherever possible.

Query FEC

The FEC provides an API (https://api.open.fec.gov/developers/) to query data.

year = 2022
fec_key <- read_lines('data/fec_api.txt')

query_params <- list(election_year = year,
                     office = 'H',
                     api_key = fec_key)
url <- modify_url('https://api.open.fec.gov/', 
                  path = '/v1/candidates/search/',
                  query = query_params)

initial_response <- GET(url)
parsed_response <- fromJSON(content(initial_response, "text", encoding="UTF-8"), 
                            simplifyVector = FALSE)
total_pages <- parsed_response$pagination$pages

The response is a heavily nested JSON, so I need to loop through each page and extract the actual data.

response <- list()

for (i in 1:total_pages) {
  query_params <- list(page = i,
                       election_year = year,
                       office = 'H',
                       api_key = fec_key)
  url <- modify_url('https://api.open.fec.gov/', 
                    path = '/v1/candidates/search/',
                    query = query_params)
  initial_response <- GET(url)
  parsed_response <- fromJSON(content(initial_response, 'text', encoding='UTF-8'), 
                              simplifyVector = FALSE)
  response[[i]] <- parsed_response
  Sys.sleep(0.5)
}

I can now loop through the data from each page and add the required features to a dataframe.

candidates <- map(response, function(x) x$results) %>%
    unlist(recursive = F) %>%
    tibble(
      fec_id = map_chr(., 'candidate_id', .default = NA),
      name = map_chr(., 'name', .default = NA),
      state = map_chr(., 'state', .default = NA),
      district = map_chr(., 'district', .default = NA),
      party = map_chr(., 'party_full', .default = NA),
      office = map_chr(., 'office_full', .default = NA),
      incumbent_challenge = map_chr(., 'incumbent_challenge_full', .default = NA),
      candidate_status = map_chr(., 'candidate_status', .default = NA),
    ) %>%
  select(-.)

candidates %>% head()

## # A tibble: 6 × 8
##   fec_id  name  state district party office incumbent_challenge candidate_status
##   <chr>   <chr> <chr> <chr>    <chr> <chr>  <chr>               <chr>           
## 1 H2CO07… AADL… CO    07       REPU… House  Open seat           P               
## 2 H2UT03… AALD… UT    03       REPU… House  Challenger          C               
## 3 H2CA30… AAZA… CA    32       DEMO… House  Challenger          N               
## 4 H2MN07… ABAH… MN    07       DEMO… House  Challenger          P               
## 5 H2NY12… ABDE… NY    12       DEMO… House  Challenger          P               
## 6 H2MN05… ABDI… MN    05       REPU… House  Challenger          P

Congress Bio, Poliwoops, and Ballotpedia Twitter IDs

To map in Twitter handles / IDs, I’ll use three datasets, the third of which I scraped myself in notebook 1.

The @unitedstates project (members of Congress only): https://github.com/unitedstates/congress-legislators
Politwoops (members of Congress only): https://www.propublica.org/datastore/dataset/politicians-tracked-by-politwoops
Ballotpedia (Member and candidates): https://ballotpedia.org/List_of_congressional_candidates_in_the_2022_elections

congress_current <- read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
congress_history <- read_csv('https://theunitedstates.io/congress-legislators/legislators-historical.csv')

congress <- rbind(congress_current, congress_history) %>%  
  mutate(fec_ids = str_split(fec_ids, ",")) %>%
  unnest_longer(fec_ids, keep_empty = TRUE)

poliwoops <- read_csv('https://s3.amazonaws.com/pp-projects-static/politwoops/active_accounts.csv')

ballotpedia <- read_csv('data/ballotpedia_scrape2022.csv')

ballotpedia <- ballotpedia[!duplicated(ballotpedia),] %>%
  mutate(district = if_else(is.na(district), '00', sprintf("%02d", district)),
         state = str_replace_all(state, '_', ' '),
         twitter_name = str_extract_all(twitter, '(?<=twitter.com/)\\w+(?=\')'),
         ballotpedia_id = str_extract(ballotpedia, '(?<=ballotpedia.org/).*') %>%
           str_replace_all('_', ' '))

I’ll perform a simple join using several unique keys: the FEC ID, the Bioguide ID, the Ballotpedia ID, and the GovTrack ID.

candidates <- candidates %>%
  ## Join with IDs from congress data
  left_join(
    select(congress, bioguide_id, fec_ids, ballotpedia_id, govtrack_id, twitter_id),
    by = c('fec_id' = 'fec_ids'), keep = FALSE) %>%
  ## Join with IDs from poliwoops data based on bioguide ID
  left_join(
    select(poliwoops, bioguide_id, twitter_id) %>%
      filter(!is.na(bioguide_id)),
    by = 'bioguide_id', suffix = c('','_poliw_bio'), keep = FALSE) %>%
  ## Join with IDs from poliwoops data based on fec ID
  left_join(
    select(poliwoops, fec_candidate_id, twitter_id) %>%
      filter(!is.na(fec_candidate_id)), 
    by = c('fec_id' = 'fec_candidate_id'), suffix = c('','_poliw_fec'), keep = FALSE) %>%
  ## Join with IDs from ballotpedia
  left_join(
    select(ballotpedia, twitter_name, ballotpedia_id)) %>%
  rename(twitter_name_match = twitter_name)

## Warning in left_join(., select(poliwoops, fec_candidate_id, twitter_id) %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 18 of `x` matches multiple rows in `y`.
## ℹ Row 599 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Joining with `by = join_by(ballotpedia_id)`

candidates <- candidates %>%
  mutate(across(setdiff(everything(), one_of('twitter_name_match')), as.character))

candidates %>% head()

## # A tibble: 6 × 15
##   fec_id  name  state district party office incumbent_challenge candidate_status
##   <chr>   <chr> <chr> <chr>    <chr> <chr>  <chr>               <chr>           
## 1 H2CO07… AADL… CO    07       REPU… House  Open seat           P               
## 2 H2UT03… AALD… UT    03       REPU… House  Challenger          C               
## 3 H2CA30… AAZA… CA    32       DEMO… House  Challenger          N               
## 4 H2MN07… ABAH… MN    07       DEMO… House  Challenger          P               
## 5 H2NY12… ABDE… NY    12       DEMO… House  Challenger          P               
## 6 H2MN05… ABDI… MN    05       REPU… House  Challenger          P               
## # ℹ 7 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## #   govtrack_id <chr>, twitter_id <chr>, twitter_id_poliw_bio <chr>,
## #   twitter_id_poliw_fec <chr>, twitter_name_match <list>

Fuzzy Matching for Ballotpedia Names

Unfortuantely, the Ballotpedia IDs mapped in from @unitedstates resulted in very few matches. So, to supplement this, I’ll use fuzzy matching based on names. I’ll set the threshold quite high, which could lead to some erroneous matches. However, I’ll also use party, state and district fields as “helpers” to narrow down potential matches and support accurate mappings.

https://www.rdocumentation.org/packages/stringdist/versions/0.9.10/topics/stringdist-metrics

candidates <- candidates %>%
  mutate(name = str_replace(name, '(?<=WALLACE)\\.', ','),
         party_simple = case_when(party == 'REPUBLICAN PARTY' ~ 'Republican',
                                  party == 'DEMOCRATIC PARTY' ~ 'Democrat',
                                  TRUE ~ 'Other'),
         key = case_when(!str_detect(name,',') ~ name,
                          TRUE ~ str_c(str_extract(name, '(?<=, ).*'), ' ',
                                       str_extract(name, '.*(?=,)'))) %>%
           str_remove_all(str_c(
             '(MS\\.|MRS\\.|MR\\.|DR\\.|JR\\.|JR|SR\\.|III|II|',
             ' [A-Z]\\.(?= )| [A-Z](?= )|É|[A-Z]+`|[0-9])')) %>%
           str_remove_all('[[:punct:]]') %>%
           str_replace_all('  ', ' ') %>%
           trimws() %>%
           tolower(),
         key = str_c(str_extract(key, '^[A-Za-z]+'), ' ',
                 str_extract(key, ' [A-Za-z]+$')))

ballotpedia <- ballotpedia %>%
  mutate(state = state.abb[match(state,state.name)],
         party_simple = case_when(party == 'Republican Party' ~ 'Republican',
                                  party == 'Democratic Party' ~ 'Democrat',
                                  TRUE ~ 'Other'),
         key = str_remove_all(name, str_c('(III|II|Jr\\.|Sr\\.|',
                                          ' [A-Z]\\.(?= )| [A-Z](?= )|é)')) %>%
           str_remove_all('[[:punct:]]') %>%
           str_replace_all('  ', ' ') %>%
           trimws() %>%
           tolower(),
         key = str_c(str_extract(key, '^[A-Za-z]+'), ' ',
                 str_extract(key, ' [A-Za-z]+$')))

candidates[is.na(candidates)] <- ''
ballotpedia[is.na(ballotpedia)] <- ''

## Fuzzy join to grab twitter handles from ballotpedia
fuzzy <- stringdist_join(
  select(candidates, key, state, party_simple,
         district, fec_id),
  select(ballotpedia, key, state, party_simple,
         district, twitter_name, ballotpedia),
  by = 'key',
  mode = 'inner',
  method = 'lv',
  max_dist = 5,
  ignore_case = TRUE,
  distance_col = 'distance') %>%
  filter(state.x == state.y,
         party_simple.x == party_simple.y,
         district.x == district.y)

candidates <- candidates %>%
  left_join(select(fuzzy, fec_id, twitter_name, ballotpedia),
            by = 'fec_id', keep = FALSE) %>%
  rename(twitter_name_fuzzy = twitter_name)

## Warning in left_join(., select(fuzzy, fec_id, twitter_name, ballotpedia), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 18 of `x` matches multiple rows in `y`.
## ℹ Row 9 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Cleanup

All of these joins created quite a bit of duplication, and we have Twitter IDs / handles spread across several columns in different formats, with potential duplication. So, I’ll pivot individuals values into seperate columns, then unite them into a single comma-separated string, then finally pivot the names / IDs longer.

## Convert IDs/Names to list, then unnest to long format
candidates <- candidates %>%
  unnest_wider(twitter_name_match, names_sep = '_') %>%
  unnest_wider(twitter_name_fuzzy, names_sep = '_') %>%
  unite(twitter_name, contains('twitter_name'), 
        sep = ',', na.rm = TRUE) %>%
  unite(twitter_id, contains('twitter_id'), 
        sep = ',', na.rm = TRUE) %>%
  mutate(twitter_name = str_split(twitter_name, ","),
         twitter_id = str_split(twitter_id, ",")) %>%
  unnest_longer(twitter_name, keep_empty = TRUE) %>%
  unnest_longer(twitter_id, keep_empty = TRUE) %>%
  mutate(twitter_name = tolower(twitter_name)) 

## Final removal of any dupes
candidates <- candidates[!duplicated(candidates), ]

candidates %>%
  filter(twitter_name != '' | twitter_id != '') %>%
  select(name) %>% unique() %>% nrow()

## [1] 1658

candidates

## # A tibble: 5,096 × 16
##    fec_id name  state district party office incumbent_challenge candidate_status
##    <chr>  <chr> <chr> <chr>    <chr> <chr>  <chr>               <chr>           
##  1 H2CO0… AADL… CO    07       REPU… House  Open seat           P               
##  2 H2UT0… AALD… UT    03       REPU… House  Challenger          C               
##  3 H2CA3… AAZA… CA    32       DEMO… House  Challenger          N               
##  4 H2MN0… ABAH… MN    07       DEMO… House  Challenger          P               
##  5 H2NY1… ABDE… NY    12       DEMO… House  Challenger          P               
##  6 H2MN0… ABDI… MN    05       REPU… House  Challenger          P               
##  7 H2OH0… ABDU… OH    09       DEMO… House  Challenger          N               
##  8 H0CA5… ABED… CA    50       REPU… House  Challenger          C               
##  9 H2NY0… ABEL… NY    04       DEMO… House  Challenger          N               
## 10 H2HI0… ABKI… HI    01       NONP… House  Challenger          N               
## # ℹ 5,086 more rows
## # ℹ 8 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## #   govtrack_id <chr>, twitter_id <chr>, twitter_name <chr>,
## #   party_simple <chr>, key <chr>, ballotpedia <chr>

I performed a close read of twitter names mapped to multiple candidates during the fuzzy join, as these rows most likely involve some erronous matches. After manually confirming the accuracy of many of these, I’ll perform some targeted clean up to address the errors.

candidates[!is.na(candidates$twitter_name) &
             duplicated(candidates$twitter_name), ] %>%
  arrange(twitter_name)

## # A tibble: 3,034 × 16
##    fec_id name  state district party office incumbent_challenge candidate_status
##    <chr>  <chr> <chr> <chr>    <chr> <chr>  <chr>               <chr>           
##  1 H2MN0… ABAH… MN    07       DEMO… House  Challenger          P               
##  2 H2NY1… ABDE… NY    12       DEMO… House  Challenger          P               
##  3 H2MN0… ABDI… MN    05       REPU… House  Challenger          P               
##  4 H2OH0… ABDU… OH    09       DEMO… House  Challenger          N               
##  5 H0CA5… ABED… CA    50       REPU… House  Challenger          C               
##  6 H2NY0… ABEL… NY    04       DEMO… House  Challenger          N               
##  7 H2HI0… ABKI… HI    01       NONP… House  Challenger          N               
##  8 H2MN0… ABRA… MN    01       OTHER House  Challenger          N               
##  9 H2NH0… ACCI… NH    01       REPU… House  Challenger          P               
## 10 H2CA1… ACEV… CA    18       DEMO… House  Challenger          P               
## # ℹ 3,024 more rows
## # ℹ 8 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## #   govtrack_id <chr>, twitter_id <chr>, twitter_name <chr>,
## #   party_simple <chr>, key <chr>, ballotpedia <chr>

candidates <- candidates %>%
  filter(!(name == 'ROGERS, MICHAEL' &
             ballotpedia == 'https://ballotpedia.org/Michael_Joiner'),
         !(name == 'LE, TUAN A' & twitter_name == 'thuy_lowe'),
         !(name == 'LOWE, THUY' & twitter_name == 'tuanleusa'),
         !(name == 'GIMENEZ, CARLOS' & twitter_name == 'carlosgarin'),
         !(name == 'SKRBIC, LJUBO DR MD'),
         !(name == 'REYNOLDS, CONRAD EARL'),
         !(name == 'DOYLE, MICHAEL' & party_simple == 'Republican'),
         !(name == 'MARTINEZ GARIN, CARLOS A MR' &
             (twitter_name == 'repcarlos' | twitter_name == 'carlosgimenezfl'))
  )

candidates[candidates$name == 'HARMAN, MATTHEW S',
           'twitter_name'] <- 'harman4congress'
candidates[candidates$name == 'HARMAN, MATTHEW S',
           'ballotpedia'] <- 'https://ballotpedia.org/Matt_Harman'

candidates[candidates$name == 'DUNN, TOM',
           'twitter_name'] <- NA
candidates[candidates$name == 'DUNN, TOM',
           'ballotpedia'] <- 'https://ballotpedia.org/Tom_Dunn'

candidates[candidates$name == 'BEATTY, JOHN',
           'twitter_name'] <- 'beattyforus'
candidates[candidates$name == 'BEATTY, JOHN',
           'ballotpedia'] <- 'https://ballotpedia.org/John_Beatty_(Virginia)'

candidates[candidates$name == 'YOUNG, TOM',
           'twitter_name'] <- NA
candidates[candidates$name == 'YOUNG, TOM',
           'ballotpedia'] <- NA

candidates <- candidates %>%
  mutate(across(everything(), as.character))

With the final datatset ready to pass on to a Jupyter notebook for scraping, I’ll export the candidates dataframe.

write_csv(candidates, paste0('data/candidates',year,'.csv'))