library(tidyverse)
library(httr)
library(jsonlite)
library(fuzzyjoin)
The goal of this notebook is to create a list of candidates that we’ll use for further analysis. I’ll use data from the Federal Election Committee (FEC) to provide a baseline list of candidates registered for elections for the House of Representatives in the 2022 midterms. The FEC data has most of the key features we need: name, party, state and district. However, I’ll eventually want to scrape tweets for these candidates, so I need Twitter handles. I’ll therefore grab a few additional datasets to map in Twitter handles wherever possible.
The FEC provides an API (https://api.open.fec.gov/developers/) to query data.
year = 2022
fec_key <- read_lines('data/fec_api.txt')
query_params <- list(election_year = year,
office = 'H',
api_key = fec_key)
url <- modify_url('https://api.open.fec.gov/',
path = '/v1/candidates/search/',
query = query_params)
initial_response <- GET(url)
parsed_response <- fromJSON(content(initial_response, "text", encoding="UTF-8"),
simplifyVector = FALSE)
total_pages <- parsed_response$pagination$pages
The response is a heavily nested JSON, so I need to loop through each page and extract the actual data.
response <- list()
for (i in 1:total_pages) {
query_params <- list(page = i,
election_year = year,
office = 'H',
api_key = fec_key)
url <- modify_url('https://api.open.fec.gov/',
path = '/v1/candidates/search/',
query = query_params)
initial_response <- GET(url)
parsed_response <- fromJSON(content(initial_response, 'text', encoding='UTF-8'),
simplifyVector = FALSE)
response[[i]] <- parsed_response
Sys.sleep(0.5)
}
I can now loop through the data from each page and add the required features to a dataframe.
candidates <- map(response, function(x) x$results) %>%
unlist(recursive = F) %>%
tibble(
fec_id = map_chr(., 'candidate_id', .default = NA),
name = map_chr(., 'name', .default = NA),
state = map_chr(., 'state', .default = NA),
district = map_chr(., 'district', .default = NA),
party = map_chr(., 'party_full', .default = NA),
office = map_chr(., 'office_full', .default = NA),
incumbent_challenge = map_chr(., 'incumbent_challenge_full', .default = NA),
candidate_status = map_chr(., 'candidate_status', .default = NA),
) %>%
select(-.)
candidates %>% head()
## # A tibble: 6 × 8
## fec_id name state district party office incumbent_challenge candidate_status
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 H2CO07… AADL… CO 07 REPU… House Open seat P
## 2 H2UT03… AALD… UT 03 REPU… House Challenger C
## 3 H2CA30… AAZA… CA 32 DEMO… House Challenger N
## 4 H2MN07… ABAH… MN 07 DEMO… House Challenger P
## 5 H2NY12… ABDE… NY 12 DEMO… House Challenger P
## 6 H2MN05… ABDI… MN 05 REPU… House Challenger P
To map in Twitter handles / IDs, I’ll use three datasets, the third of which I scraped myself in notebook 1.
The @unitedstates project (members of
Congress only): https://github.com/unitedstates/congress-legislators
Politwoops (members of Congress only): https://www.propublica.org/datastore/dataset/politicians-tracked-by-politwoops
Ballotpedia (Member and candidates): https://ballotpedia.org/List_of_congressional_candidates_in_the_2022_elections
congress_current <- read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
congress_history <- read_csv('https://theunitedstates.io/congress-legislators/legislators-historical.csv')
congress <- rbind(congress_current, congress_history) %>%
mutate(fec_ids = str_split(fec_ids, ",")) %>%
unnest_longer(fec_ids, keep_empty = TRUE)
poliwoops <- read_csv('https://s3.amazonaws.com/pp-projects-static/politwoops/active_accounts.csv')
ballotpedia <- read_csv('data/ballotpedia_scrape2022.csv')
ballotpedia <- ballotpedia[!duplicated(ballotpedia),] %>%
mutate(district = if_else(is.na(district), '00', sprintf("%02d", district)),
state = str_replace_all(state, '_', ' '),
twitter_name = str_extract_all(twitter, '(?<=twitter.com/)\\w+(?=\')'),
ballotpedia_id = str_extract(ballotpedia, '(?<=ballotpedia.org/).*') %>%
str_replace_all('_', ' '))
I’ll perform a simple join using several unique keys: the FEC ID, the Bioguide ID, the Ballotpedia ID, and the GovTrack ID.
candidates <- candidates %>%
## Join with IDs from congress data
left_join(
select(congress, bioguide_id, fec_ids, ballotpedia_id, govtrack_id, twitter_id),
by = c('fec_id' = 'fec_ids'), keep = FALSE) %>%
## Join with IDs from poliwoops data based on bioguide ID
left_join(
select(poliwoops, bioguide_id, twitter_id) %>%
filter(!is.na(bioguide_id)),
by = 'bioguide_id', suffix = c('','_poliw_bio'), keep = FALSE) %>%
## Join with IDs from poliwoops data based on fec ID
left_join(
select(poliwoops, fec_candidate_id, twitter_id) %>%
filter(!is.na(fec_candidate_id)),
by = c('fec_id' = 'fec_candidate_id'), suffix = c('','_poliw_fec'), keep = FALSE) %>%
## Join with IDs from ballotpedia
left_join(
select(ballotpedia, twitter_name, ballotpedia_id)) %>%
rename(twitter_name_match = twitter_name)
## Warning in left_join(., select(poliwoops, fec_candidate_id, twitter_id) %>% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 18 of `x` matches multiple rows in `y`.
## ℹ Row 599 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Joining with `by = join_by(ballotpedia_id)`
candidates <- candidates %>%
mutate(across(setdiff(everything(), one_of('twitter_name_match')), as.character))
candidates %>% head()
## # A tibble: 6 × 15
## fec_id name state district party office incumbent_challenge candidate_status
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 H2CO07… AADL… CO 07 REPU… House Open seat P
## 2 H2UT03… AALD… UT 03 REPU… House Challenger C
## 3 H2CA30… AAZA… CA 32 DEMO… House Challenger N
## 4 H2MN07… ABAH… MN 07 DEMO… House Challenger P
## 5 H2NY12… ABDE… NY 12 DEMO… House Challenger P
## 6 H2MN05… ABDI… MN 05 REPU… House Challenger P
## # ℹ 7 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## # govtrack_id <chr>, twitter_id <chr>, twitter_id_poliw_bio <chr>,
## # twitter_id_poliw_fec <chr>, twitter_name_match <list>
Unfortuantely, the Ballotpedia IDs mapped in from @unitedstates resulted in very few matches. So, to supplement this, I’ll use fuzzy matching based on names. I’ll set the threshold quite high, which could lead to some erroneous matches. However, I’ll also use party, state and district fields as “helpers” to narrow down potential matches and support accurate mappings.
https://www.rdocumentation.org/packages/stringdist/versions/0.9.10/topics/stringdist-metrics
candidates <- candidates %>%
mutate(name = str_replace(name, '(?<=WALLACE)\\.', ','),
party_simple = case_when(party == 'REPUBLICAN PARTY' ~ 'Republican',
party == 'DEMOCRATIC PARTY' ~ 'Democrat',
TRUE ~ 'Other'),
key = case_when(!str_detect(name,',') ~ name,
TRUE ~ str_c(str_extract(name, '(?<=, ).*'), ' ',
str_extract(name, '.*(?=,)'))) %>%
str_remove_all(str_c(
'(MS\\.|MRS\\.|MR\\.|DR\\.|JR\\.|JR|SR\\.|III|II|',
' [A-Z]\\.(?= )| [A-Z](?= )|É|[A-Z]+`|[0-9])')) %>%
str_remove_all('[[:punct:]]') %>%
str_replace_all(' ', ' ') %>%
trimws() %>%
tolower(),
key = str_c(str_extract(key, '^[A-Za-z]+'), ' ',
str_extract(key, ' [A-Za-z]+$')))
ballotpedia <- ballotpedia %>%
mutate(state = state.abb[match(state,state.name)],
party_simple = case_when(party == 'Republican Party' ~ 'Republican',
party == 'Democratic Party' ~ 'Democrat',
TRUE ~ 'Other'),
key = str_remove_all(name, str_c('(III|II|Jr\\.|Sr\\.|',
' [A-Z]\\.(?= )| [A-Z](?= )|é)')) %>%
str_remove_all('[[:punct:]]') %>%
str_replace_all(' ', ' ') %>%
trimws() %>%
tolower(),
key = str_c(str_extract(key, '^[A-Za-z]+'), ' ',
str_extract(key, ' [A-Za-z]+$')))
candidates[is.na(candidates)] <- ''
ballotpedia[is.na(ballotpedia)] <- ''
## Fuzzy join to grab twitter handles from ballotpedia
fuzzy <- stringdist_join(
select(candidates, key, state, party_simple,
district, fec_id),
select(ballotpedia, key, state, party_simple,
district, twitter_name, ballotpedia),
by = 'key',
mode = 'inner',
method = 'lv',
max_dist = 5,
ignore_case = TRUE,
distance_col = 'distance') %>%
filter(state.x == state.y,
party_simple.x == party_simple.y,
district.x == district.y)
candidates <- candidates %>%
left_join(select(fuzzy, fec_id, twitter_name, ballotpedia),
by = 'fec_id', keep = FALSE) %>%
rename(twitter_name_fuzzy = twitter_name)
## Warning in left_join(., select(fuzzy, fec_id, twitter_name, ballotpedia), : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 18 of `x` matches multiple rows in `y`.
## ℹ Row 9 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
All of these joins created quite a bit of duplication, and we have Twitter IDs / handles spread across several columns in different formats, with potential duplication. So, I’ll pivot individuals values into seperate columns, then unite them into a single comma-separated string, then finally pivot the names / IDs longer.
## Convert IDs/Names to list, then unnest to long format
candidates <- candidates %>%
unnest_wider(twitter_name_match, names_sep = '_') %>%
unnest_wider(twitter_name_fuzzy, names_sep = '_') %>%
unite(twitter_name, contains('twitter_name'),
sep = ',', na.rm = TRUE) %>%
unite(twitter_id, contains('twitter_id'),
sep = ',', na.rm = TRUE) %>%
mutate(twitter_name = str_split(twitter_name, ","),
twitter_id = str_split(twitter_id, ",")) %>%
unnest_longer(twitter_name, keep_empty = TRUE) %>%
unnest_longer(twitter_id, keep_empty = TRUE) %>%
mutate(twitter_name = tolower(twitter_name))
## Final removal of any dupes
candidates <- candidates[!duplicated(candidates), ]
candidates %>%
filter(twitter_name != '' | twitter_id != '') %>%
select(name) %>% unique() %>% nrow()
## [1] 1658
candidates
## # A tibble: 5,096 × 16
## fec_id name state district party office incumbent_challenge candidate_status
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 H2CO0… AADL… CO 07 REPU… House Open seat P
## 2 H2UT0… AALD… UT 03 REPU… House Challenger C
## 3 H2CA3… AAZA… CA 32 DEMO… House Challenger N
## 4 H2MN0… ABAH… MN 07 DEMO… House Challenger P
## 5 H2NY1… ABDE… NY 12 DEMO… House Challenger P
## 6 H2MN0… ABDI… MN 05 REPU… House Challenger P
## 7 H2OH0… ABDU… OH 09 DEMO… House Challenger N
## 8 H0CA5… ABED… CA 50 REPU… House Challenger C
## 9 H2NY0… ABEL… NY 04 DEMO… House Challenger N
## 10 H2HI0… ABKI… HI 01 NONP… House Challenger N
## # ℹ 5,086 more rows
## # ℹ 8 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## # govtrack_id <chr>, twitter_id <chr>, twitter_name <chr>,
## # party_simple <chr>, key <chr>, ballotpedia <chr>
I performed a close read of twitter names mapped to multiple candidates during the fuzzy join, as these rows most likely involve some erronous matches. After manually confirming the accuracy of many of these, I’ll perform some targeted clean up to address the errors.
candidates[!is.na(candidates$twitter_name) &
duplicated(candidates$twitter_name), ] %>%
arrange(twitter_name)
## # A tibble: 3,034 × 16
## fec_id name state district party office incumbent_challenge candidate_status
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 H2MN0… ABAH… MN 07 DEMO… House Challenger P
## 2 H2NY1… ABDE… NY 12 DEMO… House Challenger P
## 3 H2MN0… ABDI… MN 05 REPU… House Challenger P
## 4 H2OH0… ABDU… OH 09 DEMO… House Challenger N
## 5 H0CA5… ABED… CA 50 REPU… House Challenger C
## 6 H2NY0… ABEL… NY 04 DEMO… House Challenger N
## 7 H2HI0… ABKI… HI 01 NONP… House Challenger N
## 8 H2MN0… ABRA… MN 01 OTHER House Challenger N
## 9 H2NH0… ACCI… NH 01 REPU… House Challenger P
## 10 H2CA1… ACEV… CA 18 DEMO… House Challenger P
## # ℹ 3,024 more rows
## # ℹ 8 more variables: bioguide_id <chr>, ballotpedia_id <chr>,
## # govtrack_id <chr>, twitter_id <chr>, twitter_name <chr>,
## # party_simple <chr>, key <chr>, ballotpedia <chr>
candidates <- candidates %>%
filter(!(name == 'ROGERS, MICHAEL' &
ballotpedia == 'https://ballotpedia.org/Michael_Joiner'),
!(name == 'LE, TUAN A' & twitter_name == 'thuy_lowe'),
!(name == 'LOWE, THUY' & twitter_name == 'tuanleusa'),
!(name == 'GIMENEZ, CARLOS' & twitter_name == 'carlosgarin'),
!(name == 'SKRBIC, LJUBO DR MD'),
!(name == 'REYNOLDS, CONRAD EARL'),
!(name == 'DOYLE, MICHAEL' & party_simple == 'Republican'),
!(name == 'MARTINEZ GARIN, CARLOS A MR' &
(twitter_name == 'repcarlos' | twitter_name == 'carlosgimenezfl'))
)
candidates[candidates$name == 'HARMAN, MATTHEW S',
'twitter_name'] <- 'harman4congress'
candidates[candidates$name == 'HARMAN, MATTHEW S',
'ballotpedia'] <- 'https://ballotpedia.org/Matt_Harman'
candidates[candidates$name == 'DUNN, TOM',
'twitter_name'] <- NA
candidates[candidates$name == 'DUNN, TOM',
'ballotpedia'] <- 'https://ballotpedia.org/Tom_Dunn'
candidates[candidates$name == 'BEATTY, JOHN',
'twitter_name'] <- 'beattyforus'
candidates[candidates$name == 'BEATTY, JOHN',
'ballotpedia'] <- 'https://ballotpedia.org/John_Beatty_(Virginia)'
candidates[candidates$name == 'YOUNG, TOM',
'twitter_name'] <- NA
candidates[candidates$name == 'YOUNG, TOM',
'ballotpedia'] <- NA
candidates <- candidates %>%
mutate(across(everything(), as.character))
With the final datatset ready to pass on to a Jupyter notebook for
scraping, I’ll export the candidates dataframe.
write_csv(candidates, paste0('data/candidates',year,'.csv'))