607-wk6-project2-USSenate

Data Cleaning - Steps:

Remove “disclaimer” notes at the bottom
Remove Row 1 and make years the column headers
get “outcome” as its own variable
get “Incumbent/ Challenger” as its own variable
get “party” as its own variable
get “n” and “amount” as separate values for each observation

# Use slice to remove row 1 and rows 48-end
rawdata <- rawdata %>% slice(2:47)

# Make years the column headers, name col 1, and drop years row and unnamed columns
# Remove blank rows and trim whitespace
rawdata[[1, 1]] <- "Col1"
names(rawdata) <- rawdata[1,]

## Warning: The `value` argument of `names<-` can't be empty as of tibble 3.0.0.
## Columns 3, 6, 8, 10, and 13 must be named.

## Warning: The `value` argument of `names<-` must be a character vector as of
## tibble 3.0.0.

rawdata <- rawdata %>% select(!c(3,6,8,10,13))
rawdata <- rawdata %>% slice(3:46) %>% filter(`Col1` != "" | `2018` != "")
rawdata$Col1<- rawdata$Col1 %>% str_trim(side = c("both"))

Before we can comfortably pivot this data, we need to ensure that all of the multiple variables currently housed in column 1 are given their own column, and that redundant data (such as total columns) is removed…

# set factor levels for outcomes and incumbent/ challenger
outcome_levels <- c("Incumbent w/ >= 60%", "Incumbent w/ < 60%", "Challenger")
inc_chal_levels <- c("Incumbents", "Challengers")

# fill missing values
rawdata <- rawdata %>% fill(Col1, .direction = c("down"))

# add an "outcome" column
rawdata <- rawdata %>% mutate(
  Outcome = c(1:39))

# populate the "outcome" column with the appropriate factor level
rawdata$Outcome <- rawdata$Outcome %>% lapply(function(i) {
  if (i %in% 1:13){
      rawdata$Outcome[[i]] = outcome_levels[[1]]
    } else if (i %in% 14:26){
      rawdata$Outcome[[i]] = outcome_levels[[2]]
    } else if (i %in% 27:39){
      rawdata$Outcome[[i]] = outcome_levels[[3]]
    }}) %>% 
  factor(levels = outcome_levels)

# remove rows housing only a value for "outcome"
rawdata <- rawdata %>% filter(!is.na(`2018`))

# add a column for "incumbent/ challenger" and populate
rawdata <- rawdata %>% mutate(
  `Inc/Chal` = ifelse(
    rawdata$`Col1` == "Incumbents", "Incumbents", ifelse(
    rawdata$`Col1` == "Challengers", "Challengers", NA)))

# change "incumbent/ challenger" to type "factor" and set the levels
rawdata$`Inc/Chal` <- rawdata$`Inc/Chal` %>% factor(levels = inc_chal_levels)

# fill the "incumbent/ challenger" column
rawdata <- rawdata %>% fill(`Inc/Chal`, .direction = c("down"))

# retain only those rows for which col1 value is "democrats" or "republicans" and rename the column to "Party"
rawdata <- rawdata %>% filter(`Col1` == "Democrats" | `Col1` == "Republicans")
rawdata <- rename(rawdata, "Party" = `Col1`)

Since we have two sets of values which we’ll ultimately want in our long tibble (“amount spent” and “n” or “number of candidates”), we’ll accomplish the pivot and our last cleaning task by separating the two types of values out into separate tables. We’ll then join them back together once both are clean and in long format.

# get the even numbered rows where the messy "n" values are stored
split_data_n <- rawdata %>% filter((row(rawdata) %% 2) == 0)

# remove all information from "n" except their integer value
split_data_n[2:24] <- split_data_n[2:24] %>% lapply(function(i){
  str_match(i, "(?<==)[:digit:]{1,2}") %>% .[,1] %>% as.integer()
  })

# get the values for "years"
years <- names(split_data_n[2:24])

# pivot the "n" dataframe to a long format
split_data_n <- split_data_n %>% pivot_longer(cols=years, names_to="Year", values_to="N")

## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(years)` instead of `years` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

# get the odd numbered rows where the messy dollar values are stored
split_data_dol <- rawdata %>% filter((row(rawdata) %% 2) == 1)

# remove all commas from the dollar values
split_data_dol[2:24] <- split_data_dol[2:24] %>% lapply(function(i){
  str_replace_all(i, ",", "") %>% as.integer()
  })

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

## Warning in str_replace_all(i, ",", "") %>% as.integer(): NAs introduced by
## coercion

# pivot the "dollars" dataframe to a long format 
split_data_dol <- split_data_dol %>% pivot_longer(cols=years, names_to="Year", values_to = "Total Spent")

# join the split, long-format data tables back together 
clean_data <- inner_join(split_data_n, split_data_dol)

## Joining, by = c("Party", "Outcome", "Inc/Chal", "Year")

607-wk6-project2-USSenate

Tyler Frankenberg

3/13/2021

Introduction

Data Cleaning - Steps: