Workshop

Get Yer Data

Scrape the Alexander County, NC voter file from the North Carolina State Board of Elections website and load the file into R. (Alexander is county #2.)

library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
working.dir <- "D:/DropBox/Dropbox/Election Science/Class Workshop 1"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter2.zip"
dest.file <- paste(working.dir, "/ncvoter2.zip", sep="")

download.file(url, dest.file)

unzip(dest.file, exdir = working.dir, overwrite = T)

voter.file <- paste(working.dir, "/ncvoter2.txt", sep="")

voters.NC <- read_tsv(voter.file, col_names = T)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   county_id = col_double(),
##   absent_ind = col_logical(),
##   name_prefx_cd = col_logical(),
##   zip_code = col_double(),
##   mail_addr4 = col_logical(),
##   mail_zipcode = col_double(),
##   full_phone_number = col_double(),
##   birth_age = col_double(),
##   municipality_abbrv = col_logical(),
##   ward_abbrv = col_logical(),
##   ward_desc = col_logical(),
##   nc_senate_abbrv = col_double(),
##   county_commiss_abbrv = col_logical(),
##   county_commiss_desc = col_logical(),
##   township_abbrv = col_logical(),
##   township_desc = col_logical(),
##   fire_dist_abbrv = col_logical(),
##   fire_dist_desc = col_logical(),
##   water_dist_abbrv = col_logical(),
##   water_dist_desc = col_logical()
##   # ... with 11 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1 parsing failure.
##  row        col           expected       actual                                                                file
## 9698 mail_addr4 1/0/T/F/TRUE/FALSE APO AE 09368 'D:/DropBox/Dropbox/Election Science/Class Workshop 1/ncvoter2.txt'

A Frequency Distribution

Create a frequency distribution of voter_status_desc.

This is one way

voters_by_status.NC <- group_by(voters.NC, voter_status_desc)
summarize(voters_by_status.NC, voters = n())
## # A tibble: 5 x 2
##   voter_status_desc voters
##   <chr>              <int>
## 1 ACTIVE             20962
## 2 DENIED               205
## 3 INACTIVE            2278
## 4 REMOVED             2903
## 5 TEMPORARY              2

This is another way using count.

count(voters.NC, voter_status_desc)
## # A tibble: 5 x 2
##   voter_status_desc     n
##   <chr>             <int>
## 1 ACTIVE            20962
## 2 DENIED              205
## 3 INACTIVE           2278
## 4 REMOVED            2903
## 5 TEMPORARY             2

Other summary functions in R

Compute the min, max, and median birth_age within each voter_status_desc.

Write this code without and with using pipes.

Without pipes

voters_by_status.NC <- group_by(voters.NC, voter_status_desc)
summarize(voters_by_status.NC, voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
## # A tibble: 5 x 4
##   voter_status_desc voters_min voters_max voters_median
##   <chr>                  <dbl>      <dbl>         <dbl>
## 1 ACTIVE                    18        119          54  
## 2 DENIED                    18         97          40  
## 3 INACTIVE                  19        119          43  
## 4 REMOVED                   19        119          74  
## 5 TEMPORARY                 20         39          29.5

With pipes

voters_by_status.NC <- voters.NC %>%
  group_by(voter_status_desc) %>%
  summarize(voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
voters_by_status.NC
## # A tibble: 5 x 4
##   voter_status_desc voters_min voters_max voters_median
##   <chr>                  <dbl>      <dbl>         <dbl>
## 1 ACTIVE                    18        119          54  
## 2 DENIED                    18         97          40  
## 3 INACTIVE                  19        119          43  
## 4 REMOVED                   19        119          74  
## 5 TEMPORARY                 20         39          29.5

Select cases

Are there really people 119 years old? What is going on here?

(Answer: January 1, 1990 is a missing data code).

Recalculate the min, max, and median, excluding persons age 119 using filter().

voters_valid_age.NC <- filter(voters.NC, birth_age != 119)
voters_valid_age_by_status.NC <- group_by(voters_valid_age.NC, voter_status_desc)
summarize(voters_valid_age_by_status.NC, voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
## # A tibble: 5 x 4
##   voter_status_desc voters_min voters_max voters_median
##   <chr>                  <dbl>      <dbl>         <dbl>
## 1 ACTIVE                    18        106          54  
## 2 DENIED                    18         97          40  
## 3 INACTIVE                  19        102          43  
## 4 REMOVED                   19        108          74  
## 5 TEMPORARY                 20         39          29.5

Using filter() is not the best way to handle this situation since you might want to keep the records with birth_age of 119 for other analyses.

Bonus: Setting missing values equal to NA

Set the birth_age for persons age 119 equal to NA and calculate the min, max, and median.

voters_valid_age.NC <- mutate(voters.NC, birth_age_valid = ifelse(birth_age == 119, NA, birth_age))
voters_valid_age_by_status.NC <- group_by(voters_valid_age.NC, voter_status_desc)
summarize(voters_valid_age_by_status.NC, voters_min = min(birth_age_valid, na.rm = T), voters_max = max(birth_age_valid, na.rm = T), voters_median = median(birth_age_valid, na.rm = T))
## # A tibble: 5 x 4
##   voter_status_desc voters_min voters_max voters_median
##   <chr>                  <dbl>      <dbl>         <dbl>
## 1 ACTIVE                    18        106          54  
## 2 DENIED                    18         97          40  
## 3 INACTIVE                  19        102          43  
## 4 REMOVED                   19        108          74  
## 5 TEMPORARY                 20         39          29.5