Scrape the Alexander County, NC voter file from the North Carolina State Board of Elections website and load the file into R. (Alexander is county #2.)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
working.dir <- "D:/DropBox/Dropbox/Election Science/Class Workshop 1"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter2.zip"
dest.file <- paste(working.dir, "/ncvoter2.zip", sep="")
download.file(url, dest.file)
unzip(dest.file, exdir = working.dir, overwrite = T)
voter.file <- paste(working.dir, "/ncvoter2.txt", sep="")
voters.NC <- read_tsv(voter.file, col_names = T)
## Parsed with column specification:
## cols(
## .default = col_character(),
## county_id = col_double(),
## absent_ind = col_logical(),
## name_prefx_cd = col_logical(),
## zip_code = col_double(),
## mail_addr4 = col_logical(),
## mail_zipcode = col_double(),
## full_phone_number = col_double(),
## birth_age = col_double(),
## municipality_abbrv = col_logical(),
## ward_abbrv = col_logical(),
## ward_desc = col_logical(),
## nc_senate_abbrv = col_double(),
## county_commiss_abbrv = col_logical(),
## county_commiss_desc = col_logical(),
## township_abbrv = col_logical(),
## township_desc = col_logical(),
## fire_dist_abbrv = col_logical(),
## fire_dist_desc = col_logical(),
## water_dist_abbrv = col_logical(),
## water_dist_desc = col_logical()
## # ... with 11 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1 parsing failure.
## row col expected actual file
## 9698 mail_addr4 1/0/T/F/TRUE/FALSE APO AE 09368 'D:/DropBox/Dropbox/Election Science/Class Workshop 1/ncvoter2.txt'
Create a frequency distribution of voter_status_desc.
This is one way
voters_by_status.NC <- group_by(voters.NC, voter_status_desc)
summarize(voters_by_status.NC, voters = n())
## # A tibble: 5 x 2
## voter_status_desc voters
## <chr> <int>
## 1 ACTIVE 20962
## 2 DENIED 205
## 3 INACTIVE 2278
## 4 REMOVED 2903
## 5 TEMPORARY 2
This is another way using count.
count(voters.NC, voter_status_desc)
## # A tibble: 5 x 2
## voter_status_desc n
## <chr> <int>
## 1 ACTIVE 20962
## 2 DENIED 205
## 3 INACTIVE 2278
## 4 REMOVED 2903
## 5 TEMPORARY 2
Compute the min, max, and median birth_age within each voter_status_desc.
Write this code without and with using pipes.
Without pipes
voters_by_status.NC <- group_by(voters.NC, voter_status_desc)
summarize(voters_by_status.NC, voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
## # A tibble: 5 x 4
## voter_status_desc voters_min voters_max voters_median
## <chr> <dbl> <dbl> <dbl>
## 1 ACTIVE 18 119 54
## 2 DENIED 18 97 40
## 3 INACTIVE 19 119 43
## 4 REMOVED 19 119 74
## 5 TEMPORARY 20 39 29.5
With pipes
voters_by_status.NC <- voters.NC %>%
group_by(voter_status_desc) %>%
summarize(voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
voters_by_status.NC
## # A tibble: 5 x 4
## voter_status_desc voters_min voters_max voters_median
## <chr> <dbl> <dbl> <dbl>
## 1 ACTIVE 18 119 54
## 2 DENIED 18 97 40
## 3 INACTIVE 19 119 43
## 4 REMOVED 19 119 74
## 5 TEMPORARY 20 39 29.5
Are there really people 119 years old? What is going on here?
(Answer: January 1, 1990 is a missing data code).
Recalculate the min, max, and median, excluding persons age 119 using filter().
voters_valid_age.NC <- filter(voters.NC, birth_age != 119)
voters_valid_age_by_status.NC <- group_by(voters_valid_age.NC, voter_status_desc)
summarize(voters_valid_age_by_status.NC, voters_min = min(birth_age), voters_max = max(birth_age), voters_median = median(birth_age))
## # A tibble: 5 x 4
## voter_status_desc voters_min voters_max voters_median
## <chr> <dbl> <dbl> <dbl>
## 1 ACTIVE 18 106 54
## 2 DENIED 18 97 40
## 3 INACTIVE 19 102 43
## 4 REMOVED 19 108 74
## 5 TEMPORARY 20 39 29.5
Using filter() is not the best way to handle this situation since you might want to keep the records with birth_age of 119 for other analyses.
NASet the birth_age for persons age 119 equal to NA and calculate the min, max, and median.
voters_valid_age.NC <- mutate(voters.NC, birth_age_valid = ifelse(birth_age == 119, NA, birth_age))
voters_valid_age_by_status.NC <- group_by(voters_valid_age.NC, voter_status_desc)
summarize(voters_valid_age_by_status.NC, voters_min = min(birth_age_valid, na.rm = T), voters_max = max(birth_age_valid, na.rm = T), voters_median = median(birth_age_valid, na.rm = T))
## # A tibble: 5 x 4
## voter_status_desc voters_min voters_max voters_median
## <chr> <dbl> <dbl> <dbl>
## 1 ACTIVE 18 106 54
## 2 DENIED 18 97 40
## 3 INACTIVE 19 102 43
## 4 REMOVED 19 108 74
## 5 TEMPORARY 20 39 29.5