As usual, we start by scraping example voter file data from Alamace County, NY from North Carolina State Board of Elections website, and load the data into R.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
working.dir <- "D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter1.zip"
dest.file <- paste(working.dir, "/ncvoter1.zip", sep="")
download.file(url, dest.file)
unzip(dest.file, exdir = working.dir, overwrite = T)
voter.file <- paste(working.dir, "/ncvoter1.txt", sep="")
voters.NC <- read_tsv(voter.file, col_names = T)
## Parsed with column specification:
## cols(
## .default = col_character(),
## county_id = col_double(),
## absent_ind = col_logical(),
## name_prefx_cd = col_logical(),
## zip_code = col_double(),
## mail_addr3 = col_logical(),
## mail_addr4 = col_logical(),
## birth_age = col_double(),
## ward_abbrv = col_logical(),
## ward_desc = col_logical(),
## nc_senate_abbrv = col_double(),
## county_commiss_abbrv = col_logical(),
## county_commiss_desc = col_logical(),
## township_abbrv = col_logical(),
## township_desc = col_logical(),
## school_dist_abbrv = col_logical(),
## school_dist_desc = col_logical(),
## fire_dist_abbrv = col_logical(),
## fire_dist_desc = col_logical(),
## water_dist_abbrv = col_logical(),
## water_dist_desc = col_logical()
## # ... with 10 more columns
## )
## See spec(...) for full column specifications.
## Warning: 15 parsing failures.
## row col expected actual file
## 4015 mail_addr3 1/0/T/F/TRUE/FALSE CH-2900 PORRENTRUY 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 4015 mail_addr4 1/0/T/F/TRUE/FALSE SWITZERLAND 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 5663 mail_addr4 1/0/T/F/TRUE/FALSE 28213 BREMEN, GERMANY 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 22600 mail_addr3 1/0/T/F/TRUE/FALSE UKYO-KU, KYOTO, JAPAN 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 22600 mail_addr4 1/0/T/F/TRUE/FALSE 616-8184 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## ..... .......... .................. ..................... ..................................................................................
## See problems(...) for more details.
voters.NC %>%
count(party_cd)
## # A tibble: 6 x 2
## party_cd n
## <chr> <int>
## 1 CST 28
## 2 DEM 42706
## 3 GRE 14
## 4 LIB 558
## 5 REP 36602
## 6 UNA 33601
ggplot (data = voters.NC) +
geom_bar(mapping = aes(x = party_cd))
ggplot (data = voters.NC) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 10)
voters.NC %>%
count(cut_width(birth_age, 10))
## # A tibble: 11 x 2
## `cut_width(birth_age, 10)` n
## <fct> <int>
## 1 [15,25] 13154
## 2 (25,35] 17209
## 3 (35,45] 15157
## 4 (45,55] 18770
## 5 (55,65] 19195
## 6 (65,75] 15127
## 7 (75,85] 9110
## 8 (85,95] 4794
## 9 (95,105] 959
## 10 (105,115] 31
## 11 (115,125] 3
rep <- voters.NC %>%
filter(party_cd == "REP")
ggplot (data = rep) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 5)
ggplot (data = rep, mapping = aes(x = birth_age, colour = race_code)) +
geom_freqpoly(binwidth = 5)
ggplot (data = voters.NC) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 5) +
coord_cartesian(ylim = c(0, 20))
unusual <- voters.NC %>%
filter(birth_age >110) %>%
select(party_cd, birth_age) %>%
arrange(party_cd)
unusual
## # A tibble: 6 x 2
## party_cd birth_age
## <chr> <dbl>
## 1 DEM 115
## 2 DEM 119
## 3 DEM 119
## 4 DEM 113
## 5 REP 119
## 6 UNA 111
ggplot (data = voters.NC, mapping = aes(x = birth_age, colour = party_cd)) +
geom_freqpoly(binwidth = 5)
ggplot (data = voters.NC, mapping = aes(x = party_cd, y = birth_age)) +
geom_boxplot()
ggplot (data = voters.NC) +
geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age))
ggplot (data = voters.NC) +
geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age)) +
coord_flip()
ggplot(data = voters.NC) +
geom_count(mapping = aes(x = party_cd, y = race_code))
voters.NC %>%
count(party_cd, race_code) %>%
ggplot(mapping = aes(x = party_cd, y = race_code)) +
geom_tile(mapping = aes(fill = n))
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
voters.NC$reg_year <- substrRight(voters.NC$registr_dt, 4)
ggplot (data = voters.NC) +
geom_point(mapping = aes(x = reg_year, y = birth_age))
ggplot (data = voters.NC) +
geom_point(mapping = aes(x = reg_year, y = birth_age), alpha = 1/100)
ggplot (data = voters.NC) +
geom_bin2d(mapping = aes(x = reg_year, y = birth_age))
ggplot (data = voters.NC, mapping = aes(x = reg_year, y = birth_age)) +
geom_boxplot(mapping = aes(group = cut_width(reg_year, 10)))