Get Yer Data

As usual, we start by scraping example voter file data from Alamace County, NY from North Carolina State Board of Elections website, and load the data into R.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
working.dir <- "C:/Users/McDonald/Dropbox/Election Science/Class Exploratory Data Analysis"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter1.zip"
dest.file <- paste(working.dir, "/ncvoter1.zip", sep="")

download.file(url, dest.file)

unzip(dest.file, exdir = working.dir, overwrite = T)

voter.file <- paste(working.dir, "/ncvoter1.txt", sep="")

voters.NC <- read_tsv(voter.file, col_names = T)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   county_id = col_integer(),
##   zip_code = col_integer(),
##   birth_age = col_integer(),
##   nc_senate_abbrv = col_integer(),
##   dist_1_abbrv = col_integer(),
##   birth_year = col_integer()
## )
## See spec(...) for full column specifications.

Histogram by Age

voters.NC %>%
  count(party_cd)
## # A tibble: 6 x 2
##   party_cd     n
##   <chr>    <int>
## 1 CST         30
## 2 DEM      42787
## 3 GRE         15
## 4 LIB        563
## 5 REP      36732
## 6 UNA      33811
ggplot (data = voters.NC) +
  geom_bar(mapping = aes(x = party_cd))

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5)

voters.NC %>%
  count(cut_width(birth_age, 10))
## # A tibble: 11 x 2
##    `cut_width(birth_age, 10)`     n
##    <fct>                      <int>
##  1 [15,25]                    13270
##  2 (25,35]                    17285
##  3 (35,45]                    15193
##  4 (45,55]                    18765
##  5 (55,65]                    19255
##  6 (65,75]                    15192
##  7 (75,85]                     9152
##  8 (85,95]                     4808
##  9 (95,105]                     983
## 10 (105,115]                     32
## 11 (115,125]                      3
rep <- voters.NC %>%
 filter(party_cd == "UNA")

ggplot (data = rep) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5)

ggplot (data = rep, mapping = aes(x = birth_age, colour = race_code)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5) +
  coord_cartesian(ylim = c(0, 20))

unusual <- voters.NC %>% 
  filter(birth_age >110) %>% 
  select(party_cd, birth_age) %>%
  arrange(party_cd)
unusual
## # A tibble: 6 x 2
##   party_cd birth_age
##   <chr>        <int>
## 1 DEM            115
## 2 DEM            119
## 3 DEM            119
## 4 DEM            113
## 5 REP            119
## 6 UNA            111
ggplot (data = voters.NC, mapping = aes(x = birth_age, colour = party_cd)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC, mapping = aes(x = party_cd, y = birth_age)) +
  geom_boxplot()

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age))

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age)) +
  coord_flip()

ggplot(data = voters.NC) +
  geom_count(mapping = aes(x = party_cd, y = race_code))

voters.NC %>% 
  count(party_cd, race_code) %>%

ggplot(mapping = aes(x = party_cd, y = race_code)) +
  geom_tile(mapping = aes(fill = n))

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

voters.NC$reg_year <- substrRight(voters.NC$registr_dt, 4)
ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age), alpha = 1/100)

ggplot (data = voters.NC) +
  geom_bin2d(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC, mapping = aes(x = reg_year, y = birth_age)) +
  geom_boxplot(mapping = aes(group = cut_width(reg_year, 10)))