Exploratory Data Analysis

Get Yer Data

As usual, we start by scraping example voter file data from Alamace County, NY from North Carolina State Board of Elections website, and load the data into R.

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

working.dir <- "D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter1.zip"
dest.file <- paste(working.dir, "/ncvoter1.zip", sep="")

download.file(url, dest.file)

unzip(dest.file, exdir = working.dir, overwrite = T)

voter.file <- paste(working.dir, "/ncvoter1.txt", sep="")

voters.NC <- read_tsv(voter.file, col_names = T)

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   county_id = col_double(),
##   absent_ind = col_logical(),
##   name_prefx_cd = col_logical(),
##   zip_code = col_double(),
##   mail_addr3 = col_logical(),
##   mail_addr4 = col_logical(),
##   birth_age = col_double(),
##   ward_abbrv = col_logical(),
##   ward_desc = col_logical(),
##   nc_senate_abbrv = col_double(),
##   county_commiss_abbrv = col_logical(),
##   county_commiss_desc = col_logical(),
##   township_abbrv = col_logical(),
##   township_desc = col_logical(),
##   school_dist_abbrv = col_logical(),
##   school_dist_desc = col_logical(),
##   fire_dist_abbrv = col_logical(),
##   fire_dist_desc = col_logical(),
##   water_dist_abbrv = col_logical(),
##   water_dist_desc = col_logical()
##   # ... with 10 more columns
## )

## See spec(...) for full column specifications.

## Warning: 15 parsing failures.
##   row        col           expected                actual                                                                               file
##  4015 mail_addr3 1/0/T/F/TRUE/FALSE CH-2900 PORRENTRUY    'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
##  4015 mail_addr4 1/0/T/F/TRUE/FALSE SWITZERLAND           'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
##  5663 mail_addr4 1/0/T/F/TRUE/FALSE 28213 BREMEN, GERMANY 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 22600 mail_addr3 1/0/T/F/TRUE/FALSE UKYO-KU, KYOTO, JAPAN 'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## 22600 mail_addr4 1/0/T/F/TRUE/FALSE 616-8184              'D:/DropBox/Dropbox/Election Science/Class Exploratory Data Analysis/ncvoter1.txt'
## ..... .......... .................. ..................... ..................................................................................
## See problems(...) for more details.

Histogram by Age

voters.NC %>%
  count(party_cd)

## # A tibble: 6 x 2
##   party_cd     n
##   <chr>    <int>
## 1 CST         28
## 2 DEM      42706
## 3 GRE         14
## 4 LIB        558
## 5 REP      36602
## 6 UNA      33601

ggplot (data = voters.NC) +
  geom_bar(mapping = aes(x = party_cd))

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 10)

voters.NC %>%
  count(cut_width(birth_age, 10))

## # A tibble: 11 x 2
##    `cut_width(birth_age, 10)`     n
##    <fct>                      <int>
##  1 [15,25]                    13154
##  2 (25,35]                    17209
##  3 (35,45]                    15157
##  4 (45,55]                    18770
##  5 (55,65]                    19195
##  6 (65,75]                    15127
##  7 (75,85]                     9110
##  8 (85,95]                     4794
##  9 (95,105]                     959
## 10 (105,115]                     31
## 11 (115,125]                      3

rep <- voters.NC %>%
 filter(party_cd == "REP")

ggplot (data = rep) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5)

ggplot (data = rep, mapping = aes(x = birth_age, colour = race_code)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5) +
  coord_cartesian(ylim = c(0, 20))

unusual <- voters.NC %>% 
  filter(birth_age >110) %>% 
  select(party_cd, birth_age) %>%
  arrange(party_cd)
unusual

## # A tibble: 6 x 2
##   party_cd birth_age
##   <chr>        <dbl>
## 1 DEM            115
## 2 DEM            119
## 3 DEM            119
## 4 DEM            113
## 5 REP            119
## 6 UNA            111

ggplot (data = voters.NC, mapping = aes(x = birth_age, colour = party_cd)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC, mapping = aes(x = party_cd, y = birth_age)) +
  geom_boxplot()

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age))

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age)) +
  coord_flip()

ggplot(data = voters.NC) +
  geom_count(mapping = aes(x = party_cd, y = race_code))

voters.NC %>% 
  count(party_cd, race_code) %>%

ggplot(mapping = aes(x = party_cd, y = race_code)) +
  geom_tile(mapping = aes(fill = n))

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

voters.NC$reg_year <- substrRight(voters.NC$registr_dt, 4)

ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age), alpha = 1/100)

ggplot (data = voters.NC) +
  geom_bin2d(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC, mapping = aes(x = reg_year, y = birth_age)) +
  geom_boxplot(mapping = aes(group = cut_width(reg_year, 10)))

Exploratory Data Analysis

Michael McDonald

September 15, 2019

Get Yer Data

Histogram by Age