Exploratory Data Analysis

Get Yer Data

As usual, we start by scraping example voter file data from Alamace County, NY from North Carolina State Board of Elections website, and load the data into R.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

working.dir <- "C:/Users/McDonald/Dropbox/Election Science/Class Exploratory Data Analysis"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter1.zip"
dest.file <- paste(working.dir, "/ncvoter1.zip", sep="")

download.file(url, dest.file)

unzip(dest.file, exdir = working.dir, overwrite = T)

voter.file <- paste(working.dir, "/ncvoter1.txt", sep="")

voters.NC <- read_tsv(voter.file, col_names = T)

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   county_id = col_integer(),
##   zip_code = col_integer(),
##   birth_age = col_integer(),
##   nc_senate_abbrv = col_integer(),
##   dist_1_abbrv = col_integer(),
##   birth_year = col_integer()
## )

## See spec(...) for full column specifications.

Histogram by Age

voters.NC %>%
  count(party_cd)

## # A tibble: 6 x 2
##   party_cd     n
##   <chr>    <int>
## 1 CST         30
## 2 DEM      42787
## 3 GRE         15
## 4 LIB        563
## 5 REP      36732
## 6 UNA      33811

ggplot (data = voters.NC) +
  geom_bar(mapping = aes(x = party_cd))

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5)

voters.NC %>%
  count(cut_width(birth_age, 10))

## # A tibble: 11 x 2
##    `cut_width(birth_age, 10)`     n
##    <fct>                      <int>
##  1 [15,25]                    13270
##  2 (25,35]                    17285
##  3 (35,45]                    15193
##  4 (45,55]                    18765
##  5 (55,65]                    19255
##  6 (65,75]                    15192
##  7 (75,85]                     9152
##  8 (85,95]                     4808
##  9 (95,105]                     983
## 10 (105,115]                     32
## 11 (115,125]                      3

rep <- voters.NC %>%
 filter(party_cd == "UNA")

ggplot (data = rep) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5)

ggplot (data = rep, mapping = aes(x = birth_age, colour = race_code)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC) +
  geom_histogram(mapping = aes(x = birth_age), binwidth = 5) +
  coord_cartesian(ylim = c(0, 20))

unusual <- voters.NC %>% 
  filter(birth_age >110) %>% 
  select(party_cd, birth_age) %>%
  arrange(party_cd)
unusual

## # A tibble: 6 x 2
##   party_cd birth_age
##   <chr>        <int>
## 1 DEM            115
## 2 DEM            119
## 3 DEM            119
## 4 DEM            113
## 5 REP            119
## 6 UNA            111

ggplot (data = voters.NC, mapping = aes(x = birth_age, colour = party_cd)) +
  geom_freqpoly(binwidth = 5)

ggplot (data = voters.NC, mapping = aes(x = party_cd, y = birth_age)) +
  geom_boxplot()

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age))

ggplot (data = voters.NC) +
  geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age)) +
  coord_flip()

ggplot(data = voters.NC) +
  geom_count(mapping = aes(x = party_cd, y = race_code))

voters.NC %>% 
  count(party_cd, race_code) %>%

ggplot(mapping = aes(x = party_cd, y = race_code)) +
  geom_tile(mapping = aes(fill = n))

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

voters.NC$reg_year <- substrRight(voters.NC$registr_dt, 4)

ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC) +
  geom_point(mapping = aes(x = reg_year, y = birth_age), alpha = 1/100)

ggplot (data = voters.NC) +
  geom_bin2d(mapping = aes(x = reg_year, y = birth_age))

ggplot (data = voters.NC, mapping = aes(x = reg_year, y = birth_age)) +
  geom_boxplot(mapping = aes(group = cut_width(reg_year, 10)))

Exploratory Data Analysis

Michael McDonald

September 15, 2019

Get Yer Data

Histogram by Age