As usual, we start by scraping example voter file data from Alamace County, NY from North Carolina State Board of Elections website, and load the data into R.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
working.dir <- "C:/Users/McDonald/Dropbox/Election Science/Class Exploratory Data Analysis"
url <- "https://s3.amazonaws.com/dl.ncsbe.gov/data/ncvoter1.zip"
dest.file <- paste(working.dir, "/ncvoter1.zip", sep="")
download.file(url, dest.file)
unzip(dest.file, exdir = working.dir, overwrite = T)
voter.file <- paste(working.dir, "/ncvoter1.txt", sep="")
voters.NC <- read_tsv(voter.file, col_names = T)
## Parsed with column specification:
## cols(
## .default = col_character(),
## county_id = col_integer(),
## zip_code = col_integer(),
## birth_age = col_integer(),
## nc_senate_abbrv = col_integer(),
## dist_1_abbrv = col_integer(),
## birth_year = col_integer()
## )
## See spec(...) for full column specifications.
voters.NC %>%
count(party_cd)
## # A tibble: 6 x 2
## party_cd n
## <chr> <int>
## 1 CST 30
## 2 DEM 42787
## 3 GRE 15
## 4 LIB 563
## 5 REP 36732
## 6 UNA 33811
ggplot (data = voters.NC) +
geom_bar(mapping = aes(x = party_cd))
ggplot (data = voters.NC) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 5)
voters.NC %>%
count(cut_width(birth_age, 10))
## # A tibble: 11 x 2
## `cut_width(birth_age, 10)` n
## <fct> <int>
## 1 [15,25] 13270
## 2 (25,35] 17285
## 3 (35,45] 15193
## 4 (45,55] 18765
## 5 (55,65] 19255
## 6 (65,75] 15192
## 7 (75,85] 9152
## 8 (85,95] 4808
## 9 (95,105] 983
## 10 (105,115] 32
## 11 (115,125] 3
rep <- voters.NC %>%
filter(party_cd == "UNA")
ggplot (data = rep) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 5)
ggplot (data = rep, mapping = aes(x = birth_age, colour = race_code)) +
geom_freqpoly(binwidth = 5)
ggplot (data = voters.NC) +
geom_histogram(mapping = aes(x = birth_age), binwidth = 5) +
coord_cartesian(ylim = c(0, 20))
unusual <- voters.NC %>%
filter(birth_age >110) %>%
select(party_cd, birth_age) %>%
arrange(party_cd)
unusual
## # A tibble: 6 x 2
## party_cd birth_age
## <chr> <int>
## 1 DEM 115
## 2 DEM 119
## 3 DEM 119
## 4 DEM 113
## 5 REP 119
## 6 UNA 111
ggplot (data = voters.NC, mapping = aes(x = birth_age, colour = party_cd)) +
geom_freqpoly(binwidth = 5)
ggplot (data = voters.NC, mapping = aes(x = party_cd, y = birth_age)) +
geom_boxplot()
ggplot (data = voters.NC) +
geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age))
ggplot (data = voters.NC) +
geom_boxplot(mapping = aes(x = reorder(party_cd, birth_age, FUN = median), y = birth_age)) +
coord_flip()
ggplot(data = voters.NC) +
geom_count(mapping = aes(x = party_cd, y = race_code))
voters.NC %>%
count(party_cd, race_code) %>%
ggplot(mapping = aes(x = party_cd, y = race_code)) +
geom_tile(mapping = aes(fill = n))
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
}
voters.NC$reg_year <- substrRight(voters.NC$registr_dt, 4)
ggplot (data = voters.NC) +
geom_point(mapping = aes(x = reg_year, y = birth_age))
ggplot (data = voters.NC) +
geom_point(mapping = aes(x = reg_year, y = birth_age), alpha = 1/100)
ggplot (data = voters.NC) +
geom_bin2d(mapping = aes(x = reg_year, y = birth_age))
ggplot (data = voters.NC, mapping = aes(x = reg_year, y = birth_age)) +
geom_boxplot(mapping = aes(group = cut_width(reg_year, 10)))