library(dplyr)
library(readr)
library(stringr)
library(tidyr)
library(readxl)
# download area and population info from Census
download.file("http://www2.census.gov/prod2/statcomp/usac/excel/LND01.xls", "LND01.xls")
download.file("http://www2.census.gov/prod2/statcomp/usac/excel/POP01.xls", "POP01.xls")
# according to metadata, this is Land Area in 2010 and resident population in 2010
us_county_area <- read_excel("LND01.xls") %>%
transmute(CountyCode = as.character(as.integer(STCOU)),
Area = LND110210D)
us_county_population <- read_excel("POP01.xls") %>%
transmute(CountyCode = as.character(as.integer(STCOU)),
Population = POP010210D)
election_url <- "https://raw.githubusercontent.com/Prooffreader/election_2016_data/master/data/presidential_general_election_2016_by_county.csv"
county_data <- read_csv(election_url) %>%
group_by(CountyCode = as.character(fips)) %>%
mutate(TotalVotes = sum(votes)) %>%
ungroup() %>%
mutate(name = str_replace(name, ".\\. ", "")) %>%
filter(name %in% c("Trump", "Clinton", "Johnson", "Stein")) %>%
transmute(County = str_replace(geo_name, " County", ""),
State = state,
CountyCode = as.character(fips),
Candidate = name,
Percent = vote_pct / 100,
TotalVotes) %>%
spread(Candidate, Percent, fill = 0) %>%
inner_join(us_county_population, by = "CountyCode") %>%
inner_join(us_county_area, by = "CountyCode")
county_data
## # A tibble: 3,111 × 10
## County State CountyCode TotalVotes Clinton Johnson Stein Trump
## <chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Autauga Alabama 1001 24661 0.240 0.022 0.004 0.734
## 2 Baldwin Alabama 1003 94090 0.196 0.026 0.005 0.774
## 3 Barbour Alabama 1005 10390 0.467 0.009 0.002 0.523
## 4 Bibb Alabama 1007 8748 0.214 0.014 0.002 0.770
## 5 Blount Alabama 1009 25384 0.085 0.013 0.004 0.899
## 6 Bullock Alabama 1011 4701 0.751 0.005 0.002 0.242
## 7 Butler Alabama 1013 8685 0.428 0.007 0.001 0.563
## 8 Calhoun Alabama 1015 47376 0.279 0.024 0.006 0.692
## 9 Chambers Alabama 1017 13778 0.418 0.012 0.003 0.566
## 10 Cherokee Alabama 1019 10503 0.145 0.014 0.002 0.839
## # ... with 3,101 more rows, and 2 more variables: Population <dbl>,
## # Area <dbl>
write_csv(county_data, "county_election_2016.csv")
library(ggplot2)
library(scales)
theme_set(theme_bw())
ggplot(county_data, aes(Population / Area, Trump)) +
geom_point() +
scale_x_log10() +
scale_y_continuous(labels = percent_format()) +
xlab("Population density (ppl / square mile)") +
ylab("% of votes going to Trump") +
geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
geom_smooth(method = "lm") +
ggtitle("Population density vs Trump voters by county")
