Healy Chapter 7 Part 1

Harold Nelson

10/29/2018

library(tidyverse)

## ── Attaching packages ── tidyverse 1.2.1 ──

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ── Conflicts ───── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(socviz)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggmap)

## 
## Attaching package: 'ggmap'

## The following object is masked from 'package:plotly':
## 
##     wind

library(maps)

## 
## Attaching package: 'maps'

## The following object is masked from 'package:purrr':
## 
##     map

Look at the election dataframe from socviz.

glimpse(election)

## Observations: 51
## Variables: 22
## $ state        <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Cali...
## $ st           <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "...
## $ fips         <dbl> 1, 2, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16, 17, ...
## $ total_vote   <dbl> 2123372, 318608, 2604657, 1130635, 14237884, 2780...
## $ vote_margin  <dbl> 588708, 46933, 91234, 304378, 4269978, 136386, 22...
## $ winner       <chr> "Trump", "Trump", "Trump", "Trump", "Clinton", "C...
## $ party        <chr> "Republican", "Republican", "Republican", "Republ...
## $ pct_margin   <dbl> 27.73, 14.73, 3.50, 26.92, 29.99, 4.91, 13.64, 11...
## $ r_points     <dbl> 27.72, 14.73, 3.50, 26.92, -29.99, -4.91, -13.64,...
## $ d_points     <dbl> -27.72, -14.73, -3.50, -26.92, 29.99, 4.91, 13.64...
## $ pct_clinton  <dbl> 34.36, 36.55, 44.58, 33.65, 61.48, 48.16, 54.57, ...
## $ pct_trump    <dbl> 62.08, 51.28, 48.08, 60.57, 31.49, 43.25, 40.93, ...
## $ pct_johnson  <dbl> 2.09, 5.88, 4.08, 2.64, 3.36, 5.18, 2.96, 3.33, 1...
## $ pct_other    <dbl> 1.46, 6.29, 3.25, 3.13, 3.66, 3.41, 1.55, 1.88, 3...
## $ clinton_vote <dbl> 729547, 116454, 1161167, 380494, 8753788, 1338870...
## $ trump_vote   <dbl> 1318255, 163387, 1252401, 684872, 4483810, 120248...
## $ johnson_vote <dbl> 44467, 18725, 106327, 29829, 478500, 144121, 4867...
## $ other_vote   <dbl> 31103, 20042, 84762, 35440, 521786, 94772, 25457,...
## $ ev_dem       <dbl> 9, 3, 11, 6, 0, 0, 0, 0, 0, 29, 16, 0, 4, 0, 11, ...
## $ ev_rep       <dbl> 0, 0, 0, 0, 55, 9, 7, 3, 3, 0, 0, 3, 0, 20, 0, 0,...
## $ ev_oth       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0...
## $ census       <chr> "South", "West", "West", "South", "West", "West",...

Exercise

Look at Healy’s graphic using facet_wrap and separate dotplots for each region. Fiddle with fig.height and fig.width to get something you’re pleased with.

Answer

party_colors <- c("#2E74C0", "#CB454A") 

p0 <- ggplot(data = subset(election, st %nin% "DC"),
             mapping = aes(x = r_points,
                           y = reorder(state, r_points),
                           color = party))

p1 <- p0 + geom_vline(xintercept = 0, color = "gray30") +
    geom_point(size = 2)

p2 <- p1 + scale_color_manual(values = party_colors)

p3 <- p2 + scale_x_continuous(breaks = c(-30, -20, -10, 0, 10, 20, 30, 40),
                              labels = c("30\n (Clinton)", "20", "10", "0",
                                         "10", "20", "30", "40\n(Trump)"))

p3 + facet_wrap(~ census, ncol=1, scales="free_y") +
    guides(color=FALSE) + labs(x = "Point Margin", y = "") +
    theme(axis.text=element_text(size=8))

Exercise

The first ggplot object won’t show anything since it has no geom. What does p1 add?

Answer

p1

Exercise

What does p2 add?

Answer

p2

Exercise

What about p3?

Answer

p3

Exercise

What happens if we remove the free y scaling?

Answer

p3 + facet_wrap(~ census, ncol=1) +
    guides(color=FALSE) + labs(x = "Point Margin", y = "") +
    theme(axis.text=element_text(size=8))

Get the data for US state maps and look at the part for WA.

us_states <- map_data("state")
head(us_states)

##        long      lat group order  region subregion
## 1 -87.46201 30.38968     1     1 alabama      <NA>
## 2 -87.48493 30.37249     1     2 alabama      <NA>
## 3 -87.52503 30.37249     1     3 alabama      <NA>
## 4 -87.53076 30.33239     1     4 alabama      <NA>
## 5 -87.57087 30.32665     1     5 alabama      <NA>
## 6 -87.58806 30.32665     1     6 alabama      <NA>

WA_map = filter(us_states,region == "washington")
DT::datatable(WA_map)

table(WA_map$subregion)

## 
##    lopez island            main    orcas island san juan island 
##              17             448              19              17 
##  whidbey island 
##              44

Draw the US map

Note that the map is created by using the points to draw polygons in the designated order.

p <- ggplot(data = us_states,
            mapping = aes(x = long, y = lat,
                          group = group))

p + geom_polygon(fill = "white", color = "black")

Draw the map of Washington

p <- ggplot(data = WA_map,
            mapping = aes(x = long, y = lat,
                          group = group))

p + geom_polygon(fill = "white", color = "black")

Use the Albers projection to redraw the Washington map.

Map fill to subregion instead of region. Note the tilt and apparent spherical surface.

p <- ggplot(data = WA_map,
            mapping = aes(x = long, y = lat,
                          group = group, fill = subregion))

p + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) +
    guides(fill = FALSE)

Merge the election data with the map data.

Look at the merged dataframe with DT::datatable. Note the redundancy in the election data.

election$region <- tolower(election$state)
us_states_elec <- left_join(us_states, election)

## Joining, by = "region"

DT::datatable(us_states_elec)

## Warning in instance$preRenderHook(instance): It seems your data is too
## big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Create a Choropleth

Using the joined file of map and election data we can map fill to any of the election variables. First we want Healy’s map theme, which is in the appendix. Try the graphic without theme_map().

theme_map <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0,0)
              )
}

p0 <- ggplot(data = us_states_elec,
             mapping = aes(x = long, y = lat,
                           group = group, fill = party))
p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 
p2 <- p1 + scale_fill_manual(values = party_colors) +
    labs(title = "Election Results 2016", fill = NULL)
p2 + theme_map()

## Loading required package: grid

How would this look without theme_map?

p2

Healy shows several different versions of the election map with different options for color. The last one is probably the best.

p0 <- ggplot(data = subset(us_states_elec,
                           region %nin% "district of columbia"),
             aes(x = long, y = lat, group = group, fill = d_points))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p2 <- p1 + scale_fill_gradient2(low = "red",
                                mid = scales::muted("purple"),
                                high = "blue") +
    labs(title = "Winning margins") 
p2 + theme_map() + labs(fill = "Percent")

Note the impact of leaving DC in.

p0 <- ggplot(data = us_states_elec,
             aes(x = long, y = lat, group = group, fill = d_points))

p1 <- p0 + geom_polygon(color = "gray90", size = 0.1) +
    coord_map(projection = "albers", lat0 = 39, lat1 = 45) 

p2 <- p1 + scale_fill_gradient2(low = "red",
                                mid = scales::muted("purple"),
                                high = "blue") +
    labs(title = "Winning margins") 
p2 + theme_map() + labs(fill = "Percent")

Use glimpse to look at county_map and county_data, which are in the socviz package.

glimpse(county_map)

## Observations: 191,382
## Variables: 7
## $ long  <dbl> 1225889, 1235324, 1244873, 1244129, 1272010, 1276797, 12...
## $ lat   <dbl> -1275020, -1274008, -1272331, -1267515, -1262889, -12955...
## $ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1...
## $ hole  <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
## $ piece <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ group <fct> 0500000US01001.1, 0500000US01001.1, 0500000US01001.1, 05...
## $ id    <chr> "01001", "01001", "01001", "01001", "01001", "01001", "0...

Note that there are almost 200,000 entries and 7 variables.

glimpse(county_data)

## Observations: 3,195
## Variables: 32
## $ id               <chr> "0", "01000", "01001", "01003", "01005", "010...
## $ name             <chr> NA, "1", "Autauga County", "Baldwin County", ...
## $ state            <fct> NA, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, A...
## $ census_region    <fct> NA, South, South, South, South, South, South,...
## $ pop_dens         <fct> [   50,  100), [   50,  100), [   50,  100), ...
## $ pop_dens4        <fct> [ 45,  118), [ 45,  118), [ 45,  118), [118,7...
## $ pop_dens6        <fct> [ 82,  215), [ 82,  215), [ 82,  215), [ 82, ...
## $ pct_black        <fct> [10.0,15.0), [25.0,50.0), [15.0,25.0), [ 5.0,...
## $ pop              <int> 318857056, 4849377, 55395, 200111, 26887, 225...
## $ female           <dbl> 50.8, 51.5, 51.5, 51.2, 46.5, 46.0, 50.6, 45....
## $ white            <dbl> 77.7, 69.8, 78.1, 87.3, 50.2, 76.3, 96.0, 27....
## $ black            <dbl> 13.2, 26.6, 18.4, 9.5, 47.6, 22.1, 1.8, 69.9,...
## $ travel_time      <dbl> 25.5, 24.2, 26.2, 25.9, 24.6, 27.6, 33.9, 26....
## $ land_area        <dbl> 3531905.43, 50645.33, 594.44, 1589.78, 884.88...
## $ hh_income        <int> 53046, 43253, 53682, 50221, 32911, 36447, 441...
## $ su_gun4          <fct> NA, NA, [11,54], [11,54], [ 5, 8), [11,54], [...
## $ su_gun6          <fct> NA, NA, [10,12), [10,12), [ 7, 8), [10,12), [...
## $ fips             <dbl> 0, 1000, 1001, 1003, 1005, 1007, 1009, 1011, ...
## $ votes_dem_2016   <int> NA, NA, 5908, 18409, 4848, 1874, 2150, 3530, ...
## $ votes_gop_2016   <int> NA, NA, 18110, 72780, 5431, 6733, 22808, 1139...
## $ total_votes_2016 <int> NA, NA, 24661, 94090, 10390, 8748, 25384, 470...
## $ per_dem_2016     <dbl> NA, NA, 0.23956855, 0.19565310, 0.46660250, 0...
## $ per_gop_2016     <dbl> NA, NA, 0.7343579, 0.7735147, 0.5227141, 0.76...
## $ diff_2016        <int> NA, NA, 12202, 54371, 583, 4859, 20658, 2391,...
## $ per_dem_2012     <dbl> NA, NA, 0.2657577, 0.2156657, 0.5125229, 0.26...
## $ per_gop_2012     <dbl> NA, NA, 0.7263374, 0.7738975, 0.4833755, 0.73...
## $ diff_2012        <int> NA, NA, 11012, 47443, 334, 3931, 17780, 2808,...
## $ winner           <chr> NA, NA, "Trump", "Trump", "Trump", "Trump", "...
## $ partywinner16    <chr> NA, NA, "Republican", "Republican", "Republic...
## $ winner12         <chr> NA, NA, "Romney", "Romney", "Obama", "Romney"...
## $ partywinner12    <chr> NA, NA, "Republican", "Republican", "Democrat...
## $ flipped          <chr> NA, NA, "No", "No", "Yes", "No", "No", "No", ...

Now join these two dataframes, keeping just the subset of variables from county_data selected by Healy. Use glimpse to examine the result.

county_full <- left_join(county_map, county_data, by = "id")
glimpse(county_full)

## Observations: 191,382
## Variables: 38
## $ long             <dbl> 1225889, 1235324, 1244873, 1244129, 1272010, ...
## $ lat              <dbl> -1275020, -1274008, -1272331, -1267515, -1262...
## $ order            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ hole             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ piece            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ group            <fct> 0500000US01001.1, 0500000US01001.1, 0500000US...
## $ id               <chr> "01001", "01001", "01001", "01001", "01001", ...
## $ name             <chr> "Autauga County", "Autauga County", "Autauga ...
## $ state            <fct> AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, A...
## $ census_region    <fct> South, South, South, South, South, South, Sou...
## $ pop_dens         <fct> [   50,  100), [   50,  100), [   50,  100), ...
## $ pop_dens4        <fct> [ 45,  118), [ 45,  118), [ 45,  118), [ 45, ...
## $ pop_dens6        <fct> [ 82,  215), [ 82,  215), [ 82,  215), [ 82, ...
## $ pct_black        <fct> [15.0,25.0), [15.0,25.0), [15.0,25.0), [15.0,...
## $ pop              <int> 55395, 55395, 55395, 55395, 55395, 55395, 553...
## $ female           <dbl> 51.5, 51.5, 51.5, 51.5, 51.5, 51.5, 51.5, 51....
## $ white            <dbl> 78.1, 78.1, 78.1, 78.1, 78.1, 78.1, 78.1, 78....
## $ black            <dbl> 18.4, 18.4, 18.4, 18.4, 18.4, 18.4, 18.4, 18....
## $ travel_time      <dbl> 26.2, 26.2, 26.2, 26.2, 26.2, 26.2, 26.2, 26....
## $ land_area        <dbl> 594.44, 594.44, 594.44, 594.44, 594.44, 594.4...
## $ hh_income        <int> 53682, 53682, 53682, 53682, 53682, 53682, 536...
## $ su_gun4          <fct> [11,54], [11,54], [11,54], [11,54], [11,54], ...
## $ su_gun6          <fct> [10,12), [10,12), [10,12), [10,12), [10,12), ...
## $ fips             <dbl> 1001, 1001, 1001, 1001, 1001, 1001, 1001, 100...
## $ votes_dem_2016   <int> 5908, 5908, 5908, 5908, 5908, 5908, 5908, 590...
## $ votes_gop_2016   <int> 18110, 18110, 18110, 18110, 18110, 18110, 181...
## $ total_votes_2016 <int> 24661, 24661, 24661, 24661, 24661, 24661, 246...
## $ per_dem_2016     <dbl> 0.2395685, 0.2395685, 0.2395685, 0.2395685, 0...
## $ per_gop_2016     <dbl> 0.7343579, 0.7343579, 0.7343579, 0.7343579, 0...
## $ diff_2016        <int> 12202, 12202, 12202, 12202, 12202, 12202, 122...
## $ per_dem_2012     <dbl> 0.2657577, 0.2657577, 0.2657577, 0.2657577, 0...
## $ per_gop_2012     <dbl> 0.7263374, 0.7263374, 0.7263374, 0.7263374, 0...
## $ diff_2012        <int> 11012, 11012, 11012, 11012, 11012, 11012, 110...
## $ winner           <chr> "Trump", "Trump", "Trump", "Trump", "Trump", ...
## $ partywinner16    <chr> "Republican", "Republican", "Republican", "Re...
## $ winner12         <chr> "Romney", "Romney", "Romney", "Romney", "Romn...
## $ partywinner12    <chr> "Republican", "Republican", "Republican", "Re...
## $ flipped          <chr> "No", "No", "No", "No", "No", "No", "No", "No...

Note that we have all of the records from county_map and each one of these records has been augmented by 31 variables from the county_data dataframe. Every point used to draw the map of a county carries all of the data for the entire county!

Let’s create the choropleth example from Healy.

p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat,
                          fill = pop_dens, 
                          group = group))

p1 <- p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()

p2 <- p1 + scale_fill_brewer(palette="Blues",
                             labels = c("0-10", "10-50", "50-100", "100-500",
                                        "500-1,000", "1,000-5,000", ">5,000"))

p2 + labs(fill = "Population per\nsquare mile") +
    theme_map() +
    guides(fill = guide_legend(nrow = 1)) + 
    theme(legend.position = "bottom")

Create WA_full and redo the map with it.

WA_full = filter(county_full,state == "WA")
glimpse(WA_full)

## Observations: 3,599
## Variables: 38
## $ long             <dbl> -1372940, -1437163, -1458316, -1453530, -1438...
## $ lat              <dbl> 350122.3, 365353.5, 370534.7, 389522.1, 38574...
## $ order            <int> 176820, 176821, 176822, 176823, 176824, 17682...
## $ hole             <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
## $ piece            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ group            <fct> 0500000US53001.1, 0500000US53001.1, 0500000US...
## $ id               <chr> "53001", "53001", "53001", "53001", "53001", ...
## $ name             <chr> "Adams County", "Adams County", "Adams County...
## $ state            <fct> WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, WA, W...
## $ census_region    <fct> West, West, West, West, West, West, West, Wes...
## $ pop_dens         <fct> [    0,   10), [    0,   10), [    0,   10), ...
## $ pop_dens4        <fct> [  0,   17), [  0,   17), [  0,   17), [  0, ...
## $ pop_dens6        <fct> [  9,   25), [  9,   25), [  9,   25), [  9, ...
## $ pct_black        <fct> [ 0.0, 2.0), [ 0.0, 2.0), [ 0.0, 2.0), [ 0.0,...
## $ pop              <int> 19179, 19179, 19179, 19179, 19179, 19179, 191...
## $ female           <dbl> 49.2, 49.2, 49.2, 49.2, 49.2, 49.2, 49.2, 49....
## $ white            <dbl> 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 9...
## $ black            <dbl> 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, ...
## $ travel_time      <dbl> 17.3, 17.3, 17.3, 17.3, 17.3, 17.3, 17.3, 17....
## $ land_area        <dbl> 1924.98, 1924.98, 1924.98, 1924.98, 1924.98, ...
## $ hh_income        <int> 43926, 43926, 43926, 43926, 43926, 43926, 439...
## $ su_gun4          <fct> [ 5, 8), [ 5, 8), [ 5, 8), [ 5, 8), [ 5, 8), ...
## $ su_gun6          <fct> [ 4, 7), [ 4, 7), [ 4, 7), [ 4, 7), [ 4, 7), ...
## $ fips             <dbl> 53001, 53001, 53001, 53001, 53001, 53001, 530...
## $ votes_dem_2016   <int> 1118, 1118, 1118, 1118, 1118, 1118, 1118, 111...
## $ votes_gop_2016   <int> 2740, 2740, 2740, 2740, 2740, 2740, 2740, 274...
## $ total_votes_2016 <int> 4081, 4081, 4081, 4081, 4081, 4081, 4081, 408...
## $ per_dem_2016     <dbl> 0.2739525, 0.2739525, 0.2739525, 0.2739525, 0...
## $ per_gop_2016     <dbl> 0.6714041, 0.6714041, 0.6714041, 0.6714041, 0...
## $ diff_2016        <int> 1622, 1622, 1622, 1622, 1622, 1622, 1622, 162...
## $ per_dem_2012     <dbl> 0.3220735, 0.3220735, 0.3220735, 0.3220735, 0...
## $ per_gop_2012     <dbl> 0.6609305, 0.6609305, 0.6609305, 0.6609305, 0...
## $ diff_2012        <int> 1595, 1595, 1595, 1595, 1595, 1595, 1595, 159...
## $ winner           <chr> "Trump", "Trump", "Trump", "Trump", "Trump", ...
## $ partywinner16    <chr> "Republican", "Republican", "Republican", "Re...
## $ winner12         <chr> "Romney", "Romney", "Romney", "Romney", "Romn...
## $ partywinner12    <chr> "Republican", "Republican", "Republican", "Re...
## $ flipped          <chr> "No", "No", "No", "No", "No", "No", "No", "No...

WA_full = filter(county_full,state == "WA")
p <- ggplot(data = WA_full,
            mapping = aes(x = long, y = lat,
                          fill = pop_dens, 
                          group = group))

p1 <- p + geom_polygon(color = "gray90", size = 0.05) + coord_equal()

p2 <- p1 + scale_fill_brewer(palette="Blues",
                             labels = c("0-10", "10-50", "50-100", "100-500",
                                        "500-1,000", "1,000-5,000", ">5,000"))

p2 + labs(fill = "Population per\nsquare mile") +
    theme_map() +
    guides(fill = guide_legend(nrow = 1)) + 
    theme(legend.position = "bottom")

Exercise

Do a choropleth for some other variable and Washington’s counties. Do this for another state you’re familiar with.