Exercises with County Data

Harold Nelson

9/27/2021

Setup

Make the packages tidyverse and socviz available. This assumes you have already installed them on your computer.

Answer

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(socviz)

Examine File

Do a glimpse() of the file county_data from the socviz package. Also look in the package documentation.

Answer

glimpse(county_data)
## Rows: 3,195
## Columns: 32
## $ id               <chr> "0", "01000", "01001", "01003", "01005", "01007", "01…
## $ name             <chr> NA, "1", "Autauga County", "Baldwin County", "Barbour…
## $ state            <fct> NA, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, A…
## $ census_region    <fct> NA, South, South, South, South, South, South, South, …
## $ pop_dens         <fct> "[   50,  100)", "[   50,  100)", "[   50,  100)", "[…
## $ pop_dens4        <fct> "[ 45,  118)", "[ 45,  118)", "[ 45,  118)", "[118,71…
## $ pop_dens6        <fct> "[ 82,  215)", "[ 82,  215)", "[ 82,  215)", "[ 82,  …
## $ pct_black        <fct> "[10.0,15.0)", "[25.0,50.0)", "[15.0,25.0)", "[ 5.0,1…
## $ pop              <int> 318857056, 4849377, 55395, 200111, 26887, 22506, 5771…
## $ female           <dbl> 50.8, 51.5, 51.5, 51.2, 46.5, 46.0, 50.6, 45.2, 53.4,…
## $ white            <dbl> 77.7, 69.8, 78.1, 87.3, 50.2, 76.3, 96.0, 27.2, 54.3,…
## $ black            <dbl> 13.2, 26.6, 18.4, 9.5, 47.6, 22.1, 1.8, 69.9, 43.6, 2…
## $ travel_time      <dbl> 25.5, 24.2, 26.2, 25.9, 24.6, 27.6, 33.9, 26.9, 24.0,…
## $ land_area        <dbl> 3531905.43, 50645.33, 594.44, 1589.78, 884.88, 622.58…
## $ hh_income        <int> 53046, 43253, 53682, 50221, 32911, 36447, 44145, 3203…
## $ su_gun4          <fct> NA, NA, "[11,54]", "[11,54]", "[ 5, 8)", "[11,54]", "…
## $ su_gun6          <fct> NA, NA, "[10,12)", "[10,12)", "[ 7, 8)", "[10,12)", "…
## $ fips             <dbl> 0, 1000, 1001, 1003, 1005, 1007, 1009, 1011, 1013, 10…
## $ votes_dem_2016   <int> NA, NA, 5908, 18409, 4848, 1874, 2150, 3530, 3716, 13…
## $ votes_gop_2016   <int> NA, NA, 18110, 72780, 5431, 6733, 22808, 1139, 4891, …
## $ total_votes_2016 <int> NA, NA, 24661, 94090, 10390, 8748, 25384, 4701, 8685,…
## $ per_dem_2016     <dbl> NA, NA, 0.23956855, 0.19565310, 0.46660250, 0.2142203…
## $ per_gop_2016     <dbl> NA, NA, 0.7343579, 0.7735147, 0.5227141, 0.7696616, 0…
## $ diff_2016        <int> NA, NA, 12202, 54371, 583, 4859, 20658, 2391, 1175, 1…
## $ per_dem_2012     <dbl> NA, NA, 0.2657577, 0.2156657, 0.5125229, 0.2621857, 0…
## $ per_gop_2012     <dbl> NA, NA, 0.7263374, 0.7738975, 0.4833755, 0.7306638, 0…
## $ diff_2012        <int> NA, NA, 11012, 47443, 334, 3931, 17780, 2808, 714, 14…
## $ winner           <chr> NA, NA, "Trump", "Trump", "Trump", "Trump", "Trump", …
## $ partywinner16    <chr> NA, NA, "Republican", "Republican", "Republican", "Re…
## $ winner12         <chr> NA, NA, "Romney", "Romney", "Obama", "Romney", "Romne…
## $ partywinner12    <chr> NA, NA, "Republican", "Republican", "Democrat", "Repu…
## $ flipped          <chr> NA, NA, "No", "No", "Yes", "No", "No", "No", "No", "N…

Read about the FIPS codes at https://www.census.gov/library/reference/code-lists/ansi/ansi-codes-for-states.html

Get Counties

Using the FIPS codes, filter the dataframe to counties, which contains only the counties of the 50 states.

Answer

counties = county_data %>% 
  mutate(numeric_fips = as.numeric(id)) %>% 
  filter(numeric_fips %% 1000 != 0 &
         numeric_fips %/% 1000 != 11) %>% 
  mutate(state = factor(state))
table(counties$state)
## 
##  AK  AL  AR  AZ  CA  CO  CT  DE  FL  GA  HI  IA  ID  IL  IN  KS  KY  LA  MA  MD 
##  29  67  75  15  58  64   8   3  67 159   5  99  44 102  92 105 120  64  14  24 
##  ME  MI  MN  MO  MS  MT  NC  ND  NE  NH  NJ  NM  NV  NY  OH  OK  OR  PA  RI  SC 
##  16  83  87 115  82  56 100  53  93  10  21  33  17  62  88  77  36  67   5  46 
##  SD  TN  TX  UT  VA  VT  WA  WI  WV  WY 
##  66  95 254  29 134  14  39  72  55  23

Exercise

Examine the variable pop in counties. Use summary().

Answer

summary(counties$pop)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0    11025    25702   101272    67690 10116705

Exercise

Counties with zero population are strange. What are they.

Answer

counties %>% 
  select(state, name, pop) %>% 
  filter(pop == 0) 
##   state         name pop
## 1    VA Bedford city   0

Exercise

Restrict counties to have non-zero populations.

Answer

counties = counties %>% 
  filter(pop > 0)
summary(counties)
##       id                name               state        census_region 
##  Length:3141        Length:3141        TX     : 254   Midwest  :1055  
##  Class :character   Class :character   GA     : 159   Northeast: 217  
##  Mode  :character   Mode  :character   VA     : 133   South    :1421  
##                                        KY     : 120   West     : 448  
##                                        MO     : 115                   
##                                        KS     : 105                   
##                                        (Other):2255                   
##           pop_dens          pop_dens4         pop_dens6         pct_black   
##  [    0,   10): 575   [  0,   17):793   [  0,    9):529   [ 0.0, 2.0):1470  
##  [   10,   50):1117   [ 17,   45):791   [  9,   25):527   [ 2.0, 5.0): 512  
##  [   50,  100): 568   [ 45,  118):782   [ 25,   45):528   [ 5.0,10.0): 349  
##  [  100,  500): 624   [118,71672]:775   [ 45,   82):523   [10.0,15.0): 181  
##  [  500, 1000): 109                     [ 82,  215):517   [15.0,25.0): 221  
##  [ 1000, 5000): 130                     [215,71672]:517   [25.0,50.0): 312  
##  [ 5000,71672]:  18                                       [50.0,85.3]:  96  
##       pop               female          white           black       
##  Min.   :      86   Min.   :29.90   Min.   : 4.90   Min.   : 0.000  
##  1st Qu.:   11027   1st Qu.:49.50   1st Qu.:80.70   1st Qu.: 0.700  
##  Median :   25713   Median :50.40   Median :92.20   Median : 2.300  
##  Mean   :  101305   Mean   :49.93   Mean   :85.28   Mean   : 9.164  
##  3rd Qu.:   67694   3rd Qu.:51.00   3rd Qu.:96.10   3rd Qu.:10.700  
##  Max.   :10116705   Max.   :57.00   Max.   :99.10   Max.   :85.300  
##                                                                     
##   travel_time      land_area          hh_income         su_gun4   
##  Min.   : 4.40   Min.   :     2.0   Min.   : 19986   [ 0, 5):784  
##  1st Qu.:19.20   1st Qu.:   431.1   1st Qu.: 38187   [ 5, 8):787  
##  Median :22.80   Median :   616.4   Median : 44157   [ 8,11):786  
##  Mean   :22.97   Mean   :  1124.4   Mean   : 45934   [11,54]:784  
##  3rd Qu.:26.40   3rd Qu.:   924.0   3rd Qu.: 51164                
##  Max.   :44.20   Max.   :145504.8   Max.   :122238                
##                                                                   
##     su_gun6         fips       votes_dem_2016    votes_gop_2016  
##  [ 0, 4):522   Min.   : 1001   Min.   :      4   Min.   :    57  
##  [ 4, 7):524   1st Qu.:18179   1st Qu.:   1175   1st Qu.:  3238  
##  [ 7, 8):525   Median :29177   Median :   3194   Median :  7262  
##  [ 8,10):524   Mean   :30390   Mean   :  20616   Mean   : 20626  
##  [10,12):523   3rd Qu.:45081   3rd Qu.:  10031   3rd Qu.: 18134  
##  [12,54]:523   Max.   :56045   Max.   :1893770   Max.   :620285  
##                                NA's   :1         NA's   :1       
##  total_votes_2016   per_dem_2016      per_gop_2016       diff_2016      
##  Min.   :     64   Min.   :0.03145   Min.   :0.08322   Min.   :      2  
##  1st Qu.:   4868   1st Qu.:0.20537   1st Qu.:0.54615   1st Qu.:   1608  
##  Median :  11140   Median :0.28630   Median :0.66542   Median :   3797  
##  Mean   :  43493   Mean   :0.31743   Mean   :0.63535   Mean   :  11784  
##  3rd Qu.:  29773   3rd Qu.:0.39794   3rd Qu.:0.75028   3rd Qu.:   8810  
##  Max.   :2652072   Max.   :0.89329   Max.   :0.95273   Max.   :1273485  
##  NA's   :1         NA's   :1         NA's   :1         NA's   :1        
##   per_dem_2012      per_gop_2012       diff_2012         winner         
##  Min.   :0.03448   Min.   :0.06006   Min.   :     1   Length:3141       
##  1st Qu.:0.27797   1st Qu.:0.50762   1st Qu.:  1048   Class :character  
##  Median :0.37135   Median :0.60942   Median :  2565   Mode  :character  
##  Mean   :0.38477   Mean   :0.59800   Mean   :  9503                     
##  3rd Qu.:0.47518   3rd Qu.:0.70327   3rd Qu.:  6468                     
##  Max.   :0.93355   Max.   :0.95862   Max.   :972564                     
##  NA's   :30        NA's   :30        NA's   :30                         
##  partywinner16        winner12         partywinner12        flipped         
##  Length:3141        Length:3141        Length:3141        Length:3141       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   numeric_fips  
##  Min.   : 1001  
##  1st Qu.:18179  
##  Median :29177  
##  Mean   :30390  
##  3rd Qu.:45081  
##  Max.   :56045  
## 

Exercise

Do a histogram of the pop variable.

Answer

counties %>% 
  ggplot(aes(x = pop)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise

This is very difficult use logs instead.

counties %>% 
  ggplot(aes(x = pop)) +
  geom_histogram() +
  scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise

Facet the histogram by census region

counties %>% 
  ggplot(aes(x = pop)) +
  geom_histogram() +
  scale_x_log10() +
  facet_wrap(~census_region,ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise

Free the scale on the y-axis.

Answer

counties %>% 
  ggplot(aes(x = pop)) +
  geom_histogram() +
  scale_x_log10() +
  facet_wrap(~census_region,
             ncol = 1,
             scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Exercise

Use geom_density() instead of geom_histogram().

counties %>% 
  ggplot(aes(x = pop)) +
  geom_density() +
  scale_x_log10() +
  facet_wrap(~census_region,
             ncol = 1,
             scales = "free_y")

Exercise

Create a new dataframe PNW from counties. Take the counties from Washington, Oregon and Idaho.

Answer

PNW = counties %>% 
  filter(state == "WA" | state == "OR" | state == "ID")
glimpse(PNW)
## Rows: 119
## Columns: 33
## $ id               <chr> "16001", "16003", "16005", "16007", "16009", "16011",…
## $ name             <chr> "Ada County", "Adams County", "Bannock County", "Bear…
## $ state            <fct> ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, I…
## $ census_region    <fct> West, West, West, West, West, West, West, West, West,…
## $ pop_dens         <fct> "[  100,  500)", "[    0,   10)", "[   50,  100)", "[…
## $ pop_dens4        <fct> "[118,71672]", "[  0,   17)", "[ 45,  118)", "[  0,  …
## $ pop_dens6        <fct> "[215,71672]", "[  0,    9)", "[ 45,   82)", "[  0,  …
## $ pct_black        <fct> "[ 0.0, 2.0)", "[ 0.0, 2.0)", "[ 0.0, 2.0)", "[ 0.0, …
## $ pop              <int> 426236, 3861, 83347, 5957, 9118, 45269, 21482, 6824, …
## $ female           <dbl> 50.0, 48.9, 50.2, 50.3, 49.1, 49.7, 49.1, 48.7, 50.0,…
## $ white            <dbl> 92.5, 96.6, 91.9, 97.3, 86.7, 89.5, 95.7, 95.4, 95.9,…
## $ black            <dbl> 1.3, 0.2, 0.8, 0.3, 0.3, 0.5, 0.3, 0.4, 0.3, 0.7, 0.4…
## $ travel_time      <dbl> 19.6, 19.3, 18.8, 20.0, 21.7, 21.1, 16.9, 31.5, 23.3,…
## $ land_area        <dbl> 1052.58, 1363.06, 1111.99, 974.79, 776.62, 2093.98, 2…
## $ hh_income        <int> 55210, 35434, 43534, 44964, 39049, 47941, 64042, 4105…
## $ su_gun4          <fct> "[ 5, 8)", "[11,54]", "[11,54]", "[11,54]", "[11,54]"…
## $ su_gun6          <fct> "[ 7, 8)", "[12,54]", "[10,12)", "[12,54]", "[12,54]"…
## $ fips             <dbl> 16001, 16003, 16005, 16007, 16009, 16011, 16013, 1601…
## $ votes_dem_2016   <int> 75676, 415, 10342, 255, 780, 2924, 6416, 777, 5140, 8…
## $ votes_gop_2016   <int> 93748, 1556, 17180, 2203, 3101, 10907, 3340, 2673, 10…
## $ total_votes_2016 <int> 195587, 2183, 33405, 2928, 4192, 16625, 10723, 3812, …
## $ per_dem_2016     <dbl> 0.38691733, 0.19010536, 0.30959437, 0.08709016, 0.186…
## $ per_gop_2016     <dbl> 0.4793161, 0.7127806, 0.5142943, 0.7523907, 0.7397424…
## $ diff_2016        <int> 18072, 1141, 6838, 1948, 2321, 7983, 3076, 1896, 5080…
## $ per_dem_2012     <dbl> 0.42715997, 0.28009709, 0.37475893, 0.10701630, 0.299…
## $ per_gop_2012     <dbl> 0.5404217, 0.6859223, 0.5958593, 0.8819986, 0.6683831…
## $ diff_2012        <int> 20428, 836, 7796, 2187, 1432, 9618, 2053, 1331, 4867,…
## $ winner           <chr> "Trump", "Trump", "Trump", "Trump", "Trump", "Trump",…
## $ partywinner16    <chr> "Republican", "Republican", "Republican", "Republican…
## $ winner12         <chr> "Romney", "Romney", "Romney", "Romney", "Romney", "Ro…
## $ partywinner12    <chr> "Republican", "Republican", "Republican", "Republican…
## $ flipped          <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",…
## $ numeric_fips     <dbl> 16001, 16003, 16005, 16007, 16009, 16011, 16013, 1601…

Exercise

Run the table command on PNW to see how the states compare on the number of counties.

table(PNW$state)
## 
## AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT 
##  0  0  0  0  0  0  0  0  0  0  0  0 44  0  0  0  0  0  0  0  0  0  0  0  0  0 
## NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 
##  0  0  0  0  0  0  0  0  0  0 36  0  0  0  0  0  0  0  0  0 39  0  0  0

Exercise

Fix PNW to get rid of the states that don’t exist. Then rerun the table.

PNW = PNW %>% 
  mutate(state = factor(state))
table(PNW$state)
## 
## ID OR WA 
## 44 36 39

Exercise

Make a density plot of the populations of the counties in these state.

PNW %>% 
  ggplot(aes(x = pop)) +
  geom_density()

We have another right skew problem. Fix it.

Answer

PNW %>% 
  ggplot(aes(x = pop)) +
  geom_density() + 
  scale_x_log10()

Exercise

Do a facet by state.

PNW %>% 
  ggplot(aes(x = pop)) +
  geom_density() + 
  geom_rug() +
  scale_x_log10() +
  facet_wrap(~state,ncol = 1)