Setup

Get the socviz package and tidyverse. Build the counties dataframe as in the previous set of exercises.

library(socviz)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

counties = county_data %>% 
  mutate(numeric_fips = as.numeric(id)) %>% 
  filter(numeric_fips %% 1000 != 0 &
         numeric_fips %/% 1000 != 11) %>% 
  mutate(state = factor(state)) %>% 
  filter( pop > 0 )

Trim

Let’s reduce the number of variables in counties and add a continuous variable density.

Answer

counties = counties %>% 
  mutate(density = pop/land_area) %>%
  select(fips,name,census_region,state,pop,land_area, density)

Make Long and Narrow

Convert counties to counties_long by collapsing pop, density, and land_area into a variable measure.

Answer

counties_long = counties %>% 
  pivot_longer(cols = pop:density, names_to = "measure", values_to = "value")

head(counties_long)

## # A tibble: 6 x 6
##    fips name           census_region state measure      value
##   <dbl> <chr>          <fct>         <fct> <chr>        <dbl>
## 1  1001 Autauga County South         AL    pop        55395  
## 2  1001 Autauga County South         AL    land_area    594. 
## 3  1001 Autauga County South         AL    density       93.2
## 4  1003 Baldwin County South         AL    pop       200111  
## 5  1003 Baldwin County South         AL    land_area   1590. 
## 6  1003 Baldwin County South         AL    density      126.

Graph

Use facet_grid() to show how the regions compare for these measures.

Answer

counties_long %>% 
  ggplot(aes(x = value)) +
  geom_density() + 
  geom_rug() + 
  scale_x_log10() +
  facet_grid(census_region ~ measure, scales = "free_x")

Smaller

That is a bit busy and possibly hard to read. Use filter to compare just the West and the Northeast for land area.

Answer

counties_long %>% 
  filter(census_region %in% c("West","Northeast") &
           measure == "land_area") %>% 
  ggplot(aes(x = value)) +
  geom_density() + 
  geom_rug() + 
  scale_x_log10() +
  facet_grid(census_region ~ measure)

Summarize

Let’s use summarize to compare the land areas of counties in the regions. Capture min, median, and max values.

Answer

counties_long %>% 
  filter(measure == "land_area") %>% 
  group_by(census_region) %>% 
  summarize(min = min(value),
            median = median(value),
            max = max(value)) %>% 
  ungroup() %>% 
  arrange(median) %>% 
  ggplot(aes(y = census_region)) +
     geom_point(aes(x = min),color = "blue") +
     geom_point(aes(x = median),color = "black") +
     geom_point(aes(x = max),color = "red") +
  ggtitle("Comparison of Land Areas")

Fix the Compression

Use a log scale on the x-axis

Answer

counties_long %>% 
  filter(measure == "land_area") %>% 
  group_by(census_region) %>% 
  summarize(min = min(value),
            median = median(value),
            max = max(value)) %>% 
  ungroup() %>% 
  arrange(median) %>% 
  ggplot(aes(y = census_region)) +
     geom_point(aes(x = min),color = "blue") +
     geom_point(aes(x = median),color = "black") +
     geom_point(aes(x = max),color = "red") +
  ggtitle("Comparison of Land Areas") +
  scale_x_log10()

Joining the Uninsured Rate

The following code will create a dataframe containing the uninsured rate by county.

URL <- "https://healthinequality.org/dl/health_ineq_online_table_12.csv"
characteristicsData <- read.csv(URL)
uninsured_rate = characteristicsData %>% 
  select(cty,puninsured2010,county_name,statename)
str(uninsured_rate)

## 'data.frame':    3138 obs. of  4 variables:
##  $ cty           : int  1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
##  $ puninsured2010: num  13.6 19.1 18.5 17.7 19.3 ...
##  $ county_name   : chr  "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ statename     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...

Now I want to see the distribution of county uninsured rates by census_region.

Get Regions Table

Trim the counties dataframe down to just the identifying variables.

Answer

regions = counties %>% 
  select(fips,census_region, state, name)
head(regions)

##   fips census_region state           name
## 1 1001         South    AL Autauga County
## 2 1003         South    AL Baldwin County
## 3 1005         South    AL Barbour County
## 4 1007         South    AL    Bibb County
## 5 1009         South    AL  Blount County
## 6 1011         South    AL Bullock County

joined = regions %>% 
  left_join(uninsured_rate,by = c("fips" = "cty"))

head(joined)

##   fips census_region state           name puninsured2010 county_name statename
## 1 1001         South    AL Autauga County       13.60128     Autauga   Alabama
## 2 1003         South    AL Baldwin County       19.08533     Baldwin   Alabama
## 3 1005         South    AL Barbour County       18.51381     Barbour   Alabama
## 4 1007         South    AL    Bibb County       17.71883        Bibb   Alabama
## 5 1009         South    AL  Blount County       19.28041      Blount   Alabama
## 6 1011         South    AL Bullock County       16.24321     Bullock   Alabama

Graph

Use facet_wrap() to see the distribution of the uninsured rate by region.

Answer

joined %>% 
  ggplot(aes(x = puninsured2010)) +
  geom_density() +
  geom_rug() +
  facet_wrap(~census_region,ncol = 1)

## Warning: Removed 10 rows containing non-finite values (stat_density).

Pivot Join Summarize

Setup

Trim

Answer

Make Long and Narrow

Answer

Graph

Answer

Smaller

Answer

Summarize

Answer

Fix the Compression

Answer

Joining the Uninsured Rate

Get Regions Table

Answer

Graph

Answer