Setup

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

load("county_clean.Rdata")
State_Centroids <- read_csv("State Centroids.csv")

## Rows: 50 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): State
## dbl (2): Latitude, Longitude
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Examine county_clean

Solution

glimpse(county_clean)

## Rows: 3,135
## Columns: 14
## $ name              <fct> Autauga County, Baldwin County, Barbour County, Bibb…
## $ state             <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama…
## $ pop2000           <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 11…
## $ pop2010           <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 11…
## $ pop2017           <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 11…
## $ pop_change        <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51, …
## $ poverty           <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8…
## $ homeownership     <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.4…
## $ multi_unit        <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 13.7, 14.3, 8.7, 4.3…
## $ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 5.49, 4.93, 4.08…
## $ metro             <fct> yes, yes, no, yes, yes, no, no, yes, no, no, yes, no…
## $ median_edu        <fct> some_college, some_college, hs_diploma, hs_diploma, …
## $ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.05, 21367.39, 15…
## $ median_hh_income  <int> 55317, 52562, 33368, 43404, 47412, 29655, 36326, 436…

summary(county_clean)

##                 name           state         pop2000           pop2010       
##  Washington County:  30   Texas   : 254   Min.   :     67   Min.   :     82  
##  Jefferson County :  25   Georgia : 159   1st Qu.:  11260   1st Qu.:  11154  
##  Franklin County  :  24   Virginia: 133   Median :  24663   Median :  25910  
##  Jackson County   :  23   Kentucky: 120   Mean   :  89757   Mean   :  98455  
##  Lincoln County   :  23   Missouri: 115   3rd Qu.:  61896   3rd Qu.:  67016  
##  Madison County   :  19   Kansas  : 105   Max.   :9519338   Max.   :9818605  
##  (Other)          :2991   (Other) :2249                                      
##     pop2017           pop_change          poverty      homeownership 
##  Min.   :     134   Min.   :-33.6300   Min.   : 2.40   Min.   :20.7  
##  1st Qu.:   11004   1st Qu.: -1.9750   1st Qu.:11.35   1st Qu.:69.5  
##  Median :   25862   Median : -0.0700   Median :15.20   Median :74.6  
##  Mean   :  103867   Mean   :  0.5292   Mean   :15.98   Mean   :73.3  
##  3rd Qu.:   67756   3rd Qu.:  2.3700   3rd Qu.:19.40   3rd Qu.:78.4  
##  Max.   :10163507   Max.   : 37.1900   Max.   :52.00   Max.   :91.3  
##                                                                      
##    multi_unit    unemployment_rate metro             median_edu  
##  Min.   : 0.00   Min.   : 1.620    no :1971   below_hs    :   2  
##  1st Qu.: 6.10   1st Qu.: 3.520    yes:1164   hs_diploma  :1397  
##  Median : 9.70   Median : 4.360               some_college:1691  
##  Mean   :12.32   Mean   : 4.607               bachelors   :  45  
##  3rd Qu.:15.90   3rd Qu.: 5.350                                  
##  Max.   :98.50   Max.   :19.070                                  
##                                                                  
##  per_capita_income median_hh_income
##  Min.   :10467     Min.   : 19264  
##  1st Qu.:21765     1st Qu.: 41124  
##  Median :25442     Median : 48038  
##  Mean   :26074     Mean   : 49739  
##  3rd Qu.:29250     3rd Qu.: 55751  
##  Max.   :69533     Max.   :129588  
##

Get a Subset of Variables

Create county_small using the variables at the beginning of the dataframe ending with homeownership.

Solution

county_small = county_clean %>% 
  select(name:homeownership)
glimpse(county_small)

## Rows: 3,135
## Columns: 8
## $ name          <fct> Autauga County, Baldwin County, Barbour County, Bibb Cou…
## $ state         <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, Al…
## $ pop2000       <dbl> 43671, 140415, 29038, 20826, 51024, 11714, 21399, 112249…
## $ pop2010       <dbl> 54571, 182265, 27457, 22915, 57322, 10914, 20947, 118572…
## $ pop2017       <int> 55504, 212628, 25270, 22668, 58013, 10309, 19825, 114728…
## $ pop_change    <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28, -2.69, -1.51, -1.2…
## $ poverty       <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 24.4, 18.6, 18.8, 16…
## $ homeownership <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 69.0, 70.7, 71.4, 77…

Get Rid of…

Alaska, Hawaii, and District of Columbia.

Create a new version of county_small with these entities eliminated.

Solution

county_small = county_small %>% 
  filter(!state %in% c("Alaska",
                       "Hawaii",
                       "District of Columbia"))
str(county_small)

## tibble [3,105 × 8] (S3: tbl_df/tbl/data.frame)
##  $ name         : Factor w/ 1877 levels "Abbeville County",..: 83 90 101 150 165 226 236 249 297 319 ...
##  $ state        : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ pop2000      : num [1:3105] 43671 140415 29038 20826 51024 ...
##  $ pop2010      : num [1:3105] 54571 182265 27457 22915 57322 ...
##  $ pop2017      : int [1:3105] 55504 212628 25270 22668 58013 10309 19825 114728 33713 25857 ...
##  $ pop_change   : num [1:3105] 1.48 9.19 -6.22 0.73 0.68 -2.28 -2.69 -1.51 -1.2 -0.6 ...
##  $ poverty      : num [1:3105] 13.7 11.8 27.2 15.2 15.6 28.5 24.4 18.6 18.8 16.1 ...
##  $ homeownership: num [1:3105] 77.5 76.7 68 82.9 82 76.9 69 70.7 71.4 77.5 ...

Summarize by State.

Create a file states with one record per state. In addition to the name of the state the file should contain (based on pop2017) the total state population, and the mean and median values of the populations of the counties. Arrange the file in descending order based on the mean of the county populations.

Solution

states = county_small %>% 
  group_by(state) %>% 
  summarize(total_pop = sum(pop2017),
            mean_pop = mean(pop2017),
            median_pop = median(pop2017)) %>% 
  ungroup() %>% 
  arrange(desc(mean_pop))

head(states)

## # A tibble: 6 × 4
##   state         total_pop mean_pop median_pop
##   <fct>             <int>    <dbl>      <dbl>
## 1 California     39536653  681666.    185908.
## 2 Massachusetts   6859819  489987.    492480 
## 3 Arizona         7016270  467751.    124756 
## 4 Connecticut     3588184  448523     225605 
## 5 New Jersey      9005644  428840.    448596 
## 6 Delaware         961939  320646.    225322

str(states)

## tibble [48 × 4] (S3: tbl_df/tbl/data.frame)
##  $ state     : Factor w/ 51 levels "Alabama","Alaska",..: 5 22 3 7 31 8 33 10 21 40 ...
##  $ total_pop : int [1:48] 39536653 6859819 7016270 3588184 9005644 961939 19849399 20984400 6052177 1059639 ...
##  $ mean_pop  : num [1:48] 681666 489987 467751 448523 428840 ...
##  $ median_pop: num [1:48] 185908 492480 124756 225605 448596 ...

Make it longer

Convert the data to a long format by collapsing the values in the population into a single column named ‘value’ with a description in an adjoining column named ‘measure’. Name the new dataframe ‘states_long’.

Solution

states_long = states %>% 
  pivot_longer(cols = ends_with("_pop"),
               values_to = 'value',
               names_to = 'measure')
head(states_long,12)

## # A tibble: 12 × 3
##    state         measure        value
##    <fct>         <chr>          <dbl>
##  1 California    total_pop  39536653 
##  2 California    mean_pop     681666.
##  3 California    median_pop   185908.
##  4 Massachusetts total_pop   6859819 
##  5 Massachusetts mean_pop     489987.
##  6 Massachusetts median_pop   492480 
##  7 Arizona       total_pop   7016270 
##  8 Arizona       mean_pop     467751.
##  9 Arizona       median_pop   124756 
## 10 Connecticut   total_pop   3588184 
## 11 Connecticut   mean_pop     448523 
## 12 Connecticut   median_pop   225605

Add Geographic Information

The dataframe State_Centroids contains the coordinates of the centroids of each state. Left join this file to states. Note that the variable ‘state’ in states is spelled ‘State’ in the centroids file. Rename the version in states to simplify the join. Call the joined dataframe ‘with_centroids’.

Solution

with_centroids = states %>% 
  rename(State = state) %>% 
  left_join(State_Centroids)

## Joining, by = "State"

str(with_centroids)

## tibble [48 × 6] (S3: tbl_df/tbl/data.frame)
##  $ State     : chr [1:48] "California" "Massachusetts" "Arizona" "Connecticut" ...
##  $ total_pop : int [1:48] 39536653 6859819 7016270 3588184 9005644 961939 19849399 20984400 6052177 1059639 ...
##  $ mean_pop  : num [1:48] 681666 489987 467751 448523 428840 ...
##  $ median_pop: num [1:48] 185908 492480 124756 225605 448596 ...
##  $ Latitude  : num [1:48] 36.1 42.2 33.7 41.6 40.3 ...
##  $ Longitude : num [1:48] -119.7 -71.5 -111.4 -72.8 -74.5 ...

Chapter 3 Extra

Setup

Examine county_clean

Solution

Get a Subset of Variables

Solution

Get Rid of…

Solution

Summarize by State.

Solution

Make it longer

Solution

Add Geographic Information

Solution