1) Import the data set using a Tiderverse function. How many rows and columns are in the data set?

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
ACS <- read_csv('homework3/acs_2015_county_data_revised.csv')
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   state = col_character(),
##   county = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
dim(ACS)
## [1] 3142   35

There are 3142 rows and 35 columns.

2) Do any data types need changed? Show any code to change variable types and show code/output and show a glimpse() command after you’re finished.

glimpse(ACS)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, …
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama",…
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bu…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1166…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, …
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374,…
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6,…
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, …
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4,…
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3,…
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, …
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, …
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, …
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, …
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, …
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 1…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, …
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, …
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2,…
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1,…
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4,…
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9,…
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, …
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 13…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, …
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, …
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1,…
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5,…
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, …

The variable census_id can be changed to a character as it’s a unique id. We can organize the data bit better with that being a character value.

The variable state can be changed into a factor so we can graph and organize the data better.

ACS$census_id <- as.character(ACS$census_id)
ACS$state <- as.factor(ACS$state)
glimpse(ACS)
## Rows: 3,142
## Columns: 35
## $ census_id      <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013"…
## $ state          <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, …
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bu…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 1166…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, …
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374,…
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6,…
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, …
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4,…
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3,…
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, …
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, …
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, …
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, …
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, …
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 1…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, …
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, …
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2,…
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1,…
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4,…
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9,…
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, …
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 13…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, …
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, …
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1,…
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5,…
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, …

3) Are there any missing values? How will you handle missing values? Will you impute missing values with, for example, a mean or median value for the entire column, or will you remove the entire observation? Give a rationale for your decision and show any code/output to handle missing values.

colSums(is.na(ACS))
##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

There are only two missing values, one in child_poverty and one in income.

For the NA value in child_poverty I’m going to replace NA with 0 as the population of the county is small so there may be a chance that there are no children in that county.

For the NA value in income I’m going to multiply the income_per_cap by two and replace it with that number, as it seems that the income for most places is about double the income_per_cap on average.

ACS$child_poverty <- ACS$child_poverty %>%
  replace_na(0)
ACS$income <- ACS$income %>% 
  replace_na(51754)
sum(is.na(ACS))
## [1] 0

4) Use the summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code/output to handle unusual values.

summary(ACS)
##   census_id              state         county            total_pop       
##  Length:3142        Texas   : 254   Length:3142        Min.   :      85  
##  Class :character   Georgia : 159   Class :character   1st Qu.:   11028  
##  Mode  :character   Virginia: 133   Mode  :character   Median :   25768  
##                     Kentucky: 120                      Mean   :  100737  
##                     Missouri: 115                      3rd Qu.:   67552  
##                     Kansas  : 105                      Max.   :10038388  
##                     (Other) :2256                                        
##       men              women            hispanic          white      
##  Min.   :     42   Min.   :     43   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5546   1st Qu.:   5466   1st Qu.: 1.900   1st Qu.:65.60  
##  Median :  12826   Median :  12907   Median : 3.700   Median :84.60  
##  Mean   :  49565   Mean   :  51171   Mean   : 8.826   Mean   :77.28  
##  3rd Qu.:  33319   3rd Qu.:  34122   3rd Qu.: 9.000   3rd Qu.:93.30  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.879   Mean   : 1.766   Mean   : 1.258   Mean   : 0.08475  
##  3rd Qu.:10.175   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :35.30000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :     80   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8254   1st Qu.: 38826   1st Qu.:20471   1st Qu.:12.0  
##  Median :  19434   Median : 45112   Median :23577   Median :16.0  
##  Mean   :  70804   Mean   : 46832   Mean   :24338   Mean   :16.7  
##  3rd Qu.:  50728   3rd Qu.: 52249   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                                                                   
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.28   Mean   :31.04   Mean   :18.26   Mean   :22.13  
##  3rd Qu.:29.48   3rd Qu.:34.40   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##                                                                 
##   construction     production        drive          carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.20   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.53   1st Qu.:76.60   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.60   Median : 9.90  
##  Mean   :12.74   Mean   :15.82   Mean   :79.08   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.60   3rd Qu.:11.88  
##  Max.   :40.30   Max.   :55.60   Max.   :94.60   Max.   :29.90  
##                                                                 
##     transit             walk         other_transp     work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.900   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.300   Median : 4.000  
##  Mean   : 0.9675   Mean   : 3.307   Mean   : 1.614   Mean   : 4.697  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.900   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.100   Max.   :37.200  
##                                                                      
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :     62   Min.   :25.00   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4524   1st Qu.:70.90   1st Qu.:13.10  
##  Median :22.90   Median :  10644   Median :75.80   Median :16.10  
##  Mean   :23.15   Mean   :  46387   Mean   :74.44   Mean   :17.35  
##  3rd Qu.:26.60   3rd Qu.:  29254   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.921   Mean   :0.2915   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
## 

There are many instances of the mean being higher than the median, but that’s mostly caused by heavily populated counties skewing the data a bit. There are many low-mid populated counties compared to high populated counties, but the high populated counties have such higher numbers compared to the rest of the data that it tends to skew much of the data.

For example, total_pop is heavily skewed because the mean is significantly higher than the median. A few other variables that do this (and are similar to population) are men, women, hispanic, black, citizen, and employed.

I will be leaving the values in, but may use the median instead of the mean for pulling data from the variables listed above.

5) How many counties have more women than man?

women_greater <- ACS %>% 
  filter(ACS$women > ACS$men)
count(women_greater)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1985

There are 1,985 counties with more women than men.

6) How many counties have an unemployment rate lower than 10%?

unemployment_10 <- ACS %>% 
  filter(ACS$unemployment < 10.0)
count(unemployment_10)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  2420

There are 2,420 counties with an unemployment rate lower than 10%

7) What are the top 10 counties with the highest mean commute? Show the census ID, county name, state, and the mean_commute in your final answer (sorted by mean_commute)

highest_commute <- ACS %>% 
  select(census_id, county, state, mean_commute)

highest_commute %>% 
  head(arrange(desc(mean_commute)), n = 10)
## # A tibble: 10 x 4
##    census_id county   state   mean_commute
##    <chr>     <chr>    <fct>          <dbl>
##  1 1001      Autauga  Alabama         26.5
##  2 1003      Baldwin  Alabama         26.4
##  3 1005      Barbour  Alabama         24.1
##  4 1007      Bibb     Alabama         28.8
##  5 1009      Blount   Alabama         34.9
##  6 1011      Bullock  Alabama         27.5
##  7 1013      Butler   Alabama         24.6
##  8 1015      Calhoun  Alabama         24.1
##  9 1017      Chambers Alabama         25.1
## 10 1019      Cherokee Alabama         27.4

8) Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages. Show the census ID, county name, state, and the percentage in your final answer (sorted by ascending percentage)

ACS <- ACS %>% 
  mutate(percentage_of_women = women/total_pop)

women_percent <- ACS %>% 
  select(census_id, county, state, percentage_of_women)

women_percent <- women_percent %>% 
  arrange(percentage_of_women)

head(women_percent, 10)
## # A tibble: 10 x 4
##    census_id county                 state        percentage_of_women
##    <chr>     <chr>                  <fct>                      <dbl>
##  1 42053     Forest                 Pennsylvania               0.268
##  2 8011      Bent                   Colorado                   0.314
##  3 51183     Sussex                 Virginia                   0.315
##  4 13309     Wheeler                Georgia                    0.321
##  5 6035      Lassen                 California                 0.332
##  6 48095     Concho                 Texas                      0.333
##  7 13053     Chattahoochee          Georgia                    0.334
##  8 2013      Aleutians East Borough Alaska                     0.335
##  9 22125     West Feliciana         Louisiana                  0.336
## 10 32027     Pershing               Nevada                     0.337

9) Create a new variable that calculates the sum of all race percentage variables.

ACS <- ACS %>% 
  mutate(race_total = ACS$hispanic + ACS$white + ACS$black + ACS$native + ACS$asian + ACS$pacific)

a) What are the top 10 counties with the lowest sum of these race percentage variables?

ACS %>% 
  select(census_id, state, county, race_total) %>% 
  arrange(race_total) %>% 
  top_n(10, ACS$race_total)
## # A tibble: 11 x 4
##    census_id state          county    race_total
##    <chr>     <fct>          <chr>          <dbl>
##  1 21213     Kentucky       Simpson         98.3
##  2 50019     Vermont        Orleans         98.5
##  3 54069     West Virginia  Ohio            98.5
##  4 27153     Minnesota      Todd            98.5
##  5 37005     North Carolina Alleghany       99.1
##  6 34019     New Jersey     Hunterdon       99.1
##  7 37115     North Carolina Madison         99.1
##  8 13181     Georgia        Lincoln         99.2
##  9 13207     Georgia        Monroe          99.2
## 10 48193     Texas          Hamilton        99.2
## 11 1105      Alabama        Perry           99.3

b) Which state, on average, has the lowest sum of these race percentage variables?

North Carolina

c) Do any counties have a sum greater than 100%?

I do not see any counties with a sum greater than 100%

d) How many states have a sum that equals exactly to 100%?

ACS %>% 
  filter(ACS$race_total == 100) %>% 
  count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1    27

27 total counties with exactly 100%

10) using the carpool variable,

a) Use the dplyr::min_rank() function to create a new variable called carpool_rank where the highest ranked county is the county with the highest carpool value. Read the documentation carefully for the ranking function.

b) Find the 10 highest ranked counties for carpooling. Show the census ID, county name, state, carpool value, and carpool_rank in your final answer.

ACS$carpool_rank <- min_rank(ACS$carpool)

ACS %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  top_n(-10, carpool_rank)
## # A tibble: 11 x 5
##    census_id county      state        carpool carpool_rank
##    <chr>     <chr>       <fct>          <dbl>        <int>
##  1 48261     Kenedy      Texas            0              1
##  2 48269     King        Texas            0              1
##  3 48235     Irion       Texas            0.9            3
##  4 31183     Wheeler     Nebraska         1.3            4
##  5 36061     New York    New York         1.9            5
##  6 13309     Wheeler     Georgia          2.3            6
##  7 38029     Emmons      North Dakota     2.3            6
##  8 30019     Daniels     Montana          2.6            8
##  9 31057     Dundy       Nebraska         2.6            8
## 10 46069     Hyde        South Dakota     2.8           10
## 11 51720     Norton city Virginia         2.8           10

c) Find the 10 lowest ranked counties for carpooling. Show the same variables in your final answer.

ACS %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(desc(carpool_rank)) %>% 
  top_n(10, carpool_rank)
## # A tibble: 10 x 5
##    census_id county   state    carpool carpool_rank
##    <chr>     <chr>    <fct>      <dbl>        <int>
##  1 13061     Clay     Georgia     29.9         3142
##  2 18087     LaGrange Indiana     27           3141
##  3 13165     Jenkins  Georgia     25.3         3140
##  4 5133      Sevier   Arkansas    24.4         3139
##  5 20175     Seward   Kansas      23.4         3138
##  6 48079     Cochran  Texas       22.8         3137
##  7 48247     Jim Hogg Texas       22.6         3136
##  8 48393     Roberts  Texas       22.4         3135
##  9 39075     Holmes   Ohio        21.8         3134
## 10 21197     Powell   Kentucky    21.6         3133

d) On average, what state is the best ranked for carpooling?

e) What are the top 5 states for carpooling?

ACS %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  group_by(state) %>% 
  summarize(avg_carpool = mean(carpool)) %>% 
  top_n(5, avg_carpool)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 2
##   state    avg_carpool
##   <fct>          <dbl>
## 1 Alaska          12.1
## 2 Arkansas        11.9
## 3 Nevada          11.7
## 4 Texas           11.8
## 5 Utah            11.9