Assignment03

Importing Data and Data Cleaning

Question 01

The dataset contains 3,142 rows and 35 columns.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

acs <- read_csv("acs_2015_county_data_revised.csv")

## Rows: 3142 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(acs)

## [1] 3142   35

Question 02

Most variables had appropriate numeric types. I converted state to a factor and census_id to a character.

glimpse(acs)

## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…

acs <- acs %>%
  mutate(
    state = as_factor(state),       
    census_id = as.character(census_id)
  )

glimpse(acs)

## Rows: 3,142
## Columns: 35
## $ census_id      <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013",…
## $ state          <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, A…
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…

Question 03

I found that only two variables had missing values:

`income: 1 missing value
child_poverty: 1 missing value

colSums(is.na(acs))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

I removed these two observations. Since they account for less than 0.1% of the dataset (2 out of 3,142 rows), the data loss is negligible.

acs_clean <- acs %>% drop_na()

colSums(is.na(acs_clean))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              0              0 
##        poverty  child_poverty   professional        service         office 
##              0              0              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

Question 04

I used the summary() function to examine unusual values.

summary(acs)

##   census_id              state         county            total_pop       
##  Length:3142        Texas   : 254   Length:3142        Min.   :      85  
##  Class :character   Georgia : 159   Class :character   1st Qu.:   11028  
##  Mode  :character   Virginia: 133   Mode  :character   Median :   25768  
##                     Kentucky: 120                      Mean   :  100737  
##                     Missouri: 115                      3rd Qu.:   67552  
##                     Kansas  : 105                      Max.   :10038388  
##                     (Other) :2256                                        
##       men              women            hispanic          white      
##  Min.   :     42   Min.   :     43   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5546   1st Qu.:   5466   1st Qu.: 1.900   1st Qu.:65.60  
##  Median :  12826   Median :  12907   Median : 3.700   Median :84.60  
##  Mean   :  49565   Mean   :  51172   Mean   : 8.826   Mean   :77.28  
##  3rd Qu.:  33319   3rd Qu.:  34122   3rd Qu.: 9.000   3rd Qu.:93.30  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.879   Mean   : 1.766   Mean   : 1.258   Mean   : 0.08475  
##  3rd Qu.:10.175   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :35.30000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :     80   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8254   1st Qu.: 38826   1st Qu.:20471   1st Qu.:12.0  
##  Median :  19434   Median : 45111   Median :23577   Median :16.0  
##  Mean   :  70804   Mean   : 46830   Mean   :24338   Mean   :16.7  
##  3rd Qu.:  50728   3rd Qu.: 52250   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                    NA's   :1                                      
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.04   Mean   :18.26   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.40   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##  NA's   :1                                                      
##   construction     production        drive          carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.20   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.53   1st Qu.:76.60   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.60   Median : 9.90  
##  Mean   :12.74   Mean   :15.82   Mean   :79.08   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.60   3rd Qu.:11.88  
##  Max.   :40.30   Max.   :55.60   Max.   :94.60   Max.   :29.90  
##                                                                 
##     transit             walk         other_transp     work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.900   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.300   Median : 4.000  
##  Mean   : 0.9675   Mean   : 3.307   Mean   : 1.614   Mean   : 4.697  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.900   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.100   Max.   :37.200  
##                                                                      
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :     62   Min.   :25.00   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4524   1st Qu.:70.90   1st Qu.:13.10  
##  Median :22.90   Median :  10644   Median :75.80   Median :16.10  
##  Mean   :23.15   Mean   :  46387   Mean   :74.44   Mean   :17.35  
##  3rd Qu.:26.60   3rd Qu.:  29254   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.921   Mean   :0.2915   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
##

I did not find any invalid values (all variables that cannot be negative had positive minimum values), so no corrections were necessary. I chose to keep these observations to preserve data integrity.

##Data Manipulation and Insights ### Question 05 There are 1985 counties with more women than men.

more_women <- acs %>%
  filter(women > men) %>%
  nrow()

more_women

## [1] 1985

Question 06

There are 2,420 counties with an unemployment rate lower than 10%.

low_unemp <- acs %>%
  filter(unemployment < 10) %>%
  nrow()

low_unemp

## [1] 2420

Question 07

top10 <- acs %>% 
  select(census_id, county, state, mean_commute) %>% 
  arrange(desc(mean_commute)) %>% 
  top_n(10, wt = mean_commute)

top10

## # A tibble: 10 × 4
##    census_id county       state         mean_commute
##    <chr>     <chr>        <fct>                <dbl>
##  1 42103     Pike         Pennsylvania          44  
##  2 36005     Bronx        New York              43  
##  3 24017     Charles      Maryland              42.8
##  4 51187     Warren       Virginia              42.7
##  5 36081     Queens       New York              42.6
##  6 36085     Richmond     New York              42.6
##  7 51193     Westmoreland Virginia              42.5
##  8 8093      Park         Colorado              42.4
##  9 36047     Kings        New York              41.7
## 10 54015     Clay         West Virginia         41.4

Question 08

Create a new variable that calculates the percentage of women for each county. \[ percent\_women = \frac{women}{total\_pop} \times 100 \]

acs <- acs %>% 
  mutate(percent_women = (women/total_pop) * 100 )

names(acs)

##  [1] "census_id"      "state"          "county"         "total_pop"     
##  [5] "men"            "women"          "hispanic"       "white"         
##  [9] "black"          "native"         "asian"          "pacific"       
## [13] "citizen"        "income"         "income_per_cap" "poverty"       
## [17] "child_poverty"  "professional"   "service"        "office"        
## [21] "construction"   "production"     "drive"          "carpool"       
## [25] "transit"        "walk"           "other_transp"   "work_at_home"  
## [29] "mean_commute"   "employed"       "private_work"   "public_work"   
## [33] "self_employed"  "family_work"    "unemployment"   "percent_women"

lowest10 <- acs %>%
  select(census_id, county, state, percent_women) %>%
  top_n(-10, wt = percent_women) %>%
  arrange(percent_women)

lowest10

## # A tibble: 10 × 4
##    census_id county                 state        percent_women
##    <chr>     <chr>                  <fct>                <dbl>
##  1 42053     Forest                 Pennsylvania          26.8
##  2 8011      Bent                   Colorado              31.4
##  3 51183     Sussex                 Virginia              31.5
##  4 13309     Wheeler                Georgia               32.1
##  5 6035      Lassen                 California            33.2
##  6 48095     Concho                 Texas                 33.3
##  7 13053     Chattahoochee          Georgia               33.4
##  8 2013      Aleutians East Borough Alaska                33.5
##  9 22125     West Feliciana         Louisiana             33.6
## 10 32027     Pershing               Nevada                33.7

Question 09

Create a new variable that calculates the sum of all race percentage variables

acs <- acs %>%
  mutate(sum_race = hispanic + white + black + native + asian + pacific)

summary(acs$sum_race)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    76.4    97.7    98.4    98.1    99.0   100.1

9-a

lowest10_race <- acs %>% 
  select(census_id, county, state, sum_race) %>% 
  top_n(-10, wt = sum_race) %>% 
  arrange(sum_race)

lowest10_race

## # A tibble: 10 × 4
##    census_id county                   state     sum_race
##    <chr>     <chr>                    <fct>        <dbl>
##  1 15001     Hawaii                   Hawaii        76.4
##  2 15009     Maui                     Hawaii        79.2
##  3 40097     Mayes                    Oklahoma      79.7
##  4 15003     Honolulu                 Hawaii        81.5
##  5 40123     Pontotoc                 Oklahoma      82.8
##  6 47061     Grundy                   Tennessee     83  
##  7 2282      Yakutat City and Borough Alaska        83.4
##  8 40069     Johnston                 Oklahoma      84  
##  9 15007     Kauai                    Hawaii        84.1
## 10 40003     Alfalfa                  Oklahoma      85.1

9-b

Create a new variable representing the sum of these race percentage at the state level.

sum_race_state <- acs %>% 
  group_by(state) %>% 
  summarise(avg_sum_race = mean(sum_race, na.rm = TRUE)) %>% 
  arrange(avg_sum_race)

sum_race_state

## # A tibble: 51 × 2
##    state                avg_sum_race
##    <fct>                       <dbl>
##  1 Hawaii                       84  
##  2 Alaska                       92.7
##  3 Oklahoma                     92.8
##  4 Washington                   96.7
##  5 California                   96.9
##  6 Oregon                       97.1
##  7 Delaware                     97.3
##  8 Massachusetts                97.5
##  9 Maryland                     97.6
## 10 District of Columbia         97.6
## # ℹ 41 more rows

Find the loweset state

lowest_state <- sum_race_state %>% 
  top_n(-1, wt = avg_sum_race)

lowest_state

## # A tibble: 1 × 2
##   state  avg_sum_race
##   <fct>         <dbl>
## 1 Hawaii           84

9-c

There are 11 counties where the sum greater than 100%.

over100 <- acs %>% 
  filter(sum_race > 100) %>% 
  select(county, sum_race)

nrow(over100)

## [1] 11

over100

## # A tibble: 11 × 2
##    county    sum_race
##    <chr>        <dbl>
##  1 Claiborne     100 
##  2 Gosper        100.
##  3 Hooker        100.
##  4 Nance         100.
##  5 Bailey        100.
##  6 Duval         100 
##  7 Edwards       100.
##  8 Kenedy        100 
##  9 Kent          100 
## 10 Presidio      100 
## 11 Beaver        100

9-d

There are no states with a sum of exactly 100%.

equal100 <- sum_race_state %>% 
  filter(avg_sum_race == 100)

nrow(equal100)

## [1] 0

Question 10

10-a

Create a new variable Carpool_rank

acs <- acs %>% 
  mutate(carpool_rank = min_rank(desc(carpool)))

names(acs)

##  [1] "census_id"      "state"          "county"         "total_pop"     
##  [5] "men"            "women"          "hispanic"       "white"         
##  [9] "black"          "native"         "asian"          "pacific"       
## [13] "citizen"        "income"         "income_per_cap" "poverty"       
## [17] "child_poverty"  "professional"   "service"        "office"        
## [21] "construction"   "production"     "drive"          "carpool"       
## [25] "transit"        "walk"           "other_transp"   "work_at_home"  
## [29] "mean_commute"   "employed"       "private_work"   "public_work"   
## [33] "self_employed"  "family_work"    "unemployment"   "percent_women" 
## [37] "sum_race"       "carpool_rank"

10-b

Top 10

acs %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(carpool_rank) %>% 
  head(10)

## # A tibble: 10 × 5
##    census_id county   state    carpool carpool_rank
##    <chr>     <chr>    <fct>      <dbl>        <int>
##  1 13061     Clay     Georgia     29.9            1
##  2 18087     LaGrange Indiana     27              2
##  3 13165     Jenkins  Georgia     25.3            3
##  4 5133      Sevier   Arkansas    24.4            4
##  5 20175     Seward   Kansas      23.4            5
##  6 48079     Cochran  Texas       22.8            6
##  7 48247     Jim Hogg Texas       22.6            7
##  8 48393     Roberts  Texas       22.4            8
##  9 39075     Holmes   Ohio        21.8            9
## 10 21197     Powell   Kentucky    21.6           10

10-c

Lowest 10

lowest10_carpool <- acs %>% 
  select(census_id, county, state, carpool, carpool_rank) %>% 
  arrange(desc(carpool_rank)) %>% 
  head(10)

lowest10_carpool

## # A tibble: 10 × 5
##    census_id county   state        carpool carpool_rank
##    <chr>     <chr>    <fct>          <dbl>        <int>
##  1 48261     Kenedy   Texas            0           3141
##  2 48269     King     Texas            0           3141
##  3 48235     Irion    Texas            0.9         3140
##  4 31183     Wheeler  Nebraska         1.3         3139
##  5 36061     New York New York         1.9         3138
##  6 13309     Wheeler  Georgia          2.3         3136
##  7 38029     Emmons   North Dakota     2.3         3136
##  8 30019     Daniels  Montana          2.6         3134
##  9 31057     Dundy    Nebraska         2.6         3134
## 10 46069     Hyde     South Dakota     2.8         3132

10-d

Arizona is the best ranked state for carpooling on average.

carpool_rank_state <- acs %>% 
  group_by(state) %>% 
  summarise(avg_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>% 
  arrange(avg_carpool_rank)

head(carpool_rank_state, 3)

## # A tibble: 3 × 2
##   state    avg_carpool_rank
##   <fct>               <dbl>
## 1 Arizona              971.
## 2 Utah                1019.
## 3 Arkansas            1055.

10-e

carpool_rank_state <- acs %>% 
  group_by(state) %>% 
  summarise(avg_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>% 
  arrange(avg_carpool_rank)

head(carpool_rank_state, 5)

## # A tibble: 5 × 2
##   state    avg_carpool_rank
##   <fct>               <dbl>
## 1 Arizona              971.
## 2 Utah                1019.
## 3 Arkansas            1055.
## 4 Hawaii              1072.
## 5 Alaska              1087.