Importing Data and Data Cleaning

# Load required libraries
library(tidyverse)

Load Data

setwd("C:/Users/scott/OneDrive - University of Cincinnati/BANA/Data Wrangling (BANA 7025)/week 4/homework3")
acs <- read_csv("acs_2015_county_data_revised.csv")

Inspect Data Types

Data types appear to be appropriate as imported.

glimpse(acs)

## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~

Missing Values

There are only 2 missing values. These values/observations will not be removed/imputed at this point, but NA handling for calculations will be required.

sapply(acs, function(x) sum(is.na(x)))

##      census_id          state         county      total_pop            men 
##              0              0              0              0              0 
##          women       hispanic          white          black         native 
##              0              0              0              0              0 
##          asian        pacific        citizen         income income_per_cap 
##              0              0              0              1              0 
##        poverty  child_poverty   professional        service         office 
##              0              1              0              0              0 
##   construction     production          drive        carpool        transit 
##              0              0              0              0              0 
##           walk   other_transp   work_at_home   mean_commute       employed 
##              0              0              0              0              0 
##   private_work    public_work  self_employed    family_work   unemployment 
##              0              0              0              0              0

sum(is.na(acs))

## [1] 2

General Inspection of Data

There are no obvious errors in the data, but a significant number of the variables have large outliers. Examples of this include pacific, transit, and work_at_home. This is to be expected as there are some counties that have concentrations of specific ethnicities or influences that are bore out in this data.

summary(acs)

##    census_id        state              county            total_pop       
##  Min.   : 1001   Length:3142        Length:3142        Min.   :      85  
##  1st Qu.:18178   Class :character   Class :character   1st Qu.:   11028  
##  Median :29176   Mode  :character   Mode  :character   Median :   25768  
##  Mean   :30384                                         Mean   :  100737  
##  3rd Qu.:45081                                         3rd Qu.:   67552  
##  Max.   :56045                                         Max.   :10038388  
##                                                                          
##       men              women            hispanic          white      
##  Min.   :     42   Min.   :     43   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5546   1st Qu.:   5466   1st Qu.: 1.900   1st Qu.:65.60  
##  Median :  12826   Median :  12907   Median : 3.700   Median :84.60  
##  Mean   :  49565   Mean   :  51171   Mean   : 8.826   Mean   :77.28  
##  3rd Qu.:  33319   3rd Qu.:  34122   3rd Qu.: 9.000   3rd Qu.:93.30  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.879   Mean   : 1.766   Mean   : 1.258   Mean   : 0.08475  
##  3rd Qu.:10.175   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :35.30000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :     80   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8254   1st Qu.: 38826   1st Qu.:20471   1st Qu.:12.0  
##  Median :  19434   Median : 45111   Median :23577   Median :16.0  
##  Mean   :  70804   Mean   : 46830   Mean   :24338   Mean   :16.7  
##  3rd Qu.:  50728   3rd Qu.: 52250   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                    NA's   :1                                      
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.04   Mean   :18.26   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.40   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##  NA's   :1                                                      
##   construction     production        drive          carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.20   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.53   1st Qu.:76.60   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.60   Median : 9.90  
##  Mean   :12.74   Mean   :15.82   Mean   :79.08   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.60   3rd Qu.:11.88  
##  Max.   :40.30   Max.   :55.60   Max.   :94.60   Max.   :29.90  
##                                                                 
##     transit             walk         other_transp     work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.900   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.300   Median : 4.000  
##  Mean   : 0.9675   Mean   : 3.307   Mean   : 1.614   Mean   : 4.697  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.900   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.100   Max.   :37.200  
##                                                                      
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :     62   Min.   :25.00   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4524   1st Qu.:70.90   1st Qu.:13.10  
##  Median :22.90   Median :  10644   Median :75.80   Median :16.10  
##  Mean   :23.15   Mean   :  46387   Mean   :74.44   Mean   :17.35  
##  3rd Qu.:26.60   3rd Qu.:  29254   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.921   Mean   :0.2915   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
##

Data Manipulation and Insights

How many counties have more women than men?

There are 1,985 counties where there are more women than men.

acs %>% 
  select(census_id, state, county, men, women, total_pop) %>% 
  filter(women > men)

## # A tibble: 1,985 x 6
##    census_id state   county     men women total_pop
##        <dbl> <chr>   <chr>    <dbl> <dbl>     <dbl>
##  1      1001 Alabama Autauga  26745 28476     55221
##  2      1003 Alabama Baldwin  95314 99807    195121
##  3      1009 Alabama Blount   28512 29198     57710
##  4      1013 Alabama Butler    9502 10852     20354
##  5      1015 Alabama Calhoun  56274 60374    116648
##  6      1017 Alabama Chambers 16258 17821     34079
##  7      1019 Alabama Cherokee 12975 13033     26008
##  8      1021 Alabama Chilton  21619 22200     43819
##  9      1023 Alabama Choctaw   6382  7013     13395
## 10      1025 Alabama Clarke   11834 13236     25070
## # ... with 1,975 more rows

How mnay counties have an unemployment rate lower than 10%?

2,420 counties have an unemployment rate lower than 10%.

acs %>% 
  select(census_id, state, county, unemployment) %>% 
  filter(unemployment < 10)

## # A tibble: 2,420 x 4
##    census_id state   county   unemployment
##        <dbl> <chr>   <chr>           <dbl>
##  1      1001 Alabama Autauga           7.6
##  2      1003 Alabama Baldwin           7.5
##  3      1007 Alabama Bibb              8.3
##  4      1009 Alabama Blount            7.7
##  5      1017 Alabama Chambers          8.9
##  6      1019 Alabama Cherokee          7.9
##  7      1021 Alabama Chilton           9.1
##  8      1027 Alabama Clay              9.4
##  9      1029 Alabama Cleburne          8.3
## 10      1031 Alabama Coffee            7.1
## # ... with 2,410 more rows

Top 10 counties with the highest mean commute?

acs %>% 
  select(census_id, county, state, mean_commute) %>%
  top_n(10, mean_commute)

## # A tibble: 10 x 4
##    census_id county       state         mean_commute
##        <dbl> <chr>        <chr>                <dbl>
##  1      8093 Park         Colorado              42.4
##  2     24017 Charles      Maryland              42.8
##  3     36005 Bronx        New York              43  
##  4     36047 Kings        New York              41.7
##  5     36081 Queens       New York              42.6
##  6     36085 Richmond     New York              42.6
##  7     42103 Pike         Pennsylvania          44  
##  8     51187 Warren       Virginia              42.7
##  9     51193 Westmoreland Virginia              42.5
## 10     54015 Clay         West Virginia         41.4

Create variables of percentages of women and select bottom 10

acs %>% 
  mutate(pct_women = women / total_pop * 100) %>% 
  select(census_id, county, state, pct_women) %>% 
  top_n(-10, pct_women)

## # A tibble: 10 x 4
##    census_id county                 state        pct_women
##        <dbl> <chr>                  <chr>            <dbl>
##  1      2013 Aleutians East Borough Alaska            33.5
##  2      6035 Lassen                 California        33.2
##  3      8011 Bent                   Colorado          31.4
##  4     13053 Chattahoochee          Georgia           33.4
##  5     13309 Wheeler                Georgia           32.1
##  6     22125 West Feliciana         Louisiana         33.6
##  7     32027 Pershing               Nevada            33.7
##  8     42053 Forest                 Pennsylvania      26.8
##  9     48095 Concho                 Texas             33.3
## 10     51183 Sussex                 Virginia          31.5

Create a variable that calculates the sum of all race percentages

acs <- acs %>% 
  mutate(sum_races = hispanic + white + black + native + asian + pacific)

Top 10 counties with the lowest sum_races

acs %>%
  select(census_id, county, state, sum_races) %>% 
  arrange(sum_races) %>% 
  top_n(-10, sum_races)

## # A tibble: 10 x 4
##    census_id county                   state     sum_races
##        <dbl> <chr>                    <chr>         <dbl>
##  1     15001 Hawaii                   Hawaii         76.4
##  2     15009 Maui                     Hawaii         79.2
##  3     40097 Mayes                    Oklahoma       79.7
##  4     15003 Honolulu                 Hawaii         81.5
##  5     40123 Pontotoc                 Oklahoma       82.8
##  6     47061 Grundy                   Tennessee      83  
##  7      2282 Yakutat City and Borough Alaska         83.4
##  8     40069 Johnston                 Oklahoma       84  
##  9     15007 Kauai                    Hawaii         84.1
## 10     40003 Alfalfa                  Oklahoma       85.1

Which state, on average, has the lowest sum_races?

acs %>% 
  group_by(state) %>%
  summarise(state_avg = mean(sum_races)) %>% 
  top_n(-1, state_avg)

## # A tibble: 1 x 2
##   state  state_avg
##   <chr>      <dbl>
## 1 Hawaii        84

Do any counties have a sum greater than 100%?

Yes, 11 counties are over 100%.

acs %>% 
  select(census_id, county, state, sum_races) %>%
  filter(sum_races > 100)

## # A tibble: 11 x 4
##    census_id county    state       sum_races
##        <dbl> <chr>     <chr>           <dbl>
##  1     28021 Claiborne Mississippi      100 
##  2     31073 Gosper    Nebraska         100.
##  3     31091 Hooker    Nebraska         100.
##  4     31125 Nance     Nebraska         100.
##  5     48017 Bailey    Texas            100.
##  6     48131 Duval     Texas            100 
##  7     48137 Edwards   Texas            100.
##  8     48261 Kenedy    Texas            100 
##  9     48263 Kent      Texas            100 
## 10     48377 Presidio  Texas            100 
## 11     49001 Beaver    Utah             100

How many states have a sum that equals exactly 100%?

No, there are not any states that equal 100%.

acs %>% 
  group_by(state) %>%
  summarise(state_avg = mean(sum_races)) %>% 
  filter(state_avg == 100)

## # A tibble: 0 x 2
## # ... with 2 variables: state <chr>, state_avg <dbl>

Carpool Variable

acs <- acs %>% 
  mutate(carpool_rank = min_rank(desc(carpool)))

What are the 10 highest ranked counties for carpooling?

acs %>% 
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(carpool_rank) %>% 
  top_n(-10, carpool_rank)

## # A tibble: 10 x 5
##    census_id county   state    carpool carpool_rank
##        <dbl> <chr>    <chr>      <dbl>        <int>
##  1     13061 Clay     Georgia     29.9            1
##  2     18087 LaGrange Indiana     27              2
##  3     13165 Jenkins  Georgia     25.3            3
##  4      5133 Sevier   Arkansas    24.4            4
##  5     20175 Seward   Kansas      23.4            5
##  6     48079 Cochran  Texas       22.8            6
##  7     48247 Jim Hogg Texas       22.6            7
##  8     48393 Roberts  Texas       22.4            8
##  9     39075 Holmes   Ohio        21.8            9
## 10     21197 Powell   Kentucky    21.6           10

What are the 10 lowest ranked counties for carpooling?

acs %>% 
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(desc(carpool_rank)) %>% 
  top_n(10, carpool_rank)

## # A tibble: 11 x 5
##    census_id county      state        carpool carpool_rank
##        <dbl> <chr>       <chr>          <dbl>        <int>
##  1     48261 Kenedy      Texas            0           3141
##  2     48269 King        Texas            0           3141
##  3     48235 Irion       Texas            0.9         3140
##  4     31183 Wheeler     Nebraska         1.3         3139
##  5     36061 New York    New York         1.9         3138
##  6     13309 Wheeler     Georgia          2.3         3136
##  7     38029 Emmons      North Dakota     2.3         3136
##  8     30019 Daniels     Montana          2.6         3134
##  9     31057 Dundy       Nebraska         2.6         3134
## 10     46069 Hyde        South Dakota     2.8         3132
## 11     51720 Norton city Virginia         2.8         3132

What is the best state for carpooling?

acs %>% 
  group_by(state) %>% 
  summarise(avg_carpool_rank = mean(carpool_rank)) %>% 
  top_n(-1, avg_carpool_rank)

## # A tibble: 1 x 2
##   state   avg_carpool_rank
##   <chr>              <dbl>
## 1 Arizona             971.

What are the top 5 states for carpooling?

acs %>% 
  group_by(state) %>% 
  summarise(avg_carpool = mean(carpool)) %>% 
  arrange(avg_carpool) %>% 
  top_n(-5, avg_carpool)

## # A tibble: 5 x 2
##   state                avg_carpool
##   <chr>                      <dbl>
## 1 District of Columbia        5.7 
## 2 Massachusetts               7.75
## 3 Connecticut                 7.94
## 4 Rhode Island                8   
## 5 New Jersey                  8.10

Analysis of the American Community Survey (ACS)

Scott Steehler