Importing and Data Cleaning

1. Get data

# Working directory
setwd("C:/Users/dullecd/Desktop/Classes/Data Wrangling/Week 4/homework3")
# Load packages
library(tidyverse)
# 1. Get data
df <- as.tibble(read_csv("acs_2015_county_data_revised.csv"))

2. Data types

It does not appear that any data types need to be changed

glimpse(df)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017...
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...

3. Check for missing values

summary(df)
##    census_id        state              county            total_pop       
##  Min.   : 1001   Length:3142        Length:3142        Min.   :      85  
##  1st Qu.:18178   Class :character   Class :character   1st Qu.:   11028  
##  Median :29176   Mode  :character   Mode  :character   Median :   25768  
##  Mean   :30384                                         Mean   :  100737  
##  3rd Qu.:45081                                         3rd Qu.:   67552  
##  Max.   :56045                                         Max.   :10038388  
##                                                                          
##       men              women            hispanic          white      
##  Min.   :     42   Min.   :     43   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5546   1st Qu.:   5466   1st Qu.: 1.900   1st Qu.:65.60  
##  Median :  12826   Median :  12907   Median : 3.700   Median :84.60  
##  Mean   :  49565   Mean   :  51171   Mean   : 8.826   Mean   :77.28  
##  3rd Qu.:  33319   3rd Qu.:  34122   3rd Qu.: 9.000   3rd Qu.:93.30  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.879   Mean   : 1.766   Mean   : 1.258   Mean   : 0.08475  
##  3rd Qu.:10.175   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :35.30000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :     80   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8254   1st Qu.: 38826   1st Qu.:20471   1st Qu.:12.0  
##  Median :  19434   Median : 45111   Median :23577   Median :16.0  
##  Mean   :  70804   Mean   : 46830   Mean   :24338   Mean   :16.7  
##  3rd Qu.:  50728   3rd Qu.: 52250   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                    NA's   :1                                      
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.04   Mean   :18.26   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.40   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##  NA's   :1                                                      
##   construction     production        drive          carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.20   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.53   1st Qu.:76.60   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.60   Median : 9.90  
##  Mean   :12.74   Mean   :15.82   Mean   :79.08   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.60   3rd Qu.:11.88  
##  Max.   :40.30   Max.   :55.60   Max.   :94.60   Max.   :29.90  
##                                                                 
##     transit             walk         other_transp     work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.900   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.300   Median : 4.000  
##  Mean   : 0.9675   Mean   : 3.307   Mean   : 1.614   Mean   : 4.697  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.900   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.100   Max.   :37.200  
##                                                                      
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :     62   Min.   :25.00   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4524   1st Qu.:70.90   1st Qu.:13.10  
##  Median :22.90   Median :  10644   Median :75.80   Median :16.10  
##  Mean   :23.15   Mean   :  46387   Mean   :74.44   Mean   :17.35  
##  3rd Qu.:26.60   3rd Qu.:  29254   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.921   Mean   :0.2915   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
## 
df <- df %>% filter(!is.na(income),!is.na(child_poverty))

There is one missing value in the income column, and one missing value in the child poverty column. I have decided to simply remove these due to the fact there are so few. In other words, these two observations make up 2/3,142 or roughly 0.06% of the values. Because of this it is highly unlikely that these will make any substantial difference on any subsequent analysis. The information lost by excluding these is negligible.

4. Examine summary of data

I did this in step 3 because it is an easy way to check for NAs

summary(df)

I checked the summary statistics for each variable while at the same time reading the description from the data dictionary. There do not seem to be any unusual values. There seems to be a large range for some variables like population, but that is not necessarily unusual. Also, all variables that report percentages are between 0 and 100 which makes sense.

Data Manipulation and Insights

5. How many counties have more women than men?

df %>% mutate(mw=ifelse(women>men,1,0)) %>% .$mw %>% sum()
## [1] 1984

There are 1,985 counties that have more women than men. This is roughly 63% of all counties that have more women than men.

6. How many counties have an unemployment rate lower than 10%

df %>% filter(unemployment<10) %>% nrow()
## [1] 2419

There are 2,419 counties that have an unemployment rate lower than 10%. This is approximately 77% of all counties.

7. Show top 10 counties with highest mean commute

df %>% top_n(.,10,mean_commute) %>% arrange(-mean_commute) %>%
  select(census_id,county,state,mean_commute)
## # A tibble: 10 x 4
##    census_id county       state         mean_commute
##        <dbl> <chr>        <chr>                <dbl>
##  1     42103 Pike         Pennsylvania          44  
##  2     36005 Bronx        New York              43  
##  3     24017 Charles      Maryland              42.8
##  4     51187 Warren       Virginia              42.7
##  5     36081 Queens       New York              42.6
##  6     36085 Richmond     New York              42.6
##  7     51193 Westmoreland Virginia              42.5
##  8      8093 Park         Colorado              42.4
##  9     36047 Kings        New York              41.7
## 10     54015 Clay         West Virginia         41.4

8. Calculate percentage of women for each county and find top 10

df %>% mutate(perc_wom=women/total_pop) %>%
  arrange(perc_wom) %>% select(census_id,county,state,perc_wom) %>% head(10)
## # A tibble: 10 x 4
##    census_id county                 state        perc_wom
##        <dbl> <chr>                  <chr>           <dbl>
##  1     42053 Forest                 Pennsylvania    0.268
##  2      8011 Bent                   Colorado        0.314
##  3     51183 Sussex                 Virginia        0.315
##  4     13309 Wheeler                Georgia         0.321
##  5      6035 Lassen                 California      0.332
##  6     48095 Concho                 Texas           0.333
##  7     13053 Chattahoochee          Georgia         0.334
##  8      2013 Aleutians East Borough Alaska          0.335
##  9     22125 West Feliciana         Louisiana       0.336
## 10     32027 Pershing               Nevada          0.337

9. Calculate the sum of all race percentage variables

  1. Top 10 counties with lowest sum of the race percentage variables
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
  arrange(sum_rp) %>% select(county,state,sum_rp) %>% head(10)
## # A tibble: 10 x 3
##    county                   state     sum_rp
##    <chr>                    <chr>      <dbl>
##  1 Hawaii                   Hawaii      76.4
##  2 Maui                     Hawaii      79.2
##  3 Mayes                    Oklahoma    79.7
##  4 Honolulu                 Hawaii      81.5
##  5 Pontotoc                 Oklahoma    82.8
##  6 Grundy                   Tennessee   83. 
##  7 Yakutat City and Borough Alaska      83.4
##  8 Johnston                 Oklahoma    84  
##  9 Kauai                    Hawaii      84.1
## 10 Alfalfa                  Oklahoma    85.1
  1. State with lowest average sum of the race percentage variables
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
  group_by(state) %>% summarise(mean_sum_rp=mean(sum_rp)) %>% arrange(mean_sum_rp) %>%   head(1)
## # A tibble: 1 x 2
##   state  mean_sum_rp
##   <chr>        <dbl>
## 1 Hawaii        80.3
  1. Counties that have sum of race percentage variables greater than 100%
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
  filter(sum_rp>100) %>% select(county,state,sum_rp)
## # A tibble: 11 x 3
##    county    state       sum_rp
##    <chr>     <chr>        <dbl>
##  1 Claiborne Mississippi   100.
##  2 Gosper    Nebraska      100.
##  3 Hooker    Nebraska      100.
##  4 Nance     Nebraska      100.
##  5 Bailey    Texas         100.
##  6 Duval     Texas         100.
##  7 Edwards   Texas         100.
##  8 Kenedy    Texas         100.
##  9 Kent      Texas         100.
## 10 Presidio  Texas         100.
## 11 Beaver    Utah          100.
  1. States with sum of race percentage exactly equal to 100%
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
  group_by(state) %>% summarise(mean_sum_rp=mean(sum_rp))
## # A tibble: 51 x 2
##    state                mean_sum_rp
##    <chr>                      <dbl>
##  1 Alabama                     98.5
##  2 Alaska                      92.7
##  3 Arizona                     98.3
##  4 Arkansas                    98.4
##  5 California                  96.9
##  6 Colorado                    98.3
##  7 Connecticut                 97.8
##  8 Delaware                    97.3
##  9 District of Columbia        97.6
## 10 Florida                     98.1
## # ... with 41 more rows

There are no states with exactly 100% sum of race percentage variables

10. Using the carpool variable

  1. Create carpool_rank
df$carpool_rank <- min_rank(-df$carpool)
  1. 10 highest ranked counties for carpooling
df %>% arrange(carpool_rank) %>% select(census_id,county,state,carpool,carpool_rank)
## # A tibble: 3,140 x 5
##    census_id county   state    carpool carpool_rank
##        <dbl> <chr>    <chr>      <dbl>        <int>
##  1     13061 Clay     Georgia     29.9            1
##  2     18087 LaGrange Indiana     27              2
##  3     13165 Jenkins  Georgia     25.3            3
##  4      5133 Sevier   Arkansas    24.4            4
##  5     20175 Seward   Kansas      23.4            5
##  6     48079 Cochran  Texas       22.8            6
##  7     48247 Jim Hogg Texas       22.6            7
##  8     48393 Roberts  Texas       22.4            8
##  9     39075 Holmes   Ohio        21.8            9
## 10     21197 Powell   Kentucky    21.6           10
## # ... with 3,130 more rows
  1. 10 lowest ranked counties for carpooling
df %>% arrange(-carpool_rank) %>% select(census_id,county,state,carpool,carpool_rank)
## # A tibble: 3,140 x 5
##    census_id county   state        carpool carpool_rank
##        <dbl> <chr>    <chr>          <dbl>        <int>
##  1     48261 Kenedy   Texas            0           3139
##  2     48269 King     Texas            0           3139
##  3     48235 Irion    Texas            0.9         3138
##  4     31183 Wheeler  Nebraska         1.3         3137
##  5     36061 New York New York         1.9         3136
##  6     13309 Wheeler  Georgia          2.3         3134
##  7     38029 Emmons   North Dakota     2.3         3134
##  8     30019 Daniels  Montana          2.6         3132
##  9     31057 Dundy    Nebraska         2.6         3132
## 10     46069 Hyde     South Dakota     2.8         3130
## # ... with 3,130 more rows
  1. Best state ranked by carpooling
df %>% group_by(state) %>% summarise(carpool_state=mean(carpool)) %>% mutate(carpool_state_rank=min_rank(-carpool_state)) %>% arrange(carpool_state_rank)
## # A tibble: 51 x 3
##    state    carpool_state carpool_state_rank
##    <chr>            <dbl>              <int>
##  1 Hawaii            12.8                  1
##  2 Alaska            12.1                  2
##  3 Arkansas          11.9                  3
##  4 Utah              11.9                  4
##  5 Texas             11.8                  5
##  6 Nevada            11.7                  6
##  7 Arizona           11.6                  7
##  8 Missouri          11.5                  8
##  9 Wyoming           11.5                  9
## 10 Idaho             11.4                 10
## # ... with 41 more rows

Hawaii is the best state ranked by carpooling

  1. Top 5 states for carpooling
df %>% group_by(state) %>% summarise(carpool_state=mean(carpool)) %>% mutate(carpool_state_rank=min_rank(-carpool_state)) %>% arrange(carpool_state_rank) %>%
  head(5)
## # A tibble: 5 x 3
##   state    carpool_state carpool_state_rank
##   <chr>            <dbl>              <int>
## 1 Hawaii            12.8                  1
## 2 Alaska            12.1                  2
## 3 Arkansas          11.9                  3
## 4 Utah              11.9                  4
## 5 Texas             11.8                  5