# Working directory
setwd("C:/Users/dullecd/Desktop/Classes/Data Wrangling/Week 4/homework3")
# Load packages
library(tidyverse)
# 1. Get data
df <- as.tibble(read_csv("acs_2015_county_data_revised.csv"))
It does not appear that any data types need to be changed
glimpse(df)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
summary(df)
## census_id state county total_pop
## Min. : 1001 Length:3142 Length:3142 Min. : 85
## 1st Qu.:18178 Class :character Class :character 1st Qu.: 11028
## Median :29176 Mode :character Mode :character Median : 25768
## Mean :30384 Mean : 100737
## 3rd Qu.:45081 3rd Qu.: 67552
## Max. :56045 Max. :10038388
##
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52250 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## NA's :1
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## NA's :1
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
df <- df %>% filter(!is.na(income),!is.na(child_poverty))
There is one missing value in the income column, and one missing value in the child poverty column. I have decided to simply remove these due to the fact there are so few. In other words, these two observations make up 2/3,142 or roughly 0.06% of the values. Because of this it is highly unlikely that these will make any substantial difference on any subsequent analysis. The information lost by excluding these is negligible.
I did this in step 3 because it is an easy way to check for NAs
summary(df)
I checked the summary statistics for each variable while at the same time reading the description from the data dictionary. There do not seem to be any unusual values. There seems to be a large range for some variables like population, but that is not necessarily unusual. Also, all variables that report percentages are between 0 and 100 which makes sense.
df %>% mutate(mw=ifelse(women>men,1,0)) %>% .$mw %>% sum()
## [1] 1984
There are 1,985 counties that have more women than men. This is roughly 63% of all counties that have more women than men.
df %>% filter(unemployment<10) %>% nrow()
## [1] 2419
There are 2,419 counties that have an unemployment rate lower than 10%. This is approximately 77% of all counties.
df %>% top_n(.,10,mean_commute) %>% arrange(-mean_commute) %>%
select(census_id,county,state,mean_commute)
## # A tibble: 10 x 4
## census_id county state mean_commute
## <dbl> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
df %>% mutate(perc_wom=women/total_pop) %>%
arrange(perc_wom) %>% select(census_id,county,state,perc_wom) %>% head(10)
## # A tibble: 10 x 4
## census_id county state perc_wom
## <dbl> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 0.268
## 2 8011 Bent Colorado 0.314
## 3 51183 Sussex Virginia 0.315
## 4 13309 Wheeler Georgia 0.321
## 5 6035 Lassen California 0.332
## 6 48095 Concho Texas 0.333
## 7 13053 Chattahoochee Georgia 0.334
## 8 2013 Aleutians East Borough Alaska 0.335
## 9 22125 West Feliciana Louisiana 0.336
## 10 32027 Pershing Nevada 0.337
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
arrange(sum_rp) %>% select(county,state,sum_rp) %>% head(10)
## # A tibble: 10 x 3
## county state sum_rp
## <chr> <chr> <dbl>
## 1 Hawaii Hawaii 76.4
## 2 Maui Hawaii 79.2
## 3 Mayes Oklahoma 79.7
## 4 Honolulu Hawaii 81.5
## 5 Pontotoc Oklahoma 82.8
## 6 Grundy Tennessee 83.
## 7 Yakutat City and Borough Alaska 83.4
## 8 Johnston Oklahoma 84
## 9 Kauai Hawaii 84.1
## 10 Alfalfa Oklahoma 85.1
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
group_by(state) %>% summarise(mean_sum_rp=mean(sum_rp)) %>% arrange(mean_sum_rp) %>% head(1)
## # A tibble: 1 x 2
## state mean_sum_rp
## <chr> <dbl>
## 1 Hawaii 80.3
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
filter(sum_rp>100) %>% select(county,state,sum_rp)
## # A tibble: 11 x 3
## county state sum_rp
## <chr> <chr> <dbl>
## 1 Claiborne Mississippi 100.
## 2 Gosper Nebraska 100.
## 3 Hooker Nebraska 100.
## 4 Nance Nebraska 100.
## 5 Bailey Texas 100.
## 6 Duval Texas 100.
## 7 Edwards Texas 100.
## 8 Kenedy Texas 100.
## 9 Kent Texas 100.
## 10 Presidio Texas 100.
## 11 Beaver Utah 100.
df %>% mutate(sum_rp=hispanic+white+black+native+asian+pacific) %>%
group_by(state) %>% summarise(mean_sum_rp=mean(sum_rp))
## # A tibble: 51 x 2
## state mean_sum_rp
## <chr> <dbl>
## 1 Alabama 98.5
## 2 Alaska 92.7
## 3 Arizona 98.3
## 4 Arkansas 98.4
## 5 California 96.9
## 6 Colorado 98.3
## 7 Connecticut 97.8
## 8 Delaware 97.3
## 9 District of Columbia 97.6
## 10 Florida 98.1
## # ... with 41 more rows
There are no states with exactly 100% sum of race percentage variables
df$carpool_rank <- min_rank(-df$carpool)
df %>% arrange(carpool_rank) %>% select(census_id,county,state,carpool,carpool_rank)
## # A tibble: 3,140 x 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
## # ... with 3,130 more rows
df %>% arrange(-carpool_rank) %>% select(census_id,county,state,carpool,carpool_rank)
## # A tibble: 3,140 x 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3139
## 2 48269 King Texas 0 3139
## 3 48235 Irion Texas 0.9 3138
## 4 31183 Wheeler Nebraska 1.3 3137
## 5 36061 New York New York 1.9 3136
## 6 13309 Wheeler Georgia 2.3 3134
## 7 38029 Emmons North Dakota 2.3 3134
## 8 30019 Daniels Montana 2.6 3132
## 9 31057 Dundy Nebraska 2.6 3132
## 10 46069 Hyde South Dakota 2.8 3130
## # ... with 3,130 more rows
df %>% group_by(state) %>% summarise(carpool_state=mean(carpool)) %>% mutate(carpool_state_rank=min_rank(-carpool_state)) %>% arrange(carpool_state_rank)
## # A tibble: 51 x 3
## state carpool_state carpool_state_rank
## <chr> <dbl> <int>
## 1 Hawaii 12.8 1
## 2 Alaska 12.1 2
## 3 Arkansas 11.9 3
## 4 Utah 11.9 4
## 5 Texas 11.8 5
## 6 Nevada 11.7 6
## 7 Arizona 11.6 7
## 8 Missouri 11.5 8
## 9 Wyoming 11.5 9
## 10 Idaho 11.4 10
## # ... with 41 more rows
Hawaii is the best state ranked by carpooling
df %>% group_by(state) %>% summarise(carpool_state=mean(carpool)) %>% mutate(carpool_state_rank=min_rank(-carpool_state)) %>% arrange(carpool_state_rank) %>%
head(5)
## # A tibble: 5 x 3
## state carpool_state carpool_state_rank
## <chr> <dbl> <int>
## 1 Hawaii 12.8 1
## 2 Alaska 12.1 2
## 3 Arkansas 11.9 3
## 4 Utah 11.9 4
## 5 Texas 11.8 5