# Load required libraries
library(tidyverse)
setwd("C:/Users/scott/OneDrive - University of Cincinnati/BANA/Data Wrangling (BANA 7025)/week 4/homework3")
acs <- read_csv("acs_2015_county_data_revised.csv")
Data types appear to be appropriate as imported.
glimpse(acs)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
There are only 2 missing values. These values/observations will not be removed/imputed at this point, but NA handling for calculations will be required.
sapply(acs, function(x) sum(is.na(x)))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
sum(is.na(acs))
## [1] 2
There are no obvious errors in the data, but a significant number of the variables have large outliers. Examples of this include pacific, transit, and work_at_home. This is to be expected as there are some counties that have concentrations of specific ethnicities or influences that are bore out in this data.
summary(acs)
## census_id state county total_pop
## Min. : 1001 Length:3142 Length:3142 Min. : 85
## 1st Qu.:18178 Class :character Class :character 1st Qu.: 11028
## Median :29176 Mode :character Mode :character Median : 25768
## Mean :30384 Mean : 100737
## 3rd Qu.:45081 3rd Qu.: 67552
## Max. :56045 Max. :10038388
##
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52250 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## NA's :1
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## NA's :1
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
There are 1,985 counties where there are more women than men.
acs %>%
select(census_id, state, county, men, women, total_pop) %>%
filter(women > men)
## # A tibble: 1,985 x 6
## census_id state county men women total_pop
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1001 Alabama Autauga 26745 28476 55221
## 2 1003 Alabama Baldwin 95314 99807 195121
## 3 1009 Alabama Blount 28512 29198 57710
## 4 1013 Alabama Butler 9502 10852 20354
## 5 1015 Alabama Calhoun 56274 60374 116648
## 6 1017 Alabama Chambers 16258 17821 34079
## 7 1019 Alabama Cherokee 12975 13033 26008
## 8 1021 Alabama Chilton 21619 22200 43819
## 9 1023 Alabama Choctaw 6382 7013 13395
## 10 1025 Alabama Clarke 11834 13236 25070
## # ... with 1,975 more rows
2,420 counties have an unemployment rate lower than 10%.
acs %>%
select(census_id, state, county, unemployment) %>%
filter(unemployment < 10)
## # A tibble: 2,420 x 4
## census_id state county unemployment
## <dbl> <chr> <chr> <dbl>
## 1 1001 Alabama Autauga 7.6
## 2 1003 Alabama Baldwin 7.5
## 3 1007 Alabama Bibb 8.3
## 4 1009 Alabama Blount 7.7
## 5 1017 Alabama Chambers 8.9
## 6 1019 Alabama Cherokee 7.9
## 7 1021 Alabama Chilton 9.1
## 8 1027 Alabama Clay 9.4
## 9 1029 Alabama Cleburne 8.3
## 10 1031 Alabama Coffee 7.1
## # ... with 2,410 more rows
acs %>%
select(census_id, county, state, mean_commute) %>%
top_n(10, mean_commute)
## # A tibble: 10 x 4
## census_id county state mean_commute
## <dbl> <chr> <chr> <dbl>
## 1 8093 Park Colorado 42.4
## 2 24017 Charles Maryland 42.8
## 3 36005 Bronx New York 43
## 4 36047 Kings New York 41.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 42103 Pike Pennsylvania 44
## 8 51187 Warren Virginia 42.7
## 9 51193 Westmoreland Virginia 42.5
## 10 54015 Clay West Virginia 41.4
acs %>%
mutate(pct_women = women / total_pop * 100) %>%
select(census_id, county, state, pct_women) %>%
top_n(-10, pct_women)
## # A tibble: 10 x 4
## census_id county state pct_women
## <dbl> <chr> <chr> <dbl>
## 1 2013 Aleutians East Borough Alaska 33.5
## 2 6035 Lassen California 33.2
## 3 8011 Bent Colorado 31.4
## 4 13053 Chattahoochee Georgia 33.4
## 5 13309 Wheeler Georgia 32.1
## 6 22125 West Feliciana Louisiana 33.6
## 7 32027 Pershing Nevada 33.7
## 8 42053 Forest Pennsylvania 26.8
## 9 48095 Concho Texas 33.3
## 10 51183 Sussex Virginia 31.5
acs <- acs %>%
mutate(sum_races = hispanic + white + black + native + asian + pacific)
acs %>%
select(census_id, county, state, sum_races) %>%
arrange(sum_races) %>%
top_n(-10, sum_races)
## # A tibble: 10 x 4
## census_id county state sum_races
## <dbl> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
acs %>%
group_by(state) %>%
summarise(state_avg = mean(sum_races)) %>%
top_n(-1, state_avg)
## # A tibble: 1 x 2
## state state_avg
## <chr> <dbl>
## 1 Hawaii 84
Yes, 11 counties are over 100%.
acs %>%
select(census_id, county, state, sum_races) %>%
filter(sum_races > 100)
## # A tibble: 11 x 4
## census_id county state sum_races
## <dbl> <chr> <chr> <dbl>
## 1 28021 Claiborne Mississippi 100
## 2 31073 Gosper Nebraska 100.
## 3 31091 Hooker Nebraska 100.
## 4 31125 Nance Nebraska 100.
## 5 48017 Bailey Texas 100.
## 6 48131 Duval Texas 100
## 7 48137 Edwards Texas 100.
## 8 48261 Kenedy Texas 100
## 9 48263 Kent Texas 100
## 10 48377 Presidio Texas 100
## 11 49001 Beaver Utah 100
No, there are not any states that equal 100%.
acs %>%
group_by(state) %>%
summarise(state_avg = mean(sum_races)) %>%
filter(state_avg == 100)
## # A tibble: 0 x 2
## # ... with 2 variables: state <chr>, state_avg <dbl>
acs <- acs %>%
mutate(carpool_rank = min_rank(desc(carpool)))
acs %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
acs %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(desc(carpool_rank)) %>%
top_n(10, carpool_rank)
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
## 11 51720 Norton city Virginia 2.8 3132
acs %>%
group_by(state) %>%
summarise(avg_carpool_rank = mean(carpool_rank)) %>%
top_n(-1, avg_carpool_rank)
## # A tibble: 1 x 2
## state avg_carpool_rank
## <chr> <dbl>
## 1 Arizona 971.
acs %>%
group_by(state) %>%
summarise(avg_carpool = mean(carpool)) %>%
arrange(avg_carpool) %>%
top_n(-5, avg_carpool)
## # A tibble: 5 x 2
## state avg_carpool
## <chr> <dbl>
## 1 District of Columbia 5.7
## 2 Massachusetts 7.75
## 3 Connecticut 7.94
## 4 Rhode Island 8
## 5 New Jersey 8.10