library(tidyverse)
library(lubridate)
library(glue)
setwd("C:/Users/Adam Deuber/OneDrive/UC/BANA Masters/Data Wrangling/Data Wrangling/Week 4/Week 4")
county_data <- readr::read_csv("acs_2015_county_data_revised.csv")
head(county_data)
## # A tibble: 6 x 35
## census_id state county total_pop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ 55221 26745 28476 2.6 75.8 18.5 0.4 1
## 2 1003 Alab~ Baldw~ 195121 95314 99807 4.5 83.1 9.5 0.6 0.7
## 3 1005 Alab~ Barbo~ 26932 14497 12435 4.6 46.2 46.7 0.2 0.4
## 4 1007 Alab~ Bibb 22604 12073 10531 2.2 74.5 21.4 0.4 0.1
## 5 1009 Alab~ Blount 57710 28512 29198 8.6 87.9 1.5 0.3 0.1
## 6 1011 Alab~ Bullo~ 10678 5660 5018 4.4 22.2 70.7 1.2 0.2
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
nrow(county_data)
## [1] 3142
ncol(county_data)
## [1] 35
There are 3142 rows and 35 columns.
glimpse(county_data)
## Observations: 3,142
## Variables: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
county_data$census_id <- as.character(county_data$census_id)
county_data$state <- as.factor(county_data$state)
A census ID does not require statistical breakdowns. As such, I switched it to a character to allow it to be a label for all of the observations.
Additionally, I changed state to be a factor as it is a categorical variable and the observations can be collapsed into individual states.
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
county_data <- drop_na(county_data)
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
There were only two observations with NA values. One was in income, and the other was in child_poverty. With just two rows in a dataset of over 3000, I decided to remove these from my data as seen in the code.
summary(county_data)
## census_id state county total_pop
## Length:3140 Texas : 253 Length:3140 Min. : 267
## Class :character Georgia : 159 Class :character 1st Qu.: 11036
## Mode :character Virginia: 133 Mode :character Median : 25793
## Kentucky: 120 Mean : 100801
## Missouri: 115 3rd Qu.: 67620
## Kansas : 105 Max. :10038388
## (Other) :2255
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
##
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45095 Median :23575 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
##
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
##
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
The employed variable is listed as a percentage in the data dictionary. However, the minimum value is 166 and maximum is 4,635,465. This is not between 0 to 100 and is reflective as a percentage.
Additionally, outliers look likely in several columns. Further analysis would showcase these findings.
county_data %>%
filter(women > men) %>%
nrow()
## [1] 1984
There are 1984 counties with more women than men.
county_data %>%
filter(unemployment < 10.0) %>%
nrow()
## [1] 2419
There are 2419 counties with an unemployment rate under than 10%.
county_data %>%
select(census_id, county, state, mean_commute) %>%
arrange(desc(mean_commute)) %>%
top_n(10)
## # A tibble: 10 x 4
## census_id county state mean_commute
## <chr> <chr> <fct> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
Please see the output for the table with the top 10 counties with the highest mean commute.
county_data %>%
mutate(women_pct = women/total_pop) %>%
select(census_id, county, state, women_pct) %>%
arrange(women_pct) %>%
top_n(-10)
## # A tibble: 10 x 4
## census_id county state women_pct
## <chr> <chr> <fct> <dbl>
## 1 42053 Forest Pennsylvania 0.268
## 2 8011 Bent Colorado 0.314
## 3 51183 Sussex Virginia 0.315
## 4 13309 Wheeler Georgia 0.321
## 5 6035 Lassen California 0.332
## 6 48095 Concho Texas 0.333
## 7 13053 Chattahoochee Georgia 0.334
## 8 2013 Aleutians East Borough Alaska 0.335
## 9 22125 West Feliciana Louisiana 0.336
## 10 32027 Pershing Nevada 0.337
Please see the output for the table with the top 10 counties with the lowest women percentage.
county_data <- mutate(county_data, race_tot = hispanic + white + black + native + asian + pacific)
county_data %>%
select(race_tot, everything()) %>%
arrange(race_tot) %>%
top_n(10)
## # A tibble: 11 x 36
## race_tot census_id state county total_pop men women hispanic white black
## <dbl> <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 89 46121 Sout~ Todd 9942 4862 5080 3.7 10 0.3
## 2 93.8 2158 Alas~ Kusil~ 7914 4200 3714 1 4.5 0.4
## 3 97.6 1035 Alab~ Conec~ 12865 6176 6689 1.6 51 44.7
## 4 98.2 46041 Sout~ Dewey 5579 2718 2861 0.9 21.7 0
## 5 98.4 46102 Sout~ Oglal~ 14153 6809 7344 1 4.8 0.5
## 6 98.4 8025 Colo~ Crowl~ 5551 3145 2406 30.2 61.1 5.3
## 7 98.9 46031 Sout~ Corson 4149 2123 2026 0.6 30.9 0.3
## 8 99 46137 Sout~ Zieba~ 2833 1391 1442 3.7 23.7 0.3
## 9 99.4 28119 Miss~ Quitm~ 7761 3751 4010 0.9 28.3 70.2
## 10 99.8 45005 Sout~ Allen~ 9838 5225 4613 2.8 23.2 73.8
## 11 99.9 28053 Miss~ Humph~ 8984 4278 4706 2.6 22 75.1
## # ... with 26 more variables: native <dbl>, asian <dbl>, pacific <dbl>,
## # citizen <dbl>, income <dbl>, income_per_cap <dbl>, poverty <dbl>,
## # child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## # construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## # transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## # self_employed <dbl>, family_work <dbl>, unemployment <dbl>
Please see the above table with the lowest ten counties by the new race_tot variable.
county_data %>%
select(race_tot, everything()) %>%
group_by(state) %>%
summarise(avg = mean(race_tot)) %>%
arrange(avg) %>%
top_n(1)
## # A tibble: 1 x 2
## state avg
## <fct> <dbl>
## 1 Mississippi 99.2
Mississippi, on average, has the lowest sum of these race percentage variables.
county_data %>%
filter(race_tot > 100) %>%
nrow()
## [1] 11
11 counties have a sum greater than 100%.
county_data %>%
filter(race_tot == 100) %>%
nrow()
## [1] 27
27 counties have a sum equal to 100%.
county_data$carpool_rank = min_rank(-(county_data$carpool))
county_data %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, wt=carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <fct> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
Please see the above table for the 10 highest ranked counties for carpooling.
county_data %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(10, wt=carpool_rank)
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <fct> <dbl> <int>
## 1 46069 Hyde South Dakota 2.8 3130
## 2 51720 Norton city Virginia 2.8 3130
## 3 30019 Daniels Montana 2.6 3132
## 4 31057 Dundy Nebraska 2.6 3132
## 5 13309 Wheeler Georgia 2.3 3134
## 6 38029 Emmons North Dakota 2.3 3134
## 7 36061 New York New York 1.9 3136
## 8 31183 Wheeler Nebraska 1.3 3137
## 9 48235 Irion Texas 0.9 3138
## 10 48261 Kenedy Texas 0 3139
## 11 48269 King Texas 0 3139
Please see the above table for the 10 lowest ranked counties for carpooling.
county_data %>%
group_by(state) %>%
summarise(averagerank = mean(carpool_rank)) %>%
arrange(averagerank) %>%
top_n(-1)
## # A tibble: 1 x 2
## state averagerank
## <fct> <dbl>
## 1 Hawaii 651.
Hawaii is the best ranked for carpooling.
county_data %>%
group_by(state) %>%
summarise(averagerank = mean(carpool_rank)) %>%
arrange(averagerank) %>%
top_n(-5)
## # A tibble: 5 x 2
## state averagerank
## <fct> <dbl>
## 1 Hawaii 651.
## 2 Arizona 970.
## 3 Utah 1019.
## 4 Arkansas 1054.
## 5 Alaska 1086.
Please see the above table for the top 5 states for carpooling.