library(tidyverse)
library(dplyr)
library(ggplot2)
library(plotly)
library(corrplot)
# Code to import the data
acs <- read.csv("C:/Users/king.nm/OneDrive - Procter and Gamble/UC/MS Business Analytics/Classes/Summer 2021/Data Wrangling BANA 7025 - Jun2021/Final Project/acs_2015_county_data_revised.csv")
# Checking the data types
str(acs)
## 'data.frame': 3142 obs. of 35 variables:
## $ census_id : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ total_pop : int 55221 195121 26932 22604 57710 10678 20354 116648 34079 26008 ...
## $ men : int 26745 95314 14497 12073 28512 5660 9502 56274 16258 12975 ...
## $ women : int 28476 99807 12435 10531 29198 5018 10852 60374 17821 13033 ...
## $ hispanic : num 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : int 40725 147695 20714 17495 42345 8057 15581 88612 26462 20600 ...
## $ income : int 51281 50254 32964 38678 45813 31938 32229 41703 34177 36296 ...
## $ income_per_cap: int 24974 27317 16824 18431 20532 17580 18390 21374 21071 21811 ...
## $ poverty : num 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : int 23986 85953 8597 8294 22189 3865 7813 47401 13689 10155 ...
## $ private_work : num 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
# Glimpse command -- comparing glimpse vs. str (str looks cleaner)
glimpse(acs)
## Rows: 3,142
## Columns: 35
## $ census_id <int> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop <int> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <int> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <int> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <int> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <int> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <int> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <int> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
Need to change variables “state” and “county” from character to factor
# Changing the variables "state" and "county" to factor
acs$state <- as.factor(acs$state)
acs$county <- as.factor(acs$county)
# Checking whether variables changed to factor
str(acs)
## 'data.frame': 3142 obs. of 35 variables:
## $ census_id : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ county : Factor w/ 1851 levels "Abbeville","Acadia",..: 82 89 100 149 164 225 235 246 293 315 ...
## $ total_pop : int 55221 195121 26932 22604 57710 10678 20354 116648 34079 26008 ...
## $ men : int 26745 95314 14497 12073 28512 5660 9502 56274 16258 12975 ...
## $ women : int 28476 99807 12435 10531 29198 5018 10852 60374 17821 13033 ...
## $ hispanic : num 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : int 40725 147695 20714 17495 42345 8057 15581 88612 26462 20600 ...
## $ income : int 51281 50254 32964 38678 45813 31938 32229 41703 34177 36296 ...
## $ income_per_cap: int 24974 27317 16824 18431 20532 17580 18390 21374 21071 21811 ...
## $ poverty : num 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : int 23986 85953 8597 8294 22189 3865 7813 47401 13689 10155 ...
## $ private_work : num 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
# Checking for missing values
sum(is.na(acs))
## [1] 2
There are two missing values
# Checking where the missing values are (which variables have missing values)
colSums(is.na(acs))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
There’s one missing value for each variable “income” and “child_poverty”. Since there are only 2 missing values for the whole dataset, I’ll eliminate the rows that have missing values
# Handling missing values
acs1 <- na.omit(acs)
# Checking whether the rows with missing values were eliminated
count(acs1)
## n
## 1 3140
Notes:
# Looking for unusual values
summary(acs1)
## census_id state county total_pop
## Min. : 1001 Texas : 253 Washington: 31 Min. : 267
## 1st Qu.:18179 Georgia : 159 Jefferson : 26 1st Qu.: 11036
## Median :29176 Virginia: 133 Franklin : 25 Median : 25793
## Mean :30383 Kentucky: 120 Jackson : 24 Mean : 100801
## 3rd Qu.:45080 Missouri: 115 Lincoln : 24 3rd Qu.: 67620
## Max. :56045 Kansas : 105 Madison : 20 Max. :10038388
## (Other) :2255 (Other) :2990
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
##
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45095 Median :23575 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
##
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
##
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
There are unusual values for the variables “pacific” (potential outlier), “transit” (potential outlier) and “family_work” (potential outlier)
# Code to remove outliers once further analysis is done to confirm that there are outliers in the dataset
acs2 <- subset(acs1, acs1$transit < 50)
# Checking whether outliers were removed
boxplot(acs2$transit)
# Code to identify counties with more women than men
acs1 %>%
select(women, men) %>%
mutate(more_wom = (women - men) > 0) %>%
summarize(
total = sum(more_wom, na.rm = TRUE))
## total
## 1 1984
# Code to identify counties with an unemployment rate lower than 10%
acs1 %>%
select(unemployment) %>%
mutate(unempl_lower_10 = unemployment < 10) %>%
summarize(
total_unempl_lower_10 = sum(unempl_lower_10, na.rm = TRUE))
## total_unempl_lower_10
## 1 2419
# Another code to identify counties with an unemployment rate lower than 10%
length(acs1$unemployment[acs1$unemployment < 10])
## [1] 2419
Notes:
# Code for the top 10 counties with the highest mean commute
# I used the function "slice_max" instead of "top_n()" as suggested by RStudio as "top_n()" has been deprecated
acs1 %>%
select(census_id, county, state, mean_commute) %>%
slice_max(order_by = mean_commute, n = 10)
## census_id county state mean_commute
## 1 42103 Pike Pennsylvania 44.0
## 2 36005 Bronx New York 43.0
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
# Code for question #8 above
acs1 %>%
select(census_id, county, state, total_pop, women) %>%
mutate(perc_women = women / total_pop) %>%
slice_min(order_by = perc_women, n = 10)
## census_id county state total_pop women perc_women
## 1 42053 Forest Pennsylvania 7581 2030 0.2677747
## 2 8011 Bent Colorado 5895 1849 0.3136556
## 3 51183 Sussex Virginia 11864 3734 0.3147336
## 4 13309 Wheeler Georgia 7956 2554 0.3210156
## 5 6035 Lassen California 32645 10827 0.3316588
## 6 48095 Concho Texas 4086 1360 0.3328439
## 7 13053 Chattahoochee Georgia 11914 3974 0.3335572
## 8 2013 Aleutians East Borough Alaska 3304 1106 0.3347458
## 9 22125 West Feliciana Louisiana 15415 5187 0.3364904
## 10 32027 Pershing Nevada 6722 2267 0.3372508
# Code for question #8 above
acs1 %>%
mutate(perc_women = women / total_pop) %>%
slice_min(order_by = perc_women, n = 10) %>%
select(census_id, county, state, perc_women)
## census_id county state perc_women
## 1 42053 Forest Pennsylvania 0.2677747
## 2 8011 Bent Colorado 0.3136556
## 3 51183 Sussex Virginia 0.3147336
## 4 13309 Wheeler Georgia 0.3210156
## 5 6035 Lassen California 0.3316588
## 6 48095 Concho Texas 0.3328439
## 7 13053 Chattahoochee Georgia 0.3335572
## 8 2013 Aleutians East Borough Alaska 0.3347458
## 9 22125 West Feliciana Louisiana 0.3364904
## 10 32027 Pershing Nevada 0.3372508
# Code to create new variable to calculate the sum of all race percentages and checking to see whether the variable was created
acs1 %>%
mutate(sum_perc_all_races = (hispanic + white + black + native + asian + pacific) / total_pop) %>%
select(sum_perc_all_races) %>%
summarise(sum_perc_all_races_mean = mean(sum_perc_all_races))
## sum_perc_all_races_mean
## 1 0.009044451
# Code
acs1 %>%
mutate(sum_perc_all_races = (hispanic + white + black + native + asian + pacific)) %>%
select(county, sum_perc_all_races) %>%
slice_min(sum_perc_all_races, n = 10)
## county sum_perc_all_races
## 1 Hawaii 76.4
## 2 Maui 79.2
## 3 Mayes 79.7
## 4 Honolulu 81.5
## 5 Pontotoc 82.8
## 6 Grundy 83.0
## 7 Yakutat City and Borough 83.4
## 8 Johnston 84.0
## 9 Kauai 84.1
## 10 Alfalfa 85.1
# Code
acs1 %>%
mutate(sum_perc_all_races = hispanic + white + black + native + asian + pacific) %>%
select(county, sum_perc_all_races, state) %>%
group_by(state) %>%
summarise(sum_perc_all_races_mean_state = mean(sum_perc_all_races)) %>%
slice_min(sum_perc_all_races_mean_state, n = 1)
## # A tibble: 1 x 2
## state sum_perc_all_races_mean_state
## <fct> <dbl>
## 1 Hawaii 80.3
acs1 %>%
mutate(sum_perc_all_races = (hispanic + white + black + native + asian + pacific)) %>%
select(county, sum_perc_all_races) %>%
slice_max(sum_perc_all_races, n = 10)
## county sum_perc_all_races
## 1 Gosper 100.1
## 2 Hooker 100.1
## 3 Bailey 100.1
## 4 Edwards 100.1
## 5 Nance 100.1
## 6 Claiborne 100.0
## 7 Duval 100.0
## 8 Kenedy 100.0
## 9 Kent 100.0
## 10 Presidio 100.0
## 11 Beaver 100.0
acs1 %>%
mutate(sum_perc_all_races = (hispanic + white + black + native + asian + pacific)) %>%
select(county, sum_perc_all_races) %>%
count(sum_perc_all_races > 100)
## sum_perc_all_races > 100 n
## 1 FALSE 3129
## 2 TRUE 11
acs1 %>%
mutate(sum_perc_all_races = (hispanic + white + black + native + asian + pacific)) %>%
select(state, sum_perc_all_races) %>%
count(sum_perc_all_races == 100)
## sum_perc_all_races == 100 n
## 1 FALSE 3113
## 2 TRUE 27
# Create variable carpool_rank
acs1 %>%
mutate(carpool_rank = min_rank(carpool)) %>%
select(carpool_rank, county, carpool) %>%
slice_max(carpool_rank, n = 10)
## carpool_rank county carpool
## 1 3140 Clay 29.9
## 2 3139 LaGrange 27.0
## 3 3138 Jenkins 25.3
## 4 3137 Sevier 24.4
## 5 3136 Seward 23.4
## 6 3135 Cochran 22.8
## 7 3134 Jim Hogg 22.6
## 8 3133 Roberts 22.4
## 9 3132 Holmes 21.8
## 10 3131 Powell 21.6
# Code for top 10 highest ranked carpool values
acs1 %>%
mutate(carpool_rank = min_rank(carpool)) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
slice_max(carpool_rank, n = 10)
## census_id county state carpool carpool_rank
## 1 13061 Clay Georgia 29.9 3140
## 2 18087 LaGrange Indiana 27.0 3139
## 3 13165 Jenkins Georgia 25.3 3138
## 4 5133 Sevier Arkansas 24.4 3137
## 5 20175 Seward Kansas 23.4 3136
## 6 48079 Cochran Texas 22.8 3135
## 7 48247 Jim Hogg Texas 22.6 3134
## 8 48393 Roberts Texas 22.4 3133
## 9 39075 Holmes Ohio 21.8 3132
## 10 21197 Powell Kentucky 21.6 3131
# Code for 10 lowest ranked carpool values
acs1 %>%
mutate(carpool_rank = min_rank(carpool)) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
slice_min(carpool_rank, n = 10)
## census_id county state carpool carpool_rank
## 1 48261 Kenedy Texas 0.0 1
## 2 48269 King Texas 0.0 1
## 3 48235 Irion Texas 0.9 3
## 4 31183 Wheeler Nebraska 1.3 4
## 5 36061 New York New York 1.9 5
## 6 13309 Wheeler Georgia 2.3 6
## 7 38029 Emmons North Dakota 2.3 6
## 8 30019 Daniels Montana 2.6 8
## 9 31057 Dundy Nebraska 2.6 8
## 10 46069 Hyde South Dakota 2.8 10
## 11 51720 Norton city Virginia 2.8 10
# Code state with best carpool values
acs1 %>%
group_by(state) %>%
summarise(carpool_state = mean(carpool)) %>%
slice_max(carpool_state, n = 1)
## # A tibble: 1 x 2
## state carpool_state
## <fct> <dbl>
## 1 Hawaii 12.8
# Code for top 5 states with best carpool values
acs1 %>%
group_by(state) %>%
summarise(carpool_state = mean(carpool)) %>%
slice_max(carpool_state, n = 5)
## # A tibble: 5 x 2
## state carpool_state
## <fct> <dbl>
## 1 Hawaii 12.8
## 2 Alaska 12.1
## 3 Arkansas 11.9
## 4 Utah 11.9
## 5 Texas 11.8