tidyverse function and NOT with a Base R function. How many rows and columns are in the data set?library(magrittr)
library(tidyverse)
acs <- read_csv("acs_2015_county_data_revised.csv")
dim(acs)
## [1] 3142 35
There are 3142 row and 35 columns in acs_2015_county_data_revised.csv data.
glimpse()command after you’re finished.str(acs)
In the initial data, census_id was numeric; state and county were character.
acs$census_id <- as.character(acs$census_id)
acs$state <- as.factor(acs$state)
acs$county <- as.factor(acs$county)
glimpse(acs)
## Observations: 3,142
## Variables: 35
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama...
## $ county <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Bu...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
census_id has been changed to character. state and contry has been changed to factor.
colSums(is.na(acs))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
Most variables have no missing values except income and child_poverty. Both income and child_poverty includes one missing value.
I would not remove any entire obervations due to a missing value in income or child_poverty because those observations include other valuable demographic information. In addition, I prefer not to impute missing values because it can mislead. When I report mean or median of income and child_poverty, I will make a comment on the name of counties that were not included in the descriptive statistics.
However, if my boss asks me to do and there is no other option, I would impute missing values with median values Since income and child_poverty are right skewed.
acs %>% ggplot(mapping = aes(x=income)) + geom_histogram()
acs %>% ggplot(mapping = aes(x=child_poverty)) + geom_histogram()
And, the code below displays how to impute missing values with median.
acs_new <- acs
acs_new$income[is.na(acs_new$income)] <- median(acs$income, na.rm=TRUE)
acs_new$child_poverty[is.na(acs_new$child_poverty)] <- median(acs$child_poverty, na.rm=TRUE)
colSums(is.na(acs_new))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
As it is shown above, acs_new has no missing values after imputation.
summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code/output to handle unusual values.summary function will suffice for this homework.summary(acs)
## census_id state county total_pop
## Length:3142 Texas : 254 Washington: 31 Min. : 85
## Class :character Georgia : 159 Jefferson : 26 1st Qu.: 11028
## Mode :character Virginia: 133 Franklin : 25 Median : 25768
## Kentucky: 120 Jackson : 24 Mean : 100737
## Missouri: 115 Lincoln : 24 3rd Qu.: 67552
## Kansas : 105 Madison : 20 Max. :10038388
## (Other) :2256 (Other) :2992
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52250 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## NA's :1
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## NA's :1
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
In the variable description,employed is defined as “percentage employed, ages 16+”. However, summary(acs) shows that employed has unusual values for percentage. It seems that employed is prepresenting count of the employed, not the percentage employed.
new_acs <- acs %>% mutate(employed_new=employed/total_pop*100)
glimpse(new_acs)
## Observations: 3,142
## Variables: 36
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama...
## $ county <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Bu...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
## $ employed_new <dbl> 43.43637, 44.05113, 31.92113, 36.69262, 38.44914, 36...
summary(new_acs)
## census_id state county total_pop
## Length:3142 Texas : 254 Washington: 31 Min. : 85
## Class :character Georgia : 159 Jefferson : 26 1st Qu.: 11028
## Mode :character Virginia: 133 Franklin : 25 Median : 25768
## Kentucky: 120 Jackson : 24 Mean : 100737
## Missouri: 115 Lincoln : 24 3rd Qu.: 67552
## Kansas : 105 Madison : 20 Max. :10038388
## (Other) :2256 (Other) :2992
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52250 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## NA's :1
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## NA's :1
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment employed_new
## Min. : 0.000 Min. :0.0000 Min. : 0.000 Min. :16.57
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500 1st Qu.:39.01
## Median : 6.900 Median :0.2000 Median : 7.500 Median :43.67
## Mean : 7.921 Mean :0.2915 Mean : 7.815 Mean :43.37
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700 3rd Qu.:48.20
## Max. :36.600 Max. :9.8000 Max. :29.400 Max. :76.24
##
In the new_acs data, the new variable employed_new has been created using employed and total_pop.
acs %>% mutate(more_women_county=women > men) %>%
summarize(total_more_women_county = sum(more_women_county))
## # A tibble: 1 x 1
## total_more_women_county
## <int>
## 1 1985
There are 1,985 counties with more women than men.
acs %>% mutate(unemploy_county = unemployment< 10) %>%
summarize(total_unemploy_county = sum(unemploy_county))
## # A tibble: 1 x 1
## total_unemploy_county
## <int>
## 1 2420
There are 2,420 counties with unemployment rate lower than 10%.
dplyr::top_n() function. Read the documentation for this function.acs %>% arrange(desc(mean_commute)) %>%
select(census_id, county, state, mean_commute) %>%
top_n(n=10, wt=mean_commute)
## # A tibble: 10 x 4
## census_id county state mean_commute
## <chr> <fct> <fct> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
Pike in Pennsylvania is the county with the highest mean commmute, followed by Bronx, Charles, Warren, Queens, Richmond, Westmoreland, Park, Kings, and clay.
acs %>% mutate(pct_women =women/total_pop*100) %>%
arrange(pct_women) %>%
select(census_id, county, state, pct_women) %>%
top_n(n=-10, wt=pct_women)
## # A tibble: 10 x 4
## census_id county state pct_women
## <chr> <fct> <fct> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
The new variable ‘pct_women’ was created for the percentage of women for each county. Forest in Pennsylvania is the county with the lowest percentage of women, followed by Bent, Sussex, Wheeler, Lassen, Concho, Chattahoochee, Aleutians East Borough, West Feliciana, and Pershing.
acs %>% mutate(sum_race = hispanic + white + black + native + asian + pacific) %>%
arrange(sum_race) %>%
select(census_id, county, state, sum_race) %>%
top_n(n=-10, wt=sum_race)
## # A tibble: 10 x 4
## census_id county state sum_race
## <chr> <fct> <fct> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83.
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
Hawaii is the county with the lowest sum of race percentage (in other words, with most racial minorities), followed by Maui, Mayes, Honolulu, Pontotoc, Grundy, Yakutat City and Borough, Johnston, Kauai, and Alfalfa.
acs %>% group_by(state) %>%
mutate(sum_race_pct = hispanic + white + black + native + asian + pacific) %>%
summarize(avg_sum_race = mean(sum_race_pct)) %>%
arrange(avg_sum_race) %>%
top_n(n=-1, wt=avg_sum_race)
## # A tibble: 1 x 2
## state avg_sum_race
## <fct> <dbl>
## 1 Hawaii 84
Among US states, Hawaii, On averageis, is the lowest sum of race percentage of 84% .
acs_race <- acs %>% mutate(race_pct = hispanic + white + black + native + asian + pacific )
acs_race$race_pct <- format(round(acs_race$race_pct, 10), nsmall=10)
acs_race %>% mutate(pct_gt100 = race_pct > 100) %>%
filter(pct_gt100 == TRUE) %>%
select(census_id, county, state, race_pct, pct_gt100)
## # A tibble: 43 x 5
## census_id county state race_pct pct_gt100
## <chr> <fct> <fct> <chr> <lgl>
## 1 1065 Hale Alabama 100.0000000000 TRUE
## 2 1131 Wilcox Alabama 100.0000000000 TRUE
## 3 13201 Miller Georgia 100.0000000000 TRUE
## 4 13307 Webster Georgia 100.0000000000 TRUE
## 5 20199 Wallace Kansas 100.0000000000 TRUE
## 6 21031 Butler Kentucky 100.0000000000 TRUE
## 7 28021 Claiborne Mississippi 100.0000000000 TRUE
## 8 28125 Sharkey Mississippi 100.0000000000 TRUE
## 9 30019 Daniels Montana 100.0000000000 TRUE
## 10 30069 Petroleum Montana 100.0000000000 TRUE
## # ... with 33 more rows
Note: I am not sure why the list includes counties with race_pct=100. Thus, I had to rerun code below to extract counties with sum over 100%.
acs_race %>% mutate(pct_gt100 = race_pct >= 100.1) %>%
filter(pct_gt100 == TRUE) %>%
select(census_id, county, state, race_pct, pct_gt100)
## # A tibble: 5 x 5
## census_id county state race_pct pct_gt100
## <chr> <fct> <fct> <chr> <lgl>
## 1 31073 Gosper Nebraska 100.1000000000 TRUE
## 2 31091 Hooker Nebraska 100.1000000000 TRUE
## 3 31125 Nance Nebraska 100.1000000000 TRUE
## 4 48017 Bailey Texas 100.1000000000 TRUE
## 5 48137 Edwards Texas 100.1000000000 TRUE
5 Counties (Gosper, Hooker, Nance, Bailey, and edwards) have a sum 100.1%.
acs_race %>% mutate(pct_100 = (race_pct < 100.1 & race_pct >=100.0)) %>%
filter(pct_100 == TRUE) %>%
select(census_id, state, race_pct, pct_100)
## # A tibble: 38 x 4
## census_id state race_pct pct_100
## <chr> <fct> <chr> <lgl>
## 1 1065 Alabama 100.0000000000 TRUE
## 2 1131 Alabama 100.0000000000 TRUE
## 3 13201 Georgia 100.0000000000 TRUE
## 4 13307 Georgia 100.0000000000 TRUE
## 5 20199 Kansas 100.0000000000 TRUE
## 6 21031 Kentucky 100.0000000000 TRUE
## 7 28021 Mississippi 100.0000000000 TRUE
## 8 28125 Mississippi 100.0000000000 TRUE
## 9 30019 Montana 100.0000000000 TRUE
## 10 30069 Montana 100.0000000000 TRUE
## # ... with 28 more rows
There are 38 counties have a racial percentage sum of exactly 100%. To see how many unique states there exist,
acs_race %>% mutate(pct_100 = (race_pct < 100.1 & race_pct >=100.0)) %>%
filter(pct_100 == TRUE) %>%
select(census_id, state, race_pct, pct_100) %>%
distinct(state)
## # A tibble: 16 x 1
## state
## <fct>
## 1 Alabama
## 2 Georgia
## 3 Kansas
## 4 Kentucky
## 5 Mississippi
## 6 Montana
## 7 Nebraska
## 8 Nevada
## 9 New Mexico
## 10 North Carolina
## 11 North Dakota
## 12 Oregon
## 13 South Dakota
## 14 Texas
## 15 Utah
## 16 West Virginia
There are 16 states having a sum that equals exactly to 100%.
dplyr::min_rank() function to create a new variable called carpool_rank where the highest ranked county (rank = 1) is the county with the highest carpool value. Read the documentation carefully for the ranking function.acs$carpool_rank <- min_rank(desc(acs$carpool))
glimpse(acs)
## Observations: 3,142
## Variables: 36
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama...
## $ county <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Bu...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
## $ carpool_rank <int> 2157, 2157, 1103, 391, 986, 204, 621, 1811, 759, 696...
carpool_rank has been created in acs data.
acs %>% arrange(carpool_rank) %>% select(census_id , county, state, carpool, carpool_rank) %>%
top_n(n=-10, wt=carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <chr> <fct> <fct> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
Clay in Georgia is the highest ranked county (rank=1) for carpooling, followed by LaGrange, Jenkins, Sevier, Seward, Cochran, Jim Hogg, Roberts, Holmes, and Powell.
acs %>% arrange(desc(carpool_rank)) %>% select(census_id , county, state, carpool, carpool_rank) %>%
top_n(n=10, wt=carpool_rank)
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <chr> <fct> <fct> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
## 11 51720 Norton city Virginia 2.8 3132
Kenedy and King in Texas are two worst carpooling counties, followed by Irion, Wheeler(Nebraska), New York, Wheeler(Georgia), Emmons, Daniels, Dundy, Hyde, and Norton City.
acs %>% group_by(state) %>%
summarize(state_avg_carpool = mean(carpool)) %>%
mutate(state_avg_rank = min_rank(desc(state_avg_carpool))) %>%
arrange(state_avg_rank) %>%
top_n(n=-1, wt=state_avg_rank)
## # A tibble: 1 x 3
## state state_avg_carpool state_avg_rank
## <fct> <dbl> <int>
## 1 Alaska 12.1 1
On average, Alaska is the best ranked state for carpooling.
acs %>% group_by(state) %>%
summarize(state_avg_carpool = mean(carpool)) %>%
mutate(state_avg_rank = min_rank(desc(state_avg_carpool))) %>%
arrange(state_avg_rank) %>%
top_n(n=-5, wt=state_avg_rank)
## # A tibble: 5 x 3
## state state_avg_carpool state_avg_rank
## <fct> <dbl> <int>
## 1 Alaska 12.1 1
## 2 Arkansas 11.9 2
## 3 Utah 11.9 3
## 4 Texas 11.8 4
## 5 Nevada 11.7 5
Alaska, Arkansas, Utah, Texas, and Nevada are the top 5 states for carpooling.