#import necessary packages
library(lubridate)
library(glue)
library(tidyverse)
library(dplyr)
library(tidyr)
The data for this assignment is from an annual survey called the American Community Survey (ACS) that is collected by the United States Census Bureau every year. It shows five year estimates for variables in the dataset for each politically designated county in the United States.
#import the data
data <- read_csv("acs_2015_county_data_revised.csv", col_names = TRUE)
There are 3,142 rows and 35 columns in this data set.
#check rows
nrow(data)
## [1] 3142
#check columns
ncol(data)
## [1] 35
The majority of variable types are numeric and represent population totals or percentages. State and county are character variables. The only variable that I think needs to be changed is the census_id. It is originally a numeric double type, but since it doesn’t represent a number, it is just a unique identifier, I will change it to character type.
#change census_id to character type
data$census_id <- as.character(data$census_id)
Below is a snapshot of the dataset.
glimpse(data)
## Observations: 3,142
## Variables: 35
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
There are only two missing values in this data set: one in child_poverty and one in income. Because this is such a low number (i.e. less than 1% of the data), I will remove these two observations.
#check missing values per column
colSums(is.na(data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
After removing these two observations, there are no more missing values in the data set.
#remove rows with missing values
data <- data[complete.cases(data), ]
#check for missing values again
colSums(is.na(data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
The only variable that seems to have strange values is employed. According to the data dictionary, this variable is supposed to be the percentage of people employed, ages 16+. However, looking at the summary for this variable, the numbers are not in terms of percents but instead appear to be the total number of people ages 16+ employed.
summary(data)
## census_id state county total_pop
## Length:3140 Length:3140 Length:3140 Min. : 267
## Class :character Class :character Class :character 1st Qu.: 11036
## Mode :character Mode :character Mode :character Median : 25793
## Mean : 100801
## 3rd Qu.: 67620
## Max. :10038388
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45095 Median :23575 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
I will change this variable to a percantage by dividing the current number in the column by the population of that county.Now the values for employed fall between 0 and 100 like the other percentage columns.
data$employed <- ((data$employed/data$total_pop)*100)
summary(data$employed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.57 39.01 43.66 43.36 48.19 76.24
data %>%
group_by(county) %>%
select(county, women, men) %>%
filter(women>men)
## # A tibble: 1,984 x 3
## # Groups: county [1,248]
## county women men
## <chr> <dbl> <dbl>
## 1 Autauga 28476 26745
## 2 Baldwin 99807 95314
## 3 Blount 29198 28512
## 4 Butler 10852 9502
## 5 Calhoun 60374 56274
## 6 Chambers 17821 16258
## 7 Cherokee 13033 12975
## 8 Chilton 22200 21619
## 9 Choctaw 7013 6382
## 10 Clarke 13236 11834
## # ... with 1,974 more rows
data %>%
group_by(county) %>%
select(county, unemployment) %>%
filter(unemployment<10)
## # A tibble: 2,419 x 2
## # Groups: county [1,482]
## county unemployment
## <chr> <dbl>
## 1 Autauga 7.6
## 2 Baldwin 7.5
## 3 Bibb 8.3
## 4 Blount 7.7
## 5 Chambers 8.9
## 6 Cherokee 7.9
## 7 Chilton 9.1
## 8 Clay 9.4
## 9 Cleburne 8.3
## 10 Coffee 7.1
## # ... with 2,409 more rows
top_n(data,10,mean_commute) %>%
select(census_id, county, state, mean_commute) %>%
arrange(desc(mean_commute))
## # A tibble: 10 x 4
## census_id county state mean_commute
## <chr> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
data %>% mutate(
women_pct=women/total_pop*100) %>%
top_n(-10,women_pct) %>%
select(census_id, county, state, women_pct) %>%
arrange(women_pct)
## # A tibble: 10 x 4
## census_id county state women_pct
## <chr> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
data %>% mutate(
race_sum=hispanic+white+black+native+asian+pacific) %>%
top_n(-10,race_sum) %>%
select(census_id, county, state, race_sum) %>%
arrange(race_sum)
## # A tibble: 10 x 4
## census_id county state race_sum
## <chr> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83.
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
On average, Hawaii has the lowest sum of the race percentage variables.
data %>% mutate(
race_sum=hispanic+white+black+native+asian+pacific) %>%
group_by(state) %>%
summarize(race_sum_mean=mean(race_sum)) %>%
select(state, race_sum_mean) %>%
arrange(race_sum_mean)
## # A tibble: 51 x 2
## state race_sum_mean
## <chr> <dbl>
## 1 Hawaii 80.3
## 2 Alaska 92.7
## 3 Oklahoma 92.8
## 4 Washington 96.7
## 5 California 96.9
## 6 Oregon 97.1
## 7 Delaware 97.3
## 8 Massachusetts 97.5
## 9 Maryland 97.6
## 10 District of Columbia 97.6
## # ... with 41 more rows
There are 5 counties in the data set that have a sum of these race percentage variables that are 100.1%. They are Gosper, Hooker, Bailey, Edwards, and Nance counties.
data %>% mutate(
race_sum=hispanic+white+black+native+asian+pacific) %>%
filter(race_sum>100.0) %>%
select(census_id, county, state, race_sum) %>%
arrange(desc(race_sum))
## # A tibble: 11 x 4
## census_id county state race_sum
## <chr> <chr> <chr> <dbl>
## 1 31073 Gosper Nebraska 100.
## 2 31091 Hooker Nebraska 100.
## 3 48017 Bailey Texas 100.
## 4 48137 Edwards Texas 100.
## 5 31125 Nance Nebraska 100.
## 6 28021 Claiborne Mississippi 100.
## 7 48131 Duval Texas 100.
## 8 48261 Kenedy Texas 100.
## 9 48263 Kent Texas 100.
## 10 48377 Presidio Texas 100.
## 11 49001 Beaver Utah 100.
There are 13 states that have a sum that equals exactly 100%.
data %>% mutate(
race_sum=hispanic+white+black+native+asian+pacific) %>%
group_by(state, race_sum) %>%
filter(race_sum==100) %>%
select(state, race_sum)
## # A tibble: 27 x 2
## # Groups: state, race_sum [13]
## state race_sum
## <chr> <dbl>
## 1 Alabama 100
## 2 Alabama 100
## 3 Georgia 100
## 4 Georgia 100
## 5 Kansas 100
## 6 Kentucky 100
## 7 Mississippi 100
## 8 Montana 100
## 9 Montana 100
## 10 Montana 100
## # ... with 17 more rows
data %>%
mutate(carpool_rank = min_rank(-carpool)) %>%
top_n(-10, carpool_rank) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank)
## # A tibble: 10 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
The 10 lowest ranked counties for carpooling are listed in this table below.
data %>%
mutate(carpool_rank = min_rank(-carpool)) %>%
top_n(10, carpool_rank) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(desc(carpool_rank))
## # A tibble: 11 x 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3139
## 2 48269 King Texas 0 3139
## 3 48235 Irion Texas 0.9 3138
## 4 31183 Wheeler Nebraska 1.3 3137
## 5 36061 New York New York 1.9 3136
## 6 13309 Wheeler Georgia 2.3 3134
## 7 38029 Emmons North Dakota 2.3 3134
## 8 30019 Daniels Montana 2.6 3132
## 9 31057 Dundy Nebraska 2.6 3132
## 10 46069 Hyde South Dakota 2.8 3130
## 11 51720 Norton city Virginia 2.8 3130
On average Hawaii is the state ranked the best for carpooling. and the top five states for carpooling are Hawaii, Alaska, Arkansas, Utah, and Texas.
data %>%
group_by(state) %>%
summarise(mean_carpool = mean(carpool)) %>%
mutate(carpool_rank=min_rank(-mean_carpool)) %>%
arrange(desc(mean_carpool))
## # A tibble: 51 x 3
## state mean_carpool carpool_rank
## <chr> <dbl> <int>
## 1 Hawaii 12.8 1
## 2 Alaska 12.1 2
## 3 Arkansas 11.9 3
## 4 Utah 11.9 4
## 5 Texas 11.8 5
## 6 Nevada 11.7 6
## 7 Arizona 11.6 7
## 8 Missouri 11.5 8
## 9 Wyoming 11.5 9
## 10 Idaho 11.4 10
## # ... with 41 more rows