library(tidyverse)
library(readr)
library(dplyr)
library(kableExtra)
library(readr)
acs_data <- read_csv("~/001 - DATA WRANGLING R/Week 4/homework3/acs_2015_county_data_revised.csv")
Currently there are two character variables, state and county, and the rest are numeric. I would not change variable types because they appear to be in good shape.
# Use glimpse to get a better view of the variables and type of variable.
glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
Two variables each have one NA - income and child_poverty. Our sample size is 3,142 observations. Since the NA affects only a min of 1 obs and a max of 2 observations, I could drop those rows using drop_na(). The sample size will still be large enough to be meaningful for THIS assignment. For this assignment, I choose not to remove these two observations. Instead, I will exclude the NA values when doing calculations.
# To view the number of NAs in each column.
colSums(is.na(acs_data))
I used the summary function to look for unusual values. The codebook for variable Employed describes the variable as a percentage employed over ages 16+, however, the values appear to be total numbers and not a percentage. Each value will need to be divided by the total population to convert to percentage.
# To find summary statistics and look for unusual values.
summary(acs_data)
# To convert the employed column into a percentage to match codebook
acs_data %>%
mutate(pct_employed = employed/total_pop * 100) %>%
summarise(pct_employed)
# To count the number of counties with more women then men.
acs_data %>%
count(women > men)
# To count the number of counties with unemployment under 10%
acs_data %>%
count(acs_data$unemployment < 10)
# ?top_n # To read the documentation
highest_commute <- acs_data %>%
dplyr::select(census_id, county, state, mean_commute) %>% # Select Variables
dplyr::top_n(10, mean_commute) %>% # top 10 according to mean_commute
dplyr::arrange(desc(mean_commute)) # Arrange in the top 10 order.
highest_commute %>%
knitr::kable(caption = "Table 1: Counties with Highest Mean Commute in Minutes") %>%
kableExtra::kable_styling(bootstrap_options = "striped")
| census_id | county | state | mean_commute |
|---|---|---|---|
| 42103 | Pike | Pennsylvania | 44.0 |
| 36005 | Bronx | New York | 43.0 |
| 24017 | Charles | Maryland | 42.8 |
| 51187 | Warren | Virginia | 42.7 |
| 36081 | Queens | New York | 42.6 |
| 36085 | Richmond | New York | 42.6 |
| 51193 | Westmoreland | Virginia | 42.5 |
| 8093 | Park | Colorado | 42.4 |
| 36047 | Kings | New York | 41.7 |
| 54015 | Clay | West Virginia | 41.4 |
# Create new variable for the percentage of women per county.
women_pct <- acs_data %>%
mutate(pct_women = women/total_pop * 100) %>% #### New Variable
dplyr::select(census_id, county, state, pct_women) %>% #### To select variables
dplyr::arrange(pct_women) %>% #### Identify the variable to arrange
dplyr::top_n(-10, pct_women) ##### Take top 10 but in descending order
women_pct %>% #### This is for styling the table
knitr::kable(caption = "Table 2: Counties with Lowest Percent of Women",
digits = 2) %>% #### Reduced to 2 digits
kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
| census_id | county | state | pct_women |
|---|---|---|---|
| 42053 | Forest | Pennsylvania | 26.78 |
| 8011 | Bent | Colorado | 31.37 |
| 51183 | Sussex | Virginia | 31.47 |
| 13309 | Wheeler | Georgia | 32.10 |
| 6035 | Lassen | California | 33.17 |
| 48095 | Concho | Texas | 33.28 |
| 13053 | Chattahoochee | Georgia | 33.36 |
| 2013 | Aleutians East Borough | Alaska | 33.47 |
| 22125 | West Feliciana | Louisiana | 33.65 |
| 32027 | Pershing | Nevada | 33.73 |
#### 9 a
# Create new variable for the race percent total per county.
pct_race <- acs_data %>% #### New Variable
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
dplyr::top_n(-10, race_pct_sum) ##### Take top 10 but in descending order
pct_race %>% #### This is for styling the table
knitr::kable(caption = "Table 9 a: Counties with Lowest Percent of Race Total",
digits = 2) %>% #### Reduced to 2 digits
kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
| census_id | county | state | race_pct_sum |
|---|---|---|---|
| 15001 | Hawaii | Hawaii | 76.4 |
| 15009 | Maui | Hawaii | 79.2 |
| 40097 | Mayes | Oklahoma | 79.7 |
| 15003 | Honolulu | Hawaii | 81.5 |
| 40123 | Pontotoc | Oklahoma | 82.8 |
| 47061 | Grundy | Tennessee | 83.0 |
| 2282 | Yakutat City and Borough | Alaska | 83.4 |
| 40069 | Johnston | Oklahoma | 84.0 |
| 15007 | Kauai | Hawaii | 84.1 |
| 40003 | Alfalfa | Oklahoma | 85.1 |
#### 9 b
# Group by state.
low_state_mean <- acs_data %>% #### New Variable
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
dplyr::group_by(state) %>%
summarize(mean_race_pct_sum = mean(race_pct_sum, na.rm = TRUE)) %>%
dplyr::select(state, mean_race_pct_sum) %>% #### To select variables
dplyr::arrange(mean_race_pct_sum) %>% #### Identify the variable to arrange
dplyr::top_n(-3, mean_race_pct_sum) ##### Take top 3 but in descending order
low_state_mean %>% #### This is for styling the table
knitr::kable(caption = "Table 9 b: State with Lowest Percent of Mean Race Variables",
digits = 2) %>% #### Reduced to 2 digits
kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
| state | mean_race_pct_sum |
|---|---|
| Hawaii | 84.00 |
| Alaska | 92.71 |
| Oklahoma | 92.82 |
#### 9 c
pct_race <- acs_data %>% #### New Variable
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
dplyr::filter(race_pct_sum > 100) %>%
dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
dplyr::top_n(10, race_pct_sum) ##### Take top 10
pct_race %>% #### This is for styling the table
knitr::kable(caption = "Table 9 c: Counties with Percentages over 100%",
digits = 4) %>% #### Reduced to 2 digits
kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
| census_id | county | state | race_pct_sum |
|---|---|---|---|
| 28021 | Claiborne | Mississippi | 100.0 |
| 48131 | Duval | Texas | 100.0 |
| 48261 | Kenedy | Texas | 100.0 |
| 48263 | Kent | Texas | 100.0 |
| 48377 | Presidio | Texas | 100.0 |
| 49001 | Beaver | Utah | 100.0 |
| 31125 | Nance | Nebraska | 100.1 |
| 31091 | Hooker | Nebraska | 100.1 |
| 48017 | Bailey | Texas | 100.1 |
| 48137 | Edwards | Texas | 100.1 |
| 31073 | Gosper | Nebraska | 100.1 |
#### 9 d
# Group by state.
pct_race <- acs_data %>% #### New Variable
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
dplyr::filter(race_pct_sum == 100) %>%
dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
dplyr::count(race_pct_sum) ##### Count States equal to 100
pct_race %>% #### This is for styling the table
knitr::kable(caption = "Table 9 d: State Count with Race Percentages Equal to 100%",
digits = 4) %>% #### Reduced to 2 digits
kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
| race_pct_sum | n |
|---|---|
| 100 | 27 |
carpool_rank. The highest rank is Clay, Georgia.carpool_rank showing the highest rank for carpool counties.acs_data %>%
mutate(carpool_rank = rank(desc(carpool))) %>% # To make new variable
dplyr::select(census_id, county, state, carpool, carpool_rank) %>%
dplyr::arrange(carpool_rank) %>% # Ranking of Counties
dplyr::top_n(-10, carpool_rank) %>% # Descending order
knitr::kable(caption = "Table 10-a and b: Counties with Highest Carpool Rank") %>%
kableExtra::kable_styling(bootstrap_options = "striped")
| census_id | county | state | carpool | carpool_rank |
|---|---|---|---|---|
| 13061 | Clay | Georgia | 29.9 | 1 |
| 18087 | LaGrange | Indiana | 27.0 | 2 |
| 13165 | Jenkins | Georgia | 25.3 | 3 |
| 5133 | Sevier | Arkansas | 24.4 | 4 |
| 20175 | Seward | Kansas | 23.4 | 5 |
| 48079 | Cochran | Texas | 22.8 | 6 |
| 48247 | Jim Hogg | Texas | 22.6 | 7 |
| 48393 | Roberts | Texas | 22.4 | 8 |
| 39075 | Holmes | Ohio | 21.8 | 9 |
| 21197 | Powell | Kentucky | 21.6 | 10 |
carpool_rank. The lowest rank carpool counties are Hyde, South Dakota and Norton City, Virginia.acs_data %>%
mutate(carpool_rank = rank(desc(carpool))) %>% # To make new variable
dplyr::select(census_id, county, state, carpool, carpool_rank) %>%
dplyr::arrange(carpool_rank) %>% # Ranking of Counties
dplyr::top_n(10, carpool_rank) %>% # Descending order
knitr::kable(caption = "Table 10-c: Counties with Lowest Carpool Rank") %>%
kableExtra::kable_styling(bootstrap_options = "striped")
| census_id | county | state | carpool | carpool_rank |
|---|---|---|---|---|
| 46069 | Hyde | South Dakota | 2.8 | 3132.5 |
| 51720 | Norton city | Virginia | 2.8 | 3132.5 |
| 30019 | Daniels | Montana | 2.6 | 3134.5 |
| 31057 | Dundy | Nebraska | 2.6 | 3134.5 |
| 13309 | Wheeler | Georgia | 2.3 | 3136.5 |
| 38029 | Emmons | North Dakota | 2.3 | 3136.5 |
| 36061 | New York | New York | 1.9 | 3138.0 |
| 31183 | Wheeler | Nebraska | 1.3 | 3139.0 |
| 48235 | Irion | Texas | 0.9 | 3140.0 |
| 48261 | Kenedy | Texas | 0.0 | 3141.5 |
| 48269 | King | Texas | 0.0 | 3141.5 |
carpool_rank is District of Columbia.carpool_rank.acs_data %>%
mutate(carpool_rank = rank(carpool)) %>% # To make new variable
dplyr::group_by(state) %>% # Group by state
dplyr::summarize(mean_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>%
dplyr::arrange(mean_carpool_rank) %>% # Get average and arrange
dplyr::top_n(-5, mean_carpool_rank) %>% # Descending order top 5 states
knitr::kable(caption = "Table 10-e: States with Highest Average Carpool Rank",
digits = 2) %>%
kableExtra::kable_styling(bootstrap_options = "striped")
| state | mean_carpool_rank |
|---|---|
| District of Columbia | 76.50 |
| Massachusetts | 557.11 |
| Connecticut | 624.81 |
| Rhode Island | 689.60 |
| New Jersey | 705.00 |