Import Dataset and setup, read codebook

1) There are 3,142 rows and 35 variables.
library(tidyverse)
library(readr)
library(dplyr)
library(kableExtra)

library(readr)
acs_data <- read_csv("~/001 - DATA WRANGLING R/Week 4/homework3/acs_2015_county_data_revised.csv")
2) Glimpse

Currently there are two character variables, state and county, and the rest are numeric. I would not change variable types because they appear to be in good shape.

# Use glimpse to get a better view of the variables and type of variable.
glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
3) Check for missing values.

Two variables each have one NA - income and child_poverty. Our sample size is 3,142 observations. Since the NA affects only a min of 1 obs and a max of 2 observations, I could drop those rows using drop_na(). The sample size will still be large enough to be meaningful for THIS assignment. For this assignment, I choose not to remove these two observations. Instead, I will exclude the NA values when doing calculations.

# To view the number of NAs in each column.
colSums(is.na(acs_data))
4) Summary

I used the summary function to look for unusual values. The codebook for variable Employed describes the variable as a percentage employed over ages 16+, however, the values appear to be total numbers and not a percentage. Each value will need to be divided by the total population to convert to percentage.

# To find summary statistics and look for unusual values.
summary(acs_data)

# To convert the employed column into a percentage to match codebook
acs_data %>% 
  mutate(pct_employed = employed/total_pop * 100) %>% 
  summarise(pct_employed)

Data Manipulation and Insights

5) There are 1,985 counties with more women then men.
# To count the number of counties with more women then men.
acs_data %>% 
  count(women > men)
6) There are 2,420 counties with with unemployment under 10%.
# To count the number of counties with unemployment under 10%
acs_data %>% 
  count(acs_data$unemployment < 10)
7) Top 10 counties with the highest mean commute. Sorted.
# ?top_n # To read the documentation

highest_commute <- acs_data %>% 
  dplyr::select(census_id, county, state, mean_commute) %>% # Select Variables
  dplyr::top_n(10, mean_commute) %>%  # top 10 according to mean_commute
  dplyr::arrange(desc(mean_commute)) # Arrange in the top 10 order.
highest_commute %>% 
  knitr::kable(caption = "Table 1: Counties with Highest Mean Commute in Minutes") %>% 
  kableExtra::kable_styling(bootstrap_options = "striped")
Table 1: Counties with Highest Mean Commute in Minutes
census_id county state mean_commute
42103 Pike Pennsylvania 44.0
36005 Bronx New York 43.0
24017 Charles Maryland 42.8
51187 Warren Virginia 42.7
36081 Queens New York 42.6
36085 Richmond New York 42.6
51193 Westmoreland Virginia 42.5
8093 Park Colorado 42.4
36047 Kings New York 41.7
54015 Clay West Virginia 41.4
8) Percentage of women per county. The top 10 with lowest percentages.
# Create new variable for the percentage of women per county.

women_pct <- acs_data %>% 
  mutate(pct_women = women/total_pop * 100) %>% #### New Variable
  dplyr::select(census_id, county, state, pct_women) %>% #### To select variables
  dplyr::arrange(pct_women) %>% #### Identify the variable to arrange
  dplyr::top_n(-10, pct_women) ##### Take top 10 but in descending order
women_pct %>% #### This is for styling the table
  knitr::kable(caption = "Table 2: Counties with Lowest Percent of Women", 
               digits = 2) %>% #### Reduced to 2 digits
  kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
Table 2: Counties with Lowest Percent of Women
census_id county state pct_women
42053 Forest Pennsylvania 26.78
8011 Bent Colorado 31.37
51183 Sussex Virginia 31.47
13309 Wheeler Georgia 32.10
6035 Lassen California 33.17
48095 Concho Texas 33.28
13053 Chattahoochee Georgia 33.36
2013 Aleutians East Borough Alaska 33.47
22125 West Feliciana Louisiana 33.65
32027 Pershing Nevada 33.73
9 a) Race Percentages Counties with lowest percent of race total are noted in table 9 a.
#### 9 a
# Create new variable for the race percent total per county.

pct_race <- acs_data %>% #### New Variable
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>% 
  dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
  dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
  dplyr::top_n(-10, race_pct_sum) ##### Take top 10 but in descending order
pct_race %>% #### This is for styling the table
  knitr::kable(caption = "Table 9 a: Counties with Lowest Percent of Race Total", 
               digits = 2) %>% #### Reduced to 2 digits
  kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
Table 9 a: Counties with Lowest Percent of Race Total
census_id county state race_pct_sum
15001 Hawaii Hawaii 76.4
15009 Maui Hawaii 79.2
40097 Mayes Oklahoma 79.7
15003 Honolulu Hawaii 81.5
40123 Pontotoc Oklahoma 82.8
47061 Grundy Tennessee 83.0
2282 Yakutat City and Borough Alaska 83.4
40069 Johnston Oklahoma 84.0
15007 Kauai Hawaii 84.1
40003 Alfalfa Oklahoma 85.1
9 b) Hawaii is the state with the lowest mean of race percent variables.
#### 9 b
# Group by state.

low_state_mean <- acs_data %>% #### New Variable
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>% 
  dplyr::group_by(state) %>% 
  summarize(mean_race_pct_sum = mean(race_pct_sum, na.rm = TRUE)) %>% 
  dplyr::select(state, mean_race_pct_sum) %>% #### To select variables
  dplyr::arrange(mean_race_pct_sum) %>% #### Identify the variable to arrange
  dplyr::top_n(-3, mean_race_pct_sum) ##### Take top 3 but in descending order
low_state_mean %>% #### This is for styling the table
  knitr::kable(caption = "Table 9 b: State with Lowest Percent of Mean Race Variables", 
               digits = 2) %>% #### Reduced to 2 digits
  kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
Table 9 b: State with Lowest Percent of Mean Race Variables
state mean_race_pct_sum
Hawaii 84.00
Alaska 92.71
Oklahoma 92.82
9 c) Counties with percentages over 100%: Five (5) counties listed below have 100.1 percentages of race totals.
#### 9 c

pct_race <- acs_data %>% #### New Variable
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>% 
  dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
  dplyr::filter(race_pct_sum > 100) %>% 
  dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
  dplyr::top_n(10, race_pct_sum) ##### Take top 10 
pct_race %>% #### This is for styling the table
  knitr::kable(caption = "Table 9 c: Counties with Percentages over 100%", 
               digits = 4) %>% #### Reduced to 2 digits
  kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
Table 9 c: Counties with Percentages over 100%
census_id county state race_pct_sum
28021 Claiborne Mississippi 100.0
48131 Duval Texas 100.0
48261 Kenedy Texas 100.0
48263 Kent Texas 100.0
48377 Presidio Texas 100.0
49001 Beaver Utah 100.0
31125 Nance Nebraska 100.1
31091 Hooker Nebraska 100.1
48017 Bailey Texas 100.1
48137 Edwards Texas 100.1
31073 Gosper Nebraska 100.1
9 d) There are 27 states with percentage totals that equal exactly 100.
#### 9 d
# Group by state.

pct_race <- acs_data %>% #### New Variable
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>% 
  dplyr::select(census_id, county, state, race_pct_sum) %>% #### To select variables
  dplyr::filter(race_pct_sum == 100) %>% 
  dplyr::arrange(race_pct_sum) %>% #### Identify the variable to arrange
  dplyr::count(race_pct_sum) ##### Count States equal to 100 
pct_race %>% #### This is for styling the table
  knitr::kable(caption = "Table 9 d: State Count with Race Percentages Equal to  100%", 
               digits = 4) %>% #### Reduced to 2 digits
  kableExtra::kable_styling(bootstrap_options = "striped") #### Adds row stripes
Table 9 d: State Count with Race Percentages Equal to 100%
race_pct_sum n
100 27
10- a) Create new variable carpool_rank. The highest rank is Clay, Georgia.
10- b) Create table with carpool_rank showing the highest rank for carpool counties.
acs_data %>% 
  mutate(carpool_rank = rank(desc(carpool))) %>% # To make new variable
  dplyr::select(census_id, county, state, carpool, carpool_rank) %>%
  dplyr::arrange(carpool_rank) %>%  # Ranking of Counties
  dplyr::top_n(-10, carpool_rank) %>% # Descending order
knitr::kable(caption = "Table 10-a and b: Counties with Highest Carpool Rank") %>%
  kableExtra::kable_styling(bootstrap_options = "striped")
Table 10-a and b: Counties with Highest Carpool Rank
census_id county state carpool carpool_rank
13061 Clay Georgia 29.9 1
18087 LaGrange Indiana 27.0 2
13165 Jenkins Georgia 25.3 3
5133 Sevier Arkansas 24.4 4
20175 Seward Kansas 23.4 5
48079 Cochran Texas 22.8 6
48247 Jim Hogg Texas 22.6 7
48393 Roberts Texas 22.4 8
39075 Holmes Ohio 21.8 9
21197 Powell Kentucky 21.6 10
10-c) Lowest rank counties for carpool_rank. The lowest rank carpool counties are Hyde, South Dakota and Norton City, Virginia.
acs_data %>% 
  mutate(carpool_rank = rank(desc(carpool))) %>% # To make new variable
  dplyr::select(census_id, county, state, carpool, carpool_rank) %>%
  dplyr::arrange(carpool_rank) %>%  # Ranking of Counties
  dplyr::top_n(10, carpool_rank) %>% # Descending order
knitr::kable(caption = "Table 10-c: Counties with Lowest Carpool Rank") %>%
  kableExtra::kable_styling(bootstrap_options = "striped")
Table 10-c: Counties with Lowest Carpool Rank
census_id county state carpool carpool_rank
46069 Hyde South Dakota 2.8 3132.5
51720 Norton city Virginia 2.8 3132.5
30019 Daniels Montana 2.6 3134.5
31057 Dundy Nebraska 2.6 3134.5
13309 Wheeler Georgia 2.3 3136.5
38029 Emmons North Dakota 2.3 3136.5
36061 New York New York 1.9 3138.0
31183 Wheeler Nebraska 1.3 3139.0
48235 Irion Texas 0.9 3140.0
48261 Kenedy Texas 0.0 3141.5
48269 King Texas 0.0 3141.5
10-d) On average the state with the best carpool_rank is District of Columbia.
10-e) Top five states with highest average carpool_rank.
acs_data %>% 
  mutate(carpool_rank = rank(carpool)) %>% # To make new variable
  dplyr::group_by(state) %>% # Group by state
  dplyr::summarize(mean_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>% 
  dplyr::arrange(mean_carpool_rank) %>%  # Get average and arrange
  dplyr::top_n(-5, mean_carpool_rank) %>% # Descending order top 5 states
knitr::kable(caption = "Table 10-e: States with Highest Average Carpool Rank",
             digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = "striped")
Table 10-e: States with Highest Average Carpool Rank
state mean_carpool_rank
District of Columbia 76.50
Massachusetts 557.11
Connecticut 624.81
Rhode Island 689.60
New Jersey 705.00