Question 1 - Importing the Data

There are 3,142 rows and 35 columns in this data set.

Question 2 - Variable Changes

These variables are mostly numeric except state and county, and the rest are numeric. These variable types represent the data well and do not need to by changed.
glimpse(acs_2015_county_data_revised)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~

Question 3 - Missing Values

This data set is large and there are only 2 values that classify as NA in income and child_poverty columns. In this case it is best to leave them and adjust later if needed during calculations as they will not cause issues. I do not want to fill these NAs with a mean or median in order to keep true to the original data as much as possible.
colSums(is.na(acs_2015_county_data_revised))

Question 4 - Unusual Values

When looking at employment it makes sense for it to match other columns around it and also become a percentage to match.
summary(acs_2015_county_data_revised)
acs_2015_county_data_revised %>% 
  mutate(pct_employed = employed/total_pop * 100) %>%
  summarise(pct_employed)

Question 5 - Counties Gender Percentages

There are 1,985 counties with more women than men.
acs_2015_county_data_revised %>%
  count(acs_2015_county_data_revised$women > acs_2015_county_data_revised$men)

Question 6 - Unemployment Rates

There are 2,420 counties with unemployment under 10%.
acs_2015_county_data_revised %>%
  count(acs_2015_county_data_revised < 10)

Question 7 - Mean Commute

Pennsylvania has the longest mean commute at 44 minutes.
high_commute <- acs_2015_county_data_revised %>%
  select(census_id, county, state, mean_commute) %>%
  top_n(10, mean_commute) %>%
  arrange(desc(mean_commute))
high_commute %>%
  knitr::kable(caption = "States with the Highest Commute (Min)") %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
States with the Highest Commute (Min)
census_id county state mean_commute
42103 Pike Pennsylvania 44.0
36005 Bronx New York 43.0
24017 Charles Maryland 42.8
51187 Warren Virginia 42.7
36081 Queens New York 42.6
36085 Richmond New York 42.6
51193 Westmoreland Virginia 42.5
8093 Park Colorado 42.4
36047 Kings New York 41.7
54015 Clay West Virginia 41.4

Question 8 - Percentage of Women

Forest County, Pennsylvania has the lowest percentage of women.
women_pct <- acs_2015_county_data_revised %>%
  mutate(pct_women = women/total_pop * 100) %>%
  select(census_id, county, state, pct_women) %>%
  top_n(-10, pct_women) %>%
  arrange(pct_women)
women_pct %>%
  knitr::kable(caption = "Counties with the Lowest Percentage of Women", 
               digits=2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Counties with the Lowest Percentage of Women
census_id county state pct_women
42053 Forest Pennsylvania 26.78
8011 Bent Colorado 31.37
51183 Sussex Virginia 31.47
13309 Wheeler Georgia 32.10
6035 Lassen California 33.17
48095 Concho Texas 33.28
13053 Chattahoochee Georgia 33.36
2013 Aleutians East Borough Alaska 33.47
22125 West Feliciana Louisiana 33.65
32027 Pershing Nevada 33.73

Question 9 - Race Percentages

9a) Top 10 Counties with Lowest Sum of Race Percentage Variables
Hawaii, Hawaii has the lowest sum of race percentage variables.
pct_race <- acs_2015_county_data_revised %>%
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
  select(census_id, county, state, race_pct_sum) %>%
  top_n(-10, race_pct_sum) %>%
  arrange(race_pct_sum) 
pct_race %>%
  knitr::kable(caption = "Top 10 Counties with Lowest Sum of Race Percentage Variables", 
               digits=2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Top 10 Counties with Lowest Sum of Race Percentage Variables
census_id county state race_pct_sum
15001 Hawaii Hawaii 76.4
15009 Maui Hawaii 79.2
40097 Mayes Oklahoma 79.7
15003 Honolulu Hawaii 81.5
40123 Pontotoc Oklahoma 82.8
47061 Grundy Tennessee 83.0
2282 Yakutat City and Borough Alaska 83.4
40069 Johnston Oklahoma 84.0
15007 Kauai Hawaii 84.1
40003 Alfalfa Oklahoma 85.1
9b) State with Lowest Sum of Race Percentage Variables
Hawaii has the lowest sum of race percentage variables.
low_state_mean <- acs_2015_county_data_revised %>%
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
  group_by(state) %>%
  summarize(mean_race_pct_sum = mean(race_pct_sum, na.rm = TRUE)) %>%
  select(state, mean_race_pct_sum) %>%
  arrange(mean_race_pct_sum) %>%
  top_n(-3, mean_race_pct_sum)
low_state_mean %>%
  knitr::kable(caption = "State with Lowest Sum of Race Percentage Variables",
               digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
State with Lowest Sum of Race Percentage Variables
state mean_race_pct_sum
Hawaii 84.00
Alaska 92.71
Oklahoma 92.82
9c) Counties with Race Percentage Varibles That Sum Over 100%
While a few counties are just a bit over 100% none are significant, as this is most likely rounding error. All are less than 100.15%.
pct_race <- acs_2015_county_data_revised %>%
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
  select(census_id, county, state, race_pct_sum)  %>%
  filter(race_pct_sum > 100.01) %>%
  arrange(race_pct_sum) %>%
  top_n(10, race_pct_sum)
pct_race %>%
  knitr::kable(caption = "Counties with Race Percentage Varibles That Sum Over 100%",
               digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Counties with Race Percentage Varibles That Sum Over 100%
census_id county state race_pct_sum
31125 Nance Nebraska 100.1
31091 Hooker Nebraska 100.1
48017 Bailey Texas 100.1
48137 Edwards Texas 100.1
31073 Gosper Nebraska 100.1
9d) States that Sum Exactly 100%
There are 27 states that sum exactly to 100%.
pct_race <- acs_2015_county_data_revised %>%
  mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
  select(census_id, county, state, race_pct_sum)  %>%
  filter(race_pct_sum == 100.00) %>%
  arrange(race_pct_sum) %>%
  count(race_pct_sum)
pct_race %>%
  knitr::kable(caption = "Table 9d",
               digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Table 9d
race_pct_sum n
100 27

Question 10 - Carpool

10a) A new variable carpool_rank was created.
10b) Highest Ranked Counties for Carpooling
Clay, Georgia is ranked the highest for carpooling at 29.9%.
acs_2015_county_data_revised %>% 
  mutate(carpool_rank = rank(desc(carpool))) %>%
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(carpool_rank) %>%  
  top_n(-10, carpool_rank) %>% 
knitr::kable(caption = "Highest Ranked Counties for Carpooling") %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Highest Ranked Counties for Carpooling
census_id county state carpool carpool_rank
13061 Clay Georgia 29.9 1
18087 LaGrange Indiana 27.0 2
13165 Jenkins Georgia 25.3 3
5133 Sevier Arkansas 24.4 4
20175 Seward Kansas 23.4 5
48079 Cochran Texas 22.8 6
48247 Jim Hogg Texas 22.6 7
48393 Roberts Texas 22.4 8
39075 Holmes Ohio 21.8 9
21197 Powell Kentucky 21.6 10
10 c) Lowest Ranked Counties for Carpooling
Hyde, South Dakota and Norton City, Virginia are ranked lowest for carpooling at 2.8%. These areas may be more rural where it is difficult to share rides.
acs_2015_county_data_revised %>% 
  mutate(carpool_rank = rank(desc(carpool))) %>% 
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(carpool_rank) %>% 
  top_n(10, carpool_rank) %>% 
knitr::kable(caption = "Lowest Ranked Counties for Carpooling") %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
Lowest Ranked Counties for Carpooling
census_id county state carpool carpool_rank
46069 Hyde South Dakota 2.8 3132.5
51720 Norton city Virginia 2.8 3132.5
30019 Daniels Montana 2.6 3134.5
31057 Dundy Nebraska 2.6 3134.5
13309 Wheeler Georgia 2.3 3136.5
38029 Emmons North Dakota 2.3 3136.5
36061 New York New York 1.9 3138.0
31183 Wheeler Nebraska 1.3 3139.0
48235 Irion Texas 0.9 3140.0
48261 Kenedy Texas 0.0 3141.5
48269 King Texas 0.0 3141.5
10d) State on Average that is Best for Carpooling
DC is on average the best for carpooling.
10e) States on Average that are Best for Carpooling
acs_2015_county_data_revised %>% 
  mutate(carpool_rank = rank(carpool)) %>%
  group_by(state) %>%
  summarize(mean_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>% 
  arrange(mean_carpool_rank) %>%
  top_n(-5, mean_carpool_rank) %>%
knitr::kable(caption = "States on Average that are Best for Carpooling",
             digits = 2) %>%
  kableExtra::kable_styling(bootstrap_options = "basic")
States on Average that are Best for Carpooling
state mean_carpool_rank
District of Columbia 76.50
Massachusetts 557.11
Connecticut 624.81
Rhode Island 689.60
New Jersey 705.00