state and county, and the rest are numeric. These variable types represent the data well and do not need to by changed.glimpse(acs_2015_county_data_revised)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1~
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", ~
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul~
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664~
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1~
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, ~
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, ~
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9~
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8~
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, ~
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, ~
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,~
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,~
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,~
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1~
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3~
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2~
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1~
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1~
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13~
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2~
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8~
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1~
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, ~
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, ~
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, ~
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, ~
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2~
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136~
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7~
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1~
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, ~
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, ~
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9~
income and child_poverty columns. In this case it is best to leave them and adjust later if needed during calculations as they will not cause issues. I do not want to fill these NAs with a mean or median in order to keep true to the original data as much as possible.colSums(is.na(acs_2015_county_data_revised))
employment it makes sense for it to match other columns around it and also become a percentage to match.summary(acs_2015_county_data_revised)
acs_2015_county_data_revised %>%
mutate(pct_employed = employed/total_pop * 100) %>%
summarise(pct_employed)
acs_2015_county_data_revised %>%
count(acs_2015_county_data_revised$women > acs_2015_county_data_revised$men)
acs_2015_county_data_revised %>%
count(acs_2015_county_data_revised < 10)
high_commute <- acs_2015_county_data_revised %>%
select(census_id, county, state, mean_commute) %>%
top_n(10, mean_commute) %>%
arrange(desc(mean_commute))
high_commute %>%
knitr::kable(caption = "States with the Highest Commute (Min)") %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| census_id | county | state | mean_commute |
|---|---|---|---|
| 42103 | Pike | Pennsylvania | 44.0 |
| 36005 | Bronx | New York | 43.0 |
| 24017 | Charles | Maryland | 42.8 |
| 51187 | Warren | Virginia | 42.7 |
| 36081 | Queens | New York | 42.6 |
| 36085 | Richmond | New York | 42.6 |
| 51193 | Westmoreland | Virginia | 42.5 |
| 8093 | Park | Colorado | 42.4 |
| 36047 | Kings | New York | 41.7 |
| 54015 | Clay | West Virginia | 41.4 |
women_pct <- acs_2015_county_data_revised %>%
mutate(pct_women = women/total_pop * 100) %>%
select(census_id, county, state, pct_women) %>%
top_n(-10, pct_women) %>%
arrange(pct_women)
women_pct %>%
knitr::kable(caption = "Counties with the Lowest Percentage of Women",
digits=2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| census_id | county | state | pct_women |
|---|---|---|---|
| 42053 | Forest | Pennsylvania | 26.78 |
| 8011 | Bent | Colorado | 31.37 |
| 51183 | Sussex | Virginia | 31.47 |
| 13309 | Wheeler | Georgia | 32.10 |
| 6035 | Lassen | California | 33.17 |
| 48095 | Concho | Texas | 33.28 |
| 13053 | Chattahoochee | Georgia | 33.36 |
| 2013 | Aleutians East Borough | Alaska | 33.47 |
| 22125 | West Feliciana | Louisiana | 33.65 |
| 32027 | Pershing | Nevada | 33.73 |
pct_race <- acs_2015_county_data_revised %>%
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
select(census_id, county, state, race_pct_sum) %>%
top_n(-10, race_pct_sum) %>%
arrange(race_pct_sum)
pct_race %>%
knitr::kable(caption = "Top 10 Counties with Lowest Sum of Race Percentage Variables",
digits=2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| census_id | county | state | race_pct_sum |
|---|---|---|---|
| 15001 | Hawaii | Hawaii | 76.4 |
| 15009 | Maui | Hawaii | 79.2 |
| 40097 | Mayes | Oklahoma | 79.7 |
| 15003 | Honolulu | Hawaii | 81.5 |
| 40123 | Pontotoc | Oklahoma | 82.8 |
| 47061 | Grundy | Tennessee | 83.0 |
| 2282 | Yakutat City and Borough | Alaska | 83.4 |
| 40069 | Johnston | Oklahoma | 84.0 |
| 15007 | Kauai | Hawaii | 84.1 |
| 40003 | Alfalfa | Oklahoma | 85.1 |
low_state_mean <- acs_2015_county_data_revised %>%
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
group_by(state) %>%
summarize(mean_race_pct_sum = mean(race_pct_sum, na.rm = TRUE)) %>%
select(state, mean_race_pct_sum) %>%
arrange(mean_race_pct_sum) %>%
top_n(-3, mean_race_pct_sum)
low_state_mean %>%
knitr::kable(caption = "State with Lowest Sum of Race Percentage Variables",
digits = 2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| state | mean_race_pct_sum |
|---|---|
| Hawaii | 84.00 |
| Alaska | 92.71 |
| Oklahoma | 92.82 |
pct_race <- acs_2015_county_data_revised %>%
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
select(census_id, county, state, race_pct_sum) %>%
filter(race_pct_sum > 100.01) %>%
arrange(race_pct_sum) %>%
top_n(10, race_pct_sum)
pct_race %>%
knitr::kable(caption = "Counties with Race Percentage Varibles That Sum Over 100%",
digits = 2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| census_id | county | state | race_pct_sum |
|---|---|---|---|
| 31125 | Nance | Nebraska | 100.1 |
| 31091 | Hooker | Nebraska | 100.1 |
| 48017 | Bailey | Texas | 100.1 |
| 48137 | Edwards | Texas | 100.1 |
| 31073 | Gosper | Nebraska | 100.1 |
pct_race <- acs_2015_county_data_revised %>%
mutate(race_pct_sum = hispanic + white + black + native + asian + pacific) %>%
select(census_id, county, state, race_pct_sum) %>%
filter(race_pct_sum == 100.00) %>%
arrange(race_pct_sum) %>%
count(race_pct_sum)
pct_race %>%
knitr::kable(caption = "Table 9d",
digits = 2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| race_pct_sum | n |
|---|---|
| 100 | 27 |
carpool_rank was created.acs_2015_county_data_revised %>%
mutate(carpool_rank = rank(desc(carpool))) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, carpool_rank) %>%
knitr::kable(caption = "Highest Ranked Counties for Carpooling") %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| census_id | county | state | carpool | carpool_rank |
|---|---|---|---|---|
| 13061 | Clay | Georgia | 29.9 | 1 |
| 18087 | LaGrange | Indiana | 27.0 | 2 |
| 13165 | Jenkins | Georgia | 25.3 | 3 |
| 5133 | Sevier | Arkansas | 24.4 | 4 |
| 20175 | Seward | Kansas | 23.4 | 5 |
| 48079 | Cochran | Texas | 22.8 | 6 |
| 48247 | Jim Hogg | Texas | 22.6 | 7 |
| 48393 | Roberts | Texas | 22.4 | 8 |
| 39075 | Holmes | Ohio | 21.8 | 9 |
| 21197 | Powell | Kentucky | 21.6 | 10 |
acs_2015_county_data_revised %>%
mutate(carpool_rank = rank(carpool)) %>%
group_by(state) %>%
summarize(mean_carpool_rank = mean(carpool_rank, na.rm = TRUE)) %>%
arrange(mean_carpool_rank) %>%
top_n(-5, mean_carpool_rank) %>%
knitr::kable(caption = "States on Average that are Best for Carpooling",
digits = 2) %>%
kableExtra::kable_styling(bootstrap_options = "basic")
| state | mean_carpool_rank |
|---|---|
| District of Columbia | 76.50 |
| Massachusetts | 557.11 |
| Connecticut | 624.81 |
| Rhode Island | 689.60 |
| New Jersey | 705.00 |