library(tidyverse)
library(magrittr)
library(lubridate)
library(glue)
library(dplyr)
acs <- read_csv("acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(acs)
## [1] 3142
ncol(acs)
## [1] 35
There are 3,142 rows and 35 columns in the dataset
str(acs)
## spc_tbl_ [3,142 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ census_id : num [1:3142] 1001 1003 1005 1007 1009 ...
## $ state : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ total_pop : num [1:3142] 55221 195121 26932 22604 57710 ...
## $ men : num [1:3142] 26745 95314 14497 12073 28512 ...
## $ women : num [1:3142] 28476 99807 12435 10531 29198 ...
## $ hispanic : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : num [1:3142] 40725 147695 20714 17495 42345 ...
## $ income : num [1:3142] 51281 50254 32964 38678 45813 ...
## $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
## $ poverty : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num [1:3142] 23986 85953 8597 8294 22189 ...
## $ private_work : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## - attr(*, "spec")=
## .. cols(
## .. census_id = col_double(),
## .. state = col_character(),
## .. county = col_character(),
## .. total_pop = col_double(),
## .. men = col_double(),
## .. women = col_double(),
## .. hispanic = col_double(),
## .. white = col_double(),
## .. black = col_double(),
## .. native = col_double(),
## .. asian = col_double(),
## .. pacific = col_double(),
## .. citizen = col_double(),
## .. income = col_double(),
## .. income_per_cap = col_double(),
## .. poverty = col_double(),
## .. child_poverty = col_double(),
## .. professional = col_double(),
## .. service = col_double(),
## .. office = col_double(),
## .. construction = col_double(),
## .. production = col_double(),
## .. drive = col_double(),
## .. carpool = col_double(),
## .. transit = col_double(),
## .. walk = col_double(),
## .. other_transp = col_double(),
## .. work_at_home = col_double(),
## .. mean_commute = col_double(),
## .. employed = col_double(),
## .. private_work = col_double(),
## .. public_work = col_double(),
## .. self_employed = col_double(),
## .. family_work = col_double(),
## .. unemployment = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
acs$census_id <- as.factor(acs$census_id)
acs$men <- as.integer(acs$men)
acs$women <- as.integer(acs$women)
acs$citizen <- as.integer(acs$citizen)
These variables should be whole numbers since you cannot have half a person.
glimpse(acs)
## Rows: 3,142
## Columns: 35
## $ census_id <fct> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <int> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <int> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <int> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
colSums(is.na(acs))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
summary(acs)
## census_id state county total_pop
## 1001 : 1 Length:3142 Length:3142 Min. : 85
## 1003 : 1 Class :character Class :character 1st Qu.: 11028
## 1005 : 1 Mode :character Mode :character Median : 25768
## 1007 : 1 Mean : 100737
## 1009 : 1 3rd Qu.: 67552
## 1011 : 1 Max. :10038388
## (Other):3136
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51172 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
##
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52250 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## NA's :1
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## NA's :1
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
After using the summary function, I did not find any unusual variables in this dataset so I left them all as is.
acs$women_men <- ifelse(acs$men > acs$women, "Men",
ifelse(acs$men < acs$women, "Women", "Equal"))
result <- acs[, c("county", "women_men")]
print(result)
## # A tibble: 3,142 × 2
## county women_men
## <chr> <chr>
## 1 Autauga Women
## 2 Baldwin Women
## 3 Barbour Men
## 4 Bibb Men
## 5 Blount Women
## 6 Bullock Men
## 7 Butler Women
## 8 Calhoun Women
## 9 Chambers Women
## 10 Cherokee Women
## # ℹ 3,132 more rows
The code above creates a variable that labels if there are more men or more women in a county. Then prints by county if there are more men or women.
num_counties <- acs %>%
filter(unemployment < 10) %>%
summarise(total = n())
print(num_counties)
## # A tibble: 1 × 1
## total
## <int>
## 1 2420
The code above creates a vector that filters out the counties where unemployment is less than 10% and then prints that number. There are 2,420 counties that have unemployment rates less than 10%.
top10_commute <- acs %>%
top_n(n = 10, wt = mean_commute) %>%
arrange(desc(mean_commute)) %>%
select(census_id, county, state, mean_commute)
print(top10_commute)
## # A tibble: 10 × 4
## census_id county state mean_commute
## <fct> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
the code above creates a vector with the top ten counties with the highest mean it then selects the census_id, county, state, and mean commute of those top ten counties. Then that is printed and displayed.
create % women
acs_women_percentage <- acs %>%
mutate(pct_women = women / (women + men) * 100)
get top 10 counties with the lowest % women
lowest10 <- acs_women_percentage %>%
arrange(pct_women) %>%
select(census_id, county, state, pct_women) %>%
head(10)
print(lowest10)
## # A tibble: 10 × 4
## census_id county state pct_women
## <fct> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
The code above creates percentage of women, then arrange the percent of women and selects the census id, county, state, and percentage of women columns and then prints the counties with the lowest percentage of women.
##9
acs_nine <- acs %>%
mutate(race_sum = hispanic + white + black + native + asian + pacific)
The code above creates a variable called race_sum that calculates the total of the races in this dataset.
lowest10_counties <- acs_nine %>%
arrange(race_sum) %>%
select(census_id, county, state, race_sum) %>%
head(10)
print(lowest10_counties)
## # A tibble: 10 × 4
## census_id county state race_sum
## <fct> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
The code above arranges the new column I created and selects the columns census_id, county, state, and race sum. It then displays the counties with the lowest sum of race percentage.
state_lowest_avg <- acs_nine %>%
group_by(state) %>%
summarise(avg_race_sum = mean(race_sum, na.rm = TRUE)) %>%
arrange(avg_race_sum) %>%
slice(1)
print(state_lowest_avg)
## # A tibble: 1 × 2
## state avg_race_sum
## <chr> <dbl>
## 1 Hawaii 84
The code above displays which state has the lowest race sum out of all the states. The state with the lowest is Hawaii.
any_over_100 <- any(acs_nine$race_sum > 100, na.rm = TRUE)
print(any_over_100)
## [1] TRUE
The code above finds any counties that have a race sum of over 100% and then prints them.
num_states_exact_100 <- acs_nine %>%
group_by(state) %>%
filter(race_sum == 100) %>%
summarise(counties_equal_100 = n()) %>%
filter(counties_equal_100 > 0) %>%
nrow()
print(num_states_exact_100)
## [1] 13
The code above groups by state and filters the data to show only instances where race sum is equal to 100. Next it prints the number which is 13 states.
acs_ten <- acs %>%
mutate(carpool_rank = min_rank(desc(carpool)))
This code creates the carpool_rank variable
top10_counties <- acs_ten %>%
arrange(carpool_rank) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
head(10)
top10_counties
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <fct> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
bottom10_counties <- acs_ten %>%
arrange(desc(carpool_rank)) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
head(10)
bottom10_counties
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <fct> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
state_avg_rank <- acs_ten %>%
group_by(state) %>%
summarise(avg_rank = mean(carpool_rank, na.rm = TRUE)) %>%
arrange(avg_rank)
state_avg_rank
## # A tibble: 51 × 2
## state avg_rank
## <chr> <dbl>
## 1 Arizona 971.
## 2 Utah 1019.
## 3 Arkansas 1055.
## 4 Hawaii 1072.
## 5 Alaska 1087.
## 6 Nevada 1100.
## 7 Texas 1106.
## 8 Wyoming 1107.
## 9 California 1122.
## 10 Missouri 1133.
## # ℹ 41 more rows
top5_states <- state_avg_rank %>%
head(5)
top5_states
## # A tibble: 5 × 2
## state avg_rank
## <chr> <dbl>
## 1 Arizona 971.
## 2 Utah 1019.
## 3 Arkansas 1055.
## 4 Hawaii 1072.
## 5 Alaska 1087.