setwd("C:/Users/This PC/Downloads/MSBA learning Data Wragling/Week 6 data files/Individual assignment 3")
library(tidyverse)
acs_2015 <- read_csv("acs_2015_county_data_revised.csv")
The dataset contains 3,142 rows, with each row representing a county, and 35 columns, each corresponding to a distinct variable collected in the survey.
dim(acs_2015)
## [1] 3142 35
str(acs_2015)
spc_tbl_ [3,142 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame) $ census_id : num [1:3142] 1001 1003 1005 1007 1009 ... $ state : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ... $ county : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ... $ total_pop : num [1:3142] 55221 195121 26932 22604 57710 ... $ men : num [1:3142] 26745 95314 14497 12073 28512 ... $ women : num [1:3142] 28476 99807 12435 10531 29198 ... $ hispanic : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ... $ white : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ... $ black : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ... $ native : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ... $ asian : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ... $ pacific : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ... $ citizen : num [1:3142] 40725 147695 20714 17495 42345 ... $ income : num [1:3142] 51281 50254 32964 38678 45813 ... $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ... $ poverty : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ... $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ... $ professional : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ... $ service : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ... $ office : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ... $ construction : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ... $ production : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ... $ drive : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ... $ carpool : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ... $ transit : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ... $ walk : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ... $ other_transp : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ... $ work_at_home : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ... $ mean_commute : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ... $ employed : num [1:3142] 23986 85953 8597 8294 22189 ... $ private_work : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ... $ public_work : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ... $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ... $ family_work : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ... $ unemployment : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ... - attr(*, "spec")= .. cols( .. census_id = col_double(), .. state = col_character(), .. county = col_character(), .. total_pop = col_double(), .. men = col_double(), .. women = col_double(), .. hispanic = col_double(), .. white = col_double(), .. black = col_double(), .. native = col_double(), .. asian = col_double(), .. pacific = col_double(), .. citizen = col_double(), .. income = col_double(), .. income_per_cap = col_double(), .. poverty = col_double(), .. child_poverty = col_double(), .. professional = col_double(), .. service = col_double(), .. office = col_double(), .. construction = col_double(), .. production = col_double(), .. drive = col_double(), .. carpool = col_double(), .. transit = col_double(), .. walk = col_double(), .. other_transp = col_double(), .. work_at_home = col_double(), .. mean_commute = col_double(), .. employed = col_double(), .. private_work = col_double(), .. public_work = col_double(), .. self_employed = col_double(), .. family_work = col_double(), .. unemployment = col_double() .. ) - attr(*, "problems")=<externalptr>
total_pop, men,
women, citizen, income,
income_per_cap, and employed were converted
from “dbl” to “integer” because they represent
whole-number counts. Using integers makes the data
cleaner, easier to read, and avoids unnecessary decimal points during
analysis.census_id was converted from
“dbl” to “character” because it’s a unique
identifier, not a numeric value for calculation. Keeping it as
a character prevents rounding errors and ensures it’s treated as a label
when filtering or joining data.acs_2015 <- acs_2015 %>%
mutate(
census_id = as.character(census_id),
total_pop = as.integer(total_pop),
men = as.integer(men),
women = as.integer(women),
citizen = as.integer(citizen),
income = as.integer(income),
income_per_cap = as.integer(income_per_cap),
employed = as.integer(employed)
)
glimpse(acs_2015)
Rows: 3,142 Columns: 35 $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013",… $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", … $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul… $ total_pop <int> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664… $ men <int> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1… $ women <int> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, … $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, … $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9… $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8… $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, … $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, … $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, … $ citizen <int> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,… $ income <int> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,… $ income_per_cap <int> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,… $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1… $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3… $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2… $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1… $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1… $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13… $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2… $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8… $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1… $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, … $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, … $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, … $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, … $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2… $ employed <int> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136… $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7… $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1… $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, … $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, … $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
colSums(is.na(acs_2015))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
income and one in child_poverty. Since both
are key variables and only one value is missing in each, I chose to
impute using the median of the column.acs_2015 <- acs_2015 %>%
mutate(
income = ifelse(is.na(income), median(income, na.rm = TRUE), income),
child_poverty = ifelse(is.na(child_poverty), median(child_poverty, na.rm = TRUE), child_poverty)
)
colSums(is.na(acs_2015))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
total_pop,
men, women, citizen, and employed
using the summary() function. These variables represent
population counts and employment rates, which are central to my
analysis. During this inspection, I noticed that some values were
unusually high compared to the rest of the
dataset.summary(acs_2015)
census_id state county total_pop
Length:3142 Length:3142 Length:3142 Min. : 85
Class :character Class :character Class :character 1st Qu.: 11028
Mode :character Mode :character Mode :character Median : 25768
Mean : 100737
3rd Qu.: 67552
Max. :10038388
men women hispanic white
Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
Median : 12826 Median : 12907 Median : 3.700 Median :84.60
Mean : 49565 Mean : 51172 Mean : 8.826 Mean :77.28
3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
black native asian pacific
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
citizen income income_per_cap poverty
Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
Median : 19434 Median : 45111 Median :23577 Median :16.0
Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
3rd Qu.: 50728 3rd Qu.: 52249 3rd Qu.:27138 3rd Qu.:20.3
Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
child_poverty professional service office
Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
Median :22.50 Median :30.00 Median :18.00 Median :22.40
Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
3rd Qu.:29.48 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
construction production drive carpool
Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
Median :12.20 Median :15.40 Median :80.60 Median : 9.90
Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
transit walk other_transp work_at_home
Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
mean_commute employed private_work public_work
Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
Median :22.90 Median : 10644 Median :75.80 Median :16.10
Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
self_employed family_work unemployment
Min. : 0.000 Min. :0.0000 Min. : 0.000
1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
Median : 6.900 Median :0.2000 Median : 7.500
Mean : 7.921 Mean :0.2915 Mean : 7.815
3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
Max. :36.600 Max. :9.8000 Max. :29.400
acs_2015_2 <- acs_2015 %>%
mutate(
total_pop = ifelse(total_pop < (quantile(total_pop, 0.25) - 15 * IQR(total_pop)) |
total_pop > (quantile(total_pop, 0.75) + 15 * IQR(total_pop)), NA, total_pop),
men = ifelse(men < (quantile(men, 0.25) - 15 * IQR(men)) |
men > (quantile(men, 0.75) + 15 * IQR(men)), NA, men),
women = ifelse(women < (quantile(women, 0.25) - 15 * IQR(women)) |
women > (quantile(women, 0.75) + 15 * IQR(women)), NA, women),
citizen = ifelse(citizen < (quantile(citizen, 0.25) - 15 * IQR(citizen)) |
citizen > (quantile(citizen, 0.75) + 15 * IQR(citizen)), NA, citizen),
employed = ifelse(employed < (quantile(employed, 0.25) - 15 * IQR(employed)) |
employed > (quantile(employed, 0.75) + 15 * IQR(employed)), NA, employed)
) %>%
drop_na(total_pop, men, women, citizen, employed)
summary(acs_2015_2)
census_id state county total_pop
Length:3073 Length:3073 Length:3073 Min. : 85
Class :character Class :character Class :character 1st Qu.: 10765
Mode :character Mode :character Mode :character Median : 25044
Mean : 66681
3rd Qu.: 62296
Max. :865736
men women hispanic white
Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
1st Qu.: 5420 1st Qu.: 5387 1st Qu.: 1.800 1st Qu.:66.60
Median : 12439 Median : 12439 Median : 3.600 Median :85.10
Mean : 32912 Mean : 33769 Mean : 8.552 Mean :77.88
3rd Qu.: 30828 3rd Qu.: 31290 3rd Qu.: 8.600 3rd Qu.:93.50
Max. :444547 Max. :424978 Max. :98.700 Max. :99.80
black native asian pacific
Min. : 0.000 Min. : 0.0 Min. : 0.000 Min. : 0.00000
1st Qu.: 0.500 1st Qu.: 0.1 1st Qu.: 0.200 1st Qu.: 0.00000
Median : 1.900 Median : 0.3 Median : 0.500 Median : 0.00000
Mean : 8.718 Mean : 1.8 Mean : 1.085 Mean : 0.08096
3rd Qu.: 9.400 3rd Qu.: 0.6 3rd Qu.: 1.100 3rd Qu.: 0.00000
Max. :85.900 Max. :92.1 Max. :36.800 Max. :35.30000
citizen income income_per_cap poverty
Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.40
1st Qu.: 8087 1st Qu.: 38678 1st Qu.:20412 1st Qu.:12.00
Median : 18899 Median : 44866 Median :23473 Median :16.00
Mean : 48482 Mean : 46467 Mean :24141 Mean :16.75
3rd Qu.: 46464 3rd Qu.: 51976 3rd Qu.:26968 3rd Qu.:20.30
Max. :596879 Max. :123453 Max. :65600 Max. :53.30
child_poverty professional service office
Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
1st Qu.:16.10 1st Qu.:26.60 1st Qu.:15.90 1st Qu.:20.10
Median :22.50 Median :29.90 Median :18.00 Median :22.30
Mean :23.35 Mean :30.83 Mean :18.26 Mean :22.08
3rd Qu.:29.60 3rd Qu.:34.10 3rd Qu.:20.20 3rd Qu.:24.20
Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
construction production drive carpool
Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
1st Qu.:10.00 1st Qu.:11.70 1st Qu.:76.70 1st Qu.: 8.50
Median :12.30 Median :15.60 Median :80.70 Median :10.00
Mean :12.87 Mean :15.96 Mean :79.26 Mean :10.36
3rd Qu.:15.00 3rd Qu.:19.50 3rd Qu.:83.70 3rd Qu.:11.90
Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
transit walk other_transp work_at_home
Min. : 0.0000 Min. : 0.00 Min. : 0.000 Min. : 0.000
1st Qu.: 0.1000 1st Qu.: 1.40 1st Qu.: 0.900 1st Qu.: 2.800
Median : 0.3000 Median : 2.40 Median : 1.300 Median : 4.000
Mean : 0.7708 Mean : 3.31 Mean : 1.606 Mean : 4.698
3rd Qu.: 0.8000 3rd Qu.: 4.00 3rd Qu.: 1.900 3rd Qu.: 5.700
Max. :41.0000 Max. :71.20 Max. :39.100 Max. :37.200
mean_commute employed private_work public_work
Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
1st Qu.:19.30 1st Qu.: 4416 1st Qu.:70.80 1st Qu.:13.10
Median :22.80 Median : 10273 Median :75.70 Median :16.20
Mean :23.03 Mean : 30237 Mean :74.29 Mean :17.45
3rd Qu.:26.50 3rd Qu.: 26689 3rd Qu.:79.60 3rd Qu.:20.20
Max. :44.00 Max. :398116 Max. :88.30 Max. :66.20
self_employed family_work unemployment
Min. : 0.000 Min. :0.000 Min. : 0.000
1st Qu.: 5.400 1st Qu.:0.100 1st Qu.: 5.400
Median : 6.900 Median :0.200 Median : 7.500
Mean : 7.971 Mean :0.295 Mean : 7.798
3rd Qu.: 9.400 3rd Qu.:0.300 3rd Qu.: 9.700
Max. :36.600 Max. :9.800 Max. :29.400
I found that 1,923 out of 3,073
counties have more women than men, which represents
62.6% of the total. I compared the women
and men columns directly, counting how many counties meet
the condition.
acs_2015_2 %>%
summarise(
total_counties = n(),
more_women_counties = sum(women > men),
ratio = round(100 * more_women_counties / total_counties, 2)
)
## # A tibble: 1 × 3
## total_counties more_women_counties ratio
## <int> <int> <dbl>
## 1 3073 1923 62.6
Out of 3,073 counties, 2,369 have an unemployment rate below 10%, which accounts for 77.1% of the total. I counted all counties, filter those with low unemployment, and compute the percentage.
acs_2015_2 %>%
summarise(
total_counties = n(),
low_unemployment_countries = sum(unemployment < 10),
ratio = round(100 * low_unemployment_countries / total_counties, 2)
)
## # A tibble: 1 × 3
## total_counties low_unemployment_countries ratio
## <int> <int> <dbl>
## 1 3073 2369 77.1
I selected the relevant columns, sorted all counties by descending commute time, and extracted the top 10.
acs_2015_2 %>%
select(census_id, county, state, mean_commute) %>%
arrange(desc(mean_commute)) %>%
top_n(10, mean_commute)
## # A tibble: 10 × 4
## census_id county state mean_commute
## <chr> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 24017 Charles Maryland 42.8
## 3 51187 Warren Virginia 42.7
## 4 36085 Richmond New York 42.6
## 5 51193 Westmoreland Virginia 42.5
## 6 8093 Park Colorado 42.4
## 7 54015 Clay West Virginia 41.4
## 8 17013 Calhoun Illinois 41.1
## 9 28041 Greene Mississippi 41
## 10 48407 San Jacinto Texas 40.9
I created a new variable called percent_of_women by
dividing the number of women by the total population and multiplying by
100. Then I selected the relevant columns and
sorted the data in ascending order to find the 10
counties with the lowest percentage of women.
acs_2015_2 %>%
mutate(percent_of_women = women / total_pop * 100) %>%
select(census_id, county, state, percent_of_women) %>%
arrange(percent_of_women) %>%
slice_head(n = 10)
## # A tibble: 10 × 4
## census_id county state percent_of_women
## <chr> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
I created a new variable called race_percent_total by
summing the percentage values of six racial
groups.
acs_2015_2 <- acs_2015_2 %>%
mutate(race_percent_total = rowSums(select(
., hispanic, white, black, native, asian, pacific)))
summary(acs_2015_2$race_percent_total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 76.40 97.80 98.50 98.12 99.00 100.10
I selected the relevant columns and
sorted all counties by ascending
race_percent_total to find the 10 counties with the lowest
combined racial percentages.
acs_2015_2 %>%
select(census_id, county, state, race_percent_total) %>%
arrange(race_percent_total) %>%
slice_head(n = 10)
## # A tibble: 10 × 4
## census_id county state race_percent_total
## <chr> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 40123 Pontotoc Oklahoma 82.8
## 5 47061 Grundy Tennessee 83
## 6 2282 Yakutat City and Borough Alaska 83.4
## 7 40069 Johnston Oklahoma 84
## 8 15007 Kauai Hawaii 84.1
## 9 40003 Alfalfa Oklahoma 85.1
## 10 40135 Sequoyah Oklahoma 86.4
I grouped the data by state, calculated the
average race_percent_total, and
filtered for the state with the lowest average.
acs_2015_2 %>%
group_by(state) %>%
summarise(avg_race_percent = mean(race_percent_total, na.rm = TRUE)) %>%
filter(avg_race_percent == min(avg_race_percent))
## # A tibble: 1 × 2
## state avg_race_percent
## <chr> <dbl>
## 1 Hawaii 84.6
I filtered for counties where the
total race percentage exceeds 100%,
which may indicate overlapping racial identities, rounding artifacts, or
data entry inconsistencies. To make these subtle overages visible, I
used round(..., 4) to display the
values with four decimal places while preserving their numeric
type.
acs_2015_2 %>%
select(census_id, county, state, race_percent_total) %>%
filter(race_percent_total > 100) %>%
mutate(race_percent_display = format(race_percent_total, digits = 4))
## # A tibble: 5 × 5
## census_id county state race_percent_total race_percent_display
## <chr> <chr> <chr> <dbl> <chr>
## 1 31073 Gosper Nebraska 100. 100.1
## 2 31091 Hooker Nebraska 100. 100.1
## 3 31125 Nance Nebraska 100. 100.1
## 4 48017 Bailey Texas 100. 100.1
## 5 48137 Edwards Texas 100. 100.1
race_percent_total for each one.
Then I filtered for states where the average equals
exactly 100%.count() to determine how many
such states exist, and I found 0. If any states had hit
exactly 100%, they likely would have been statistical
outliers.acs_2015_2 %>%
group_by(state) %>%
summarise(avg_race_percent = mean(race_percent_total, na.rm = TRUE)) %>%
filter(avg_race_percent == 100) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 0
carpool_rank to find the 10 highest ranked
counties for carpooling:carpool_rank
using min_rank(desc(carpool)). This ranks counties
based on carpool rates, with rank 1 assigned to the
county with the highest carpool value. Although the
rank increases numerically, the values decrease → so a lower rank means
a higher carpool rate.acs_2015_2 %>%
mutate(carpool_rank = min_rank(desc(carpool))) %>%
filter(carpool_rank <= 10) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank)
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
I filtered for counties with the largest rank values, those with the lowest carpool rates and arranged the results in descending rank order so that the county with the lowest carpool rate appears first.
acs_2015_2 %>%
mutate(carpool_rank = min_rank(desc(carpool))) %>%
filter(carpool_rank >= max(carpool_rank) - 9) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(desc(carpool_rank))
## # A tibble: 11 × 5
## census_id county state carpool carpool_rank
## <chr> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3072
## 2 48269 King Texas 0 3072
## 3 48235 Irion Texas 0.9 3071
## 4 31183 Wheeler Nebraska 1.3 3070
## 5 13309 Wheeler Georgia 2.3 3068
## 6 38029 Emmons North Dakota 2.3 3068
## 7 30019 Daniels Montana 2.6 3066
## 8 31057 Dundy Nebraska 2.6 3066
## 9 46069 Hyde South Dakota 2.8 3064
## 10 51720 Norton city Virginia 2.8 3064
## 11 8057 Jackson Colorado 2.9 3063
round(..., 4) to preserve
numeric precision for filtering and comparison.format(..., digits = 4).acs_2015_2 %>%
mutate(carpool_rank = min_rank(desc(carpool))) %>%
group_by(state) %>%
summarise(
avg_rank = round(mean(carpool_rank, na.rm = TRUE), 4),
avg_rank_display = format(mean(carpool_rank, na.rm = TRUE), digits = 4)
) %>%
arrange(avg_rank) %>%
slice_head(n = 5)
## # A tibble: 5 × 3
## state avg_rank avg_rank_display
## <chr> <dbl> <chr>
## 1 Arizona 909. 908.8
## 2 Utah 1015. 1015
## 3 Arkansas 1040. 1040
## 4 Alaska 1069. 1069
## 5 Nevada 1071. 1071