#install.packages(tidyverse)
library(tidyverse)
setwd("~/Desktop/Grad School/Data Wrangling/Assignment 3")
data <- read_csv("acs_2015_county_data_revised copy.csv")
nrow(data)
## [1] 3142
ncol(data)
## [1] 35
glimpse(data)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
Looking at the variable types from the glimpse(), it doesn’t look like we need to change anything. All the character variables and integer variables are correctly identified in the table data frame.
sum(is.na(data))
## [1] 2
colSums(is.na(data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
data$income[is.na(data$income)] <- median(data$income, na.rm = TRUE)
data$child_poverty[is.na(data$child_poverty)] <- median(data$child_poverty, na.rm = TRUE)
colSums(is.na(data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
When I checked for missing values, we see that there are only two. I computed both of those missing values using the median value. This made more sense because I did not want to skew the data by using an average.
summary(data)
## census_id state county total_pop
## Min. : 1001 Length:3142 Length:3142 Min. : 85
## 1st Qu.:18178 Class :character Class :character 1st Qu.: 11028
## Median :29176 Mode :character Mode :character Median : 25768
## Mean :30384 Mean : 100737
## 3rd Qu.:45080 3rd Qu.: 67552
## Max. :56045 Max. :10038388
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51171 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52249 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.48 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
Based on the results from the summary tables for each of the variables, there seems to be no unusual values and nothing needs to be done to deal with them.
count(filter(data, women > men))
## # A tibble: 1 × 1
## n
## <int>
## 1 1985
This code chunk counts the number of rows where there are more women than men. There are 1985 counties that have more women than men.
count(filter(data, unemployment < 0.10))
## # A tibble: 1 × 1
## n
## <int>
## 1 4
There are 4 counties with an unemployment rate less than 10%.
name, state, and the mean_commute in your final answer (sorted by mean_commute).
data %>%
select(census_id, state, mean_commute) %>%
arrange(desc(mean_commute)) %>%
top_n(10)
## # A tibble: 10 × 3
## census_id state mean_commute
## <dbl> <chr> <dbl>
## 1 42103 Pennsylvania 44
## 2 36005 New York 43
## 3 24017 Maryland 42.8
## 4 51187 Virginia 42.7
## 5 36081 New York 42.6
## 6 36085 New York 42.6
## 7 51193 Virginia 42.5
## 8 8093 Colorado 42.4
## 9 36047 New York 41.7
## 10 54015 West Virginia 41.4
the top 10 counties with the lowest percentages. Show the census ID, county name, state, and the percentage in your final answer (sorted by ascending percentage).
data_2 <- data %>%
mutate(percent_women = (women / total_pop) * 100) %>%
select(census_id, county, state, percent_women) %>%
arrange(percent_women) %>%
top_n(10)
data_2
## # A tibble: 10 × 4
## census_id county state percent_women
## <dbl> <chr> <chr> <dbl>
## 1 29117 Livingston Missouri 54.9
## 2 35011 De Baca New Mexico 55.1
## 3 51790 Staunton city Virginia 55.1
## 4 48137 Edwards Texas 55.2
## 5 51091 Highland Virginia 55.3
## 6 51620 Franklin city Virginia 55.5
## 7 28125 Sharkey Mississippi 55.5
## 8 1119 Sumter Alabama 55.7
## 9 13235 Pulaski Georgia 58.0
## 10 51720 Norton city Virginia 59.4
are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).
data_3 <- data %>%
mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE))
variables?
data_3 %>%
arrange() %>%
top_n(10)
## # A tibble: 43 × 36
## census_id state county total_pop men women hispanic white black native
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1065 Alabama Hale 15256 7183 8073 1.1 40 58.2 0.3
## 2 1131 Alabama Wilcox 11235 5376 5859 0.6 27.4 72 0
## 3 13201 Georgia Miller 5936 2837 3099 0.1 68.7 31.1 0.1
## 4 13307 Georgia Webst… 2720 1370 1350 1 43.2 55.8 0
## 5 20199 Kansas Walla… 1595 789 806 3 96.9 0 0
## 6 21031 Kentucky Butler 12835 6450 6385 3.5 95.6 0.7 0.2
## 7 28021 Mississip… Claib… 9330 4329 5001 1.1 13.6 84.2 0.2
## 8 28125 Mississip… Shark… 4805 2139 2666 0.6 27.7 71.3 0
## 9 30019 Montana Danie… 1758 891 867 2.5 96.6 0 0.7
## 10 30069 Montana Petro… 443 232 211 1.4 98.6 0 0
## # ℹ 33 more rows
## # ℹ 26 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>, …
data %>%
mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE)) %>%
group_by(state) %>%
summarise(avg_race_sum = mean(race_sum, na.rm = TRUE)) %>%
arrange(avg_race_sum) %>%
top_n(1)
## # A tibble: 1 × 2
## state avg_race_sum
## <chr> <dbl>
## 1 Mississippi 99.2
###c. Do any counties have a sum greater than 100%?
data %>%
mutate(race_sum = rowSums(across(c(hispanic, white, black, native, asian, pacific)), na.rm = TRUE)) %>%
filter(race_sum > 100) %>%
arrange(desc(race_sum)) %>%
select(census_id, county, state, race_sum)
## # A tibble: 5 × 4
## census_id county state race_sum
## <dbl> <chr> <chr> <dbl>
## 1 31073 Gosper Nebraska 100.
## 2 31091 Hooker Nebraska 100.
## 3 31125 Nance Nebraska 100.
## 4 48017 Bailey Texas 100.
## 5 48137 Edwards Texas 100.
###d. How many states have a sum that equals exactly to 100%?
data_3 %>%
filter(race_sum == 100) %>%
select(state)
## # A tibble: 38 × 1
## state
## <chr>
## 1 Alabama
## 2 Alabama
## 3 Georgia
## 4 Georgia
## 5 Kansas
## 6 Kentucky
## 7 Mississippi
## 8 Mississippi
## 9 Montana
## 10 Montana
## # ℹ 28 more rows
where the highest ranked county (rank = 1) is the county with the highest carpool value. Read the documentation carefully for the ranking function.
carpool_df <- data %>%
mutate(carpool_rank = min_rank(desc(carpool)))
head(carpool_df, 5)
## # A tibble: 5 × 36
## census_id state county total_pop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… 55221 26745 28476 2.6 75.8 18.5 0.4 1
## 2 1003 Alab… Baldw… 195121 95314 99807 4.5 83.1 9.5 0.6 0.7
## 3 1005 Alab… Barbo… 26932 14497 12435 4.6 46.2 46.7 0.2 0.4
## 4 1007 Alab… Bibb 22604 12073 10531 2.2 74.5 21.4 0.4 0.1
## 5 1009 Alab… Blount 57710 28512 29198 8.6 87.9 1.5 0.3 0.1
## # ℹ 25 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>, carpool_rank <int>
name, state, carpool value, and carpool_rank in your final answer.
carpool_df_top10 <- data %>%
mutate(carpool_rank = min_rank(desc(carpool))) %>%
arrange(carpool_rank) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
top_n(10)
carpool_df_top10
## # A tibble: 11 × 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 46069 Hyde South Dakota 2.8 3132
## 2 51720 Norton city Virginia 2.8 3132
## 3 30019 Daniels Montana 2.6 3134
## 4 31057 Dundy Nebraska 2.6 3134
## 5 13309 Wheeler Georgia 2.3 3136
## 6 38029 Emmons North Dakota 2.3 3136
## 7 36061 New York New York 1.9 3138
## 8 31183 Wheeler Nebraska 1.3 3139
## 9 48235 Irion Texas 0.9 3140
## 10 48261 Kenedy Texas 0 3141
## 11 48269 King Texas 0 3141
#the arrange function is not arranging correctly on carpool_rank
final answer.
carpool_df_bottom10 <- data %>%
mutate(carpool_rank = min_rank(desc(carpool))) %>%
arrange(desc(carpool_rank)) %>%
select(census_id, county, state, carpool, carpool_rank) %>%
top_n(10)
carpool_df_bottom10
## # A tibble: 11 × 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
## 11 51720 Norton city Virginia 2.8 3132
carpool_df_by_state <- data %>%
group_by(state) %>%
summarise(avg_carpool = mean(carpool, na.rm = TRUE)) %>%
mutate(avg_state_rank = min_rank(desc(avg_carpool))) %>%
arrange(avg_state_rank) %>%
select(state, avg_carpool, avg_state_rank) %>%
top_n(1)
carpool_df_by_state
## # A tibble: 1 × 3
## state avg_carpool avg_state_rank
## <chr> <dbl> <int>
## 1 District of Columbia 5.7 51
# carpool_df_by_state[avg_state_rank == 1] why doesn't this work?
# arrange function is still not arranging correctly
carpool_df_by_state_top5 <- data %>%
group_by(state) %>%
summarise(avg_carpool = mean(carpool, na.rm = TRUE)) %>%
mutate(avg_state_rank = min_rank(desc(avg_carpool))) %>%
arrange(avg_state_rank) %>%
select(state, avg_carpool, avg_state_rank) %>%
top_n(5)
carpool_df_by_state_top5
## # A tibble: 5 × 3
## state avg_carpool avg_state_rank
## <chr> <dbl> <int>
## 1 New Jersey 8.10 47
## 2 Rhode Island 8 48
## 3 Connecticut 7.94 49
## 4 Massachusetts 7.75 50
## 5 District of Columbia 5.7 51