Michael Morelli MSBA660 Assignment 3 Fall 2023 September 28
# message=FALSE, echo=FALSE are optionbal
# knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library("tidyverse")
library("dplyr")
library("tidyr")
How many rows and columns are in the data set?
num_rows_acs_2015_county_data_revised <- nrow(acs_2015_county_data_revised)
paste0("The number of rows is ", num_rows_acs_2015_county_data_revised) %>%
print()
## [1] "The number of rows is 3142"
num_cols_acs_2015_county_data_revised <- ncol(acs_2015_county_data_revised)
paste0("The number of columns is ", num_cols_acs_2015_county_data_revised) %>%
print()
## [1] "The number of columns is 35"
acs_2015_county_data_revised
## # A tibble: 3,142 × 35
## census_id state county total_pop men women hispanic white black native
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alabama Autauga 55221 26745 28476 2.6 75.8 18.5 0.4
## 2 1003 Alabama Baldwin 195121 95314 99807 4.5 83.1 9.5 0.6
## 3 1005 Alabama Barbour 26932 14497 12435 4.6 46.2 46.7 0.2
## 4 1007 Alabama Bibb 22604 12073 10531 2.2 74.5 21.4 0.4
## 5 1009 Alabama Blount 57710 28512 29198 8.6 87.9 1.5 0.3
## 6 1011 Alabama Bullock 10678 5660 5018 4.4 22.2 70.7 1.2
## 7 1013 Alabama Butler 20354 9502 10852 1.2 53.3 43.8 0.1
## 8 1015 Alabama Calhoun 116648 56274 60374 3.5 73 20.3 0.2
## 9 1017 Alabama Chambers 34079 16258 17821 0.4 57.3 40.3 0.2
## 10 1019 Alabama Cherokee 26008 12975 13033 1.5 91.7 4.8 0.6
## # ℹ 3,132 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>, …
paste0("The number of columns is ", num_cols_acs_2015_county_data_revised) %>%
print()
## [1] "The number of columns is 35"
acs_2015_county_data_revised
## # A tibble: 3,142 × 35
## census_id state county total_pop men women hispanic white black native
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alabama Autauga 55221 26745 28476 2.6 75.8 18.5 0.4
## 2 1003 Alabama Baldwin 195121 95314 99807 4.5 83.1 9.5 0.6
## 3 1005 Alabama Barbour 26932 14497 12435 4.6 46.2 46.7 0.2
## 4 1007 Alabama Bibb 22604 12073 10531 2.2 74.5 21.4 0.4
## 5 1009 Alabama Blount 57710 28512 29198 8.6 87.9 1.5 0.3
## 6 1011 Alabama Bullock 10678 5660 5018 4.4 22.2 70.7 1.2
## 7 1013 Alabama Butler 20354 9502 10852 1.2 53.3 43.8 0.1
## 8 1015 Alabama Calhoun 116648 56274 60374 3.5 73 20.3 0.2
## 9 1017 Alabama Chambers 34079 16258 17821 0.4 57.3 40.3 0.2
## 10 1019 Alabama Cherokee 26008 12975 13033 1.5 91.7 4.8 0.6
## # ℹ 3,132 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>, …
We can see that none of the data types need to be changed.
glimpse(acs_2015_county_data_revised)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
3.Are there any missing values?
na_number <- sum(colSums(is.na(acs_2015_county_data_revised)))
if (na_number > 0) {
paste0("There are missing values in our data and the number of them is ", na_number, ".") %>%
print() } else {
print("There are no missing values in our data.")
}
## [1] "There are missing values in our data and the number of them is 2."
# Print information where the missing values are and
# informationa about the counties in which they occur
# First, Filter rows with any NA
na_rows <- acs_2015_county_data_revised %>%
filter(if_any(everything(), is.na))
for (row_index in 1:nrow(na_rows)) {
for (col_index in 1:num_cols_acs_2015_county_data_revised) {
if (is.na(na_rows[row_index, col_index])) {
# If the value is NA, print the census_id, state, county, and total_pop
print(na_rows[row_index,1:4])
# Print where the NAs occure
print(na_rows[row_index, col_index])
}
}
}
## # A tibble: 1 × 4
## census_id state county total_pop
## <dbl> <chr> <chr> <dbl>
## 1 15005 Hawaii Kalawao 85
## # A tibble: 1 × 1
## child_poverty
## <dbl>
## 1 NA
## # A tibble: 1 × 4
## census_id state county total_pop
## <dbl> <chr> <chr> <dbl>
## 1 48301 Texas Loving 117
## # A tibble: 1 × 1
## income
## <dbl>
## 1 NA
How will you handle missing values?
Since the only two missing values happen in Kalawao County, Hawaii and Loving County, Texas which are the only two counties in the US with fewer than 120 people, we will remove these two observations.
An aside, according to Wikipedia, Loving County has 680 square miles of land and 64 - 100 people, this is a density of 1 person every ten square miles. It would be like Cincinnati having a population of 7 people. They have a sheriff and two constables in addition to 15 other county officials.
Kalawao County is the smallest county in area in the US at 5.5 square miles. The Borought of Manhattan, NY, also known as NY County, the second smallest, has 26 square miles of land.Kalawao, in contrast to Loving, is administered by another county and the Hawaii Health department.
print("Since the missing values happen in the only two counties in the US
with fewer than 120 people, we will remove the entire observation.")
## [1] "Since the missing values happen in the only two counties in the US \n with fewer than 120 people, we will remove the entire observation."
cleaned_acs_2015_county_data_revised <- acs_2015_county_data_revised %>%
drop_na()
print(cleaned_acs_2015_county_data_revised)
## # A tibble: 3,140 × 35
## census_id state county total_pop men women hispanic white black native
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alabama Autauga 55221 26745 28476 2.6 75.8 18.5 0.4
## 2 1003 Alabama Baldwin 195121 95314 99807 4.5 83.1 9.5 0.6
## 3 1005 Alabama Barbour 26932 14497 12435 4.6 46.2 46.7 0.2
## 4 1007 Alabama Bibb 22604 12073 10531 2.2 74.5 21.4 0.4
## 5 1009 Alabama Blount 57710 28512 29198 8.6 87.9 1.5 0.3
## 6 1011 Alabama Bullock 10678 5660 5018 4.4 22.2 70.7 1.2
## 7 1013 Alabama Butler 20354 9502 10852 1.2 53.3 43.8 0.1
## 8 1015 Alabama Calhoun 116648 56274 60374 3.5 73 20.3 0.2
## 9 1017 Alabama Chambers 34079 16258 17821 0.4 57.3 40.3 0.2
## 10 1019 Alabama Cherokee 26008 12975 13033 1.5 91.7 4.8 0.6
## # ℹ 3,130 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>, …
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
4.Use the summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code/output to handle unusual values.
summary(cleaned_acs_2015_county_data_revised)
## census_id state county total_pop
## Min. : 1001 Length:3140 Length:3140 Min. : 267
## 1st Qu.:18178 Class :character Class :character 1st Qu.: 11036
## Median :29176 Mode :character Mode :character Median : 25793
## Mean :30383 Mean : 100801
## 3rd Qu.:45080 3rd Qu.: 67620
## Max. :56045 Max. :10038388
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45094 Median :23574 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
Are there any negative values?
num_rows_with_negatives <- cleaned_acs_2015_county_data_revised %>%
filter(across(where(is.numeric), ~ any(.x < 0))) %>%
nrow()
## Warning: Using `across()` in `filter()` was deprecated in dplyr 1.0.8.
## ℹ Please use `if_any()` or `if_all()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
if (num_rows_with_negatives == 0) {
print("Since there are no negative values, the summary shows that there are no unusual values in the cleaned data.")
}
## [1] "Since there are no negative values, the summary shows that there are no unusual values in the cleaned data."
5.How many counties have more women than men? Filter for counties with more women than men and count them.
num_counties_women_majority <- acs_2015_county_data_revised %>%
filter(women > men) %>%
nrow()
print(num_counties_women_majority)
## [1] 1985
6.How many counties have an unemployment rate lower than 10%? Filter for counties with unemployment rate less than 10% and count them
num_counties_unemployment_rate_less_than_ten_percent <- acs_2015_county_data_revised %>%
filter(unemployment < 10) %>%
nrow()
print(num_counties_unemployment_rate_less_than_ten_percent)
## [1] 2420
7.What are the top 10 counties with the highest mean commute? Show the census ID, countyname, state, and the mean_commute in your final answer, sorted by mean_commute.
Notes: -Use the variable mean_commute to answer this question.
-Leverage the dplyr::topn() function. Read the documentation for this function.
?top_n
The comments in R Studio state the following.
[Superseded] top_n() has been superseded in favour of slice_min()/slice_max(). While it will not be deprecated in the near future, retirement means that we will only perform critical bug fixes, so we recommend moving to the newer alternatives.
top_n() was superseded because the name was fundamentally confusing as it returned what you might reasonably consider to be the bottom rows. Additionally, the wt variable had a confusing name, and strange default (the last column in the data frame). Unfortunately we could not see an easy way to fix the existing top_n() function without breaking existing code, so we created a new alternative.
top_commute_counties <- acs_2015_county_data_revised %>%
arrange(desc(mean_commute)) %>%
slice_head(n = 10) %>%
select(census_id, county, state, mean_commute)
print(top_commute_counties)
## # A tibble: 10 × 4
## census_id county state mean_commute
## <dbl> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
8.Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages.
Show the census ID, county name, state,and the percentage in your final answer (sorted by ascending percentage).Perform data manipulation using dplyr
acs_2015_county_data_revised %>%
# Calculate the percentage of women for each county
mutate(percent_women = (women/total_pop) * 100) %>%
# Arrange the data in ascending order by the percentage of women
arrange(percent_women) %>%
# Select the top 10 counties
slice_max(percent_women, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, percent_women)
## # A tibble: 10 × 4
## census_id county state percent_women
## <dbl> <chr> <chr> <dbl>
## 1 51720 Norton city Virginia 59.4
## 2 13235 Pulaski Georgia 58.0
## 3 1119 Sumter Alabama 55.7
## 4 28125 Sharkey Mississippi 55.5
## 5 51620 Franklin city Virginia 55.5
## 6 51091 Highland Virginia 55.3
## 7 48137 Edwards Texas 55.2
## 8 51790 Staunton city Virginia 55.1
## 9 35011 De Baca New Mexico 55.1
## 10 29117 Livingston Missouri 54.9
sum_races <- acs_2015_county_data_revised %>%
mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
arrange(sum_of_all_race_variables) %>%
# Select the top 10 counties
slice_max(sum_of_all_race_variables, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, sum_of_all_race_variables)
print(sum_races)
## # A tibble: 11 × 4
## census_id county state sum_of_all_race_variables
## <dbl> <chr> <chr> <dbl>
## 1 31073 Gosper Nebraska 100.
## 2 31091 Hooker Nebraska 100.
## 3 48017 Bailey Texas 100.
## 4 48137 Edwards Texas 100.
## 5 31125 Nance Nebraska 100.
## 6 28021 Claiborne Mississippi 100
## 7 48131 Duval Texas 100
## 8 48261 Kenedy Texas 100
## 9 48263 Kent Texas 100
## 10 48377 Presidio Texas 100
## 11 49001 Beaver Utah 100
9b. Which state, on average, has the lowest sum of these race percentage variables?
acs_2015_county_data_revised %>%
mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
arrange(sum_of_all_race_variables) %>%
# Select the top 10 counties
slice_min(sum_of_all_race_variables, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, sum_of_all_race_variables)
## # A tibble: 10 × 4
## census_id county state sum_of_all_race_variables
## <dbl> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
max_10 <- acs_2015_county_data_revised %>%
mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
arrange(sum_of_all_race_variables) %>%
# Select the top 10 counties
slice_max(sum_of_all_race_variables, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, sum_of_all_race_variables)
m <- max(max_10[,4])
if (m > 100) {
paste0("A county has sum_of_all_race_variables > 100 and it equals ", m) %>%
print() } else {
print("All counties have sum_of_all_race_variables less than or equal to 100 percent")
}
## [1] "A county has sum_of_all_race_variables > 100 and it equals 100.1"
9d. How many states have a sum that equals exactly to 100%?
allrows <- acs_2015_county_data_revised %>%
mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
arrange(sum_of_all_race_variables) %>%
# Select the top 10 counties
slice_max(sum_of_all_race_variables, n = nrow(acs_2015_county_data_revised)) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, sum_of_all_race_variables)
sum(allrows[,4] == 100)
## [1] 27
carpool_rank <- acs_2015_county_data_revised %>%
arrange(carpool) %>%
# Select the top 10 counties
slice_max(carpool, n = 1) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, carpool)
paste0("This is the county with the highest car pool percentage.")
## [1] "This is the county with the highest car pool percentage."
carpool_rank
## # A tibble: 1 × 4
## census_id county state carpool
## <dbl> <chr> <chr> <dbl>
## 1 13061 Clay Georgia 29.9
10 b. Find the 10 highest ranked counties for carpooling. Show the census ID, countyname, state, carpool value, and carpool_rank in your final answer.
max_10_carpool <- acs_2015_county_data_revised %>%
arrange(carpool) %>%
# Select the top 10 counties
slice_max(carpool, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, carpool)
paste0("These are the ten highest counties in car pool percentage.")
## [1] "These are the ten highest counties in car pool percentage."
max_10_carpool
## # A tibble: 10 × 4
## census_id county state carpool
## <dbl> <chr> <chr> <dbl>
## 1 13061 Clay Georgia 29.9
## 2 18087 LaGrange Indiana 27
## 3 13165 Jenkins Georgia 25.3
## 4 5133 Sevier Arkansas 24.4
## 5 20175 Seward Kansas 23.4
## 6 48079 Cochran Texas 22.8
## 7 48247 Jim Hogg Texas 22.6
## 8 48393 Roberts Texas 22.4
## 9 39075 Holmes Ohio 21.8
## 10 21197 Powell Kentucky 21.6
10c. Find the 10 lowest ranked counties for carpooling. Show the census ID, countyname, state, carpool value, and carpool_rank in your final answer.
min_10_carpool <- acs_2015_county_data_revised %>%
arrange(carpool) %>%
# Select the lowest 10 counties
slice_min(carpool, n = 10) %>%
# Select and rename the final columns for the output"
select(census_id, county, state, carpool)
paste0("These are the ten lowest counties in car pool percentage.")
## [1] "These are the ten lowest counties in car pool percentage."
min_10_carpool
## # A tibble: 11 × 4
## census_id county state carpool
## <dbl> <chr> <chr> <dbl>
## 1 48261 Kenedy Texas 0
## 2 48269 King Texas 0
## 3 48235 Irion Texas 0.9
## 4 31183 Wheeler Nebraska 1.3
## 5 36061 New York New York 1.9
## 6 13309 Wheeler Georgia 2.3
## 7 38029 Emmons North Dakota 2.3
## 8 30019 Daniels Montana 2.6
## 9 31057 Dundy Nebraska 2.6
## 10 46069 Hyde South Dakota 2.8
## 11 51720 Norton city Virginia 2.8
10d. On average, what state is the best ranked for carpooling? Calculate the average carpool time for each state
average_carpool_percent_by_state <-acs_2015_county_data_revised %>%
group_by(state) %>%
summarise(
average_carpool_time = mean(carpool, na.rm = TRUE)
)
average_carpool_percent_by_state
## # A tibble: 51 × 2
## state average_carpool_time
## <chr> <dbl>
## 1 Alabama 10.2
## 2 Alaska 12.1
## 3 Arizona 11.6
## 4 Arkansas 11.9
## 5 California 11.3
## 6 Colorado 11.2
## 7 Connecticut 7.94
## 8 Delaware 8.53
## 9 District of Columbia 5.7
## 10 Florida 11.0
## # ℹ 41 more rows
best_state_carpool <- average_carpool_percent_by_state %>%
# Select the top county
slice_max(average_carpool_time, n = 1) %>%
# Select and rename the final columns for the output"
select(state, average_carpool_time)
best_state_carpool
## # A tibble: 1 × 2
## state average_carpool_time
## <chr> <dbl>
## 1 Alaska 12.1
best_5_states_carpool <- average_carpool_percent_by_state %>%
# Select the top county
slice_max(average_carpool_time, n = 5) %>%
# Select and rename the final columns for the output"
select(state, average_carpool_time)
best_5_states_carpool
## # A tibble: 5 × 2
## state average_carpool_time
## <chr> <dbl>
## 1 Alaska 12.1
## 2 Arkansas 11.9
## 3 Utah 11.9
## 4 Texas 11.8
## 5 Nevada 11.7