Morelli Assignment 3

Michael Morelli MSBA660 Assignment 3 Fall 2023 September 28

#  message=FALSE, echo=FALSE are optionbal
# knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library("tidyverse")
library("dplyr")
library("tidyr")

Importing Data and Data Cleaning

Import the data set using a Tidyverse function and NOT with a Base R function.

How many rows and columns are in the data set?

num_rows_acs_2015_county_data_revised <- nrow(acs_2015_county_data_revised)
paste0("The number of rows is ", num_rows_acs_2015_county_data_revised) %>%
  print()

## [1] "The number of rows is 3142"

num_cols_acs_2015_county_data_revised <- ncol(acs_2015_county_data_revised)
paste0("The number of columns is ", num_cols_acs_2015_county_data_revised) %>%
  print()

## [1] "The number of columns is 35"

acs_2015_county_data_revised

## # A tibble: 3,142 × 35
##    census_id state   county   total_pop   men women hispanic white black native
##        <dbl> <chr>   <chr>        <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1      1001 Alabama Autauga      55221 26745 28476      2.6  75.8  18.5    0.4
##  2      1003 Alabama Baldwin     195121 95314 99807      4.5  83.1   9.5    0.6
##  3      1005 Alabama Barbour      26932 14497 12435      4.6  46.2  46.7    0.2
##  4      1007 Alabama Bibb         22604 12073 10531      2.2  74.5  21.4    0.4
##  5      1009 Alabama Blount       57710 28512 29198      8.6  87.9   1.5    0.3
##  6      1011 Alabama Bullock      10678  5660  5018      4.4  22.2  70.7    1.2
##  7      1013 Alabama Butler       20354  9502 10852      1.2  53.3  43.8    0.1
##  8      1015 Alabama Calhoun     116648 56274 60374      3.5  73    20.3    0.2
##  9      1017 Alabama Chambers     34079 16258 17821      0.4  57.3  40.3    0.2
## 10      1019 Alabama Cherokee     26008 12975 13033      1.5  91.7   4.8    0.6
## # ℹ 3,132 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>, …

paste0("The number of columns is ", num_cols_acs_2015_county_data_revised) %>%
  print()

## [1] "The number of columns is 35"

acs_2015_county_data_revised

## # A tibble: 3,142 × 35
##    census_id state   county   total_pop   men women hispanic white black native
##        <dbl> <chr>   <chr>        <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1      1001 Alabama Autauga      55221 26745 28476      2.6  75.8  18.5    0.4
##  2      1003 Alabama Baldwin     195121 95314 99807      4.5  83.1   9.5    0.6
##  3      1005 Alabama Barbour      26932 14497 12435      4.6  46.2  46.7    0.2
##  4      1007 Alabama Bibb         22604 12073 10531      2.2  74.5  21.4    0.4
##  5      1009 Alabama Blount       57710 28512 29198      8.6  87.9   1.5    0.3
##  6      1011 Alabama Bullock      10678  5660  5018      4.4  22.2  70.7    1.2
##  7      1013 Alabama Butler       20354  9502 10852      1.2  53.3  43.8    0.1
##  8      1015 Alabama Calhoun     116648 56274 60374      3.5  73    20.3    0.2
##  9      1017 Alabama Chambers     34079 16258 17821      0.4  57.3  40.3    0.2
## 10      1019 Alabama Cherokee     26008 12975 13033      1.5  91.7   4.8    0.6
## # ℹ 3,132 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>, …

Do any data types need changed? Show any code to change variable types and show code/output for a glimpse() command after you’re finished. We use glimpse() since this allows us to see a portion of the data and all of the variables.

We can see that none of the data types need to be changed.

glimpse(acs_2015_county_data_revised)

## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…

3.Are there any missing values?

Including Plots

na_number <- sum(colSums(is.na(acs_2015_county_data_revised)))
if (na_number > 0) {
  paste0("There are missing values in our data and the number of them is ", na_number, ".") %>%
    print() } else {
      print("There are no missing values in our data.")
    }

## [1] "There are missing values in our data and the number of them is 2."

# Print information where the missing values are and 
# informationa about the counties in which they occur
# First, Filter rows with any NA
na_rows <- acs_2015_county_data_revised %>%
  filter(if_any(everything(), is.na))
for (row_index in 1:nrow(na_rows)) {
for (col_index in 1:num_cols_acs_2015_county_data_revised) {
  if (is.na(na_rows[row_index, col_index])) {
    # If the value is NA, print the census_id, state, county, and total_pop
    print(na_rows[row_index,1:4])
    # Print where the NAs occure
    print(na_rows[row_index, col_index])
  }
}
}

## # A tibble: 1 × 4
##   census_id state  county  total_pop
##       <dbl> <chr>  <chr>       <dbl>
## 1     15005 Hawaii Kalawao        85
## # A tibble: 1 × 1
##   child_poverty
##           <dbl>
## 1            NA
## # A tibble: 1 × 4
##   census_id state county total_pop
##       <dbl> <chr> <chr>      <dbl>
## 1     48301 Texas Loving       117
## # A tibble: 1 × 1
##   income
##    <dbl>
## 1     NA

How will you handle missing values?

Since the only two missing values happen in Kalawao County, Hawaii and Loving County, Texas which are the only two counties in the US with fewer than 120 people, we will remove these two observations.

An aside, according to Wikipedia, Loving County has 680 square miles of land and 64 - 100 people, this is a density of 1 person every ten square miles. It would be like Cincinnati having a population of 7 people. They have a sheriff and two constables in addition to 15 other county officials.

Kalawao County is the smallest county in area in the US at 5.5 square miles. The Borought of Manhattan, NY, also known as NY County, the second smallest, has 26 square miles of land.Kalawao, in contrast to Loving, is administered by another county and the Hawaii Health department.

print("Since the missing values happen in the only two counties in the US 
      with fewer than 120 people, we will remove the entire observation.")

## [1] "Since the missing values happen in the only two counties in the US \n      with fewer than 120 people, we will remove the entire observation."

cleaned_acs_2015_county_data_revised <- acs_2015_county_data_revised %>%
  drop_na()
print(cleaned_acs_2015_county_data_revised)

## # A tibble: 3,140 × 35
##    census_id state   county   total_pop   men women hispanic white black native
##        <dbl> <chr>   <chr>        <dbl> <dbl> <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1      1001 Alabama Autauga      55221 26745 28476      2.6  75.8  18.5    0.4
##  2      1003 Alabama Baldwin     195121 95314 99807      4.5  83.1   9.5    0.6
##  3      1005 Alabama Barbour      26932 14497 12435      4.6  46.2  46.7    0.2
##  4      1007 Alabama Bibb         22604 12073 10531      2.2  74.5  21.4    0.4
##  5      1009 Alabama Blount       57710 28512 29198      8.6  87.9   1.5    0.3
##  6      1011 Alabama Bullock      10678  5660  5018      4.4  22.2  70.7    1.2
##  7      1013 Alabama Butler       20354  9502 10852      1.2  53.3  43.8    0.1
##  8      1015 Alabama Calhoun     116648 56274 60374      3.5  73    20.3    0.2
##  9      1017 Alabama Chambers     34079 16258 17821      0.4  57.3  40.3    0.2
## 10      1019 Alabama Cherokee     26008 12975 13033      1.5  91.7   4.8    0.6
## # ℹ 3,130 more rows
## # ℹ 25 more variables: asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## #   income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## #   professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## #   production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## #   other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## #   private_work <dbl>, public_work <dbl>, self_employed <dbl>, …

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

4.Use the summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code/output to handle unusual values.

summary(cleaned_acs_2015_county_data_revised)

##    census_id        state              county            total_pop       
##  Min.   : 1001   Length:3140        Length:3140        Min.   :     267  
##  1st Qu.:18178   Class :character   Class :character   1st Qu.:   11036  
##  Median :29176   Mode  :character   Mode  :character   Median :   25793  
##  Mean   :30383                                         Mean   :  100801  
##  3rd Qu.:45080                                         3rd Qu.:   67620  
##  Max.   :56045                                         Max.   :10038388  
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45094   Median :23574   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400

Are there any negative values?

num_rows_with_negatives <- cleaned_acs_2015_county_data_revised %>%
  filter(across(where(is.numeric), ~ any(.x < 0))) %>%
  nrow()

## Warning: Using `across()` in `filter()` was deprecated in dplyr 1.0.8.
## ℹ Please use `if_any()` or `if_all()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

if (num_rows_with_negatives == 0) {
print("Since there are no negative values, the summary shows that there are no unusual values in the cleaned data.")
}

## [1] "Since there are no negative values, the summary shows that there are no unusual values in the cleaned data."

5.How many counties have more women than men? Filter for counties with more women than men and count them.

  num_counties_women_majority <- acs_2015_county_data_revised %>%
  filter(women > men) %>%
  nrow()
print(num_counties_women_majority)

## [1] 1985

6.How many counties have an unemployment rate lower than 10%? Filter for counties with unemployment rate less than 10% and count them

num_counties_unemployment_rate_less_than_ten_percent <- acs_2015_county_data_revised %>%
  filter(unemployment < 10) %>%
  nrow()
print(num_counties_unemployment_rate_less_than_ten_percent)

## [1] 2420

7.What are the top 10 counties with the highest mean commute? Show the census ID, countyname, state, and the mean_commute in your final answer, sorted by mean_commute.

Notes: -Use the variable mean_commute to answer this question.

-Leverage the dplyr::topn() function. Read the documentation for this function.

?top_n

The comments in R Studio state the following.

[Superseded] top_n() has been superseded in favour of slice_min()/slice_max(). While it will not be deprecated in the near future, retirement means that we will only perform critical bug fixes, so we recommend moving to the newer alternatives.

top_n() was superseded because the name was fundamentally confusing as it returned what you might reasonably consider to be the bottom rows. Additionally, the wt variable had a confusing name, and strange default (the last column in the data frame). Unfortunately we could not see an easy way to fix the existing top_n() function without breaking existing code, so we created a new alternative.

top_commute_counties <- acs_2015_county_data_revised %>%
  arrange(desc(mean_commute)) %>%
  slice_head(n = 10) %>%
  select(census_id, county, state, mean_commute)
print(top_commute_counties)

## # A tibble: 10 × 4
##    census_id county       state         mean_commute
##        <dbl> <chr>        <chr>                <dbl>
##  1     42103 Pike         Pennsylvania          44  
##  2     36005 Bronx        New York              43  
##  3     24017 Charles      Maryland              42.8
##  4     51187 Warren       Virginia              42.7
##  5     36081 Queens       New York              42.6
##  6     36085 Richmond     New York              42.6
##  7     51193 Westmoreland Virginia              42.5
##  8      8093 Park         Colorado              42.4
##  9     36047 Kings        New York              41.7
## 10     54015 Clay         West Virginia         41.4

8.Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages.

Show the census ID, county name, state,and the percentage in your final answer (sorted by ascending percentage).Perform data manipulation using dplyr

acs_2015_county_data_revised %>%
  # Calculate the percentage of women for each county
  mutate(percent_women = (women/total_pop) * 100) %>%
  # Arrange the data in ascending order by the percentage of women
  arrange(percent_women) %>%
  # Select the top 10 counties
  slice_max(percent_women, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, percent_women)

## # A tibble: 10 × 4
##    census_id county        state       percent_women
##        <dbl> <chr>         <chr>               <dbl>
##  1     51720 Norton city   Virginia             59.4
##  2     13235 Pulaski       Georgia              58.0
##  3      1119 Sumter        Alabama              55.7
##  4     28125 Sharkey       Mississippi          55.5
##  5     51620 Franklin city Virginia             55.5
##  6     51091 Highland      Virginia             55.3
##  7     48137 Edwards       Texas                55.2
##  8     51790 Staunton city Virginia             55.1
##  9     35011 De Baca       New Mexico           55.1
## 10     29117 Livingston    Missouri             54.9

Create a new variable that calculates the sum of all racepercentage variables (these columns are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).

sum_races <- acs_2015_county_data_revised %>%
  mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
  arrange(sum_of_all_race_variables) %>%
  # Select the top 10 counties
  slice_max(sum_of_all_race_variables, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, sum_of_all_race_variables)
print(sum_races)

## # A tibble: 11 × 4
##    census_id county    state       sum_of_all_race_variables
##        <dbl> <chr>     <chr>                           <dbl>
##  1     31073 Gosper    Nebraska                         100.
##  2     31091 Hooker    Nebraska                         100.
##  3     48017 Bailey    Texas                            100.
##  4     48137 Edwards   Texas                            100.
##  5     31125 Nance     Nebraska                         100.
##  6     28021 Claiborne Mississippi                      100 
##  7     48131 Duval     Texas                            100 
##  8     48261 Kenedy    Texas                            100 
##  9     48263 Kent      Texas                            100 
## 10     48377 Presidio  Texas                            100 
## 11     49001 Beaver    Utah                             100

9b. Which state, on average, has the lowest sum of these race percentage variables?

  acs_2015_county_data_revised %>%
  mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
  arrange(sum_of_all_race_variables) %>%
  # Select the top 10 counties
  slice_min(sum_of_all_race_variables, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, sum_of_all_race_variables)

## # A tibble: 10 × 4
##    census_id county                   state     sum_of_all_race_variables
##        <dbl> <chr>                    <chr>                         <dbl>
##  1     15001 Hawaii                   Hawaii                         76.4
##  2     15009 Maui                     Hawaii                         79.2
##  3     40097 Mayes                    Oklahoma                       79.7
##  4     15003 Honolulu                 Hawaii                         81.5
##  5     40123 Pontotoc                 Oklahoma                       82.8
##  6     47061 Grundy                   Tennessee                      83  
##  7      2282 Yakutat City and Borough Alaska                         83.4
##  8     40069 Johnston                 Oklahoma                       84  
##  9     15007 Kauai                    Hawaii                         84.1
## 10     40003 Alfalfa                  Oklahoma                       85.1

Do any counties have a sum greater than 100%?

max_10 <- acs_2015_county_data_revised %>%
  mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
  arrange(sum_of_all_race_variables) %>%
  # Select the top 10 counties
  slice_max(sum_of_all_race_variables, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, sum_of_all_race_variables) 
m <- max(max_10[,4])
if (m  > 100) {
  paste0("A county has sum_of_all_race_variables > 100  and it equals ", m) %>%
    print()  } else { 
    print("All counties have sum_of_all_race_variables less than  or equal to 100 percent")
      }

## [1] "A county has sum_of_all_race_variables > 100  and it equals 100.1"

9d. How many states have a sum that equals exactly to 100%?

allrows <- acs_2015_county_data_revised %>%
  mutate(sum_of_all_race_variables = hispanic + white + black + native + asian + pacific) %>%
  arrange(sum_of_all_race_variables) %>%
  # Select the top 10 counties
  slice_max(sum_of_all_race_variables, n = nrow(acs_2015_county_data_revised)) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, sum_of_all_race_variables) 
sum(allrows[,4] == 100)

## [1] 27

Using the carpool variable,

Use the function to create a new variable called carpool_rank where the highest ranked county (rank = 1) is the county with the highest carpoolvalue.

carpool_rank <- acs_2015_county_data_revised %>%
  arrange(carpool) %>%
  # Select the top 10 counties
  slice_max(carpool, n = 1) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, carpool) 
paste0("This is the county with the highest car pool percentage.")

## [1] "This is the county with the highest car pool percentage."

carpool_rank

## # A tibble: 1 × 4
##   census_id county state   carpool
##       <dbl> <chr>  <chr>     <dbl>
## 1     13061 Clay   Georgia    29.9

10 b. Find the 10 highest ranked counties for carpooling. Show the census ID, countyname, state, carpool value, and carpool_rank in your final answer.

max_10_carpool <- acs_2015_county_data_revised %>%
  arrange(carpool) %>%
  # Select the top 10 counties
  slice_max(carpool, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, carpool) 
paste0("These are the ten highest counties in car pool percentage.")

## [1] "These are the ten highest counties in car pool percentage."

max_10_carpool

## # A tibble: 10 × 4
##    census_id county   state    carpool
##        <dbl> <chr>    <chr>      <dbl>
##  1     13061 Clay     Georgia     29.9
##  2     18087 LaGrange Indiana     27  
##  3     13165 Jenkins  Georgia     25.3
##  4      5133 Sevier   Arkansas    24.4
##  5     20175 Seward   Kansas      23.4
##  6     48079 Cochran  Texas       22.8
##  7     48247 Jim Hogg Texas       22.6
##  8     48393 Roberts  Texas       22.4
##  9     39075 Holmes   Ohio        21.8
## 10     21197 Powell   Kentucky    21.6

10c. Find the 10 lowest ranked counties for carpooling. Show the census ID, countyname, state, carpool value, and carpool_rank in your final answer.

min_10_carpool <- acs_2015_county_data_revised %>%
  arrange(carpool) %>%
  # Select the lowest 10 counties
  slice_min(carpool, n = 10) %>%
  # Select and rename the final columns for the output"
  select(census_id, county, state, carpool) 
paste0("These are the ten lowest counties in car pool percentage.")

## [1] "These are the ten lowest counties in car pool percentage."

min_10_carpool

## # A tibble: 11 × 4
##    census_id county      state        carpool
##        <dbl> <chr>       <chr>          <dbl>
##  1     48261 Kenedy      Texas            0  
##  2     48269 King        Texas            0  
##  3     48235 Irion       Texas            0.9
##  4     31183 Wheeler     Nebraska         1.3
##  5     36061 New York    New York         1.9
##  6     13309 Wheeler     Georgia          2.3
##  7     38029 Emmons      North Dakota     2.3
##  8     30019 Daniels     Montana          2.6
##  9     31057 Dundy       Nebraska         2.6
## 10     46069 Hyde        South Dakota     2.8
## 11     51720 Norton city Virginia         2.8

10d. On average, what state is the best ranked for carpooling? Calculate the average carpool time for each state

average_carpool_percent_by_state <-acs_2015_county_data_revised %>%
  group_by(state) %>%
  summarise(
    average_carpool_time = mean(carpool, na.rm = TRUE)
  ) 
average_carpool_percent_by_state

## # A tibble: 51 × 2
##    state                average_carpool_time
##    <chr>                               <dbl>
##  1 Alabama                             10.2 
##  2 Alaska                              12.1 
##  3 Arizona                             11.6 
##  4 Arkansas                            11.9 
##  5 California                          11.3 
##  6 Colorado                            11.2 
##  7 Connecticut                          7.94
##  8 Delaware                             8.53
##  9 District of Columbia                 5.7 
## 10 Florida                             11.0 
## # ℹ 41 more rows

best_state_carpool <- average_carpool_percent_by_state %>%
  # Select the top county
  slice_max(average_carpool_time, n = 1) %>%
  # Select and rename the final columns for the output"
  select(state, average_carpool_time)
best_state_carpool

## # A tibble: 1 × 2
##   state  average_carpool_time
##   <chr>                 <dbl>
## 1 Alaska                 12.1

What are the top 5 states for carpooling?

best_5_states_carpool <- average_carpool_percent_by_state %>%
  # Select the top county
  slice_max(average_carpool_time, n = 5) %>%
  # Select and rename the final columns for the output"
  select(state, average_carpool_time)
best_5_states_carpool

## # A tibble: 5 × 2
##   state    average_carpool_time
##   <chr>                   <dbl>
## 1 Alaska                   12.1
## 2 Arkansas                 11.9
## 3 Utah                     11.9
## 4 Texas                    11.8
## 5 Nevada                   11.7

Morelli Assignment 3

Michael Morelli

2025-10-03

Importing Data and Data Cleaning

Including Plots