Question 01: Import the data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
acs_data <- read_csv("~/Downloads/acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(acs_data)
## [1] 3142   35
nrow(acs_data)
## [1] 3142
ncol(acs_data)
## [1] 35

Rationale: Importing the dataset from Base R functions is not allowed, so I need to use “Tidyverse function” instead => library(tidyverse)

Question 02: Change data types

glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id      <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state          <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county         <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
acs_data <- acs_data %>%
  mutate(
    census_id = as.character(census_id),  
    state     = as.factor(state),         
    county    = as.factor(county)         
  )
glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id      <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013",…
## $ state          <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, A…
## $ county         <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Butle…
## $ total_pop      <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men            <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women          <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic       <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white          <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black          <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native         <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian          <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific        <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen        <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income         <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty        <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty  <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional   <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service        <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office         <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction   <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production     <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive          <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool        <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit        <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk           <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp   <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home   <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute   <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed       <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work   <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work    <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed  <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work    <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment   <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…

Rationale: glimpse() was used to view the overall structure of the dataset.

Question 03: Missing values

sum(is.na(acs_data))
## [1] 2
acs_data_clean <- tidyr::drop_na(acs_data) # Remove NA values
sum(is.na(acs_data_clean))
## [1] 0

Rationale: sum(is.na())was used to view the total number of NA values in the dataset.

Question 04: Unusual values

summary(acs_data_clean)
##   census_id              state             county       total_pop       
##  Length:3140        Texas   : 253   Washington:  31   Min.   :     267  
##  Class :character   Georgia : 159   Jefferson :  26   1st Qu.:   11036  
##  Mode  :character   Virginia: 133   Franklin  :  25   Median :   25793  
##                     Kentucky: 120   Jackson   :  24   Mean   :  100801  
##                     Missouri: 115   Lincoln   :  24   3rd Qu.:   67620  
##                     Kansas  : 105   Madison   :  20   Max.   :10038388  
##                     (Other) :2255   (Other)   :2990                     
##       men              women            hispanic          white      
##  Min.   :    136   Min.   :    131   Min.   : 0.000   Min.   : 0.90  
##  1st Qu.:   5551   1st Qu.:   5488   1st Qu.: 1.900   1st Qu.:65.67  
##  Median :  12838   Median :  12916   Median : 3.700   Median :84.65  
##  Mean   :  49597   Mean   :  51204   Mean   : 8.819   Mean   :77.31  
##  3rd Qu.:  33328   3rd Qu.:  34123   3rd Qu.: 9.000   3rd Qu.:93.33  
##  Max.   :4945351   Max.   :5093037   Max.   :98.700   Max.   :99.80  
##                                                                      
##      black            native           asian           pacific        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.00000  
##  1st Qu.: 0.600   1st Qu.: 0.100   1st Qu.: 0.200   1st Qu.: 0.00000  
##  Median : 2.100   Median : 0.300   Median : 0.500   Median : 0.00000  
##  Mean   : 8.885   Mean   : 1.763   Mean   : 1.253   Mean   : 0.07357  
##  3rd Qu.:10.200   3rd Qu.: 0.600   3rd Qu.: 1.200   3rd Qu.: 0.00000  
##  Max.   :85.900   Max.   :92.100   Max.   :41.600   Max.   :11.10000  
##                                                                       
##     citizen            income       income_per_cap     poverty    
##  Min.   :    199   Min.   : 19328   Min.   : 8292   Min.   : 1.4  
##  1st Qu.:   8276   1st Qu.: 38826   1st Qu.:20470   1st Qu.:12.0  
##  Median :  19454   Median : 45094   Median :23574   Median :16.0  
##  Mean   :  70849   Mean   : 46824   Mean   :24331   Mean   :16.7  
##  3rd Qu.:  50795   3rd Qu.: 52248   3rd Qu.:27138   3rd Qu.:20.3  
##  Max.   :6046749   Max.   :123453   Max.   :65600   Max.   :53.3  
##                                                                   
##  child_poverty    professional      service          office     
##  Min.   : 0.00   Min.   :13.50   Min.   : 5.00   Min.   : 4.10  
##  1st Qu.:16.10   1st Qu.:26.70   1st Qu.:15.90   1st Qu.:20.20  
##  Median :22.50   Median :30.00   Median :18.00   Median :22.40  
##  Mean   :23.29   Mean   :31.05   Mean   :18.25   Mean   :22.13  
##  3rd Qu.:29.50   3rd Qu.:34.42   3rd Qu.:20.20   3rd Qu.:24.30  
##  Max.   :72.30   Max.   :74.00   Max.   :36.60   Max.   :35.40  
##                                                                 
##   construction     production        drive         carpool     
##  Min.   : 1.70   Min.   : 0.00   Min.   : 5.2   Min.   : 0.00  
##  1st Qu.: 9.80   1st Qu.:11.50   1st Qu.:76.6   1st Qu.: 8.50  
##  Median :12.20   Median :15.40   Median :80.6   Median : 9.90  
##  Mean   :12.75   Mean   :15.82   Mean   :79.1   Mean   :10.33  
##  3rd Qu.:15.00   3rd Qu.:19.40   3rd Qu.:83.6   3rd Qu.:11.90  
##  Max.   :40.30   Max.   :55.60   Max.   :94.6   Max.   :29.90  
##                                                                
##     transit             walk         other_transp    work_at_home   
##  Min.   : 0.0000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.1000   1st Qu.: 1.400   1st Qu.: 0.90   1st Qu.: 2.800  
##  Median : 0.4000   Median : 2.400   Median : 1.30   Median : 4.000  
##  Mean   : 0.9681   Mean   : 3.294   Mean   : 1.61   Mean   : 4.694  
##  3rd Qu.: 0.8000   3rd Qu.: 4.000   3rd Qu.: 1.90   3rd Qu.: 5.700  
##  Max.   :61.7000   Max.   :71.200   Max.   :39.10   Max.   :37.200  
##                                                                     
##   mean_commute      employed        private_work    public_work   
##  Min.   : 4.90   Min.   :    166   Min.   :29.50   Min.   : 5.80  
##  1st Qu.:19.30   1st Qu.:   4532   1st Qu.:70.90   1st Qu.:13.07  
##  Median :22.90   Median :  10657   Median :75.85   Median :16.10  
##  Mean   :23.15   Mean   :  46416   Mean   :74.45   Mean   :17.33  
##  3rd Qu.:26.60   3rd Qu.:  29272   3rd Qu.:79.80   3rd Qu.:20.10  
##  Max.   :44.00   Max.   :4635465   Max.   :88.30   Max.   :66.20  
##                                                                   
##  self_employed     family_work      unemployment   
##  Min.   : 0.000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.: 5.400   1st Qu.:0.1000   1st Qu.: 5.500  
##  Median : 6.900   Median :0.2000   Median : 7.500  
##  Mean   : 7.922   Mean   :0.2917   Mean   : 7.815  
##  3rd Qu.: 9.400   3rd Qu.:0.3000   3rd Qu.: 9.700  
##  Max.   :36.600   Max.   :9.8000   Max.   :29.400  
## 

Rationale:

  1. After observation, no outliers were detected in variables such as census_id, state, county, percent of population (hispanic, white, black, native, asian, pacific), income, poverty, etc.

  2. However, outliers were detected in variables, including: total_pop, men, women, citizen, employed because Max.value are too big for tract level.

# Fix outliers
quantile(acs_data_clean$total_pop, probs = c(0.95, 0.99, 0.999), na.rm = TRUE)
##       95%       99%     99.9% 
##  435870.5 1229322.0 3907631.5
acs_data_clean <- acs_data_clean %>% filter(total_pop < 1000000, men < 1000000, women < 1000000,
                                            citizen < 1000000, employed < 1000000)

Question 05

acs_data_clean %>%
  summarise(more_women_than_men = sum(women > men))
## # A tibble: 1 × 1
##   more_women_than_men
##                 <int>
## 1                1948

Rationale:

=> With the result of 1948, there are 1948 counties where the number of women is more than the number of men.

Question 06

acs_data_clean %>%
  summarise(unemployment_rate_lower_10 = sum(unemployment < 10))
## # A tibble: 1 × 1
##   unemployment_rate_lower_10
##                        <int>
## 1                       2391

Rationale:

=> With the result of 2391, we have 2391 counties where the unemployment rate is below 10%.

Question 07

top_commute <- acs_data_clean %>%
  select(census_id, county, state, mean_commute) %>%   
  arrange(desc(mean_commute)) %>%                      
  top_n(10, mean_commute)
top_commute
## # A tibble: 10 × 4
##    census_id county       state         mean_commute
##    <chr>     <fct>        <fct>                <dbl>
##  1 42103     Pike         Pennsylvania          44  
##  2 24017     Charles      Maryland              42.8
##  3 51187     Warren       Virginia              42.7
##  4 36085     Richmond     New York              42.6
##  5 51193     Westmoreland Virginia              42.5
##  6 8093      Park         Colorado              42.4
##  7 54015     Clay         West Virginia         41.4
##  8 17013     Calhoun      Illinois              41.1
##  9 28041     Greene       Mississippi           41  
## 10 48407     San Jacinto  Texas                 40.9

Rationale:

  1. select(census_id, county, state, mean_commute) is used to take the columns the question needs.

  2. arrange(desc(mean_commute)) is used to sort data by mean_commute in descending order (from highest to lowest).

  3. top_n(10, mean_commute) is used to select the 10 rows with the highest mean_commute values (top 10 counties with the highest mean commute).

Question 08

lowest_women <- acs_data_clean %>%
  mutate(percentage_women = (women / (men + women)) * 100) %>% 
  select(census_id, county, state, percentage_women) %>%         
  arrange(percentage_women) %>%                                  
  slice_head(n = 10)
lowest_women
## # A tibble: 10 × 4
##    census_id county                 state        percentage_women
##    <chr>     <fct>                  <fct>                   <dbl>
##  1 42053     Forest                 Pennsylvania             26.8
##  2 8011      Bent                   Colorado                 31.4
##  3 51183     Sussex                 Virginia                 31.5
##  4 13309     Wheeler                Georgia                  32.1
##  5 6035      Lassen                 California               33.2
##  6 48095     Concho                 Texas                    33.3
##  7 13053     Chattahoochee          Georgia                  33.4
##  8 2013      Aleutians East Borough Alaska                   33.5
##  9 22125     West Feliciana         Louisiana                33.6
## 10 32027     Pershing               Nevada                   33.7

Rationale:

  1. mutate() is used to create a new variable to calculate the percentage of women for each county.

  2. select(census_id, county, state, percentage_women) is used to take the columns the question needs.

  3. arrange(percentage_women) is used to sort data by percentage_women in ascending order (from lowest to highest).

  4. slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 counties with the lowest percentages of women).

Question 09:

09a.

acs_data_clean_race <- acs_data_clean %>%
  mutate(race = hispanic + white + black + native + asian + pacific)
acs_data_clean_race %>%
  select(census_id, county, state, race) %>%
  arrange(race) %>%
  slice_head(n = 10)
## # A tibble: 10 × 4
##    census_id county                   state      race
##    <chr>     <fct>                    <fct>     <dbl>
##  1 15001     Hawaii                   Hawaii     76.4
##  2 15009     Maui                     Hawaii     79.2
##  3 40097     Mayes                    Oklahoma   79.7
##  4 15003     Honolulu                 Hawaii     81.5
##  5 40123     Pontotoc                 Oklahoma   82.8
##  6 47061     Grundy                   Tennessee  83  
##  7 2282      Yakutat City and Borough Alaska     83.4
##  8 40069     Johnston                 Oklahoma   84  
##  9 15007     Kauai                    Hawaii     84.1
## 10 40003     Alfalfa                  Oklahoma   85.1

Rationale:

  1. mutate() is used to create a new variable that stores the sum of these race percentages (new variable named “race”).

  2. select(census_id, county, state, race) is used to take the columns the question needs.

  3. arrange(race) is used to sort data by race in ascending order (from lowest to highest).

  4. slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 counties with the lowest sum of race percentage).

09b.

acs_data_clean_race %>%
  group_by(state) %>%
  summarise(avg_race = mean(race, na.rm = TRUE)) %>%
  arrange(avg_race) %>%
  slice_head(n = 1)
## # A tibble: 1 × 2
##   state  avg_race
##   <fct>     <dbl>
## 1 Hawaii     80.3

Rationale:

  1. group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.

  2. summarise(avg_race = mean(race, na.rm = TRUE)) is used to create a new column called “avg_race” that contains the average value of the variable “race” for each state.

  3. arrange(avg_race) is used to sort data by avg_race in ascending order (from lowest to highest).

  4. slice_head(n = 1) is used to take the first row from the sorted list.

=> Hawaii is the state, on average, has the lowest sum of these race percentage.

09c.

acs_data_clean_race %>%
  summarise(any_gt_100 = any(race > 100),
            n_gt_100   = sum(race > 100))
## # A tibble: 1 × 2
##   any_gt_100 n_gt_100
##   <lgl>         <int>
## 1 TRUE             11

Rationale:

  1. any(race > 100) is used to check whether there is any county with race > 100.

  2. sum(race > 100) is used to count the number of counties that satisfy this condition.

  3. summarise() is used to wrap the result into a one-row summary table.

=> 11 counties have a sum greater than 100%.

09d.

acs_data_clean_race %>%
  filter(near(race, 100, tol = 1e-9)) %>%
  distinct(state) %>%
  count(name = "states_eq_100")
## # A tibble: 1 × 1
##   states_eq_100
##           <int>
## 1            16

Rationale:

  1. filter(near(race, 100, tol = 1e-9)) is used to select all the counties with race = 100

  2. distinct(state) is used to identify the states that contain at least one such county.

=> 16 counties have a sum that equals exactly to 100%.

Question 10

10a.

acs_data_clean_carpool_rank <- acs_data_clean %>%
  filter(!is.na(carpool)) %>%
  mutate(carpool_rank = min_rank(desc(carpool)))

Rationale:

  1. filter(!is.na(carpool)) is used to remove NA values for accurate ranking.

  2. mutate() is used to create a new variable called carpool_rank.

10b.

acs_data_clean_carpool_rank %>%
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(carpool_rank, state, county) %>%
  slice_head(n = 10)
## # A tibble: 10 × 5
##    census_id county   state    carpool carpool_rank
##    <chr>     <fct>    <fct>      <dbl>        <int>
##  1 13061     Clay     Georgia     29.9            1
##  2 18087     LaGrange Indiana     27              2
##  3 13165     Jenkins  Georgia     25.3            3
##  4 5133      Sevier   Arkansas    24.4            4
##  5 20175     Seward   Kansas      23.4            5
##  6 48079     Cochran  Texas       22.8            6
##  7 48247     Jim Hogg Texas       22.6            7
##  8 48393     Roberts  Texas       22.4            8
##  9 39075     Holmes   Ohio        21.8            9
## 10 21197     Powell   Kentucky    21.6           10

Rationale:

  1. select(census_id, county, state, carpool, carpool_rank) is used to take the columns the question needs.

  2. arrange(carpool_rank, state, county) is used to sort data by carpool_rank first, then by state and county alphabetically if there are ties, in ascending order (from lowest to highest).

  3. slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 highest ranked counties for carpooling).

10c.

acs_data_clean_carpool_rank %>%
  select(census_id, county, state, carpool, carpool_rank) %>%
  arrange(desc(carpool_rank), state, county) %>%
  slice_head(n = 10)
## # A tibble: 10 × 5
##    census_id county      state        carpool carpool_rank
##    <chr>     <fct>       <fct>          <dbl>        <int>
##  1 48261     Kenedy      Texas            0           3098
##  2 48269     King        Texas            0           3098
##  3 48235     Irion       Texas            0.9         3097
##  4 31183     Wheeler     Nebraska         1.3         3096
##  5 13309     Wheeler     Georgia          2.3         3094
##  6 38029     Emmons      North Dakota     2.3         3094
##  7 30019     Daniels     Montana          2.6         3092
##  8 31057     Dundy       Nebraska         2.6         3092
##  9 46069     Hyde        South Dakota     2.8         3090
## 10 51720     Norton city Virginia         2.8         3090

Rationale:

  1. select(census_id, county, state, carpool, carpool_rank) is used to take the columns the question needs.

  2. arrange(desc(carpool_rank), state, county) is used to sort data by carpool_rank first, then by state and county alphabetically if there are ties, in descending order (from highest to lowest).

  3. slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 lowest ranked counties for carpooling).

10d.

acs_data_clean_carpool_rank %>%
  group_by(state) %>%
  summarise(mean_rank = mean(carpool_rank)) %>%
  arrange(mean_rank) %>%
  slice_head(n = 1)
## # A tibble: 1 × 2
##   state  mean_rank
##   <fct>      <dbl>
## 1 Hawaii       648

Rationale:

  1. group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.

  2. mean(carpool_rank) is used to calculate the average (mean) carpool rank across all counties within each state.

  3. summarise(mean_rank = mean(carpool_rank)) is used to collapse these results, creating a single row for each state containing its respective mean rank.

  4. arrange(mean_rank) is used to sort data by mean_rank in ascending order (from lowest to highest).

  5. slice_head(n = 1) is used to take the first row from the sorted list.

=> Hawaii is the state with the best ranking for carpooling.

10e.

acs_data_clean_carpool_rank %>%
  group_by(state) %>%
  summarise(mean_rank = mean(carpool_rank)) %>%
  arrange(mean_rank) %>%
  slice_head(n = 5)
## # A tibble: 5 × 2
##   state    mean_rank
##   <fct>        <dbl>
## 1 Hawaii        648 
## 2 Arizona       957.
## 3 Utah         1020.
## 4 Arkansas     1045.
## 5 Alaska       1076.

Rationale:

  1. group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.

  2. mean(carpool_rank) is used to calculate the average (mean) carpool rank across all counties within each state.

  3. summarise(mean_rank = mean(carpool_rank)) is used to collapse these results, creating a single row for each state containing its respective mean rank.

  4. arrange(mean_rank) is used to sort data by mean_rank in ascending order (from lowest to highest).

  5. slice_head(n = 5) is used to take the first 5 row from the sorted list.