library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
acs_data <- read_csv("~/Downloads/acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(acs_data)
## [1] 3142 35
nrow(acs_data)
## [1] 3142
ncol(acs_data)
## [1] 35
Rationale: Importing the dataset from Base R functions is not allowed, so I need to use “Tidyverse function” instead => library(tidyverse)
Import the dataset into R: acs_data <- read_csv(“~/Downloads/acs_2015_county_data_revised.csv”) #read_csv() from readr packages, which satisfies the requirement “NOT Base R”.
dim() / nrow() / ncol() tell you the dimensions of the dataset.
Check rows and columns: dim(acs_data)
Check specific rows: nrow(acs_data) with the result being #3142 rows
Check specific columns: ncol(acs_data) with the result being #35 columns
glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
acs_data <- acs_data %>%
mutate(
census_id = as.character(census_id),
state = as.factor(state),
county = as.factor(county)
)
glimpse(acs_data)
## Rows: 3,142
## Columns: 35
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "1013",…
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama, A…
## $ county <fct> Autauga, Baldwin, Barbour, Bibb, Blount, Bullock, Butle…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
Rationale: glimpse() was used to view the overall structure of the dataset.
Census Tract ID (census_id) is not used for calculations -> should be converted to “character”.
State Name (state) and County or county equivalent (county) should be converted to “factor”, since these are categorical variables.
The remaining dimensions are already in the correct data types -> no conversion is needed.
sum(is.na(acs_data))
## [1] 2
acs_data_clean <- tidyr::drop_na(acs_data) # Remove NA values
sum(is.na(acs_data_clean))
## [1] 0
Rationale: sum(is.na())was used to view the total number of NA values in the dataset.
summary(acs_data_clean)
## census_id state county total_pop
## Length:3140 Texas : 253 Washington: 31 Min. : 267
## Class :character Georgia : 159 Jefferson : 26 1st Qu.: 11036
## Mode :character Virginia: 133 Franklin : 25 Median : 25793
## Kentucky: 120 Jackson : 24 Mean : 100801
## Missouri: 115 Lincoln : 24 3rd Qu.: 67620
## Kansas : 105 Madison : 20 Max. :10038388
## (Other) :2255 (Other) :2990
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
##
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45094 Median :23574 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
##
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
##
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
Rationale:
After observation, no outliers were detected in variables such as census_id, state, county, percent of population (hispanic, white, black, native, asian, pacific), income, poverty, etc.
However, outliers were detected in variables, including: total_pop, men, women, citizen, employed because Max.value are too big for tract level.
# Fix outliers
quantile(acs_data_clean$total_pop, probs = c(0.95, 0.99, 0.999), na.rm = TRUE)
## 95% 99% 99.9%
## 435870.5 1229322.0 3907631.5
acs_data_clean <- acs_data_clean %>% filter(total_pop < 1000000, men < 1000000, women < 1000000,
citizen < 1000000, employed < 1000000)
acs_data_clean %>%
summarise(more_women_than_men = sum(women > men))
## # A tibble: 1 × 1
## more_women_than_men
## <int>
## 1 1948
Rationale:
women > men is used to check whether, for each county, the number of women is more than the number of men. The result is a logical vector consisting of TRUE or FALSE.
sum(women > men) is used to calculate the total number of counties that satisfy the condition women > men.
summarise() is used to wrap the result in a clean summary table.
=> With the result of 1948, there are 1948 counties where the number of women is more than the number of men.
acs_data_clean %>%
summarise(unemployment_rate_lower_10 = sum(unemployment < 10))
## # A tibble: 1 × 1
## unemployment_rate_lower_10
## <int>
## 1 2391
Rationale:
unemployment < 10 is used to check if each county’s unemployment rate is less than 10. The result is a logical vector consisting of TRUE or FALSE.
sum(unemployment < 10) is used to calculate the total number of counties that satisfy the condition unemployment < 10.
summarise() is used to wrap the result in a clean summary table.
=> With the result of 2391, we have 2391 counties where the unemployment rate is below 10%.
top_commute <- acs_data_clean %>%
select(census_id, county, state, mean_commute) %>%
arrange(desc(mean_commute)) %>%
top_n(10, mean_commute)
top_commute
## # A tibble: 10 × 4
## census_id county state mean_commute
## <chr> <fct> <fct> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 24017 Charles Maryland 42.8
## 3 51187 Warren Virginia 42.7
## 4 36085 Richmond New York 42.6
## 5 51193 Westmoreland Virginia 42.5
## 6 8093 Park Colorado 42.4
## 7 54015 Clay West Virginia 41.4
## 8 17013 Calhoun Illinois 41.1
## 9 28041 Greene Mississippi 41
## 10 48407 San Jacinto Texas 40.9
Rationale:
select(census_id, county, state, mean_commute) is used to take the columns the question needs.
arrange(desc(mean_commute)) is used to sort data by mean_commute in descending order (from highest to lowest).
top_n(10, mean_commute) is used to select the 10 rows with the highest mean_commute values (top 10 counties with the highest mean commute).
lowest_women <- acs_data_clean %>%
mutate(percentage_women = (women / (men + women)) * 100) %>%
select(census_id, county, state, percentage_women) %>%
arrange(percentage_women) %>%
slice_head(n = 10)
lowest_women
## # A tibble: 10 × 4
## census_id county state percentage_women
## <chr> <fct> <fct> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
Rationale:
mutate() is used to create a new variable to calculate the percentage of women for each county.
select(census_id, county, state, percentage_women) is used to take the columns the question needs.
arrange(percentage_women) is used to sort data by percentage_women in ascending order (from lowest to highest).
slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 counties with the lowest percentages of women).
acs_data_clean_race <- acs_data_clean %>%
mutate(race = hispanic + white + black + native + asian + pacific)
acs_data_clean_race %>%
select(census_id, county, state, race) %>%
arrange(race) %>%
slice_head(n = 10)
## # A tibble: 10 × 4
## census_id county state race
## <chr> <fct> <fct> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
Rationale:
mutate() is used to create a new variable that stores the sum of these race percentages (new variable named “race”).
select(census_id, county, state, race) is used to take the columns the question needs.
arrange(race) is used to sort data by race in ascending order (from lowest to highest).
slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 counties with the lowest sum of race percentage).
acs_data_clean_race %>%
group_by(state) %>%
summarise(avg_race = mean(race, na.rm = TRUE)) %>%
arrange(avg_race) %>%
slice_head(n = 1)
## # A tibble: 1 × 2
## state avg_race
## <fct> <dbl>
## 1 Hawaii 80.3
Rationale:
group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.
summarise(avg_race = mean(race, na.rm = TRUE)) is used to create a new column called “avg_race” that contains the average value of the variable “race” for each state.
arrange(avg_race) is used to sort data by avg_race in ascending order (from lowest to highest).
slice_head(n = 1) is used to take the first row from the sorted list.
=> Hawaii is the state, on average, has the lowest sum of these race percentage.
acs_data_clean_race %>%
summarise(any_gt_100 = any(race > 100),
n_gt_100 = sum(race > 100))
## # A tibble: 1 × 2
## any_gt_100 n_gt_100
## <lgl> <int>
## 1 TRUE 11
Rationale:
any(race > 100) is used to check whether there is any county with race > 100.
sum(race > 100) is used to count the number of counties that satisfy this condition.
summarise() is used to wrap the result into a one-row summary table.
=> 11 counties have a sum greater than 100%.
acs_data_clean_race %>%
filter(near(race, 100, tol = 1e-9)) %>%
distinct(state) %>%
count(name = "states_eq_100")
## # A tibble: 1 × 1
## states_eq_100
## <int>
## 1 16
Rationale:
filter(near(race, 100, tol = 1e-9)) is used to select all the counties with race = 100
distinct(state) is used to identify the states that contain at least one such county.
=> 16 counties have a sum that equals exactly to 100%.
acs_data_clean_carpool_rank <- acs_data_clean %>%
filter(!is.na(carpool)) %>%
mutate(carpool_rank = min_rank(desc(carpool)))
Rationale:
filter(!is.na(carpool)) is used to remove NA values for accurate ranking.
mutate() is used to create a new variable called carpool_rank.
acs_data_clean_carpool_rank %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank, state, county) %>%
slice_head(n = 10)
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <chr> <fct> <fct> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
Rationale:
select(census_id, county, state, carpool, carpool_rank) is used to take the columns the question needs.
arrange(carpool_rank, state, county) is used to sort data by carpool_rank first, then by state and county alphabetically if there are ties, in ascending order (from lowest to highest).
slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 highest ranked counties for carpooling).
acs_data_clean_carpool_rank %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(desc(carpool_rank), state, county) %>%
slice_head(n = 10)
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <chr> <fct> <fct> <dbl> <int>
## 1 48261 Kenedy Texas 0 3098
## 2 48269 King Texas 0 3098
## 3 48235 Irion Texas 0.9 3097
## 4 31183 Wheeler Nebraska 1.3 3096
## 5 13309 Wheeler Georgia 2.3 3094
## 6 38029 Emmons North Dakota 2.3 3094
## 7 30019 Daniels Montana 2.6 3092
## 8 31057 Dundy Nebraska 2.6 3092
## 9 46069 Hyde South Dakota 2.8 3090
## 10 51720 Norton city Virginia 2.8 3090
Rationale:
select(census_id, county, state, carpool, carpool_rank) is used to take the columns the question needs.
arrange(desc(carpool_rank), state, county) is used to sort data by carpool_rank first, then by state and county alphabetically if there are ties, in descending order (from highest to lowest).
slice_head(n = 10) is used to take the first 10 rows after sorting (top 10 lowest ranked counties for carpooling).
acs_data_clean_carpool_rank %>%
group_by(state) %>%
summarise(mean_rank = mean(carpool_rank)) %>%
arrange(mean_rank) %>%
slice_head(n = 1)
## # A tibble: 1 × 2
## state mean_rank
## <fct> <dbl>
## 1 Hawaii 648
Rationale:
group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.
mean(carpool_rank) is used to calculate the average (mean) carpool rank across all counties within each state.
summarise(mean_rank = mean(carpool_rank)) is used to collapse these results, creating a single row for each state containing its respective mean rank.
arrange(mean_rank) is used to sort data by mean_rank in ascending order (from lowest to highest).
slice_head(n = 1) is used to take the first row from the sorted list.
=> Hawaii is the state with the best ranking for carpooling.
acs_data_clean_carpool_rank %>%
group_by(state) %>%
summarise(mean_rank = mean(carpool_rank)) %>%
arrange(mean_rank) %>%
slice_head(n = 5)
## # A tibble: 5 × 2
## state mean_rank
## <fct> <dbl>
## 1 Hawaii 648
## 2 Arizona 957.
## 3 Utah 1020.
## 4 Arkansas 1045.
## 5 Alaska 1076.
Rationale:
group_by(state) is used to group the data by state -> all calculations in the next steps will be done separately for each state instead of the whole dataset.
mean(carpool_rank) is used to calculate the average (mean) carpool rank across all counties within each state.
summarise(mean_rank = mean(carpool_rank)) is used to collapse these results, creating a single row for each state containing its respective mean rank.
arrange(mean_rank) is used to sort data by mean_rank in ascending order (from lowest to highest).
slice_head(n = 5) is used to take the first 5 row from the sorted list.