knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
I libraried tidyverse in order to input the dataset. With it being a csv I was able to use read_csv and point to the csv name since I had already set the working directory.
Then I used dim(acs_2015_county_data) to identify the number of rows and columns in the data set. - There are 3142 rows and 35 columns in this data set.
setwd("~/MSBA 660/Week 6")
# set working directory
acs_2015_county_data <- read_csv("acs_2015_county_data_revised.csv")
## Rows: 3142 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, county
## dbl (33): census_id, total_pop, men, women, hispanic, white, black, native, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# import using tidyverse
dim(acs_2015_county_data)
# checking number of rows and columns
I did not change any of the data types because the dim showed that there were two chr columns (state and county) and the remaining 33 are dbl columns which is accurate because the variables are represented as number of or percentages.
The data dictionary was used to confirm that the variables accurately match the column type.
glimpse(acs_2015_county_data)
## Rows: 3,142
## Columns: 35
## $ census_id <dbl> 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017, 1…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", …
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "Bul…
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11664…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274, 1…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 60374, …
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7.6, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3, 9…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, 4.8…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0.4, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0.3, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 88612,…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 41703,…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 21374,…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6, 1…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2, 3…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3, 2…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5, 1…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3, 1…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5, 13…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4, 2…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1, 8…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 12.1…
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1.1, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1.9, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1, 2…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, 136…
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1, 7…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1, 1…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4.1, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0.5, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9, 9…
I used colSums(is.na(acs_2015_county_data) to break out by column how many missing values were present. That identified two NAs one in income and one in child_poverty.
For income, the data represents median income so I used the median of the column to fill in the na. I chose to use the median because income is not typically an even distribution and using the median allows the result to not be influenced by outliers.
For child_poverty I used the mean of the column to fill in the na. I used the mean for child poverty is more uniform and likely to have a more normal distribution.
colSums(is.na(acs_2015_county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
# find number of missing values
median_income <- median(acs_2015_county_data$income, na.rm = TRUE)
# calculate median of income column
acs_2015_county_data$income[is.na(acs_2015_county_data$income)] <- median_income
# replace the NA in income with the median
summary(acs_2015_county_data$income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19328 38826 45111 46830 52249 123453
# check the NA is no longer there
mean_child_poverty <- mean(acs_2015_county_data$child_poverty, na.rm = TRUE)
#calculate median of child_poverty column
acs_2015_county_data$child_poverty[is.na(acs_2015_county_data$child_poverty)] <- mean_child_poverty
# replace NA values in child_poverty with mean
summary(acs_2015_county_data$child_poverty)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 16.10 22.50 23.29 29.48 72.30
# check the NA is no longer there
The first check to examine for unusual values was to ensure that any columns represented by percentages did not have a maximum above 100. The first thing that stood out was employed because in the data dictionary it is identified as a percentage of employed above the age of 16. However, when returning to the original data source, employed is the number of employed therefore not represented by a percentage and able to be above 100. This makes this column not unusual despite the initial look.
The next check would be to check for negative values in the dataset. This dataset represents people who are alive and you cannot have a negative person any negative values would be unusual. There were no negative values in the dataset.
summary(acs_2015_county_data)
## census_id state county total_pop
## Min. : 1001 Length:3142 Length:3142 Min. : 85
## 1st Qu.:18178 Class :character Class :character 1st Qu.: 11028
## Median :29176 Mode :character Mode :character Median : 25768
## Mean :30384 Mean : 100737
## 3rd Qu.:45081 3rd Qu.: 67552
## Max. :56045 Max. :10038388
## men women hispanic white
## Min. : 42 Min. : 43 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5546 1st Qu.: 5466 1st Qu.: 1.900 1st Qu.:65.60
## Median : 12826 Median : 12907 Median : 3.700 Median :84.60
## Mean : 49565 Mean : 51172 Mean : 8.826 Mean :77.28
## 3rd Qu.: 33319 3rd Qu.: 34122 3rd Qu.: 9.000 3rd Qu.:93.30
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.879 Mean : 1.766 Mean : 1.258 Mean : 0.08475
## 3rd Qu.:10.175 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :35.30000
## citizen income income_per_cap poverty
## Min. : 80 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8254 1st Qu.: 38826 1st Qu.:20471 1st Qu.:12.0
## Median : 19434 Median : 45111 Median :23577 Median :16.0
## Mean : 70804 Mean : 46830 Mean :24338 Mean :16.7
## 3rd Qu.: 50728 3rd Qu.: 52249 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.04 Mean :18.26 Mean :22.13
## 3rd Qu.:29.48 3rd Qu.:34.40 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.20 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.53 1st Qu.:76.60 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.60 Median : 9.90
## Mean :12.74 Mean :15.82 Mean :79.08 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.60 3rd Qu.:11.88
## Max. :40.30 Max. :55.60 Max. :94.60 Max. :29.90
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.900 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.300 Median : 4.000
## Mean : 0.9675 Mean : 3.307 Mean : 1.614 Mean : 4.697
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.900 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.100 Max. :37.200
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 62 Min. :25.00 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4524 1st Qu.:70.90 1st Qu.:13.10
## Median :22.90 Median : 10644 Median :75.80 Median :16.10
## Mean :23.15 Mean : 46387 Mean :74.44 Mean :17.35
## 3rd Qu.:26.60 3rd Qu.: 29254 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.921 Mean :0.2915 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
sapply(acs_2015_county_data, function(x) max(x, na.rm = TRUE) > 100)
## census_id state county total_pop men
## TRUE TRUE TRUE TRUE TRUE
## women hispanic white black native
## TRUE FALSE FALSE FALSE FALSE
## asian pacific citizen income income_per_cap
## FALSE FALSE TRUE TRUE TRUE
## poverty child_poverty professional service office
## FALSE FALSE FALSE FALSE FALSE
## construction production drive carpool transit
## FALSE FALSE FALSE FALSE FALSE
## walk other_transp work_at_home mean_commute employed
## FALSE FALSE FALSE FALSE TRUE
## private_work public_work self_employed family_work unemployment
## FALSE FALSE FALSE FALSE FALSE
# check which columns have a maximum more than 100
acs_2015_county_data %>%
filter(if_any(where(is.numeric), ~ . < 0))
## # A tibble: 0 × 35
## # ℹ 35 variables: census_id <dbl>, state <chr>, county <chr>, total_pop <dbl>,
## # men <dbl>, women <dbl>, hispanic <dbl>, white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, …
# check for any numeric value below 0
To find this I created a data set that filtered observations where the women column had more than the men column. I then used dim to count the number of observations(counties).
There are 1,985 counties where the women out number the men.
more_women <- acs_2015_county_data %>%
filter(women > men)
# filter the data to only inclue observations where the number of women are more than men
dim(more_women)
## [1] 1985 35
# show the number of observations and variable
To find this I created a data set that filtered observations where the unemployment rate is lower than 10%. I did not need to do additional calculation because unemployment is the unemployment rate as a percentage. I then used dim to count the number of observations(counties).
There are 2,420 counties where the unemployment rate is below 10%.
unemploy_below_10 <- acs_2015_county_data %>%
filter(unemployment < 10)
# filter the data to only include observations where the unemployment rate is below 10
dim(unemploy_below_10)
## [1] 2420 35
I used what we learned with the movie challenge to complete this question. However, due to guidance given in the question I used top_n from dplyr instead of head. This created a tibble output that organized my data neatly for output. The top_n returned the top 10, the arrange is to ensure it is organized by commute time, and select displays the data requested by the question.
The top 10 counties with the highest mean commute are: Pike, Pennsylvania Bronx, New York Charles, Maryland Queens, New York Richmond, New York Westmoreland, Virginia Park, Colorado Clay, West Virgina
acs_2015_county_data %>%
top_n(10, mean_commute) %>%
arrange(desc(mean_commute)) %>%
select(census_id, county, state, mean_commute)
## # A tibble: 10 × 4
## census_id county state mean_commute
## <dbl> <chr> <chr> <dbl>
## 1 42103 Pike Pennsylvania 44
## 2 36005 Bronx New York 43
## 3 24017 Charles Maryland 42.8
## 4 51187 Warren Virginia 42.7
## 5 36081 Queens New York 42.6
## 6 36085 Richmond New York 42.6
## 7 51193 Westmoreland Virginia 42.5
## 8 8093 Park Colorado 42.4
## 9 36047 Kings New York 41.7
## 10 54015 Clay West Virginia 41.4
#create a table of highest commute
I used what we learned with the movie challenge to complete this question. However, due to guidance given in the question I used top_n from dplyr instead of head. This created a tibble output that organized my data neatly for output. The slice_min returned the bottom 10, the arrange is to ensure it is organized by percentage of women, and select displays the data requested by the question.
The counties with the smallest percentages of women are: Forest, Pennsylvania Bent, Colorado Sussex, Virgina Wheeler, Georgia Lassen, California Concho, Texas Chattahoochee, Georgia Aleutions East Borough, Alaska West Feliciana, Louisiana Pershing, Nevada
acs_2015_county_data <- acs_2015_county_data %>%
mutate(women_percentage = (women / total_pop) * 100)
#create the new column of percentage of women
acs_2015_county_data %>%
slice_min(n = 10, order_by = women_percentage) %>%
arrange(women_percentage) %>%
select(census_id, county, state, women_percentage)
## # A tibble: 10 × 4
## census_id county state women_percentage
## <dbl> <chr> <chr> <dbl>
## 1 42053 Forest Pennsylvania 26.8
## 2 8011 Bent Colorado 31.4
## 3 51183 Sussex Virginia 31.5
## 4 13309 Wheeler Georgia 32.1
## 5 6035 Lassen California 33.2
## 6 48095 Concho Texas 33.3
## 7 13053 Chattahoochee Georgia 33.4
## 8 2013 Aleutians East Borough Alaska 33.5
## 9 22125 West Feliciana Louisiana 33.6
## 10 32027 Pershing Nevada 33.7
#create a table of the smallest women percentages
What are the top 10 counties with the lowest sum of these race percentage variables? Hawaii, Hawaii Maui, Hawaii Mayes, Oklahoma Honolulu, Hawaii Pontotoc, Oklahoma Grundy, Tenessee Yakutat City and Borough, Alaska Johnston, Oklahoma Kauai, Hawaii Alfafa, Oklahoma
To answer this question I created a new column that summed the pre-exisiting race columns. This added race_sum to the dataset.
Then I created a tibble that ordered the counties in ascending order from the smallest sum to to the 10th smallest. This allowed me to identify the smallest ten counties by the lowest sum of the race percentage variables above.
Which state, on average, has the lowest sum of these race percentage variables? *Hawaii is the state with the lowest sum of the race percentage variables
To answer this question I created a new object grouped by state averaging the race sum of each county. In this I arranged the data by the average race sum so that I could us the head function to display the lowest percentage.
Do any counties have a sum greater than 100% *There are 11 counties with a sum above 100.
To find this I created a data set that filtered observations where the race_sum was greater than 100. I then used dim to count the number of observations(counties).
How many states have a sum that equals exactly to 100%
To find this I filtered the previouslt created object to show only observations that equal 100. There was none so when I used dim to count the number of observations zero were represented.
# A
acs_2015_county_data <- acs_2015_county_data %>%
mutate(race_sum = hispanic + white + black + native + asian + pacific)
# create the sum of races column
acs_2015_county_data %>%
slice_min(n = 10, order_by = race_sum) %>%
arrange(race_sum) %>%
select(census_id, county, state, race_sum)
## # A tibble: 10 × 4
## census_id county state race_sum
## <dbl> <chr> <chr> <dbl>
## 1 15001 Hawaii Hawaii 76.4
## 2 15009 Maui Hawaii 79.2
## 3 40097 Mayes Oklahoma 79.7
## 4 15003 Honolulu Hawaii 81.5
## 5 40123 Pontotoc Oklahoma 82.8
## 6 47061 Grundy Tennessee 83
## 7 2282 Yakutat City and Borough Alaska 83.4
## 8 40069 Johnston Oklahoma 84
## 9 15007 Kauai Hawaii 84.1
## 10 40003 Alfalfa Oklahoma 85.1
#create a table of the smallest totals
# B
state_avg_race_sum <- acs_2015_county_data %>%
group_by(state) %>%
summarize(avg_race_sum = mean(race_sum, na.rm = TRUE)) %>%
arrange(avg_race_sum)
# create a filtered object grouped by stae averaging race_sum
head(state_avg_race_sum, 1)
## # A tibble: 1 × 2
## state avg_race_sum
## <chr> <dbl>
## 1 Hawaii 84
#showing the head of the data since already arranged by avg_race_sum
# C
race_sum_above_100 <- acs_2015_county_data %>%
filter(race_sum > 100)
# filter the data to only include observations where the race_sum is above 100
dim(race_sum_above_100)
## [1] 11 37
# summarizes numbers of rows and columns
# D
states_with_exact_100_sum <- acs_2015_county_data %>%
group_by(state) %>%
summarize(avg_race_sum = mean(race_sum, na.rm = TRUE)) %>%
filter(avg_race_sum == 100)
# filter the state data to only include observations where the state race sum is 100.
dim(states_with_exact_100_sum)
## [1] 0 2
# summarizes numbers of rows and columns
See code chunk for using min_rank to create the new variable.
Find the 10 highest ranked counties for carpooling. Show the census ID, county name, state, carpool value, and carpool_rank in your final answer. The 10 highest ranked counties are: Kennedy, Texas King, Texas Irion, Texas Wheeler, Nebraska New York, New York Emmons, North Dakota Daniels, Montana Dundy, Nebraska *Hyde, South Dakora
To answer this question I used top_n to create a table showing the required details. I used arrange(desc) to show from the lowest ranking to the 10th lowest.
Find the 10 lowest ranked counties for carpooling. Show the same variables in your final answer. The 10 lowest ranked counties are: Clay, Georgia LaGrange, Indiana Jenkins, Georgia Sevier, Arkansas Seward, Kansas Cochran, Texas Jim Hogg, Texas Roberts, Texas Holmes, Ohio Powell, Kentucky
To answer this question I used slice_min to create a table showing the required details. I used arrange to show from the highest rank to the 10th highest.
On average, what state is the best ranked for carpooling?
To answer this question I averaged the carpool rank grouping by state. Then I used slice_min to show a table of the highest (in this case smallest) average carpool rank.
What are the top 5 states for carpooling? The top 5 states for carpooling are:
# A
acs_2015_county_data <- acs_2015_county_data %>%
mutate(carpool_rank = min_rank(-carpool))
#create the carpool rank using min_rank
# B
acs_2015_county_data %>%
top_n(10, carpool_rank) %>%
arrange(desc(carpool_rank)) %>%
select(census_id, county, state, carpool, carpool_rank)
## # A tibble: 11 × 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 48261 Kenedy Texas 0 3141
## 2 48269 King Texas 0 3141
## 3 48235 Irion Texas 0.9 3140
## 4 31183 Wheeler Nebraska 1.3 3139
## 5 36061 New York New York 1.9 3138
## 6 13309 Wheeler Georgia 2.3 3136
## 7 38029 Emmons North Dakota 2.3 3136
## 8 30019 Daniels Montana 2.6 3134
## 9 31057 Dundy Nebraska 2.6 3134
## 10 46069 Hyde South Dakota 2.8 3132
## 11 51720 Norton city Virginia 2.8 3132
#create a table of the top 10 counties by rank
# C
acs_2015_county_data %>%
slice_min(n = 10, order_by = carpool_rank) %>%
arrange(carpool_rank) %>%
select(census_id, county, state, carpool, carpool_rank)
## # A tibble: 10 × 5
## census_id county state carpool carpool_rank
## <dbl> <chr> <chr> <dbl> <int>
## 1 13061 Clay Georgia 29.9 1
## 2 18087 LaGrange Indiana 27 2
## 3 13165 Jenkins Georgia 25.3 3
## 4 5133 Sevier Arkansas 24.4 4
## 5 20175 Seward Kansas 23.4 5
## 6 48079 Cochran Texas 22.8 6
## 7 48247 Jim Hogg Texas 22.6 7
## 8 48393 Roberts Texas 22.4 8
## 9 39075 Holmes Ohio 21.8 9
## 10 21197 Powell Kentucky 21.6 10
# create a table of the bottom 10 counties by rank
# D
state_avg_carpool_rank <- acs_2015_county_data %>%
group_by(state) %>%
summarize(avg_carpool_rank = mean(carpool_rank)) %>%
arrange(avg_carpool_rank)
# calc the avg carpool rank by state
state_avg_carpool_rank %>%
slice_min(n = 1, order_by = avg_carpool_rank) %>%
arrange(avg_carpool_rank)
## # A tibble: 1 × 2
## state avg_carpool_rank
## <chr> <dbl>
## 1 Arizona 971.
# create a table of the top carpool rank (1)
state_avg_carpool_rank %>%
slice_min(n = 5, order_by = avg_carpool_rank) %>%
arrange(avg_carpool_rank)
## # A tibble: 5 × 2
## state avg_carpool_rank
## <chr> <dbl>
## 1 Arizona 971.
## 2 Utah 1019.
## 3 Arkansas 1055.
## 4 Hawaii 1072.
## 5 Alaska 1087.
# create a table of the top carpool rank (5)