(1) Import the data set using a Tidyverse function and NOT with a Base R function. How many rows and columns are in the data set?
library(tidyverse)
library(lubridate)
library(glue)
library(knitr)
getwd()
## [1] "C:/Users/katie/OneDrive - University of Cincinnati/FS20/Second Half/Data Wrangling (BANA 7025)/Week 4"
# Import the data set using a Tidyverse function and NOT with a Base R function. How many rows and columns are in the data set?
county_data <- readr::read_csv('homework3/acs_2015_county_data_revised.csv')
nrow(county_data)
## [1] 3142
ncol(county_data)
## [1] 35
In the data set, there are 3,142 rows and 35 columns.
(2) Do any data types need to be changed? Show any code to change variable types and show code / output for a glimpse() command after you’re finished.
str(county_data)
## tibble [3,142 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ census_id : num [1:3142] 1001 1003 1005 1007 1009 ...
## $ state : chr [1:3142] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr [1:3142] "Autauga" "Baldwin" "Barbour" "Bibb" ...
## $ total_pop : num [1:3142] 55221 195121 26932 22604 57710 ...
## $ men : num [1:3142] 26745 95314 14497 12073 28512 ...
## $ women : num [1:3142] 28476 99807 12435 10531 29198 ...
## $ hispanic : num [1:3142] 2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
## $ white : num [1:3142] 75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
## $ black : num [1:3142] 18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
## $ native : num [1:3142] 0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
## $ asian : num [1:3142] 1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
## $ pacific : num [1:3142] 0 0 0 0 0 0 0 0 0 0 ...
## $ citizen : num [1:3142] 40725 147695 20714 17495 42345 ...
## $ income : num [1:3142] 51281 50254 32964 38678 45813 ...
## $ income_per_cap: num [1:3142] 24974 27317 16824 18431 20532 ...
## $ poverty : num [1:3142] 12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
## $ child_poverty : num [1:3142] 18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
## $ professional : num [1:3142] 33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
## $ service : num [1:3142] 17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
## $ office : num [1:3142] 24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
## $ construction : num [1:3142] 8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
## $ production : num [1:3142] 17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
## $ drive : num [1:3142] 87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
## $ carpool : num [1:3142] 8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
## $ transit : num [1:3142] 0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
## $ walk : num [1:3142] 0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
## $ other_transp : num [1:3142] 1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
## $ work_at_home : num [1:3142] 1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
## $ mean_commute : num [1:3142] 26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
## $ employed : num [1:3142] 23986 85953 8597 8294 22189 ...
## $ private_work : num [1:3142] 73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
## $ public_work : num [1:3142] 20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
## $ self_employed : num [1:3142] 5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
## $ family_work : num [1:3142] 0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
## $ unemployment : num [1:3142] 7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
## - attr(*, "spec")=
## .. cols(
## .. census_id = col_double(),
## .. state = col_character(),
## .. county = col_character(),
## .. total_pop = col_double(),
## .. men = col_double(),
## .. women = col_double(),
## .. hispanic = col_double(),
## .. white = col_double(),
## .. black = col_double(),
## .. native = col_double(),
## .. asian = col_double(),
## .. pacific = col_double(),
## .. citizen = col_double(),
## .. income = col_double(),
## .. income_per_cap = col_double(),
## .. poverty = col_double(),
## .. child_poverty = col_double(),
## .. professional = col_double(),
## .. service = col_double(),
## .. office = col_double(),
## .. construction = col_double(),
## .. production = col_double(),
## .. drive = col_double(),
## .. carpool = col_double(),
## .. transit = col_double(),
## .. walk = col_double(),
## .. other_transp = col_double(),
## .. work_at_home = col_double(),
## .. mean_commute = col_double(),
## .. employed = col_double(),
## .. private_work = col_double(),
## .. public_work = col_double(),
## .. self_employed = col_double(),
## .. family_work = col_double(),
## .. unemployment = col_double()
## .. )
head(county_data)
## # A tibble: 6 x 35
## census_id state county total_pop men women hispanic white black native asian
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab~ Autau~ 55221 26745 28476 2.6 75.8 18.5 0.4 1
## 2 1003 Alab~ Baldw~ 195121 95314 99807 4.5 83.1 9.5 0.6 0.7
## 3 1005 Alab~ Barbo~ 26932 14497 12435 4.6 46.2 46.7 0.2 0.4
## 4 1007 Alab~ Bibb 22604 12073 10531 2.2 74.5 21.4 0.4 0.1
## 5 1009 Alab~ Blount 57710 28512 29198 8.6 87.9 1.5 0.3 0.1
## 6 1011 Alab~ Bullo~ 10678 5660 5018 4.4 22.2 70.7 1.2 0.2
## # ... with 24 more variables: pacific <dbl>, citizen <dbl>, income <dbl>,
## # income_per_cap <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>, walk <dbl>,
## # other_transp <dbl>, work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>
county_data$census_id <- as.character(county_data$census_id)
county_data$state <- as.factor(county_data$state)
It appears that no data types need to be changed except the census_id and state variables. Because the census_id variable will not need any data visualizations or summary statistics and is unique for each record, I will be changing the data type to a character. Additionally, I am going to change the state variable to a factor data type, as this bucket of information is largely categorical in nature.
Any variable that is a double (dbl) type is also numeric.
glimpse(county_data)
## Rows: 3,142
## Columns: 35
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011", "101...
## $ state <fct> Alabama, Alabama, Alabama, Alabama, Alabama, Alabama...
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount", "...
## $ total_pop <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354, 11...
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 56274...
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, 6037...
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5, 7....
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, 57.3...
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40.3, ...
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6, 0....
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3, 0....
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....
## $ citizen <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581, 886...
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229, 417...
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390, 213...
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, 21.6...
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, 37.2...
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, 23.3...
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, 14.5...
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, 26.3...
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 11.5,...
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, 24.4...
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, 85.1...
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11.9, 1...
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2, 0....
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6, 1....
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7, 1....
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5, 1....
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, 25.1...
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 47401, ...
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, 85.1...
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, 12.1...
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9, 4....
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5, 0....
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9, 7.9...
(3) Are there any missing values? How will you handle missing values? Will you impute a missing value with, for example, a mean or median value for the entire column, or will you remove the entire observation? Give a rationale for your decision and show any code/output to handle missing values.
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 1 0
## poverty child_poverty professional service office
## 0 1 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
There are only missing values in the income and child_poverty columns, for a total of two missing values. Because these are only two values out of over 3000, I will simply drop these observations entirely. It would not make sense to impute these values with a mean or median, as the two missing values will not greatly impact the data set.
county_data <- drop_na(county_data)
## Check to see if there are any missing values
colSums(is.na(county_data))
## census_id state county total_pop men
## 0 0 0 0 0
## women hispanic white black native
## 0 0 0 0 0
## asian pacific citizen income income_per_cap
## 0 0 0 0 0
## poverty child_poverty professional service office
## 0 0 0 0 0
## construction production drive carpool transit
## 0 0 0 0 0
## walk other_transp work_at_home mean_commute employed
## 0 0 0 0 0
## private_work public_work self_employed family_work unemployment
## 0 0 0 0 0
nrow(county_data)
## [1] 3140
ncol(county_data)
## [1] 35
As evident in the code chunk above, there are now 3,140 rows and 35 columns.
(4) Use the summary() function to examine any unusual values. Are there any? If so, how will you handle these unusual values? Show any code / output to handle unusual values.
summary(county_data)
## census_id state county total_pop
## Length:3140 Texas : 253 Length:3140 Min. : 267
## Class :character Georgia : 159 Class :character 1st Qu.: 11036
## Mode :character Virginia: 133 Mode :character Median : 25793
## Kentucky: 120 Mean : 100801
## Missouri: 115 3rd Qu.: 67620
## Kansas : 105 Max. :10038388
## (Other) :2255
## men women hispanic white
## Min. : 136 Min. : 131 Min. : 0.000 Min. : 0.90
## 1st Qu.: 5551 1st Qu.: 5488 1st Qu.: 1.900 1st Qu.:65.67
## Median : 12838 Median : 12916 Median : 3.700 Median :84.65
## Mean : 49597 Mean : 51204 Mean : 8.819 Mean :77.31
## 3rd Qu.: 33328 3rd Qu.: 34123 3rd Qu.: 9.000 3rd Qu.:93.33
## Max. :4945351 Max. :5093037 Max. :98.700 Max. :99.80
##
## black native asian pacific
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 0.600 1st Qu.: 0.100 1st Qu.: 0.200 1st Qu.: 0.00000
## Median : 2.100 Median : 0.300 Median : 0.500 Median : 0.00000
## Mean : 8.885 Mean : 1.763 Mean : 1.253 Mean : 0.07357
## 3rd Qu.:10.200 3rd Qu.: 0.600 3rd Qu.: 1.200 3rd Qu.: 0.00000
## Max. :85.900 Max. :92.100 Max. :41.600 Max. :11.10000
##
## citizen income income_per_cap poverty
## Min. : 199 Min. : 19328 Min. : 8292 Min. : 1.4
## 1st Qu.: 8276 1st Qu.: 38826 1st Qu.:20470 1st Qu.:12.0
## Median : 19454 Median : 45095 Median :23575 Median :16.0
## Mean : 70849 Mean : 46824 Mean :24331 Mean :16.7
## 3rd Qu.: 50795 3rd Qu.: 52248 3rd Qu.:27138 3rd Qu.:20.3
## Max. :6046749 Max. :123453 Max. :65600 Max. :53.3
##
## child_poverty professional service office
## Min. : 0.00 Min. :13.50 Min. : 5.00 Min. : 4.10
## 1st Qu.:16.10 1st Qu.:26.70 1st Qu.:15.90 1st Qu.:20.20
## Median :22.50 Median :30.00 Median :18.00 Median :22.40
## Mean :23.29 Mean :31.05 Mean :18.25 Mean :22.13
## 3rd Qu.:29.50 3rd Qu.:34.42 3rd Qu.:20.20 3rd Qu.:24.30
## Max. :72.30 Max. :74.00 Max. :36.60 Max. :35.40
##
## construction production drive carpool
## Min. : 1.70 Min. : 0.00 Min. : 5.2 Min. : 0.00
## 1st Qu.: 9.80 1st Qu.:11.50 1st Qu.:76.6 1st Qu.: 8.50
## Median :12.20 Median :15.40 Median :80.6 Median : 9.90
## Mean :12.75 Mean :15.82 Mean :79.1 Mean :10.33
## 3rd Qu.:15.00 3rd Qu.:19.40 3rd Qu.:83.6 3rd Qu.:11.90
## Max. :40.30 Max. :55.60 Max. :94.6 Max. :29.90
##
## transit walk other_transp work_at_home
## Min. : 0.0000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.1000 1st Qu.: 1.400 1st Qu.: 0.90 1st Qu.: 2.800
## Median : 0.4000 Median : 2.400 Median : 1.30 Median : 4.000
## Mean : 0.9681 Mean : 3.294 Mean : 1.61 Mean : 4.694
## 3rd Qu.: 0.8000 3rd Qu.: 4.000 3rd Qu.: 1.90 3rd Qu.: 5.700
## Max. :61.7000 Max. :71.200 Max. :39.10 Max. :37.200
##
## mean_commute employed private_work public_work
## Min. : 4.90 Min. : 166 Min. :29.50 Min. : 5.80
## 1st Qu.:19.30 1st Qu.: 4532 1st Qu.:70.90 1st Qu.:13.07
## Median :22.90 Median : 10657 Median :75.85 Median :16.10
## Mean :23.15 Mean : 46416 Mean :74.45 Mean :17.33
## 3rd Qu.:26.60 3rd Qu.: 29272 3rd Qu.:79.80 3rd Qu.:20.10
## Max. :44.00 Max. :4635465 Max. :88.30 Max. :66.20
##
## self_employed family_work unemployment
## Min. : 0.000 Min. :0.0000 Min. : 0.000
## 1st Qu.: 5.400 1st Qu.:0.1000 1st Qu.: 5.500
## Median : 6.900 Median :0.2000 Median : 7.500
## Mean : 7.922 Mean :0.2917 Mean : 7.815
## 3rd Qu.: 9.400 3rd Qu.:0.3000 3rd Qu.: 9.700
## Max. :36.600 Max. :9.8000 Max. :29.400
##
Looking at the above summary, there are a few variables that have unusual values. When compared to the data dictionary, the employed variable appears to be incorrectly recorded. This is because the data dictionary defines the variable as, “Percentage employed, ages 16+”. However, the Min to Max ranges from 166 to 4,635,465, which is not between 0 and 100 (as a typical percentage would be). During my analysis, I will ignore this variable and / or drop it.
Additionally, more unusual values appear in the form of outliers. Outliers appear to exist in almost every variable, as seen in examples total_pop, men, women, citizen, etc. These outliers could be proven through plots and visualizations in further analysis.
(5) How many counties have more women than men?
county_data %>%
filter(women > men) %>%
nrow()
## [1] 1984
There are 1,984 counties that have more women than men.
(6) How many counties have an unemployment rate lower than 10%?
county_data %>%
filter(unemployment < 10) %>%
nrow()
## [1] 2419
There are 2,419 counties with an unemployment rate lower than 10%.
(7) What are the top 10 counties with the highest mean commute? Show the census ID, county name, state, and the mean_commute in your final answer (sorted by mean_commute).
kable(county_data %>%
select(census_id, county, state, mean_commute) %>%
arrange(desc(mean_commute)) %>%
top_n(10))
| census_id | county | state | mean_commute |
|---|---|---|---|
| 42103 | Pike | Pennsylvania | 44.0 |
| 36005 | Bronx | New York | 43.0 |
| 24017 | Charles | Maryland | 42.8 |
| 51187 | Warren | Virginia | 42.7 |
| 36081 | Queens | New York | 42.6 |
| 36085 | Richmond | New York | 42.6 |
| 51193 | Westmoreland | Virginia | 42.5 |
| 8093 | Park | Colorado | 42.4 |
| 36047 | Kings | New York | 41.7 |
| 54015 | Clay | West Virginia | 41.4 |
The top ten counties with the highest mean commute are seen in the table above.
(8) Create a new variable that calculates the percentage of women for each county and then find the top 10 counties with the lowest percentages. Show the census ID, county name, state,and the percentage in your final answer (sorted by ascending percentage).
kable(county_data %>%
mutate(percentage_of_women = women/total_pop) %>%
select(census_id, county, state, percentage_of_women) %>%
arrange(percentage_of_women) %>%
top_n(-10))
| census_id | county | state | percentage_of_women |
|---|---|---|---|
| 42053 | Forest | Pennsylvania | 0.2677747 |
| 8011 | Bent | Colorado | 0.3136556 |
| 51183 | Sussex | Virginia | 0.3147336 |
| 13309 | Wheeler | Georgia | 0.3210156 |
| 6035 | Lassen | California | 0.3316588 |
| 48095 | Concho | Texas | 0.3328439 |
| 13053 | Chattahoochee | Georgia | 0.3335572 |
| 2013 | Aleutians East Borough | Alaska | 0.3347458 |
| 22125 | West Feliciana | Louisiana | 0.3364904 |
| 32027 | Pershing | Nevada | 0.3372508 |
The top ten counties with the lowest percentage of women are seen in the table above.
(9) Create a new variable that calculates the sum of all race percentage variables (these columns are the “hispanic”, “white”, “black”, “native”, “asian”, and “pacific” variables).
# The new variable is named "percentage_race_total"
county_data <- mutate(county_data, percentage_race_total = hispanic + white + black + native + asian + pacific)
kable(county_data %>%
select(percentage_race_total, everything()) %>%
arrange(percentage_race_total) %>%
top_n(10))
| percentage_race_total | census_id | state | county | total_pop | men | women | hispanic | white | black | native | asian | pacific | citizen | income | income_per_cap | poverty | child_poverty | professional | service | office | construction | production | drive | carpool | transit | walk | other_transp | work_at_home | mean_commute | employed | private_work | public_work | self_employed | family_work | unemployment |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 89.0 | 46121 | South Dakota | Todd | 9942 | 4862 | 5080 | 3.7 | 10.0 | 0.3 | 74.4 | 0.6 | 0.0 | 5915 | 31128 | 11616 | 46.4 | 56.0 | 35.5 | 25.0 | 22.0 | 10.5 | 7.1 | 58.3 | 18.0 | 1.0 | 6.0 | 5.0 | 11.7 | 16.5 | 2901 | 34.4 | 55.0 | 9.8 | 0.8 | 25.4 |
| 93.8 | 2158 | Alaska | Kusilvak Census Area | 7914 | 4200 | 3714 | 1.0 | 4.5 | 0.4 | 87.4 | 0.4 | 0.1 | 4680 | 38229 | 11569 | 33.3 | 39.7 | 29.9 | 22.8 | 22.4 | 14.6 | 10.3 | 5.2 | 3.3 | 0.1 | 47.6 | 39.1 | 4.7 | 5.9 | 2067 | 45.4 | 53.8 | 0.5 | 0.3 | 28.6 |
| 97.6 | 1035 | Alabama | Conecuh | 12865 | 6176 | 6689 | 1.6 | 51.0 | 44.7 | 0.3 | 0.0 | 0.0 | 9930 | 24900 | 15968 | 33.8 | 41.5 | 19.4 | 21.0 | 20.4 | 11.4 | 27.9 | 93.6 | 4.3 | 0.0 | 0.6 | 0.3 | 1.3 | 29.7 | 3718 | 77.8 | 13.0 | 8.3 | 0.8 | 22.6 |
| 98.2 | 46041 | South Dakota | Dewey | 5579 | 2718 | 2861 | 0.9 | 21.7 | 0.0 | 75.6 | 0.0 | 0.0 | 3644 | 37206 | 16216 | 27.8 | 33.7 | 42.4 | 20.1 | 17.4 | 13.7 | 6.5 | 73.1 | 3.1 | 1.9 | 5.2 | 1.6 | 15.2 | 16.3 | 1900 | 34.9 | 49.2 | 14.7 | 1.2 | 24.6 |
| 98.4 | 46102 | South Dakota | Oglala Lakota | 14153 | 6809 | 7344 | 1.0 | 4.8 | 0.5 | 92.1 | 0.0 | 0.0 | 8903 | 26369 | 9150 | 53.3 | 61.5 | 37.2 | 26.0 | 19.7 | 7.2 | 10.0 | 64.1 | 18.5 | 0.5 | 8.7 | 2.3 | 5.8 | 17.7 | 3226 | 29.5 | 66.2 | 4.3 | 0.0 | 28.7 |
| 98.4 | 8025 | Colorado | Crowley | 5551 | 3145 | 2406 | 30.2 | 61.1 | 5.3 | 0.9 | 0.4 | 0.5 | 4169 | 31151 | 10742 | 33.4 | 40.9 | 24.4 | 28.8 | 22.7 | 11.5 | 12.6 | 79.5 | 10.4 | 0.0 | 7.3 | 0.5 | 2.3 | 21.9 | 1457 | 66.7 | 27.3 | 6.0 | 0.0 | 27.0 |
| 98.9 | 46031 | South Dakota | Corson | 4149 | 2123 | 2026 | 0.6 | 30.9 | 0.3 | 66.8 | 0.3 | 0.0 | 2706 | 31676 | 13848 | 45.6 | 56.9 | 43.0 | 14.9 | 18.7 | 15.8 | 7.6 | 63.0 | 5.9 | 1.5 | 8.9 | 1.5 | 19.0 | 19.7 | 1240 | 36.1 | 40.8 | 21.5 | 1.5 | 29.4 |
| 99.0 | 46137 | South Dakota | Ziebach | 2833 | 1391 | 1442 | 3.7 | 23.7 | 0.3 | 70.6 | 0.7 | 0.0 | 1796 | 35119 | 12877 | 39.5 | 47.2 | 48.7 | 12.9 | 17.5 | 15.8 | 5.1 | 65.3 | 5.7 | 0.7 | 6.6 | 1.7 | 20.0 | 19.6 | 948 | 40.9 | 41.7 | 16.6 | 0.8 | 27.4 |
| 99.4 | 28119 | Mississippi | Quitman | 7761 | 3751 | 4010 | 0.9 | 28.3 | 70.2 | 0.0 | 0.0 | 0.0 | 5751 | 24583 | 14503 | 39.5 | 60.8 | 30.1 | 27.0 | 17.0 | 11.9 | 14.1 | 81.2 | 13.0 | 0.2 | 2.9 | 0.1 | 2.5 | 23.7 | 2324 | 68.1 | 24.4 | 7.5 | 0.0 | 25.3 |
| 99.8 | 45005 | South Carolina | Allendale | 9838 | 5225 | 4613 | 2.8 | 23.2 | 73.8 | 0.0 | 0.0 | 0.0 | 7758 | 25327 | 12199 | 29.0 | 48.1 | 17.2 | 26.0 | 22.6 | 9.0 | 25.3 | 81.4 | 11.3 | 0.0 | 3.4 | 0.5 | 3.4 | 27.5 | 2866 | 73.3 | 21.9 | 4.8 | 0.0 | 22.6 |
| 99.9 | 28053 | Mississippi | Humphreys | 8984 | 4278 | 4706 | 2.6 | 22.0 | 75.1 | 0.1 | 0.1 | 0.0 | 6469 | 23216 | 13503 | 39.7 | 49.1 | 24.7 | 17.6 | 17.3 | 13.6 | 26.9 | 77.5 | 17.8 | 0.0 | 2.6 | 0.7 | 1.5 | 21.4 | 2678 | 74.3 | 18.9 | 6.6 | 0.2 | 26.8 |
The top ten counties with the lowest sum of these race percentage variables are seen above.
kable(county_data %>%
select(percentage_race_total, everything()) %>%
group_by(state) %>%
summarise(avg = mean(percentage_race_total)) %>%
arrange(avg) %>%
top_n(1))
| state | avg |
|---|---|
| Mississippi | 99.18415 |
On average, Mississippi has the lowest sum of these race percentage variables as seen above.
county_data %>%
filter(percentage_race_total > 100) %>%
nrow()
## [1] 11
There are 11 counties that have a sum greater than 100%.
county_data %>%
filter(percentage_race_total == 100) %>%
nrow()
## [1] 27
There are 27 counties that have a sum equal to 100%.
(10) Using the carpool variable,
dplyr::min_rank() function to create a new variable called carpool_rank where the highest ranked county (rank = 1) is the county with the highest carpool value. Read the documentation carefully for the ranking function.county_data$carpool_rank = min_rank(-(county_data$carpool))
kable(county_data %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(-10, wt=carpool_rank))
| census_id | county | state | carpool | carpool_rank |
|---|---|---|---|---|
| 13061 | Clay | Georgia | 29.9 | 1 |
| 18087 | LaGrange | Indiana | 27.0 | 2 |
| 13165 | Jenkins | Georgia | 25.3 | 3 |
| 5133 | Sevier | Arkansas | 24.4 | 4 |
| 20175 | Seward | Kansas | 23.4 | 5 |
| 48079 | Cochran | Texas | 22.8 | 6 |
| 48247 | Jim Hogg | Texas | 22.6 | 7 |
| 48393 | Roberts | Texas | 22.4 | 8 |
| 39075 | Holmes | Ohio | 21.8 | 9 |
| 21197 | Powell | Kentucky | 21.6 | 10 |
The ten highest ranked counties for carpooling can be seen above.
kable(county_data %>%
select(census_id, county, state, carpool, carpool_rank) %>%
arrange(carpool_rank) %>%
top_n(10, wt=carpool_rank))
| census_id | county | state | carpool | carpool_rank |
|---|---|---|---|---|
| 46069 | Hyde | South Dakota | 2.8 | 3130 |
| 51720 | Norton city | Virginia | 2.8 | 3130 |
| 30019 | Daniels | Montana | 2.6 | 3132 |
| 31057 | Dundy | Nebraska | 2.6 | 3132 |
| 13309 | Wheeler | Georgia | 2.3 | 3134 |
| 38029 | Emmons | North Dakota | 2.3 | 3134 |
| 36061 | New York | New York | 1.9 | 3136 |
| 31183 | Wheeler | Nebraska | 1.3 | 3137 |
| 48235 | Irion | Texas | 0.9 | 3138 |
| 48261 | Kenedy | Texas | 0.0 | 3139 |
| 48269 | King | Texas | 0.0 | 3139 |
The ten lowest ranked counties for carpooling can be seen above.
kable(county_data %>%
group_by(state) %>%
summarise(average_rank = mean(carpool_rank)) %>%
arrange(average_rank) %>%
top_n(-1))
| state | average_rank |
|---|---|
| Hawaii | 651.25 |
On average, the state best ranked for carpooling is Hawaii.
kable(county_data %>%
group_by(state) %>%
summarise(average_rank = mean(carpool_rank)) %>%
arrange(average_rank) %>%
top_n(-5))
| state | average_rank |
|---|---|
| Hawaii | 651.250 |
| Arizona | 970.400 |
| Utah | 1018.931 |
| Arkansas | 1054.307 |
| Alaska | 1086.379 |
On average, the best top five states for carpooling can be seen above.